--- title: "NFL Scrape Test Icing Kickers" author: "Sam Silva" date: "11/20/2016" output: html_document: default word_document: default --- ```{r, echo = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "README-", cache = TRUE, tidy = TRUE ) require(pacman) ``` ```{r loadpacs, echo=TRUE} # Must install the devtools package using the below commented out code # install.packages("devtools") p_load(devtools) #devtools::install_github(repo = "maksimhorowitz/nflscrapR") # Load the package p_load(nflscrapR) ``` Get the game information for all NFL games from 2009 to 2015. NFL.com doesn't want to serve 2007 and 2008. Change the `` to ``` at the beginning and end of the chunk to make it executable. The chunk can be made non-executable once the dataframe has been created and saved for reloading. ```{r grabgames, echo=TRUE} # NFL.COM seems not to want to serve years 2007 and 2008 years <- c(2009:2015) for (i in 1:length(years)){ print(years[i]) assign(paste0("gameids", years[i]), extracting_gameids(years[i], playoffs = FALSE)) } ``` Now grab all the play information for all of the games that were found above. Change the `` to ``` at the beginning and end of the chunk to make it executable. The chunk can be made non-executable once the dataframe has been created and saved for reloading. ```{r, getplays, echo=TRUE} years <- c(2009:2015) rm(plays) print(years[1]) gameids <- eval(as.symbol(paste0("gameids", years[1]))) plays <- game_play_by_play(gameids[1]) for(i in 2:length(gameids)){ tempplays <- game_play_by_play(gameids[i]) plays <- rbind(plays, tempplays) rm(tempplays) print(c(years[1], paste(i/length(gameids)*100,"%"))) } for(j in 2:length(years)){ print(years[j]) gameids <- eval(as.symbol(paste0("gameids", years[j]))) for(i in 1:length(gameids)){ tempplays <- game_play_by_play(gameids[i]) plays <- rbind(plays, tempplays) rm(tempplays) print(c(years[j], paste(i/length(gameids)*100,"%"))) } } dim(plays) head(plays) write.csv(plays,"plays.csv") ``` We can create a new dataframe that contains "Timeout" and "Field Goal" events. Change the `` to ``` at the beginning and end of the chunk to make it executable. The chunk can be made non-executable once the dataframe has been created and saved for reloading. ```{r subsetFGandTO, echo=TRUE} table(plays$PlayType) FGsTOs <- subset(plays, PlayType %in% c("Timeout", "Field Goal")) # FGsTOs <- plays[plays$PlayType %in% c("Timeout", "Field Goal"),] table(FGsTOs$PlayType) write.csv(FGsTOs,"FGsTOs.csv") ``` Determine if the kicker was iced and subset the data to get only field goals (FGs). ```{r subsetFG, echo=TRUE} p_load(dplyr) p_load(stringr) ### Comment out the next line if you are building the plays above ### Change the file location to reflect where it is stored #load("D:/Fall16/FYS/NFLScrape/PlayByPlay/FYSNFLproject20161120.RData") #load("/Users/samuelsilva/Desktop/FYSNFLproject20161120.Rdata") ### Using plays temp = plays temp$desc1 = toupper(lag(temp$desc,1)) temp$desc2 = toupper(lag(temp$desc,2)) temp$time1 = lag(temp$time,1) temp$time2 = lag(temp$time,2) temp$PlayType1 = lag(temp$PlayType,1) temp$PlayType2 = lag(temp$PlayType,2) temp$DefensiveTeam = toupper(temp$DefensiveTeam) temp$Iced1 = rep(NA,nrow(temp)) temp$Iced1 = (temp$PlayType1=="Timeout" & temp$PlayType=="Field Goal") & (temp$time1==temp$time) & !is.na(str_locate(temp$desc1,temp$DefensiveTeam)[,1]) table(temp$Iced1) temp$Iced2 = (temp$PlayType2=="Timeout" & temp$PlayType=="Field Goal") & (temp$time2==temp$time) & !is.na(str_locate(temp$desc2,temp$DefensiveTeam)[,2]) temp$Iced = temp$Iced1 | temp$Iced2 xtabs(~Iced2, data=temp) xtabs(~Iced1+Iced2, data=temp) xtabs(~FieldGoalResult+(Iced1 | Iced2), data=temp) temp$obsNo = 1:nrow(temp) FGs = subset(temp, temp$PlayType=="Field Goal") FGs$FieldGoalDistance = as.numeric(FGs$FieldGoalDistance) FGs$FieldGoalGood = factor(FGs$FieldGoalResult=="Good") FGs$Iced = factor(FGs$Iced) FGs$Iced1 = factor(FGs$Iced1) FGs$Iced2 = factor(FGs$Iced2) xtabs(~FieldGoalGood+Iced, data=FGs) # Recode TRUE/FALSE as No/Yes FGs$Iced=factor(FGs$Iced, levels = c(FALSE,TRUE), labels = c("No","Yes")) FGs$FieldGoalGood=factor(FGs$FieldGoalGood, levels = c(FALSE,TRUE), labels = c("No","Yes")) #write.csv(FGs,"d:/fall16/fys/nflscrape/playbyplay/FGs.csv") # Remove temp since it is a bigger version of plays with subseting variables rm(temp) save.image("FYSNFLproject20161216.RData") ``` Add kicker ```{r addkicker, echo=TRUE} p_load(stringr) #tmpx = FGs[c(1,7,15),"desc"] getkicker <- function(x=tmpx){ n <- length(x) kicker <- rep(NA,n) #print(paste("kicker=",kicker)) for (i in 1:n){ #print(paste("i=",i," of ",n)) tmp <- strsplit(x[i], split="formation) ") tmpn <- length(tmp[[1]]) #print(paste("tmp=", tmp)) #print(paste("tmpn=",tmpn)) if (tmpn==1){ tmpk <- strsplit(tmp[[1]][1], split=") ") #print(paste("tmpk1=",tmpk)) tmpk <- strsplit(tmpk[[1]][2],split=" ") #print(paste("tmpk2=",tmpk)) kicker[i] <- tmpk[[1]][1] #print(paste("kicker=",kicker)) }else{ tmp <- strsplit(tmp[[1]][2], split=" ") #print(paste("tmp=",tmp)) kicker[i] <- tmp[[1]][1] #print(paste("kicker=",kicker)) } # else } # for return(str_trim(chartr(",."," ",kicker),"both")) } #getkicker #getkicker(tmpx) FGs$kicker = getkicker(FGs$desc) FGs$A.Vinatieri = factor(FGs$kicker=="A Vinatieri",levels = c(FALSE,TRUE),labels = c("No","Yes")) FGs$D.Bailey = factor(FGs$kicker=="D Bailey",levels = c(FALSE,TRUE),labels = c("No","Yes")) FGs$M.Crosby = factor(FGs$kicker=="M Crosby",levels = c(FALSE,TRUE),labels = c("No","Yes")) xtabs(~kicker+M.Crosby, data=FGs) ``` Load packages needed to fit the logist regression models (lrm) and to generate predicted probability plots and nomograms. ```{r loadpacks, echo=TRUE} # Load a few packages p_load(Hmisc) p_load(xtable) p_load(lattice) p_load(rms) ### Modern R replacement for the Design package ``` ## Fitting Logistic Models ### Distance The method of fitting models by finding the observed log-odds that is demonstrated above is tedious. A model to test for the difference in odds of **FieldGoalGood** = "Y" as determined by **FieldGoalDistance** may be fitted using the **lrm** function. ```{r lrmdistance, echo=TRUE} # Define the data.frame removing variables that won't be used. FG = FGs[,c(1:19, 49:50, 62:65, 67:ncol(FGs))] dd = datadist(FG) options(datadist="dd") if ("FG" %in% search()) detach(FG) attach(FG) # Fit a logistic model of Hit as a function of Distance FG.lrm.dist=lrm(FieldGoalGood~FieldGoalDistance, x=TRUE, y=TRUE) FG.lrm.dist # Get a little information about the quality of the model anova(FG.lrm.dist) ``` Since the coefficient of **FieldGoalDistance** is negative, we see that the (log) odds of making a shot (**FieldGoalGood** = "Y") decreases as **FieldGoalDistance** increases. This can most easily be seen using the odds ratios. Comparisons against the minimum distance show the effect of moving farther back. E.g. a FG from 60 yards is about 0.015 (0.010, 0.020) as likely to be made as one taken from 18 yrds. ```{r oddsdist, echo=TRUE} summary(FG.lrm.dist) summary(FieldGoalDistance, na.rm=TRUE) min(FieldGoalDistance, na.rm=TRUE) summary(FG.lrm.dist, FieldGoalDistance=c(min(FieldGoalDistance, na.rm=TRUE),30)) summary(FG.lrm.dist, FieldGoalDistance=c(min(FieldGoalDistance, na.rm=TRUE),40)) summary(FG.lrm.dist, FieldGoalDistance=c(min(FieldGoalDistance, na.rm=TRUE),50)) summary(FG.lrm.dist, FieldGoalDistance=c(min(FieldGoalDistance, na.rm=TRUE),60)) ``` The probability of making the FG at different distances may be plotted. ```{r plotdist, echo=TRUE} print(plot(Predict(FG.lrm.dist, fun=plogis), ylab="Probability of Making a Kick", xlab="Yards")) ``` We see that as the kick is taken from farther back, the probability of their making the FG decreases. A nomogram may be helpful in estimating these probabilities. ```{r nomogramdist, echo=TRUE} nom = nomogram(FG.lrm.dist, fun=plogis) print(plot(nom)) ``` ### Icing A model to test for the difference in odds of **FieldGoalGood** = "Y" as determined by **Iced** may be fitted using the **lrm** function. ```{r lrmhand, echo=TRUE} xtabs(~Iced, data=FGs) FG = FGs[,c(1:19, 49:50, 62:65, 67:ncol(FGs))] dd = datadist(FG) options(datadist="dd") if ("FG" %in% search()) detach(FG) attach(FG) FG.lrm.Iced=lrm(FieldGoalGood~Iced, x=TRUE, y=TRUE) FG.lrm.Iced anova(FG.lrm.Iced) summary(FG.lrm.Iced, Iced=c("No")) summary(FG.lrm.Iced, Iced=c("Yes")) plot(Predict(FG.lrm.Iced, fun=plogis), ylab="Probability of Making a Kick") nom = nomogram(FG.lrm.Iced, fun=plogis) plot(nom) ``` ### Kicker ```{r lrmkicker, echo=TRUE} ### Code to the next comment subsets the data for kickers with ge 20 attempts ### who were iced more than 4 times kickertable = table(FGs$kicker) kickertable FGs$kickerge20 = FGs$kicker %in% names(kickertable[kickertable >= 20]) kickericedtable = table(FGs$kicker, FGs$Iced) kickericedtable FGs$kickericed = FGs$kicker %in% dimnames(kickericedtable[kickericedtable[,2] > 4,])[[1]] xtabs(~kickericed+Iced, data=FGs) FG = subset(FGs, FGs$kickericed) xtabs(~kicker+Iced, data=FG) ### End of subsetting # Refit distance to see if things changed # Define the data.frame using subsetted data.frame removing variables that # won't be used. FG = FG[,c(1:19, 49:50, 62:65, 67:ncol(FG))] dd = datadist(FG) options(datadist="dd") while("FG" %in% search()) { detach(FG) } attach(FG) # Fit a logistic model of Hit as a function of Distance FG.lrm.dist=lrm(FieldGoalGood~FieldGoalDistance, x=TRUE, y=TRUE) FG.lrm.dist # Get a little information about the quality of the model anova(FG.lrm.dist) # Odds ratios summary(FG.lrm.dist) summary(FieldGoalDistance, na.rm=TRUE) min(FieldGoalDistance, na.rm=TRUE) summary(FG.lrm.dist, FieldGoalDistance=c(min(FieldGoalDistance, na.rm=TRUE),30)) summary(FG.lrm.dist, FieldGoalDistance=c(min(FieldGoalDistance, na.rm=TRUE),40)) summary(FG.lrm.dist, FieldGoalDistance=c(min(FieldGoalDistance, na.rm=TRUE),50)) summary(FG.lrm.dist, FieldGoalDistance=c(min(FieldGoalDistance, na.rm=TRUE),60)) print(plot(Predict(FG.lrm.dist, fun=plogis), ylab="Probability of Making a Kick", xlab="Yards")) # Nomogram nom = nomogram(FG.lrm.dist, fun=plogis) print(plot(nom)) # Refit Iced model FG.lrm.Iced=lrm(FieldGoalGood~Iced, x=TRUE, y=TRUE) FG.lrm.Iced anova(FG.lrm.Iced) summary(FG.lrm.Iced, Iced=c("No")) summary(FG.lrm.Iced, Iced=c("Yes")) plot(Predict(FG.lrm.Iced, fun=plogis), ylab="Probability of Making a Kick") nom = nomogram(FG.lrm.Iced, fun=plogis) plot(nom) # Fit kicker model FG.lrm.kicker=lrm(FieldGoalGood~kicker, x=TRUE, y=TRUE) FG.lrm.kicker anova(FG.lrm.kicker) summary(FG.lrm.kicker, kicker=c("A Vinatieri")) summary(FG.lrm.kicker, kicker=c("M Crosby")) plot(Predict(FG.lrm.kicker, fun=plogis), ylab="Probability of Making a Kick") nom = nomogram(FG.lrm.kicker, fun=plogis) plot(nom) # Fit kicker*Iced model FG.lrm.kickerIced=lrm(FieldGoalGood~kicker*Iced, x=TRUE, y=TRUE) FG.lrm.kickerIced anova(FG.lrm.kickerIced) summary(FG.lrm.kickerIced, kicker=c("A Vinatieri"), Iced=c("No")) plot(Predict(FG.lrm.kickerIced, fun=plogis, kicker=c("A Vinatieri","M Crosby"), Iced=c("No","Yes")), ylab="Probability of Making a Kick") nom = nomogram(FG.lrm.kickerIced, fun=plogis) plot(nom) # Fit kicker*FieldGoalDistance model FG.lrm.kickerDist=lrm(FieldGoalGood~kicker*FieldGoalDistance, x=TRUE, y=TRUE) FG.lrm.kickerDist anova(FG.lrm.kickerDist) summary(FG.lrm.kickerDist, kicker=c("A Vinatieri")) plot(Predict(FG.lrm.kickerDist, fun=plogis, kicker=c("A Vinatieri","M Crosby")), ylab="Probability of Making a Kick") nom = nomogram(FG.lrm.kickerDist, fun=plogis) plot(nom) # Fit Iced*FieldGoalDistance model FG.lrm.IcedDist=lrm(FieldGoalGood~Iced*FieldGoalDistance, x=TRUE, y=TRUE) FG.lrm.IcedDist anova(FG.lrm.IcedDist) summary(FG.lrm.IcedDist, Iced=c("No")) plot(Predict(FG.lrm.IcedDist, fun=plogis), ylab="Probability of Making a Kick") nom = nomogram(FG.lrm.IcedDist, fun=plogis) plot(nom) ```