## ----nomessages, echo = FALSE------------------------------------------------- # set some default options for chunks knitr::opts_chunk$set( warning = FALSE, # avoid warnings and messages in the output message = FALSE, collapse = TRUE, # collapse all output into a single block tidy = FALSE, # don't tidy our code-- assume we do it ourselves fig.height = 5, fig.width = 5 ) options(digits=4) # number of digits to display in output; can override with chunk option R.options=list(digits=) par(mar=c(3,3,1,1)+.1) set.seed(1234) # reproducibility ## ----load-packages------------------------------------------------------------ library(Lahman) library(ggplot2) library(dplyr) library(car) ## ----Batting-names------------------------------------------------------------ data("Batting", package="Lahman") # load the data str(Batting) # take a look at the structure of the complete data set, as it is ## ----Batting-filter----------------------------------------------------------- Batting <- Batting %>% select(yearID, AB, SO, HR) %>% # select the variables that we need group_by(yearID) %>% # group by year, so that each row is one year summarise_each(funs(sum)) # we want the sum of AB, HR, and SO in the other rows FullBatting<- Batting %>% # create a new variable that has SO rate and HR rate filter(yearID >= 1950) %>% # select the years from 1900+ mutate(SO_rate = (SO/AB)*100, HR_rate = (HR/AB)*100) #add SO rate and HR rate as percentages to our data frame some(FullBatting) # look at a set of random observations ## ----------------------------------------------------------------------------- dim(FullBatting) # show the dimensions of the data frame ## ----------------------------------------------------------------------------- sum(FullBatting$SO) # find the sum of strikeout column ## ----------------------------------------------------------------------------- mean(FullBatting$SO_rate) # find the mean of the strikeout rate column ## ----------------------------------------------------------------------------- sum(FullBatting$HR) # find the sum of home run column ## ----------------------------------------------------------------------------- mean(FullBatting$HR_rate) # find the mean of the home run rate column ## ----------------------------------------------------------------------------- corr <- cor.test(FullBatting$SO_rate, FullBatting$HR_rate) corr # find the correlation between strikeout rate and home run rate ## ----------------------------------------------------------------------------- Model_Totals <- lm(SO_rate~HR_rate, data=FullBatting) summary(Model_Totals) # look at the model totals ## ----------------------------------------------------------------------------- plot <- ggplot(FullBatting, aes(x= SO_rate, y= HR_rate))+ geom_point()+ xlab("Strikeout Rate") + ylab("Home Run Rate") + ggtitle("Relationship Between Strikeouts and Home Runs") plot + stat_smooth(method= "lm") ##stat_smooth fits the model and then we plot the linear regression model