Set Up

library(tidyverse)
## ── Attaching packages ───────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.1     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## ── Conflicts ──────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
#load data into R
player_data <- read_csv("Master.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   birthYear = col_integer(),
##   birthMonth = col_integer(),
##   birthDay = col_integer(),
##   deathYear = col_integer(),
##   deathMonth = col_integer(),
##   deathDay = col_integer(),
##   weight = col_integer(),
##   height = col_integer(),
##   debut = col_date(format = ""),
##   finalGame = col_date(format = "")
## )
## See spec(...) for full column specifications.
pitching_data <- read_csv("Pitching.csv")
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   playerID = col_character(),
##   teamID = col_character(),
##   lgID = col_character(),
##   BAOpp = col_double(),
##   ERA = col_double(),
##   IBB = col_character(),
##   SH = col_character(),
##   SF = col_character(),
##   GIDP = col_character()
## )
## See spec(...) for full column specifications.
salary_data <- read_csv("Salaries.csv")
## Parsed with column specification:
## cols(
##   yearID = col_integer(),
##   teamID = col_character(),
##   lgID = col_character(),
##   playerID = col_character(),
##   salary = col_integer()
## )
inflation_index <- read_csv("inflation.csv")
## Parsed with column specification:
## cols(
##   year = col_integer(),
##   inflation2015 = col_double()
## )

Graph 1

# treat as categorical variable
pitching_data$yearID <- as.factor(pitching_data$yearID)
#boxplot of salaries for each year
ggplot(pitching_data)+geom_boxplot(aes(x=yearID, y=ERA))
## Warning: Removed 90 rows containing non-finite values (stat_boxplot).

Graph 2

# group the salary data by the year, compute median, 1st quartile, 3rd quartile
summary_ERA <- summarize(group_by(pitching_data, yearID), Q1 = quantile(ERA,.25,na.rm=T),median=median(ERA,na.rm=T), Q3 = quantile(ERA,.75,na.rm=T), min=min(ERA,na.rm=T), max=max(ERA,na.rm=T))

#change to categorical variable
summary_ERA$yearID <- as.numeric(as.character(summary_ERA$yearID))

# line graph of change of median ERA
ggplot(summary_ERA)+geom_line(aes(x=yearID, y=median))

Graph 3

# graph showing range between the first and third quartiles as ribbons and median ERA as line
ggplot(summary_ERA)+geom_ribbon(aes(x=yearID, ymin=Q1, ymax=Q3), fill="lightgreen")+geom_line(aes(x=yearID, y=median), color="darkblue")

Graph 4

# filter pitching data to include only pitchers who pitches at least 10 games
pitching_data_10 <- filter(pitching_data, G >= 10)

# summarize filtered data to include the proportion of pitchers who had an ERA less than or equal to 3 and the proportion who had an ERA greater than or equal to 6
summary_ERA_10 <- summarize(group_by(pitching_data_10, yearID), count = n(), low_ERA = sum(ERA <=3,na.rm=T),median=median(ERA,na.rm=T), high_ERA = sum(ERA >=6,na.rm=T), low_ERA_proportion = low_ERA/count, high_ERA_proportion = high_ERA/count)

# make numeric variable
summary_ERA_10$yearID <- as.numeric(as.character(summary_ERA_10$yearID))

# graph
ggplot(summary_ERA_10, aes(x=yearID)) +
  geom_line(aes(y=low_ERA_proportion,color="Low ERA")) +
  geom_line(aes(y=high_ERA_proportion,color="High ERA")) +
  scale_color_manual(values=c("Low ERA"="darkblue","High ERA"="red"), name ="ERA", labels=c("3 or under", "6 or higher")) + theme_classic() + labs(title="Proportion of Pitchers (pitching at least 10 games) \n With Low and High ERAs by Year") + xlab("Year") + ylab("Proportion")

Graph 5

# create new vairable for born in USA
player_data_2 <- mutate(player_data, usa_born = ifelse(player_data$birthCountry == "USA", "Born in USA", "Born outside USA"))

# join data
summary_salary_players <- inner_join(salary_data, player_data_2, by="playerID")

# create summary salary statistics
summary_usa_born <- summarize(group_by(summary_salary_players, yearID, usa_born),
                              median = median(salary, na.rn=T),
                              Q1=quantile(salary,.25,na.rm=T),
                              Q3=quantile(salary,.75,na.rm=T))
                              
# create variable to join by
names(inflation_index)[1] <- "yearID"

# join data
summary_salary_inner <- inner_join(summary_usa_born, inflation_index, by="yearID")
summary_salary_inner[summary_salary_inner$yearID==2015,"inflation2015"]<-1

# summary statistics
summary_salary_players2 <- mutate(summary_salary_inner, median_inflation_adjusted = median*inflation2015, Q1_inflation_adjusted = Q1*inflation2015, Q3_inflation_adjusted = Q3*inflation2015)

#graph
ggplot(summary_salary_players2)+geom_ribbon(aes(x=yearID, ymin=Q1, ymax=Q3,fill=usa_born),alpha=.4)+geom_line(aes(x=yearID, y=median,color=usa_born),size=1.2)+scale_y_continuous(labels = scales::dollar)+scale_color_discrete(name ="Median Salary")+ scale_fill_discrete(name ="Middle 50% of Earners")+ theme_minimal() + labs(title="Salaries of Middle 50% of Earners in Major League Baseball") + xlab("Year") + ylab("Annual Salary (Adjusted for Inflation")

# 1. Join player data @ salary data (inner join)
# 2. compute summary salary statistics for each year and birth status
#      summarize(group_by(country_data, yearID, usa_born))
#      median_salary = median(salary, na.rn=T)
#      Q1_salary=
#      Q3_salary=
# 3.   adjust inflation