Set Up
library(tidyverse)
## ── Attaching packages ───────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.1 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ──────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
#load data into R
player_data <- read_csv("Master.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## birthYear = col_integer(),
## birthMonth = col_integer(),
## birthDay = col_integer(),
## deathYear = col_integer(),
## deathMonth = col_integer(),
## deathDay = col_integer(),
## weight = col_integer(),
## height = col_integer(),
## debut = col_date(format = ""),
## finalGame = col_date(format = "")
## )
## See spec(...) for full column specifications.
pitching_data <- read_csv("Pitching.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## playerID = col_character(),
## teamID = col_character(),
## lgID = col_character(),
## BAOpp = col_double(),
## ERA = col_double(),
## IBB = col_character(),
## SH = col_character(),
## SF = col_character(),
## GIDP = col_character()
## )
## See spec(...) for full column specifications.
salary_data <- read_csv("Salaries.csv")
## Parsed with column specification:
## cols(
## yearID = col_integer(),
## teamID = col_character(),
## lgID = col_character(),
## playerID = col_character(),
## salary = col_integer()
## )
inflation_index <- read_csv("inflation.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## inflation2015 = col_double()
## )
Graph 1
# treat as categorical variable
pitching_data$yearID <- as.factor(pitching_data$yearID)
#boxplot of salaries for each year
ggplot(pitching_data)+geom_boxplot(aes(x=yearID, y=ERA))
## Warning: Removed 90 rows containing non-finite values (stat_boxplot).
Graph 2
# group the salary data by the year, compute median, 1st quartile, 3rd quartile
summary_ERA <- summarize(group_by(pitching_data, yearID), Q1 = quantile(ERA,.25,na.rm=T),median=median(ERA,na.rm=T), Q3 = quantile(ERA,.75,na.rm=T), min=min(ERA,na.rm=T), max=max(ERA,na.rm=T))
#change to categorical variable
summary_ERA$yearID <- as.numeric(as.character(summary_ERA$yearID))
# line graph of change of median ERA
ggplot(summary_ERA)+geom_line(aes(x=yearID, y=median))
Graph 3
# graph showing range between the first and third quartiles as ribbons and median ERA as line
ggplot(summary_ERA)+geom_ribbon(aes(x=yearID, ymin=Q1, ymax=Q3), fill="lightgreen")+geom_line(aes(x=yearID, y=median), color="darkblue")
Graph 4
# filter pitching data to include only pitchers who pitches at least 10 games
pitching_data_10 <- filter(pitching_data, G >= 10)
# summarize filtered data to include the proportion of pitchers who had an ERA less than or equal to 3 and the proportion who had an ERA greater than or equal to 6
summary_ERA_10 <- summarize(group_by(pitching_data_10, yearID), count = n(), low_ERA = sum(ERA <=3,na.rm=T),median=median(ERA,na.rm=T), high_ERA = sum(ERA >=6,na.rm=T), low_ERA_proportion = low_ERA/count, high_ERA_proportion = high_ERA/count)
# make numeric variable
summary_ERA_10$yearID <- as.numeric(as.character(summary_ERA_10$yearID))
# graph
ggplot(summary_ERA_10, aes(x=yearID)) +
geom_line(aes(y=low_ERA_proportion,color="Low ERA")) +
geom_line(aes(y=high_ERA_proportion,color="High ERA")) +
scale_color_manual(values=c("Low ERA"="darkblue","High ERA"="red"), name ="ERA", labels=c("3 or under", "6 or higher")) + theme_classic() + labs(title="Proportion of Pitchers (pitching at least 10 games) \n With Low and High ERAs by Year") + xlab("Year") + ylab("Proportion")
Graph 5
# create new vairable for born in USA
player_data_2 <- mutate(player_data, usa_born = ifelse(player_data$birthCountry == "USA", "Born in USA", "Born outside USA"))
# join data
summary_salary_players <- inner_join(salary_data, player_data_2, by="playerID")
# create summary salary statistics
summary_usa_born <- summarize(group_by(summary_salary_players, yearID, usa_born),
median = median(salary, na.rn=T),
Q1=quantile(salary,.25,na.rm=T),
Q3=quantile(salary,.75,na.rm=T))
# create variable to join by
names(inflation_index)[1] <- "yearID"
# join data
summary_salary_inner <- inner_join(summary_usa_born, inflation_index, by="yearID")
summary_salary_inner[summary_salary_inner$yearID==2015,"inflation2015"]<-1
# summary statistics
summary_salary_players2 <- mutate(summary_salary_inner, median_inflation_adjusted = median*inflation2015, Q1_inflation_adjusted = Q1*inflation2015, Q3_inflation_adjusted = Q3*inflation2015)
#graph
ggplot(summary_salary_players2)+geom_ribbon(aes(x=yearID, ymin=Q1, ymax=Q3,fill=usa_born),alpha=.4)+geom_line(aes(x=yearID, y=median,color=usa_born),size=1.2)+scale_y_continuous(labels = scales::dollar)+scale_color_discrete(name ="Median Salary")+ scale_fill_discrete(name ="Middle 50% of Earners")+ theme_minimal() + labs(title="Salaries of Middle 50% of Earners in Major League Baseball") + xlab("Year") + ylab("Annual Salary (Adjusted for Inflation")
# 1. Join player data @ salary data (inner join)
# 2. compute summary salary statistics for each year and birth status
# summarize(group_by(country_data, yearID, usa_born))
# median_salary = median(salary, na.rn=T)
# Q1_salary=
# Q3_salary=
# 3. adjust inflation