# Read in Data
library(tidyverse)
library(knitr)
library(readxl)
library(zoo)
url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
covid = read_csv(url)
pop = PopulationEstimates <- read_excel("../data/PopulationEstimates.xls",
skip = 2)
Question 1
# Set up data by county/create a New Cases column
data = covid %>%
filter(state == "California") %>%
group_by(county) %>%
mutate(newCases = cases - lag(cases)) %>%
ungroup()
today = data %>% filter(date == max(date))
# Calculate total California Cases for end analysis
sum(today$cases)
sum(today$newCases)
# Make Tables - Cumulative/New Cases
most_new_cases = today %>%
slice_max(newCases, n = 5) %>%
select(county, newCases)
most_cumulative_cases = data %>%
filter(date == max(date)) %>%
slice_max(cases, n = 5) %>%
select(county, cases)
# Refine Tables - cumulative/new cases
library(kableExtra)
knitr::kable(most_new_cases,
caption = "California Counties: Most New Cases",
col.names = c("County", "New Cases"),
format.args = list(big.mark = ",")) %>%
kableExtra::kable_styling("striped", full_width = TRUE, font_size = 14)
California Counties: Most New Cases
County
|
New Cases
|
Los Angeles
|
1,042
|
Sacramento
|
394
|
San Diego
|
361
|
Santa Clara
|
289
|
Riverside
|
205
|
knitr::kable(most_cumulative_cases,
caption = "California Counties: Most Cumulative Cases",
col.names = c("County", "Cumulative Cases"),
format.args = list(big.mark = ",")) %>%
kableExtra::kable_styling("striped", full_width = TRUE, font_size = 14)
California Counties: Most Cumulative Cases
County
|
Cumulative Cases
|
Los Angeles
|
252,066
|
Riverside
|
55,073
|
Orange
|
51,758
|
San Bernardino
|
50,385
|
San Diego
|
42,032
|
# Join population to covid data/filter to include only last 14 days
perCap = pop %>%
select(fips = "FIPStxt", state = "State", pop2019 = "POP_ESTIMATE_2019") %>%
right_join(data, by = "fips") %>%
mutate(cumulative_cases_percapita = (cases/pop2019), new_cases_percapita = (newCases/pop2019)) %>%
filter(date >= max(date) - 13)
# Make Percapita Tables
cum_per_cap = perCap %>%
filter(date == max(date)) %>%
slice_max(cases, n = 5) %>%
select(county, cumulative_cases_percapita)
new_per_cap = perCap %>%
filter(date == max(date)) %>%
slice_max(cases, n = 5) %>%
select(county, new_cases_percapita)
# Refine Tables - Per Capita
knitr::kable(cum_per_cap,
caption = "California Counties: Most Cumulative Cases Per Capita",
col.names = c("County", "Cumulative Cases Per Capita"),
format.args = list(big.mark = ",")) %>%
kableExtra::kable_styling("striped", full_width = TRUE, font_size = 14)
California Counties: Most Cumulative Cases Per Capita
County
|
Cumulative Cases Per Capita
|
Los Angeles
|
0.0251084
|
Riverside
|
0.0222918
|
Orange
|
0.0162982
|
San Bernardino
|
0.0231115
|
San Diego
|
0.0125907
|
knitr::kable(new_per_cap,
caption = "California Counties: Most New Cases Per Capita",
col.names = c("County", "New Cases Per Capita"),
format.args = list(big.mark = ",")) %>%
kableExtra::kable_styling("striped", full_width = TRUE, font_size = 14)
California Counties: Most New Cases Per Capita
County
|
New Cases Per Capita
|
Los Angeles
|
0.0001038
|
Riverside
|
0.0000830
|
Orange
|
0.0000501
|
San Bernardino
|
0.0000803
|
San Diego
|
0.0001081
|
# Calculate number of new cases in last 14 days per 100000 people/number of safe counties
safe = perCap %>%
group_by(county, pop2019) %>%
summarise(total_newCases = sum(newCases, na.rm = TRUE)) %>%
mutate(total_safecounties = total_newCases/(pop2019/100000)) %>%
filter(total_safecounties < 100)
Results:
Most recent counts show that in California, there are 7.5712510^{5} total COVID-19 cases, 4108 new cases, and 17 safe counties, where safe counties are those with fewer than 100 new cases per 100,000 residents over the past 14 days.
Question 2
# Facet plot 1
data2 = covid %>%
filter(state %in% c("New York","California", "Louisiana", "Florida")) %>%
group_by(state, date) %>%
summarize(cases = sum(cases)) %>%
ungroup() %>%
group_by(state) %>%
mutate(newCases = cases - lag(cases),
roll7 = zoo::rollmean(newCases, 7, fill = NA, align = 'right')) %>%
ungroup() %>%
filter(newCases > 0)
ggplot(data = data2, aes(x = date)) +
geom_col(aes(y = newCases), col = NA, fill = "red") +
geom_line(aes(y = roll7), col = "darkred", size = 1) +
ggthemes::theme_wsj() +
labs(title = paste("New Cases")) +
theme(plot.background = element_rect(fill = "white"),
panel.background = element_rect(fill = "white"),
plot.title = element_text(size = 14, face = 'bold')) +
facet_grid(~state, scales = "free_y")
# Facet plot 2
data3 = pop %>%
select(pop2019 = "POP_ESTIMATE_2019", areaname = "Area_Name") %>%
right_join(data2, by = c("areaname" = "state")) %>%
mutate(cases_per_capita = (newCases/pop2019), roll7 = zoo::rollmean(cases_per_capita, 7, fill = NA, align = 'right')) %>%
ungroup() %>%
filter(cases_per_capita > 0)
ggplot(data = data3, aes(x = date)) +
geom_col(aes(y = cases_per_capita), col = NA, fill = "lightblue") +
geom_line(aes(y = roll7), col = "blue", size = 1) +
ggthemes::theme_wsj() +
labs(title = paste("New Cases Per Capita")) +
theme(plot.background = element_rect(fill = "white"),
panel.background = element_rect(fill = "white"),
plot.title = element_text(size = 14, face = 'bold')) +
facet_grid(~areaname, scales = "free_y")
These two graphs displaying the total new cases and the total new cases per capita in California, Florida, Louisiana, and New York show the importance of scale in data analysis. The first graph makes it seem as though California, Florida, and New York all reach relatively close maximums and that Louisiana has significantly fewer cases than the other states. After analyzing the per capita data in the second graph, however, these assumptions prove to be false. Louisiana’s new cases per capita are comparable to those of Florida and New York and California has about half the number of new cases per capita as the other states.