Load the Data

library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
#> ✔ ggplot2 3.3.3     ✔ purrr   0.3.4
#> ✔ tibble  3.1.1     ✔ dplyr   1.0.5
#> ✔ tidyr   1.1.3     ✔ stringr 1.4.0
#> ✔ readr   1.4.0     ✔ forcats 0.5.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
library(covdata)
#> 
#> Attaching package: 'covdata'
#> The following object is masked from 'package:datasets':
#> 
#>     uspop
library(ggrepel)

Data for the United States come from a variety of sources:

COVID Tracking Project data

covus
#> # A tibble: 664,960 x 7
#>    date       state fips  data_quality_gra… measure       count measure_label   
#>    <date>     <chr> <chr> <lgl>             <chr>         <dbl> <chr>           
#>  1 2021-03-07 AK    02    NA                positive      56886 Positive Tests  
#>  2 2021-03-07 AK    02    NA                probable_cas…    NA Probable Cases  
#>  3 2021-03-07 AK    02    NA                negative         NA Negative Tests  
#>  4 2021-03-07 AK    02    NA                pending          NA Pending Tests   
#>  5 2021-03-07 AK    02    NA                hospitalized…    33 Currently Hospi…
#>  6 2021-03-07 AK    02    NA                hospitalized…  1293 Cumulative Hosp…
#>  7 2021-03-07 AK    02    NA                in_icu_curre…    NA Currently in ICU
#>  8 2021-03-07 AK    02    NA                in_icu_cumul…    NA Cumulative in I…
#>  9 2021-03-07 AK    02    NA                on_ventilato…     2 Currently on Ve…
#> 10 2021-03-07 AK    02    NA                on_ventilato…    NA Cumulative on V…
#> # … with 664,950 more rows

Draw a log-linear graph of cumulative reported US cases.

## Which n states are leading the count of positive cases or deaths?
top_n_states <- function(data, n = 5, measure = c("positive", "death")) {
  meas <- match.arg(measure)
  data %>%
  group_by(state) %>%
  filter(measure == meas, date == max(date)) %>%
  drop_na() %>%
  ungroup() %>%
  top_n(n, wt = count) %>%
  pull(state)
}

state_cols <- c("gray70", "#195F90FF", "#D76500FF", "#238023FF", "#AB1F20FF", "#7747A3FF", 
                "#70453CFF", "#D73EA8FF", "#666666FF", "#96971BFF", "#1298A6FF", "#6F9BD6FF", 
                "#FF952DFF", "#66CF51FF", "#FF4945FF", "#A07DBAFF", "#AC7368FF", "#EF69A2FF", 
                "#9F9F9FFF", "#CACA56FF", "#61C3D5FF")

covus %>%
  group_by(state) %>%
  mutate(core = case_when(state %nin% top_n_states(covus) ~ "",
                          TRUE ~ state),
         end_label = ifelse(date == max(date), core, NA)) %>%
  arrange(date) %>%
  filter(measure == "positive", date > "2020-03-09") %>%
  ggplot(aes(x = date, y = count, group = state, color = core, label = end_label)) + 
  geom_line(size = 0.5) + 
  geom_text_repel(segment.color = NA, nudge_x = 0.2, nudge_y = 0.1) + 
  scale_color_manual(values = state_cols) + 
  scale_x_date(date_breaks = "1 month", date_labels = "%b" ) + 
  scale_y_continuous(trans = "log2",
                     labels = scales::comma_format(accuracy = 1),
                     breaks = 2^c(seq(1, 22, 1))) +
  guides(color = FALSE) + 
  labs(title = "COVID-19 Cumulative Recorded Cases by US State",
       subtitle = paste("Data as of", format(max(covus$date), "%A, %B %e, %Y")),
       x = "Date", y = "Count of Cases (log 2 scale)", 
       caption = "Data: COVID Tracking Project, http://covidtracking.com | Graph: @kjhealy") + 
  theme_minimal()
#> Warning: Transformation introduced infinite values in continuous y-axis

#> Warning: Transformation introduced infinite values in continuous y-axis
#> Warning: Removed 22 row(s) containing missing values (geom_path).
#> Warning: Removed 20242 rows containing missing values (geom_text_repel).

Calculating daily counts

The COVID Tracking Project reports cumulative counts for key measures such as positive tests and deaths. For example, for New York State:


measures <- c("positive", "negative", "death")

covus %>%
  filter(measure %in% measures, state == "NY") %>%
  select(date, state, measure, count) %>%
  pivot_wider(names_from = measure, values_from = count)
#> # A tibble: 371 x 5
#>    date       state positive negative death
#>    <date>     <chr>    <dbl>    <dbl> <dbl>
#>  1 2021-03-07 NY     1681169       NA 39029
#>  2 2021-03-06 NY     1674380       NA 38970
#>  3 2021-03-05 NY     1666733       NA 38891
#>  4 2021-03-04 NY     1657777       NA 38796
#>  5 2021-03-03 NY     1650184       NA 38735
#>  6 2021-03-02 NY     1642480       NA 38660
#>  7 2021-03-01 NY     1636680       NA 38577
#>  8 2021-02-28 NY     1630445       NA 38497
#>  9 2021-02-27 NY     1622865       NA 38407
#> 10 2021-02-26 NY     1614724       NA 38321
#> # … with 361 more rows

To calculate daily counts from these cumulative measures, use lag().


covus %>%
  filter(measure %in% measures, state == "NY") %>%
  select(date, state, measure, count) %>%
  pivot_wider(names_from = measure, values_from = count) %>%
  mutate(across(positive:death, ~.x - lag(.x, order_by = date), 
                .names = "daily_{col}"))
#> # A tibble: 371 x 8
#>    date       state positive negative death daily_positive daily_negative
#>    <date>     <chr>    <dbl>    <dbl> <dbl>          <dbl>          <dbl>
#>  1 2021-03-07 NY     1681169       NA 39029           6789             NA
#>  2 2021-03-06 NY     1674380       NA 38970           7647             NA
#>  3 2021-03-05 NY     1666733       NA 38891           8956             NA
#>  4 2021-03-04 NY     1657777       NA 38796           7593             NA
#>  5 2021-03-03 NY     1650184       NA 38735           7704             NA
#>  6 2021-03-02 NY     1642480       NA 38660           5800             NA
#>  7 2021-03-01 NY     1636680       NA 38577           6235             NA
#>  8 2021-02-28 NY     1630445       NA 38497           7580             NA
#>  9 2021-02-27 NY     1622865       NA 38407           8141             NA
#> 10 2021-02-26 NY     1614724       NA 38321           8204             NA
#> # … with 361 more rows, and 1 more variable: daily_death <dbl>

Draw a graph of the weekly rolling average death rate, by state


state_pops <- uspop %>%
  filter(sex_id == "totsex", hisp_id == "tothisp") %>%
  select(state_abbr, statefips, pop, state) %>%
  rename(name = state, 
         state = state_abbr, fips = statefips) %>%
  mutate(state = replace(state, fips == "11", "DC"))

## Using a convenience function to do something similar
## to the lambda version above
get_daily_count <- function(count, date){
  count - lag(count, order_by = date)
}


covus %>%
  filter(measure == "death", state %in% unique(state_pops$state)) %>%
  group_by(state) %>%
  mutate(
    deaths_daily = get_daily_count(count, date), 
    deaths7 = slider::slide_dbl(deaths_daily, mean, .before = 7, .after = 0, na.rm = TRUE)) %>%
  left_join(state_pops) %>%
  filter(date > lubridate::ymd("2020-03-15")) %>%
  ggplot(mapping = aes(x = date, y = (deaths7/pop)*1e5)) + 
  geom_line(size = 0.5) + 
  scale_y_continuous(labels = scales::comma_format(accuracy = 1)) + 
  facet_wrap(~ name, ncol = 4) +
  labs(x = "Date", 
       y = "Deaths per 100,000 Population (Seven Day Rolling Average)", 
       title = "Average Death Rate from COVID-19: US States and Washington, DC", 
       subtitle = paste("COVID Tracking Project data as of", format(max(covus$date), "%A, %B %e, %Y")), 
       caption = "Kieran Healy @kjhealy / Data: https://www.covidtracking.com/") +
  theme_minimal()
#> Joining, by = c("state", "fips")

Draw a graph of the weekly rolling average death rate, by state, with free y-axes in the panels


covus %>%
  filter(measure == "death", state %in% unique(state_pops$state)) %>%
  group_by(state) %>%
  mutate(
    deaths_daily = get_daily_count(count, date), 
    deaths7 = slider::slide_dbl(deaths_daily, mean, .before = 7, .after = 0, na.rm = TRUE)) %>%
  left_join(state_pops) %>%
  filter(date > lubridate::ymd("2020-03-15")) %>%
  ggplot(mapping = aes(x = date, y = (deaths7/pop)*1e5)) + 
  geom_line(size = 0.5) + 
  facet_wrap(~ name, ncol = 4, scales = "free_y") +
  labs(x = "Date", 
       y = "Deaths per 100,000 Population (Seven Day Rolling Average)", 
       title = "Average Death Rate from COVID-19: US States and Washington, DC. Free scales.", 
       subtitle = paste("COVID Tracking Project data as of", format(max(covus$date), "%A, %B %e, %Y")), 
       caption = "Kieran Healy @kjhealy / Data: https://www.covidtracking.com/") +
  theme_minimal()
#> Joining, by = c("state", "fips")

Draw a graph of cumulative reported US deaths, aggregated to the national level

covus %>%
  filter(measure == "death") %>%
  group_by(date) %>%
  summarize(count = sum(count, na.rm = TRUE)) %>%
  ggplot(aes(x = date, y = count)) + 
  geom_line(size = 0.75) + 
  scale_x_date(date_breaks = "3 months", date_labels = "%b %Y" ) + 
  scale_y_continuous(labels = scales::comma, breaks = seq(0, 500000, 100000)) + 
  labs(title = "COVID-19 Cumulative Recorded Deaths in the United States",
       subtitle = paste("Data as of", format(max(covus$date), "%A, %B %e, %Y"), "Recorded counts underestimate total mortality."),
       x = "Date", y = "Count of Deaths", 
       caption = "Data: COVID Tracking Project, http://covidtracking.com | Graph: @kjhealy") + 
  theme_minimal()

State-Level and County-Level (Cumulative) Data from the New York Times

State-level table

nytcovstate
#> # A tibble: 23,169 x 5
#>    date       state      fips  cases deaths
#>    <date>     <chr>      <chr> <dbl>  <dbl>
#>  1 2020-01-21 Washington 53        1      0
#>  2 2020-01-22 Washington 53        1      0
#>  3 2020-01-23 Washington 53        1      0
#>  4 2020-01-24 Illinois   17        1      0
#>  5 2020-01-24 Washington 53        1      0
#>  6 2020-01-25 California 06        1      0
#>  7 2020-01-25 Illinois   17        1      0
#>  8 2020-01-25 Washington 53        1      0
#>  9 2020-01-26 Arizona    04        1      0
#> 10 2020-01-26 California 06        2      0
#> # … with 23,159 more rows

County-level table

nytcovcounty
#> # A tibble: 1,264,543 x 6
#>    date       county      state      fips  cases deaths
#>    <date>     <chr>       <chr>      <chr> <dbl>  <dbl>
#>  1 2020-01-21 Snohomish   Washington 53061     1      0
#>  2 2020-01-22 Snohomish   Washington 53061     1      0
#>  3 2020-01-23 Snohomish   Washington 53061     1      0
#>  4 2020-01-24 Cook        Illinois   17031     1      0
#>  5 2020-01-24 Snohomish   Washington 53061     1      0
#>  6 2020-01-25 Orange      California 06059     1      0
#>  7 2020-01-25 Cook        Illinois   17031     1      0
#>  8 2020-01-25 Snohomish   Washington 53061     1      0
#>  9 2020-01-26 Maricopa    Arizona    04013     1      0
#> 10 2020-01-26 Los Angeles California 06037     1      0
#> # … with 1,264,533 more rows

Draw a log-linear graph of cumulative US cases by county.

nytcovcounty %>%
  mutate(uniq_name = paste(county, state)) %>% # Can't use FIPS because of how the NYT bundled cities
  group_by(uniq_name) %>%
  mutate(days_elapsed = date - min(date)) %>%
  ggplot(aes(x = days_elapsed, y = cases, group = uniq_name)) + 
  geom_line(size = 0.25, color = "gray20") + 
  scale_y_log10(labels = scales::label_number_si()) + 
  guides(color = FALSE) + 
  facet_wrap(~ state, ncol = 5) + 
  labs(title = "COVID-19 Cumulative Recorded Cases by US County",
       subtitle = paste("New York is bundled into a single area in this data.\nData as of", format(max(nytcovcounty$date), "%A, %B %e, %Y")),
       x = "Days since first case", y = "Count of Cases (log 10 scale)", 
       caption = "Data: The New York Times | Graph: @kjhealy") + 
  theme_minimal()
#> Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
#> Warning: Transformation introduced infinite values in continuous y-axis