Overview

The goal of this tutorial is to use multiple R(t) packages to fit to a simple, simulated outbreak with a known R(t), serial interval, generation interval, and reporting delay probability mass function (PMF). We will use the summrt package to generate standardized outputs, plot results, and quantitatively evaluate the accuracy in R(t) estimation.

Eventually we will expand this to additional vignettes that will fit to and evaluate more complex simulated and real datasets (this might require not evaluating just accuracy and reliability in R(t) but also nowcasting and forecasting expected observations and comparing to the true observations).

Load the simulated data

Load the dataset we will be fitting the R(t) estimation packages to in this vignette.

This will eventually use data from the rtdata package that will be documented and will describe the specific epidemiological use case this data scenario is meant to represent. In this case, we are going to fit to data on the number of reported cases, with a known discrete generation interval probability mass function (PMF) and reporting delay PMF that are also provided as data.

One of the goals of this evaluation using the summrt package is to standardize date/time indexing. All R(t) estimates from each package should be lined up such that the time vector returned alongside the estimates follows the same indexing.

# We will eventually replace this with package data specific to the dataset. 
# E.g. this might be our baseline infections, onset, report data
url <- "https://raw.githubusercontent.com/cmilando/RtEval/main/all_data.RDS"
all_data <- readRDS(url(url))

ggplot(all_data$rt) + 
  geom_line(aes(x = Day, y = Rt)) +
  geom_hline(aes(yintercept = 1), linetype = "dashed") + 
  xlab("Day") + ylab("R(t)") + 
  scale_y_continuous(trans = "log") + 
  theme_bw() + ggtitle("Simulated R(t)")

ggplot(all_data$generation) +
  geom_bar(aes(x = Day, y = Px), stat = "identity") + 
  xlab("Day") + ylab("Generation interval PMF") +
  theme_bw() 

ggplot(all_data$reporting_delay) +
  geom_bar(aes(x = Day, y = Px), stat = "identity") +
  xlab("Day") + ylab("Reporting delay PMF") +
  theme_bw()

ggplot(all_data$cases) +
  geom_bar(aes(x = day, y = daily_reports), stat = "identity") +
  xlab("Day") + ylab("Reported cases") + 
  theme_bw()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).

Use each package to estimate R(t)

Fit each of the packages to the dataset. See the package specific vignettes for more of a walk through for the decisions made for each package.

EpiNow2

incidence_df = data.frame(
  date = lubridate::make_date(2020, 3, 19) + 1:nrow(all_data$cases),
  confirm = as.vector(all_data$cases$daily_reports)
)

gi_pmf <- NonParametric(pmf = all_data$serial$Px)
sym_report_delay_pmf <- NonParametric(pmf = all_data$reporting_delay$Px)
incubation_pmf <- NonParametric(pmf = all_data$incubation$Px)

start.time <- Sys.time()
EpiNow2_obj <- epinow(
  data = incidence_df,
  generation_time = generation_time_opts(gi_pmf),
  delays = delay_opts(incubation_pmf + sym_report_delay_pmf),
  backcalc = backcalc_opts(prior = 'reports'),
  rt = rt_opts(rw = 1),
  stan = stan_opts(chains = 4, cores = 4)
)
## WARN [2024-09-26 16:31:00] epinow: There were 59 divergent transitions after warmup. See
## https://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## to find out why this is a problem and how to eliminate them. - 
## WARN [2024-09-26 16:31:00] epinow: There were 2 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 15. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded - 
## WARN [2024-09-26 16:31:00] epinow: Examine the pairs() plot to diagnose sampling problems
##  - 
## WARN [2024-09-26 16:31:01] epinow: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess -
end.time <- Sys.time()

run.time.epinow <- end.time-start.time

EpiEstim

# reporting data with dates indexed by integers starting at 0
incidence_df <- data.frame(
  dates = all_data$cases$day,
  I = as.vector(all_data$cases$daily_reports)
)
colnames(incidence_df) <- c('dates', 'I')

# Serial interval from data PMF-has leading 0 and is vector of probabilities
si_distr <- as.matrix(all_data$serial$Px)
if (all_data$serial$Day[1] == 1) si_distr <- c(0, si_distr)
si_distr
##  [1] 0.0000 0.0610 0.1540 0.2198 0.2178 0.1536 0.1122 0.0486 0.0224 0.0078
## [11] 0.0022 0.0004 0.0002
# Estimate R DAILY
start.time <- Sys.time()
EpiEstim_obj <- EpiEstim::estimate_R(
  incid = incidence_df,
  method = "non_parametric_si",
  config = make_config(list(
    si_distr = si_distr,
    t_start = 2:nrow(incidence_df),
    t_end = 2:nrow(incidence_df)
  )),
  backimputation_window = 10
)
end.time <- Sys.time()

run.time.epiestim <- end.time-start.time

rtestim

rr <- 2:nrow(all_data$cases)
start.time <- Sys.time()
rtestim_obj <- cv_estimate_rt(
  observed_counts = all_data$cases$daily_reports[rr],
  x = all_data$cases$day[rr],
  delay_distn = all_data$serial$Px
)
end.time <- Sys.time()
run.time.rtestim <- end.time-start.time

EpiLPS

si_spec <- Idist(probs = all_data$serial$Px)
incidence = all_data$cases$daily_reports
which(is.na(incidence))
## [1] 1
incidence[1] <- 0
start.time <- Sys.time()
EpiLPS_obj <- estimR(incidence = incidence, si = si_spec$pvec)
end.time <- Sys.time()
run.time.epilps <- end.time-start.time

Call the SummRt package to standardize the outputs

std_EpiEstim <- summarize_rtestimate(EpiEstim_obj)
std_EpiNow2 <- summarize_rtestimate(EpiNow2_obj)
std_rtestim <- summarize_rtestimate(rtestim_obj)
std_EpiLPS <- summarize_rtestimate(EpiLPS_obj)

# Put them all together into one dataframe
convert_to_df <- function(output_summrt){
  df <- tibble::tibble(output_summrt$estimates) |>
  dplyr::mutate(package = output_summrt$package)
}

df <- tibble::tibble() |>
  bind_rows(convert_to_df(std_EpiNow2)) |>
  bind_rows(convert_to_df(std_rtestim)) |>
  bind_rows(convert_to_df(std_EpiLPS)) |>
  bind_rows(convert_to_df(std_EpiEstim)) |>
  dplyr::left_join(
    all_data$rt,
    by = c("date" = "Day")
  )

Plot outputs

We’re going to make a plot of the four R(t) estimates that result from the summRt output standardization, overlaid onto one another. Eventually we will write functionality to combine these things nicely and plot them a few different ways. For now will just show how to do this via accessing the elements directly.

ggplot(data = df) +
    geom_line(aes(x = date,
                y = Rt),
            linetype = "dashed") +
  geom_line(
    aes(x = date,
                y = median,
                color = package))+
  geom_ribbon(
    aes(x = date,
                  ymin = lb,
                  ymax = ub,
              fill = package),
              alpha = 0.1) +
    scale_y_continuous(trans = "log10",
                     limits = c(0.1, 10)) +
    theme_bw() +
    xlab('Time (days)') +
    ylab('R(t)')
## Warning in scale_y_continuous(trans = "log10", limits = c(0.1, 10)): log-10
## transformation introduced infinite values.
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Also plot them faceted
ggplot(data = df) +
    geom_line(aes(x = date,
                y = Rt),
            linetype = "dashed") +
  geom_line(
    aes(x = date,
                y = median,
                color = package))+
  geom_ribbon(
    aes(x = date,
                  ymin = lb,
                  ymax = ub,
              fill = package),
              alpha = 0.1) +
    scale_y_continuous(trans = "log10",
                     limits = c(0.1, 10)) +
    facet_wrap(~package) + 
    theme_bw() +
    xlab('Time (days)') +
    ylab('R(t)')
## Warning in scale_y_continuous(trans = "log10", limits = c(0.1, 10)): log-10 transformation introduced infinite values.
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_line()`).

Score the output of the R(t) model

Run times and MAE for each model

mae.epinow <- get_mae(pd = std_EpiNow2,                                                   rt_sim=all_data$rt)
## Warning in pd$estimates$median - rt_sim$Rt_calc: longer object length is not a
## multiple of shorter object length
mae.epiestim <- get_mae(pd = std_EpiEstim,                                                   rt_sim=all_data$rt)
mae.rtestim <- get_mae(pd = std_rtestim,                                                   rt_sim=all_data$rt)
mae.epilps <- get_mae(pd = std_EpiLPS,                                                   rt_sim=all_data$rt)
## Warning in pd$estimates$median - rt_sim$Rt_calc: longer object length is not a
## multiple of shorter object length
run.time.obj <- data.frame(Package=c("EpiNow2","EpiEstim","rtestim","EpiLPS"),
                Run.Time=round(c(run.time.epinow,
                                 run.time.epiestim,
                                 run.time.rtestim,
                                 run.time.epilps),3),
                MAE=round(c(mae.epinow,
                            mae.epiestim,
                            mae.rtestim,
                            mae.epilps),3))

run.time.obj %>%
  kbl() %>%
  kable_styling()
Package Run.Time MAE
EpiNow2 1367.700 secs 0.759
EpiEstim 0.022 secs 1.133
rtestim 0.568 secs 0.516
EpiLPS 0.095 secs 2.368

Compare scores from each package

Visual comparison of scores for each package

Discussion/Interpretation

Which packages perform best on this dataset in what regimes? E.g. EpiEstim most accurately captures perturbation, EpiNow2 performs best at nowcasting/forecasting observations