The goal of this tutorial is to use multiple R(t) packages to fit to
a simple, simulated outbreak with a known R(t), serial interval,
generation interval, and reporting delay probability mass function
(PMF). We will use the summrt
package to generate
standardized outputs, plot results, and quantitatively evaluate the
accuracy in R(t) estimation.
Eventually we will expand this to additional vignettes that will fit to and evaluate more complex simulated and real datasets (this might require not evaluating just accuracy and reliability in R(t) but also nowcasting and forecasting expected observations and comparing to the true observations).
Load the dataset we will be fitting the R(t) estimation packages to in this vignette.
This will eventually use data from the rtdata
package
that will be documented and will describe the specific epidemiological
use case this data scenario is meant to represent. In this case, we are
going to fit to data on the number of reported cases, with a known
discrete generation interval probability mass function (PMF) and
reporting delay PMF that are also provided as data.
One of the goals of this evaluation using the summrt
package is to standardize date/time indexing. All R(t) estimates from
each package should be lined up such that the time vector returned
alongside the estimates follows the same indexing.
# We will eventually replace this with package data specific to the dataset.
# E.g. this might be our baseline infections, onset, report data
url <- "https://raw.githubusercontent.com/cmilando/RtEval/main/all_data.RDS"
all_data <- readRDS(url(url))
ggplot(all_data$rt) +
geom_line(aes(x = Day, y = Rt)) +
geom_hline(aes(yintercept = 1), linetype = "dashed") +
xlab("Day") + ylab("R(t)") +
scale_y_continuous(trans = "log") +
theme_bw() + ggtitle("Simulated R(t)")
ggplot(all_data$generation) +
geom_bar(aes(x = Day, y = Px), stat = "identity") +
xlab("Day") + ylab("Generation interval PMF") +
theme_bw()
ggplot(all_data$reporting_delay) +
geom_bar(aes(x = Day, y = Px), stat = "identity") +
xlab("Day") + ylab("Reporting delay PMF") +
theme_bw()
ggplot(all_data$cases) +
geom_bar(aes(x = day, y = daily_reports), stat = "identity") +
xlab("Day") + ylab("Reported cases") +
theme_bw()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_bar()`).
Fit each of the packages to the dataset. See the package specific vignettes for more of a walk through for the decisions made for each package.
incidence_df = data.frame(
date = lubridate::make_date(2020, 3, 19) + 1:nrow(all_data$cases),
confirm = as.vector(all_data$cases$daily_reports)
)
gi_pmf <- NonParametric(pmf = all_data$serial$Px)
sym_report_delay_pmf <- NonParametric(pmf = all_data$reporting_delay$Px)
incubation_pmf <- NonParametric(pmf = all_data$incubation$Px)
start.time <- Sys.time()
EpiNow2_obj <- epinow(
data = incidence_df,
generation_time = generation_time_opts(gi_pmf),
delays = delay_opts(incubation_pmf + sym_report_delay_pmf),
backcalc = backcalc_opts(prior = 'reports'),
rt = rt_opts(rw = 1),
stan = stan_opts(chains = 4, cores = 4)
)
## WARN [2024-09-26 16:31:00] epinow: There were 59 divergent transitions after warmup. See
## https://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## to find out why this is a problem and how to eliminate them. -
## WARN [2024-09-26 16:31:00] epinow: There were 2 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 15. See
## https://mc-stan.org/misc/warnings.html#maximum-treedepth-exceeded -
## WARN [2024-09-26 16:31:00] epinow: Examine the pairs() plot to diagnose sampling problems
## -
## WARN [2024-09-26 16:31:01] epinow: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## https://mc-stan.org/misc/warnings.html#bulk-ess -
end.time <- Sys.time()
run.time.epinow <- end.time-start.time
# reporting data with dates indexed by integers starting at 0
incidence_df <- data.frame(
dates = all_data$cases$day,
I = as.vector(all_data$cases$daily_reports)
)
colnames(incidence_df) <- c('dates', 'I')
# Serial interval from data PMF-has leading 0 and is vector of probabilities
si_distr <- as.matrix(all_data$serial$Px)
if (all_data$serial$Day[1] == 1) si_distr <- c(0, si_distr)
si_distr
## [1] 0.0000 0.0610 0.1540 0.2198 0.2178 0.1536 0.1122 0.0486 0.0224 0.0078
## [11] 0.0022 0.0004 0.0002
# Estimate R DAILY
start.time <- Sys.time()
EpiEstim_obj <- EpiEstim::estimate_R(
incid = incidence_df,
method = "non_parametric_si",
config = make_config(list(
si_distr = si_distr,
t_start = 2:nrow(incidence_df),
t_end = 2:nrow(incidence_df)
)),
backimputation_window = 10
)
end.time <- Sys.time()
run.time.epiestim <- end.time-start.time
rr <- 2:nrow(all_data$cases)
start.time <- Sys.time()
rtestim_obj <- cv_estimate_rt(
observed_counts = all_data$cases$daily_reports[rr],
x = all_data$cases$day[rr],
delay_distn = all_data$serial$Px
)
end.time <- Sys.time()
run.time.rtestim <- end.time-start.time
si_spec <- Idist(probs = all_data$serial$Px)
incidence = all_data$cases$daily_reports
which(is.na(incidence))
## [1] 1
incidence[1] <- 0
start.time <- Sys.time()
EpiLPS_obj <- estimR(incidence = incidence, si = si_spec$pvec)
end.time <- Sys.time()
run.time.epilps <- end.time-start.time
SummRt
package to standardize the outputsstd_EpiEstim <- summarize_rtestimate(EpiEstim_obj)
std_EpiNow2 <- summarize_rtestimate(EpiNow2_obj)
std_rtestim <- summarize_rtestimate(rtestim_obj)
std_EpiLPS <- summarize_rtestimate(EpiLPS_obj)
# Put them all together into one dataframe
convert_to_df <- function(output_summrt){
df <- tibble::tibble(output_summrt$estimates) |>
dplyr::mutate(package = output_summrt$package)
}
df <- tibble::tibble() |>
bind_rows(convert_to_df(std_EpiNow2)) |>
bind_rows(convert_to_df(std_rtestim)) |>
bind_rows(convert_to_df(std_EpiLPS)) |>
bind_rows(convert_to_df(std_EpiEstim)) |>
dplyr::left_join(
all_data$rt,
by = c("date" = "Day")
)
We’re going to make a plot of the four R(t) estimates that result
from the summRt
output standardization, overlaid onto one
another. Eventually we will write functionality to combine these things
nicely and plot them a few different ways. For now will just show how to
do this via accessing the elements directly.
ggplot(data = df) +
geom_line(aes(x = date,
y = Rt),
linetype = "dashed") +
geom_line(
aes(x = date,
y = median,
color = package))+
geom_ribbon(
aes(x = date,
ymin = lb,
ymax = ub,
fill = package),
alpha = 0.1) +
scale_y_continuous(trans = "log10",
limits = c(0.1, 10)) +
theme_bw() +
xlab('Time (days)') +
ylab('R(t)')
## Warning in scale_y_continuous(trans = "log10", limits = c(0.1, 10)): log-10
## transformation introduced infinite values.
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_line()`).
# Also plot them faceted
ggplot(data = df) +
geom_line(aes(x = date,
y = Rt),
linetype = "dashed") +
geom_line(
aes(x = date,
y = median,
color = package))+
geom_ribbon(
aes(x = date,
ymin = lb,
ymax = ub,
fill = package),
alpha = 0.1) +
scale_y_continuous(trans = "log10",
limits = c(0.1, 10)) +
facet_wrap(~package) +
theme_bw() +
xlab('Time (days)') +
ylab('R(t)')
## Warning in scale_y_continuous(trans = "log10", limits = c(0.1, 10)): log-10 transformation introduced infinite values.
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_line()`).
Run times and MAE for each model
mae.epinow <- get_mae(pd = std_EpiNow2, rt_sim=all_data$rt)
## Warning in pd$estimates$median - rt_sim$Rt_calc: longer object length is not a
## multiple of shorter object length
mae.epiestim <- get_mae(pd = std_EpiEstim, rt_sim=all_data$rt)
mae.rtestim <- get_mae(pd = std_rtestim, rt_sim=all_data$rt)
mae.epilps <- get_mae(pd = std_EpiLPS, rt_sim=all_data$rt)
## Warning in pd$estimates$median - rt_sim$Rt_calc: longer object length is not a
## multiple of shorter object length
run.time.obj <- data.frame(Package=c("EpiNow2","EpiEstim","rtestim","EpiLPS"),
Run.Time=round(c(run.time.epinow,
run.time.epiestim,
run.time.rtestim,
run.time.epilps),3),
MAE=round(c(mae.epinow,
mae.epiestim,
mae.rtestim,
mae.epilps),3))
run.time.obj %>%
kbl() %>%
kable_styling()
Package | Run.Time | MAE |
---|---|---|
EpiNow2 | 1367.700 secs | 0.759 |
EpiEstim | 0.022 secs | 1.133 |
rtestim | 0.568 secs | 0.516 |
EpiLPS | 0.095 secs | 2.368 |
Visual comparison of scores for each package
Which packages perform best on this dataset in what regimes? E.g. EpiEstim most accurately captures perturbation, EpiNow2 performs best at nowcasting/forecasting observations