Assuming this is just toy data to give the flavor of the problem and you have more observations, I would try
- fitting a heteroscedasticity-robust Poisson model of streams on episode age
- calculating the expected number of streams at each age, along with a 95% CI
This should look something like this:
R Code
# Uncomment the following lines to install the required packages if
# not already installed
# install.packages(c("tidyverse", "lubridate", "sandwich", "emmeans", "ggrepel"))
# Load necessary libraries
library(tidyverse)
library(lubridate)
library(sandwich)
library(emmeans)
library(ggrepel)
# Input data
data <- tibble(
episode = c("The Shadow Realm", "The Forbidden Tome",
"The Devil's Tea",
"The Sorcerer's Shoes", "The Enchanted Forest",
"The Echoing Abyss",
"The Lost Civilization", "The Dimensional Rift",
"The Crystal Maze",
"The Scepter of Zalthar"), # Episode titles
date = as_date(c("2024-03-08", "2024-03-01", "2024-02-23",
"2024-02-16", "2024-02-09", "2024-02-02",
"2024-01-26", "2024-01-19", "2024-01-12",
"2024-01-05")), # Episode release dates
streams = c(5987, 6315, 9584, 8996, 7564, 7982, 8456, 8834, 9215,
9763) # Number of streams
) %>%
mutate(age_in_days = as.numeric(ymd("2024-07-11") - date),
# Calculate age of each episode in days
outlier = ifelse(streams > 8900 & age_in_days < 150, episode, NA))
# Identify outliers based on streams and age
# Poisson regression model with robust standard errors
model <- glm(streams ~ age_in_days, data = data,
family = quasipoisson()) # Fit Poisson regression model
robust_vcov <- vcovHC(model, type = "HC1")
# Calculate robust variance-covariance matrix (HC1)
# Generate new data for prediction
new_data <- tibble(age_in_days = 125:188)
# Create a new data frame for age_in_days range
# Predictions with confidence intervals using emmeans
emm <- emmeans(model, ~ age_in_days, at = list(age_in_days = 125:188),
vcov = robust_vcov, type = "response")
emm_summary <- summary(emm)
# Print the column names of emm_summary to check
print(colnames(emm_summary))
# Convert emm_summary to a data frame and rename columns based on actual names
emm_df <- as.data.frame(emm_summary)
colnames(emm_df) <- c("age_in_days", "fitted", "SE", "df", "conf.low",
"conf.high")
# Ensure new_data matches emmeans summary
new_data <- left_join(new_data, emm_df, by = "age_in_days")
# Merge new_data with emm_df
# Plot
ggplot(new_data, aes(x = age_in_days, y = fitted)) +
geom_line(color = "blue") + # Plot fitted values as a line
geom_ribbon(aes(ymin = conf.low, ymax = conf.high),
linetype = "dashed", alpha = 0.1) +
# Add confidence interval ribbon
geom_point(data = data, aes(x = age_in_days, y = streams)) +
# Plot actual data points
geom_text_repel(data = data, aes(x = age_in_days, y = streams,
label = outlier),
na.rm = TRUE, box.padding = 0.35,
point.padding = 0.3, angle = 0) +
# Label outliers with episode names
theme_minimal() + # Apply minimal theme
labs(x = "Age in Days", y = "Streams") + # Add axis labels
theme(legend.position = "none") # Remove legend
Stata Code
Same results, but I cannot quite match the CI from R, probably due to different HC and small sample corrections. Differences should be less stark with a larger sample.
clear
input str30 episode str10 date int streams
"The Shadow Realm" "2024-03-08" 5987
"The Forbidden Tome" "2024-03-01" 6315
"The Devil's Tea" "2024-02-23" 9584
"The Sorcerer's Shoes" "2024-02-16" 8996
"The Enchanted Forest" "2024-02-09" 7564
"The Echoing Abyss" "2024-02-02" 7982
"The Lost Civilization" "2024-01-26" 8456
"The Dimensional Rift" "2024-01-19" 8834
"The Crystal Maze" "2024-01-12" 9215
"The Scepter of Zalthar" "2024-01-05" 9763
end
gen age_in_days = td(11jul2024) - date(date,"YMD")
gen outlier = episode if streams > 8900 & age_in_days < 150
glm streams age_in_days, vce(robust) family(poisson) irls // irr
margins, at(age_in_days = (125(1)188)) post
marginsplot, addplot(scatter streams age_in_days, ms(Oh) mlab(outlier) mlabangle(20) mlabpos(2)) recastci(rline) ciopt(lpattern(dash)) recast(line) legend(off)