library(tidyverse)
library(lubridate)
library(lme4)
library(survival)
library(survminer)
library(gridExtra)
library(ggpubr)
library(ggeffects)

Load anonymous data (no comment texts, open comments, usernames or email addresses)

# reddit data
discussion_data <- read_csv("../data/anon/discussions_anon.csv")
sample <- read_csv("../data/anon/sample_anon.csv")
# preprocessed survey data
scaled_sample <- read_csv("../data/anon/scaled_sample.csv")

Social Feedback

Construct lagged dataframe

#arrange by day
daily_discussion_data <- discussion_data %>% 
  arrange(created_comment) %>% 
  mutate(day = as.Date(created_comment)) %>% 
  group_by(ParticipantID, day) %>% 
  dplyr::summarize(n_comments = n(), 
                   mean_score = mean(score_comment), 
                   mean_toxicity = mean(comment_toxicity, na.rm=T), 
                   sum_score = sum(score_comment, na.rm=T))

# Pad out the daily_discussion_data dataframe to have a row for each ParticipantID-day combination
daily_discussion_data_complete <- daily_discussion_data %>% 
  ungroup %>% 
  complete(ParticipantID, day)
daily_discussion_data_complete$n_comments[which(is.na(daily_discussion_data_complete$n_comments))] <- 0

# Construct lagged data structure
# Note: This loop is inefficient, but has the virtue of making explicit exactly how the lagged mean score is calculated

lagged_data <- daily_discussion_data_complete
pb <- txtProgressBar(min = 0, max = nrow(lagged_data), style = 3)
for (i in 1:nrow(lagged_data)) {
  setTxtProgressBar(pb, i)
  if (lagged_data$day[i] <= (lagged_data %>% filter(ParticipantID == lagged_data$ParticipantID[i]) %>% filter(n_comments!=0) %>% pull(day) %>% min)){
    lagged_data$mean_score_lag[i] <- NA
  } else {
    lags <- lagged_data %>% filter(ParticipantID == lagged_data$ParticipantID[i] & day < lagged_data$day[i]) %>% dplyr::summarize(mean_score = mean(mean_score, na.rm = T), sum_score = sum(sum_score, na.rm = T))
    lagged_data$mean_score_lag[i] <- lags %>% pull(mean_score)
  }
}

#augment with survey data
lagged_data <- left_join(lagged_data, sample, by = "ParticipantID") %>% 
  mutate(condition = as.factor(condition))

# using next_day_comments / comment_again
lagged_data <- lagged_data %>%
  group_by(ParticipantID) %>%
  arrange(day) %>%
  mutate(next_day_comments = lead(n_comments),
         comment_again = ifelse(next_day_comments > 0, 1, 0))

model_mean <- glmer(next_day_comments ~ mean_score_lag + polinterest + (1 | ParticipantID) + (1 | subreddit),
               data = lagged_data, 
               family = poisson(link = "log"),
               na.action = na.exclude)
summary(model_mean)

model_prop_mean <- glmer(comment_again ~ mean_score_lag + polinterest + (1 | ParticipantID) + (1 | subreddit),
               data = lagged_data, 
               family = binomial(link = "logit"),
               na.action = na.exclude)
summary(model_prop_mean)

# For model_mean (Poisson GLMM)
coef_est_mean <- fixef(model_mean)["mean_score_lag"]
se_est_mean <- summary(model_mean)$coefficients["mean_score_lag", "Std. Error"]
lower_ci_mean <- coef_est_mean - 1.96 * se_est_mean
upper_ci_mean <- coef_est_mean + 1.96 * se_est_mean

coef_df_mean <- data.frame(term = "Mean score (lagged)",
                           estimate = coef_est_mean,
                           lower = lower_ci_mean,
                           upper = upper_ci_mean)

coef_plot_mean <- ggplot(coef_df_mean, aes(x = term, y = estimate)) +
  geom_point(size = 4, color = "black") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0, size = 1,color = "black") +
  labs(x = "",
       y = "Number of next-day comments",
       title = "Social feedback") +
  ylim(-0.05,0.22)+
  theme_bw(base_size = 13) +
  theme(axis.title.y = element_text(size = 10),
        axis.text.x  = element_text(size = 10),
        plot.title   = element_text(size = 12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

# For model_prop_mean (Binomial GLMM)
coef_est_prop <- fixef(model_prop_mean)["mean_score_lag"]
se_est_prop <- summary(model_prop_mean)$coefficients["mean_score_lag", "Std. Error"]
lower_ci_prop <- coef_est_prop - 1.96 * se_est_prop
upper_ci_prop <- coef_est_prop + 1.96 * se_est_prop

coef_df_prop <- data.frame(term = "Mean score (lagged)",
                           estimate = coef_est_prop,
                           lower = lower_ci_prop,
                           upper = upper_ci_prop)

coef_plot_prop <- ggplot(coef_df_prop, aes(x = term, y = estimate)) +
  geom_point(size = 4, color = "black") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0, size = 1, color = "black") +
  labs(x = "",
       y = "Propensity to comment again",
       title = "") +
  theme_bw(base_size = 13) +
  ylim(-0.05,0.22)+
  theme(axis.title.y = element_text(size = 10),
        axis.text.x  = element_text(size = 10),
        plot.title   = element_text(size = 12),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

combi_plot <- grid.arrange(coef_plot_mean, coef_plot_prop, ncol = 2)

ggsave("../output/social_feedback_coef_plot.pdf",combi_plot, width = 4, height = 3)


# plotting raw data
feedback_plot <- ggplot() +
  geom_vline(xintercept = 1, linetype = "dashed")+
  geom_point(data = lagged_data, 
             aes(x = mean_score_lag, y = next_day_comments),
             color = "#6699FF", alpha = 0.4, size = 1.5) +
  labs(x = "Mean score received on previous comments (lagged)",
       y = "Number of next-day comments",
       title = "") +
  annotate("text", x = 10, y = 22, label = "Start value of score (no votes) = 1")+
  theme_bw() +
  ylim(c(0,22)) +
  theme(axis.title.x = element_text(size = 10),
        axis.title.y = element_text(size = 10),
        axis.text = element_text(size = 13),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

feedback_plot

ggsave("../output/social_feedback_raw.pdf", feedback_plot, width = 5, height = 3)

Modeling Drop-out with Survival Models

# observation period's end (e.g., 30 days after start)
end_of_observation <- as.Date(min(discussion_data$created_comment)) + 30

# Prepare the survival data
survival_data <- discussion_data %>%
  group_by(ParticipantID) %>%
  summarize(discussion_start = min(created_comment),
    last_comment_date = max(created_comment),
    time = as.numeric(pmin(last_comment_date, end_of_observation) - discussion_start),
    status = ifelse(last_comment_date <= end_of_observation, 1, 0))%>%
  left_join(.,scaled_sample,by = "ParticipantID")

# h(t) hazard's function: risk of dying at time t, given the covariates. Covariates > 0 increase in hazard, # covariates < 0 decrease in hazard / negative predictor for dropout.    
res.cox <- coxph(Surv(time,status) ~ condition + group_toxicity, data = survival_data) 
summary(res.cox)

## Call:
## coxph(formula = Surv(time, status) ~ condition + group_toxicity, 
##     data = survival_data)
## 
##   n= 331, number of events= 324 
##    (4 observations deleted due to missingness)
## 
##                        coef exp(coef) se(coef)      z Pr(>|z|)   
## conditionincentives -0.4491    0.6382   0.1370 -3.277  0.00105 **
## conditionmoderation -0.3222    0.7246   0.1564 -2.060  0.03940 * 
## group_toxicity       0.3088    1.3618   0.1335  2.314  0.02070 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                     exp(coef) exp(-coef) lower .95 upper .95
## conditionincentives    0.6382     1.5669    0.4879    0.8349
## conditionmoderation    0.7246     1.3801    0.5333    0.9845
## group_toxicity         1.3618     0.7343    1.0483    1.7691
## 
## Concordance= 0.538  (se = 0.019 )
## Likelihood ratio test= 14.01  on 3 df,   p=0.003
## Wald test            = 14.06  on 3 df,   p=0.003
## Score (logrank) test = 14.23  on 3 df,   p=0.003

Cox coefficient plot

cox_summary <- summary(res.cox)$coefficients

coef_df <- data.frame(term = rownames(cox_summary),
  estimate = cox_summary[, "coef"],
  se = cox_summary[, "se(coef)"]) %>%
  mutate(lower = estimate - 1.96 * se,
         upper = estimate + 1.96 * se)

coef_df <- coef_df %>% 
  filter(term %in% c("conditionincentives", "conditionmoderation", "group_toxicity"))

cox_coef_plot <- ggplot(coef_df, aes(x = estimate, y = term, color = term)) +
  geom_point(size = 4) +
  geom_errorbarh(aes(xmin = lower, xmax = upper), height = 0, size = 1) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
  labs(x = "Cox prop. hazards model coefficient, 95% CIs", 
       title = "Discussion drop-out", y = "",
       subtitle = "Survival analysis for participation") +
  scale_y_discrete(labels = c("conditionincentives" = "Incentives",
                              "conditionmoderation" = "Moderation",
                              "group_toxicity" = "Group \n Toxicity")) +
  scale_color_manual(values = c(
    "conditionincentives" = "#C5701A",
    "conditionmoderation" = "#6699FF",
    "group_toxicity" = "black"
  )) +
  theme_bw()+
  coord_flip()+
  guides(color = "none")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

cox_coef_plot

ggsave("../output/cox_coef_plot.pdf", cox_coef_plot, width = 3, height = 4.5)

Kaplan-Meier survival curves for visualization

# for different groups
surv_fit <- survfit(Surv(time, status) ~ condition, data = survival_data)

surv_group_plot <- ggsurvplot(surv_fit,
  data = survival_data,
  ggtheme = theme_bw()+ theme(panel.grid = element_blank()),
  surv.median.line = "hv",
  pval = TRUE,
  pval.size = 3,
  palette = c("grey","#C5701A","#6699FF"), 
  conf.int = TRUE,  
  #risk.table = TRUE,
  #              tables.height = 0.2,
  #              tables.theme = theme_cleantable(),
  legend.title = "Condition", 
  legend.labs = c("Control", "Incentives", "Moderation"), 
  xlab = "Days of discussion phase",  
  xlim = c(1, 28) ,
  ylab = "Probability for continued participation")

surv_group_plot

ggexport(plotlist =list(surv_group_plot),filename = "../output/group_survival.pdf", width = 4, height = 4)

# split along toxicity
toxicity_threshold <- quantile(survival_data$group_toxicity, 0.8, na.rm = TRUE)

survival_data <- survival_data %>%
  mutate(toxicity_group = ifelse(group_toxicity > toxicity_threshold, "High", "Low"))

surv_fit <- survfit(Surv(time, status) ~ toxicity_group, data = survival_data)

surv_tox_plot <- ggsurvplot(surv_fit,
  data = survival_data,
  ggtheme = theme_bw()+ theme(panel.grid = element_blank()),
  surv.median.line = "hv",
  pval = TRUE,
  pval.size = 3,
  palette = c("black","grey"), 
  conf.int = TRUE,   
  legend.title = "Group toxicity", # Customize legend title
  legend.labs = c("High", "Low"), 
  xlab = "Days of discussion phase",  
  xlim = c(1, 28) ,
  ylab = "Probability for continued participation")

surv_tox_plot

ggexport(plotlist = list(surv_tox_plot), filename = "../output/tox_survival.pdf", width = 4, height = 4)

Temporal Models

Reddit Communities Field Experiment

2025-03-24

Modeling Drop-out with Survival Models

Cox coefficient plot

Kaplan-Meier survival curves for visualization