library(tidyverse)
library(lubridate)
library(lme4)
library(survival)
library(survminer)
library(gridExtra)
library(ggpubr)
library(ggeffects)
Load anonymous data (no comment texts, open comments, usernames or email addresses)
# reddit data
discussion_data <- read_csv("../data/anon/discussions_anon.csv")
sample <- read_csv("../data/anon/sample_anon.csv")
# preprocessed survey data
scaled_sample <- read_csv("../data/anon/scaled_sample.csv")
# observation period's end (e.g., 30 days after start)
end_of_observation <- as.Date(min(discussion_data$created_comment)) + 30
# Prepare the survival data
survival_data <- discussion_data %>%
group_by(ParticipantID) %>%
summarize(discussion_start = min(created_comment),
last_comment_date = max(created_comment),
time = as.numeric(pmin(last_comment_date, end_of_observation) - discussion_start),
status = ifelse(last_comment_date <= end_of_observation, 1, 0))%>%
left_join(.,scaled_sample,by = "ParticipantID")
# h(t) hazard's function: risk of dying at time t, given the covariates. Covariates > 0 increase in hazard, # covariates < 0 decrease in hazard / negative predictor for dropout.
res.cox <- coxph(Surv(time,status) ~ condition + group_toxicity, data = survival_data)
summary(res.cox)
## Call:
## coxph(formula = Surv(time, status) ~ condition + group_toxicity,
## data = survival_data)
##
## n= 331, number of events= 324
## (4 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## conditionincentives -0.4491 0.6382 0.1370 -3.277 0.00105 **
## conditionmoderation -0.3222 0.7246 0.1564 -2.060 0.03940 *
## group_toxicity 0.3088 1.3618 0.1335 2.314 0.02070 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## conditionincentives 0.6382 1.5669 0.4879 0.8349
## conditionmoderation 0.7246 1.3801 0.5333 0.9845
## group_toxicity 1.3618 0.7343 1.0483 1.7691
##
## Concordance= 0.538 (se = 0.019 )
## Likelihood ratio test= 14.01 on 3 df, p=0.003
## Wald test = 14.06 on 3 df, p=0.003
## Score (logrank) test = 14.23 on 3 df, p=0.003
cox_summary <- summary(res.cox)$coefficients
coef_df <- data.frame(term = rownames(cox_summary),
estimate = cox_summary[, "coef"],
se = cox_summary[, "se(coef)"]) %>%
mutate(lower = estimate - 1.96 * se,
upper = estimate + 1.96 * se)
coef_df <- coef_df %>%
filter(term %in% c("conditionincentives", "conditionmoderation", "group_toxicity"))
cox_coef_plot <- ggplot(coef_df, aes(x = estimate, y = term, color = term)) +
geom_point(size = 4) +
geom_errorbarh(aes(xmin = lower, xmax = upper), height = 0, size = 1) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black") +
labs(x = "Cox prop. hazards model coefficient, 95% CIs",
title = "Discussion drop-out", y = "",
subtitle = "Survival analysis for participation") +
scale_y_discrete(labels = c("conditionincentives" = "Incentives",
"conditionmoderation" = "Moderation",
"group_toxicity" = "Group \n Toxicity")) +
scale_color_manual(values = c(
"conditionincentives" = "#C5701A",
"conditionmoderation" = "#6699FF",
"group_toxicity" = "black"
)) +
theme_bw()+
coord_flip()+
guides(color = "none")+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
cox_coef_plot
ggsave("../output/cox_coef_plot.pdf", cox_coef_plot, width = 3, height = 4.5)
# for different groups
surv_fit <- survfit(Surv(time, status) ~ condition, data = survival_data)
surv_group_plot <- ggsurvplot(surv_fit,
data = survival_data,
ggtheme = theme_bw()+ theme(panel.grid = element_blank()),
surv.median.line = "hv",
pval = TRUE,
pval.size = 3,
palette = c("grey","#C5701A","#6699FF"),
conf.int = TRUE,
#risk.table = TRUE,
# tables.height = 0.2,
# tables.theme = theme_cleantable(),
legend.title = "Condition",
legend.labs = c("Control", "Incentives", "Moderation"),
xlab = "Days of discussion phase",
xlim = c(1, 28) ,
ylab = "Probability for continued participation")
surv_group_plot
ggexport(plotlist =list(surv_group_plot),filename = "../output/group_survival.pdf", width = 4, height = 4)
# split along toxicity
toxicity_threshold <- quantile(survival_data$group_toxicity, 0.8, na.rm = TRUE)
survival_data <- survival_data %>%
mutate(toxicity_group = ifelse(group_toxicity > toxicity_threshold, "High", "Low"))
surv_fit <- survfit(Surv(time, status) ~ toxicity_group, data = survival_data)
surv_tox_plot <- ggsurvplot(surv_fit,
data = survival_data,
ggtheme = theme_bw()+ theme(panel.grid = element_blank()),
surv.median.line = "hv",
pval = TRUE,
pval.size = 3,
palette = c("black","grey"),
conf.int = TRUE,
legend.title = "Group toxicity", # Customize legend title
legend.labs = c("High", "Low"),
xlab = "Days of discussion phase",
xlim = c(1, 28) ,
ylab = "Probability for continued participation")
surv_tox_plot
ggexport(plotlist = list(surv_tox_plot), filename = "../output/tox_survival.pdf", width = 4, height = 4)
Social Feedback
Construct lagged dataframe