library(tidyverse)
library(readr)
library(gridExtra)
library(haven)
library(RColorBrewer)
library(estimatr)
library(ggridges)
library(DescTools)
library(lme4)
library(brms)
library(purrr)
library(tableone)
library(pscl)
library(ggcorrplot)
library(kableExtra)
library(lme4)
library(forcats)
library(glmmTMB)

Import data (no comment texts, open comments, usernames or email addresses)

# reddit data
user_data <- read_csv("../data/anon/users_anon.csv")
discussion_data <- read_csv("../data/anon/discussions_anon.csv")
external_data <- read_csv("../data/anon/external_reddit_anon.csv")

# survey data
pre_survey <- read_csv("../data/anon/pre_survey_anon.csv")
post_surveys <- read_csv("../data/anon/post_surveys_anon.csv")

# sample (combined data)
sample <- read_csv("../data/anon/sample_anon.csv")

Preprocessing, combining pre and post surveys

# average group toxicity
group_toxicity_dat <- discussion_data%>%
  mutate(subreddit_w1 = subreddit)%>%
  group_by(subreddit_w1)%>%
  mutate(group_toxicity = mean(comment_toxicity, na.rm=T))%>%
  select(subreddit_w1, group_toxicity)%>%
  slice(1)%>%
  ungroup()

# any check-in surveys answered
any_check <- post_surveys%>%
  group_by(ParticipantID)%>%
  mutate(any_checkin = 1)%>%
  slice(1)%>%
  pull(ParticipantID)

post_1 <- post_surveys%>%filter(survey_week == 1)
post_2 <- post_surveys%>%filter(survey_week == 2)
post_3 <- post_surveys%>%filter(survey_week == 3)
post_4 <- post_surveys%>%filter(survey_week == 4)

# pre-post data (combining pre and post surveys) 
full_data <- left_join(sample, post_1, by = "ParticipantID", suffix = c("_w1" , "_w2"))%>%
  left_join(., post_2, by = "ParticipantID", suffix = c("" , "_w3"))%>%
  left_join(., post_3, by = "ParticipantID", suffix = c("" , "_w4"))%>%
  left_join(., post_4, by = "ParticipantID", suffix = c("" , "_w5"))%>%
  left_join(.,group_toxicity_dat, by = "subreddit_w1")%>%
  mutate(active = ifelse(is.na(comment_count), 0, 1),
         condition = factor(condition),
         active2 = ifelse(!ParticipantID %in% any_check & active == 0, "inactive", active))

write_csv(full_data, file = "../data/anon/full_data_waves.csv")

Data Scaling for models https://onlinelibrary.wiley.com/doi/10.1002/sim.3107

# scale data for better comparison of coefficients
gelman_scale <- function(x){
  ( (x - mean(x, na.rm=T)) / (2*sd(x, na.rm=TRUE)))
}

scaled_sample <- full_data%>%
  # generation of check-in variables (average over check-ins)
  mutate(mean_issue_distance_abs = rowMeans(select(., starts_with("issue_distance_abs")), na.rm = TRUE),
         mean_discussion_constructive = rowMeans(select(., starts_with("discussion_percep_2")), na.rm = TRUE),
         mean_discussion_enjoyable = rowMeans(select(., starts_with("discussion_percep_3")), na.rm = TRUE),
         mean_discussion_toxic = rowMeans(select(., starts_with("discussion_percep_1")), na.rm = TRUE),
         mean_group_knowledgeable = rowMeans(select(., starts_with("group_percep_3")), na.rm = TRUE), 
         mean_group_polarized = rowMeans(select(., starts_with("group_percep_2")), na.rm = TRUE),
         mean_group_respectful = rowMeans(select(., starts_with("group_percep_1")), na.rm = TRUE))%>%
  # scale selected variables with Gelman (2008) method by dividing by two standard deviations
  select(ParticipantID, condition, subreddit_w1, comment_count, comment_mean_tox,comment_mean_score,
         ex_comment_mean_score, 
         active, active2, gender, age, education, comments_online_w1,  group_toxicity,
         leftright_w1, polinterest_w1, affective_polarization_w1, ex_comment_mean_tox,
         affective_pol_1_w1,affective_pol_2_w1,comment_karma,
         pol_cynicism_w1, efficacy_w1, trust_general_1_w1,
         trust_general_2_w1,trust_general_3_w1,trust_general_4_w1,
         mean_issue_distance_abs, mean_discussion_constructive, 
         mean_discussion_enjoyable, mean_discussion_toxic, 
         mean_group_knowledgeable, mean_group_polarized, 
         mean_group_respectful)%>%
  mutate(across(c(age:mean_group_respectful), gelman_scale))%>%
  mutate(comment_count = ifelse(is.na(comment_count),0,comment_count),
         gender_male = factor(ifelse(gender == 1, 1, 0)))

write_csv(scaled_sample, file = "../data/anon/scaled_sample.csv")

Participation Prediction

Zero Hurdle Models for Count data

https://search.r-project.org/CRAN/refmans/pscl/html/hurdle.html

# correspondence between perceptions and external measures
cor.test(scaled_sample$mean_discussion_toxic, scaled_sample$group_toxicity)

## 
##  Pearson's product-moment correlation
## 
## data:  scaled_sample$mean_discussion_toxic and scaled_sample$group_toxicity
## t = 2.8732, df = 364, p-value = 0.004302
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.04712583 0.24764617
## sample estimates:
##       cor 
## 0.1489165

present_sample <- scaled_sample%>%
  filter(active2 != "inactive")%>%
  # also include external measure of group toxicity
  mutate(group_toxicity = gelman_scale(group_toxicity)) 

models <- list(
  hurdle(comment_count ~ gender_male, data = present_sample),
  hurdle(comment_count ~ age, data = present_sample),
  hurdle(comment_count ~ education, data = present_sample),
  hurdle(comment_count ~ comments_online_w1, data = present_sample),
  hurdle(comment_count ~ comment_karma, data = present_sample),
  hurdle(comment_count ~ leftright_w1, data = present_sample),
  hurdle(comment_count ~ polinterest_w1, data = present_sample),
  hurdle(comment_count ~ affective_polarization_w1, data = present_sample),
  hurdle(comment_count ~ trust_general_1_w1, data = present_sample),
  hurdle(comment_count ~ trust_general_2_w1, data = present_sample),
  hurdle(comment_count ~ trust_general_3_w1, data = present_sample),
  hurdle(comment_count ~ trust_general_4_w1, data = present_sample),
  hurdle(comment_count ~ mean_issue_distance_abs, data = present_sample),
  hurdle(comment_count ~ mean_group_respectful, data = present_sample),
  hurdle(comment_count ~ mean_group_knowledgeable, data = present_sample),
  hurdle(comment_count ~ mean_group_polarized, data = present_sample),
  hurdle(comment_count ~ mean_discussion_enjoyable, data = present_sample),
  hurdle(comment_count ~ mean_discussion_constructive, data = present_sample),
  hurdle(comment_count ~ mean_discussion_toxic, data = present_sample),
  hurdle(comment_count ~ mean_discussion_toxic + group_toxicity, data = present_sample)
  #hurdle(comment_count ~ group_toxicity, data = present_sample) #"Average group toxicity"
)

predictors <- c(
  "Gender (male)", "Age", "Education", "Commenting online", "Comment karma", "Political orientation",
  "Political interest", "Affective polarization", "Trust in politics","Trust in media",
  "Trust in science", "Trust in others", "Issue distance",
  "Group respectful", "Group knowledgeable", "Group polarized",
  "Discussion enjoyable",  "Discussion constructive","Discussion toxic", "Discussion toxic contr.",
  "Write toxic external"
   )

model_frames <- list()

for (i in seq_along(models)) {
  model_summary <- summary(models[[i]])
  
  count_model_coef <- as.data.frame(model_summary$coefficients$count)
  count_model_coef$Predictor <- predictors[i]
  count_model_coef$Model <- "Count Model"
  count_model_coef <- count_model_coef %>%
    filter(row.names(.) != "(Intercept)") %>%
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  zero_hurdle_coef <- as.data.frame(model_summary$coefficients$zero)
  zero_hurdle_coef$Predictor <- predictors[i]
  zero_hurdle_coef$Model <- "Zero Hurdle Model"
  zero_hurdle_coef <- zero_hurdle_coef %>%
    filter(row.names(.) != "(Intercept)") %>%
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  model_frames[[i]] <- rbind(zero_hurdle_coef, count_model_coef)
}

combined_df <- do.call(rbind, model_frames)%>%
  filter(!row.names(.) %in% c("group_toxicity","group_toxicity1"))

combined_df <- combined_df %>%
  mutate(Predictor = fct_inorder(rev(Predictor))) %>%
  group_by(Model) %>%
  mutate(AlphaValue = abs(Coefficient) / max(abs(Coefficient), na.rm = TRUE)) %>%
  ungroup()

interval2 <- -qnorm((1 - 0.95) / 2)

comment_plot_pred_active_hurdle_ext <- ggplot(combined_df, aes(x = rev(Predictor), alpha = abs(AlphaValue))) + 
  geom_hline(yintercept = 0, colour = gray(1 / 2), lty = 2) + 
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE * interval2,
                      ymax = Coefficient + SE * interval2), colour = "black",
                  lwd = 1, position = position_dodge(width = 2 / 3)) + 
  guides(alpha = "none") +
  geom_vline(xintercept=8.5)+
  theme_bw(base_size = 10) + 
  coord_flip() +
  ggtitle("Prediction of user-level discussion participation") +
  xlab("") +
  ylab("Zero hurdle models for count data regression") +
  facet_wrap(~ forcats::fct_rev(Model), ncol = 2, scales = "free_x")+
  theme(#panel.grid.major = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank())  

ggsave("../output/comment_plot_pred_hurdle_with_external.pdf", comment_plot_pred_active_hurdle_ext, width = 6, height = 4)

comment_plot_pred_active_hurdle_ext

Random effects with glmmTMB

glmmTMB(comment_count ~ gender_male + (1 + gender_male | subreddit_w1), data = present_sample, family = hurdle_poisson(link = "log"))

## Formula:          
## comment_count ~ gender_male + (1 + gender_male | subreddit_w1)
## Data: present_sample
##       AIC       BIC    logLik -2*log(L)  df.resid 
##        NA        NA        NA        NA       417 
## Random-effects (co)variances:
## 
## Conditional model:
##  Groups       Name         Std.Dev.  Corr 
##  subreddit_w1 (Intercept)  8.295e-08      
##               gender_male1 2.070e-01 0.22 
## 
## Number of obs: 423 / Conditional model: subreddit_w1, 6
## 
## Dispersion parameter for hurdle_poisson family ():   18 
## 
## Fixed Effects:
## 
## Conditional model:
##  (Intercept)  gender_male1  
##       2.4631        0.2695

glmmTMB(comment_count ~ age + (1 + age | subreddit_w1), data = present_sample, family = hurdle_poisson(link = "log"))

## Formula:          comment_count ~ age + (1 + age | subreddit_w1)
## Data: present_sample
##       AIC       BIC    logLik -2*log(L)  df.resid 
##  3668.033  3692.317 -1828.017  3656.033       417 
## Random-effects (co)variances:
## 
## Conditional model:
##  Groups       Name        Std.Dev. Corr  
##  subreddit_w1 (Intercept) 0.03630        
##               age         0.05996  -1.00 
## 
## Number of obs: 423 / Conditional model: subreddit_w1, 6
## 
## Dispersion parameter for hurdle_poisson family (): 18.2 
## 
## Fixed Effects:
## 
## Conditional model:
## (Intercept)          age  
##     2.60763      0.02067

glmmTMB(comment_count ~ education + (1 + education | subreddit_w1), data = present_sample, family = hurdle_poisson(link = "log"))

## Formula:          comment_count ~ education + (1 + education | subreddit_w1)
## Data: present_sample
##       AIC       BIC    logLik -2*log(L)  df.resid 
##        NA        NA        NA        NA       417 
## Random-effects (co)variances:
## 
## Conditional model:
##  Groups       Name        Std.Dev.  Corr 
##  subreddit_w1 (Intercept) 7.000e-05      
##               education   5.565e-05 0.78 
## 
## Number of obs: 423 / Conditional model: subreddit_w1, 6
## 
## Dispersion parameter for hurdle_poisson family (): 18.2 
## 
## Fixed Effects:
## 
## Conditional model:
## (Intercept)    education  
##     2.60874     -0.01315

glmmTMB(comment_count ~ comments_online_w1 + (1 + comments_online_w1 | subreddit_w1), data = present_sample, family = hurdle_poisson(link = "log"))

## Formula:          
## comment_count ~ comments_online_w1 + (1 + comments_online_w1 |  
##     subreddit_w1)
## Data: present_sample
##       AIC       BIC    logLik -2*log(L)  df.resid 
##  3661.755  3686.039 -1824.877  3649.755       417 
## Random-effects (co)variances:
## 
## Conditional model:
##  Groups       Name               Std.Dev.  Corr  
##  subreddit_w1 (Intercept)        8.243e-05       
##               comments_online_w1 5.689e-05 -0.99 
## 
## Number of obs: 423 / Conditional model: subreddit_w1, 6
## 
## Dispersion parameter for hurdle_poisson family (): 18.1 
## 
## Fixed Effects:
## 
## Conditional model:
##        (Intercept)  comments_online_w1  
##             2.5867              0.3485

glmmTMB(comment_count ~ comment_karma + (1 + comment_karma | subreddit_w1), data = present_sample, family = hurdle_poisson(link = "log"))

## Formula:          
## comment_count ~ comment_karma + (1 + comment_karma | subreddit_w1)
## Data: present_sample
##       AIC       BIC    logLik -2*log(L)  df.resid 
##        NA        NA        NA        NA       416 
## Random-effects (co)variances:
## 
## Conditional model:
##  Groups       Name          Std.Dev.  Corr 
##  subreddit_w1 (Intercept)   6.693e-05      
##               comment_karma 6.098e-07 0.54 
## 
## Number of obs: 422 / Conditional model: subreddit_w1, 6
## 
## Dispersion parameter for hurdle_poisson family (): 18.1 
## 
## Fixed Effects:
## 
## Conditional model:
##   (Intercept)  comment_karma  
##         2.597          0.108

glmmTMB(comment_count ~ leftright_w1 + (1 + leftright_w1 | subreddit_w1), data = present_sample, family = hurdle_poisson(link = "log"))

## Formula:          
## comment_count ~ leftright_w1 + (1 + leftright_w1 | subreddit_w1)
## Data: present_sample
##       AIC       BIC    logLik -2*log(L)  df.resid 
##  3665.500  3689.785 -1826.750  3653.500       417 
## Random-effects (co)variances:
## 
## Conditional model:
##  Groups       Name         Std.Dev. Corr  
##  subreddit_w1 (Intercept)  0.0500         
##               leftright_w1 0.3145   -1.00 
## 
## Number of obs: 423 / Conditional model: subreddit_w1, 6
## 
## Dispersion parameter for hurdle_poisson family (): 18.1 
## 
## Fixed Effects:
## 
## Conditional model:
##  (Intercept)  leftright_w1  
##      2.58254       0.04445

glmmTMB(comment_count ~ polinterest_w1 + (1 + polinterest_w1 | subreddit_w1), data = present_sample, family = hurdle_poisson(link = "log"))

## Formula:          
## comment_count ~ polinterest_w1 + (1 + polinterest_w1 | subreddit_w1)
## Data: present_sample
##       AIC       BIC    logLik -2*log(L)  df.resid 
##  3661.391  3685.675 -1824.695  3649.391       417 
## Random-effects (co)variances:
## 
## Conditional model:
##  Groups       Name           Std.Dev. Corr 
##  subreddit_w1 (Intercept)    0.0649        
##               polinterest_w1 0.2477   1.00 
## 
## Number of obs: 423 / Conditional model: subreddit_w1, 6
## 
## Dispersion parameter for hurdle_poisson family ():   18 
## 
## Fixed Effects:
## 
## Conditional model:
##    (Intercept)  polinterest_w1  
##         2.5861          0.3447

glmmTMB(comment_count ~ affective_polarization_w1 + (1 + affective_polarization_w1 | subreddit_w1), data = present_sample, family = hurdle_poisson(link = "log"))

## Formula:          
## comment_count ~ affective_polarization_w1 + (1 + affective_polarization_w1 |  
##     subreddit_w1)
## Data: present_sample
##       AIC       BIC    logLik -2*log(L)  df.resid 
##  3293.792  3317.512 -1640.896  3281.792       379 
## Random-effects (co)variances:
## 
## Conditional model:
##  Groups       Name                      Std.Dev. Corr  
##  subreddit_w1 (Intercept)               0.08959        
##               affective_polarization_w1 0.14672  -1.00 
## 
## Number of obs: 385 / Conditional model: subreddit_w1, 6
## 
## Dispersion parameter for hurdle_poisson family (): 17.1 
## 
## Fixed Effects:
## 
## Conditional model:
##               (Intercept)  affective_polarization_w1  
##                    2.5735                    -0.1926

glmmTMB(comment_count ~ trust_general_1_w1 + (1 + trust_general_1_w1 | subreddit_w1), data = present_sample, family = hurdle_poisson(link = "log"))

## Formula:          
## comment_count ~ trust_general_1_w1 + (1 + trust_general_1_w1 |  
##     subreddit_w1)
## Data: present_sample
##       AIC       BIC    logLik -2*log(L)  df.resid 
##  3654.137  3678.421 -1821.068  3642.137       417 
## Random-effects (co)variances:
## 
## Conditional model:
##  Groups       Name               Std.Dev. Corr  
##  subreddit_w1 (Intercept)        0.08099        
##               trust_general_1_w1 0.47882  -1.00 
## 
## Number of obs: 423 / Conditional model: subreddit_w1, 6
## 
## Dispersion parameter for hurdle_poisson family (): 17.7 
## 
## Fixed Effects:
## 
## Conditional model:
##        (Intercept)  trust_general_1_w1  
##             2.5452              0.3086

glmmTMB(comment_count ~ mean_discussion_toxic + group_toxicity + (1 + mean_discussion_toxic | subreddit_w1), data = present_sample, family = hurdle_poisson(link = "log"))

## Formula:          
## comment_count ~ mean_discussion_toxic + group_toxicity + (1 +  
##     mean_discussion_toxic | subreddit_w1)
## Data: present_sample
##       AIC       BIC    logLik -2*log(L)  df.resid 
##  3197.520  3224.838 -1591.760  3183.520       359 
## Random-effects (co)variances:
## 
## Conditional model:
##  Groups       Name                  Std.Dev. Corr  
##  subreddit_w1 (Intercept)           0.1208         
##               mean_discussion_toxic 0.3039   -1.00 
## 
## Number of obs: 366 / Conditional model: subreddit_w1, 6
## 
## Dispersion parameter for hurdle_poisson family (): 18.6 
## 
## Fixed Effects:
## 
## Conditional model:
##           (Intercept)  mean_discussion_toxic         group_toxicity  
##                2.7002                 0.0922                 0.2199

Geometric distribution (instead of default poisson)

models <- list(
  hurdle(comment_count ~ gender_male, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ age, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ education, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ comments_online_w1, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ comment_karma, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ leftright_w1, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ polinterest_w1, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ affective_polarization_w1, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ trust_general_1_w1, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ trust_general_2_w1, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ trust_general_3_w1, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ trust_general_4_w1, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ mean_issue_distance_abs, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ mean_group_respectful, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ mean_group_knowledgeable, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ mean_group_polarized, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ mean_discussion_enjoyable, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ mean_discussion_toxic, data = present_sample, dist = "geometric"),
  hurdle(comment_count ~ mean_discussion_constructive, data = present_sample, dist = "geometric")
)

predictors <- c(
  "Gender (male)", "Age", "Education", "Commenting online", "Comment karma", "Political orientation",
  "Political interest", "Affective polarization", "Trust in politics", "Trust in media",
  "Trust in science", "Trust in others", "Issue distance",
  "Group respectful", "Group knowledgeable", "Group polarized",
  "Discussion enjoyable", "Discussion toxic", "Discussion constructive"
)

model_frames <- list()

for (i in seq_along(models)) {
  model_summary <- summary(models[[i]])
  
  count_model_coef <- as.data.frame(model_summary$coefficients$count)
  count_model_coef$Predictor <- predictors[i]
  count_model_coef$Model <- "Count Model"  
  count_model_coef <- count_model_coef %>% 
    filter(row.names(.) != "(Intercept)") %>%   
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  zero_hurdle_coef <- as.data.frame(model_summary$coefficients$zero)
  zero_hurdle_coef$Predictor <- predictors[i]
  zero_hurdle_coef$Model <- "Zero Hurdle Model"  
  zero_hurdle_coef <- zero_hurdle_coef %>% 
    filter(row.names(.) != "(Intercept)") %>%   
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  model_frames[[i]] <- rbind(zero_hurdle_coef, count_model_coef)
}

combined_df <- do.call(rbind, model_frames)

combined_df <- combined_df %>%
  mutate(Predictor = fct_inorder(rev(Predictor))) %>%
  group_by(Model) %>%
  mutate(AlphaValue = abs(Coefficient) / max(abs(Coefficient), na.rm = TRUE)) %>%
  ungroup()

comment_plot_pred_active_hurdle_geometric <- ggplot(combined_df, aes(x = rev(Predictor), alpha = abs(AlphaValue))) + 
  geom_hline(yintercept = 0, colour = gray(1 / 2), lty = 2) + 
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE * interval2,
                      ymax = Coefficient + SE * interval2), colour = "black",
                  lwd = 1, position = position_dodge(width = 2 / 3)) + 
  guides(alpha = "none") +
  theme_bw(base_size = 10) + 
  coord_flip() +
  ggtitle("Prediction of user-level discussion participation") +
  xlab("") +
  ylab("Zero hurdle models for count data regression (geometric, binomial)") +
  facet_wrap(~ Model, ncol = 2, scales = "free_x")  +
  theme(#panel.grid.major = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank())  

ggsave("../output/comment_plot_pred_hurdle_geometric.pdf", comment_plot_pred_active_hurdle_geometric, width = 6, height = 4)

comment_plot_pred_active_hurdle_geometric

Negative Binomial

models <- list(
  hurdle(comment_count ~ gender_male, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ age, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ education, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ comments_online_w1, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ comment_karma, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ leftright_w1, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ polinterest_w1, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ affective_polarization_w1, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ trust_general_1_w1, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ trust_general_2_w1, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ trust_general_3_w1, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ trust_general_4_w1, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ mean_issue_distance_abs, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ mean_group_respectful, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ mean_group_knowledgeable, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ mean_group_polarized, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ mean_discussion_enjoyable, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ mean_discussion_toxic, data = present_sample, dist = "negbin"),
  hurdle(comment_count ~ mean_discussion_constructive, data = present_sample, dist = "negbin")
)

model_frames <- list()

for (i in seq_along(models)) {
  model_summary <- summary(models[[i]])
  
  count_model_coef <- as.data.frame(model_summary$coefficients$count)
  count_model_coef$Predictor <- predictors[i]
  count_model_coef$Model <- "Count Model"  
  count_model_coef <- count_model_coef %>% 
    filter(row.names(.) != "(Intercept)") %>% 
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  zero_hurdle_coef <- as.data.frame(model_summary$coefficients$zero)
  zero_hurdle_coef$Predictor <- predictors[i]
  zero_hurdle_coef$Model <- "Zero Hurdle Model" 
  zero_hurdle_coef <- zero_hurdle_coef %>% 
    filter(row.names(.) != "(Intercept)") %>% 
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  model_frames[[i]] <- rbind(zero_hurdle_coef, count_model_coef)
}

combined_df <- do.call(rbind, model_frames)

combined_df <- combined_df %>%
  rownames_to_column()%>%
  filter(!str_detect(rowname,"theta"))%>%
  mutate(Predictor = fct_inorder(rev(Predictor))) %>%
  group_by(Model) %>%
  mutate(AlphaValue = abs(Coefficient) / max(abs(Coefficient), na.rm = TRUE)) %>%
  ungroup()

comment_plot_pred_active_hurdle_negbin <- ggplot(combined_df, aes(x = rev(Predictor), alpha = abs(AlphaValue))) + 
  geom_hline(yintercept = 0, colour = gray(1 / 2), lty = 2) + 
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE * interval2,
                      ymax = Coefficient + SE * interval2), colour = "black",
                  lwd = 1, position = position_dodge(width = 2 / 3)) + 
  guides(alpha = "none") +
  theme_bw(base_size = 10) + 
  coord_flip() +
  ggtitle("Prediction of user-level discussion participation") +
  xlab("") +
  ylab("Zero hurdle models for count data regression (negbin, binomial)") +
  facet_wrap(~ Model, ncol = 2, scales = "free_x") +
  theme(#panel.grid.major = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank())  

# Save the plot with "negbin" in the file name
ggsave("../output/comment_plot_pred_hurdle_negbin.pdf", comment_plot_pred_active_hurdle_negbin, width = 6, height = 4)

# Display the plot
comment_plot_pred_active_hurdle_negbin

Binary Active vs. Passive

models <- list(
  glm(active ~ gender_male, data = present_sample),
  glm(active ~ age, data = present_sample),
  glm(active ~ education, data = present_sample),
  glm(active ~ comments_online_w1, data = present_sample),
  glm(active ~ comment_karma, data = present_sample),
  glm(active ~ leftright_w1, data = present_sample),
  glm(active ~ polinterest_w1, data = present_sample),
  glm(active ~ affective_polarization_w1, data = present_sample),
  glm(active ~ trust_general_1_w1, data = present_sample),
  glm(active ~ trust_general_2_w1, data = present_sample),
  glm(active ~ trust_general_3_w1, data = present_sample),
  glm(active ~ trust_general_4_w1, data = present_sample),
  glm(active ~ mean_issue_distance_abs, data = present_sample),
  glm(active ~ mean_group_respectful, data = present_sample),
  glm(active ~ mean_group_knowledgeable, data = present_sample),
  glm(active ~ mean_group_polarized, data = present_sample),
  glm(active ~ mean_discussion_enjoyable, data = present_sample),
  glm(active ~ mean_discussion_toxic, data = present_sample),
  glm(active ~ mean_discussion_constructive, data = present_sample)
)

model_frames <- list()

for (i in seq_along(models)) {
  model_frames[[i]] <- data.frame(
    Coefficient = summary(models[[i]])$coef[-1, 1], 
    pval = summary(models[[i]])$coef[,4],
    SE = summary(models[[i]])$coef[-1, 2],          
    Predictor = predictors[i]                     
  )
}

combined_df <- do.call(rbind, model_frames)
combined_df <- combined_df%>%
  mutate(Significance = ifelse(pval < 0.05, "sig.", "insig."),
         Predictor = fct_inorder(rev(Predictor)))%>%
  group_by(Predictor)%>%
  slice(2) 

comment_plot_pred_binary <- ggplot(combined_df, aes(x = rev(Predictor), alpha = abs(Coefficient))) + 
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) + 
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE*interval2,
                      ymax = Coefficient + SE*interval2), colour = "black",
                  lwd = 1, position = position_dodge(width = 2/3)) + 
  guides(alpha = "none")+
  theme_bw(base_size=10)+ 
  coord_flip()+
  ggtitle("Comment activity binary", subtitle = "Excluding inactive users")+
  xlab("")+
  ylab("Change in log odds")+
  theme(#panel.grid.major = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank())  

comment_plot_pred_binary

ggsave("../output/comment_plot_pred_binary.pdf",comment_plot_pred_binary, width = 4, height = 4)

Binary Active vs. Passive - Random Effects

predictors <- c("gender_male", "age", "education", "comments_online_w1", "comment_karma",
                "leftright_w1", "polinterest_w1", "affective_polarization_w1", 
                "trust_general_1_w1", "trust_general_2_w1", "trust_general_3_w1", 
                "trust_general_4_w1", "mean_issue_distance_abs", "mean_group_respectful",
                "mean_group_knowledgeable", "mean_group_polarized", 
                "mean_discussion_enjoyable","mean_discussion_constructive", "mean_discussion_toxic")

predictor_labels <- c(
  "Gender (male)", "Age", "Education", "Commenting online", "Comment karma", "Political orientation",
  "Political interest", "Affective polarization", "Trust in politics","Trust in media",
  "Trust in science", "Trust in others", "Issue distance",
  "Group respectful", "Group knowledgeable", "Group polarized",
  "Discussion enjoyable",  "Discussion constructive","Discussion toxic"
   )

models <- lapply(predictors, function(pred) {
  formula <- as.formula(paste("active ~", pred, "+ (1 +", pred," | subreddit_w1)"))
  glmer(formula, data = present_sample, family = binomial)
})

model_frames <- list()

for (i in seq_along(models)) {
  # Extract fixed effects summary; skip the intercept (row 1)
  coefs <- summary(models[[i]])$coefficients
  
  model_frames[[i]] <- data.frame(
    Coefficient = coefs[-1, "Estimate"],   # exclude intercept
    SE = coefs[-1, "Std. Error"],
    pval = coefs[-1, "Pr(>|z|)"],
    Predictor = predictor_labels[i]
  )
}

combined_df <- do.call(rbind, model_frames) %>%
  mutate(Significance = ifelse(pval < 0.05, "sig.", "insig."),
         Predictor = fct_inorder(rev(Predictor))) %>%
  group_by(Predictor) %>%
  slice(1)  # slice(1) because we have only one predictor per model

interval2 <- 1.96  # for 95% CI

comment_plot_pred_binary_random <- ggplot(combined_df, aes(x = rev(Predictor), alpha = abs(Coefficient))) + 
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) + 
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE*interval2,
                      ymax = Coefficient + SE*interval2), colour = "black",
                  lwd = 1, position = position_dodge(width = 2/3)) + 
  geom_vline(xintercept = 7.5)+
  guides(alpha = "none")+
  theme_bw(base_size=10)+ 
  coord_flip()+
  ggtitle("Zero model | silent vs. active", subtitle = "Excluding join-only users, community random effects")+
  xlab("")+
  ylab("Change in log odds")+
  theme(#panel.grid.major = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank())  

comment_plot_pred_binary_random

ggsave("../output/comment_plot_pred_binary_random.pdf", comment_plot_pred_binary_random, width = 5, height = 5)

Manual poisson models for count data (active only)

present_sample <- scaled_sample%>%filter(active == 1)

models <- list(
  glm(comment_count ~ gender_male, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ age, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ education, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ comments_online_w1, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ comment_karma, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ leftright_w1, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ polinterest_w1, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ affective_polarization_w1, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ trust_general_1_w1, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ trust_general_2_w1, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ trust_general_3_w1, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ trust_general_4_w1, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ mean_issue_distance_abs, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ mean_group_respectful, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ mean_group_knowledgeable, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ mean_group_polarized, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ mean_discussion_enjoyable, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ mean_discussion_toxic, family = poisson(link = "log"), data = present_sample),
  glm(comment_count ~ mean_discussion_constructive, family = poisson(link = "log"), data = present_sample)
)

model_frames <- list()

for (i in seq_along(models)) {
  model_summary <- summary(models[[i]])
    model_frames[[i]] <- data.frame(
    Coefficient = model_summary$coef[-1, 1],        
    pval = model_summary$coef[-1, 4],               
    SE = model_summary$coef[-1, 2],                 
    Predictor = predictors[i]                       
  )
}


combined_df <- do.call(rbind, model_frames)
combined_df <- combined_df %>%
  mutate(Predictor = fct_inorder(rev(Predictor)))

comment_plot_pred_active <- ggplot(combined_df, aes(x = rev(Predictor), alpha = abs(Coefficient))) + 
  geom_hline(yintercept = 0, colour = gray(1 / 2), lty = 2) + 
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE * interval2,
                      ymax = Coefficient + SE * interval2), colour = "black",
                  lwd = 1 , position = position_dodge(width = 2 / 3)) + 
  guides(alpha = "none") +
  theme_bw(base_size = 10) + 
  coord_flip() +
  ggtitle("Comment activity", subtitle = "Poisson prediction of comment count") +
  xlab("") +
  ylab("Change in log number of comments")+
  theme(#panel.grid.major = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank())  
comment_plot_pred_active

ggsave("../output/comment_plot_pred_poisson.pdf", comment_plot_pred_active, width = 4, height = 4)

Manual poisson models for count data (active only) - Random Effects

# Filter for active users
present_sample <- scaled_sample %>% filter(active == 1)

# Fit Poisson models with random effects
models <- lapply(predictors, function(pred) {
  formula <- as.formula(paste("comment_count ~", pred, "+ (1 +", pred," | subreddit_w1)"))
  glmer(formula, data = present_sample, family = poisson(link = "log"))
})

# Extract model summaries
model_frames <- list()

for (i in seq_along(models)) {
  coefs <- summary(models[[i]])$coefficients
  
  model_frames[[i]] <- data.frame(
    Coefficient = coefs[-1, "Estimate"],   # Exclude intercept
    SE = coefs[-1, "Std. Error"],
    pval = coefs[-1, "Pr(>|z|)"],
    Predictor = predictor_labels[i]
  )
}

# Combine results
combined_df <- do.call(rbind, model_frames) %>%
  mutate(Predictor = fct_inorder(rev(Predictor)))

# Define interval for confidence intervals
interval2 <- 1.96  # 95% CI

# Create the plot
comment_plot_pred_active_random <- ggplot(combined_df, aes(x = rev(Predictor), alpha = abs(Coefficient))) + 
  geom_hline(yintercept = 0, colour = gray(1 / 2), lty = 2) + 
  geom_vline(xintercept = 7.5)+
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE * interval2,
                      ymax = Coefficient + SE * interval2), colour = "black",
                  lwd = 1 , position = position_dodge(width = 2 / 3)) + 
  guides(alpha = "none") +
  theme_bw(base_size = 10) + 
  coord_flip() +
  ggtitle("Count model | comment activity", subtitle = "Poisson model | subreddit random effects") +
  xlab("") +
  ylab("Change in log number of comments") +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank())  

# Display and save the plot
comment_plot_pred_active_random

ggsave("../output/comment_plot_pred_poisson_random.pdf", comment_plot_pred_active_random, width = 5, height = 5)

Linear models (Excluding 0 and inactive)

present_sample <- scaled_sample%>%filter(active == 1)

mean(present_sample$comment_count)

## [1] 17.35347

sd(present_sample$comment_count)

## [1] 18.97236

max(present_sample$comment_count)

## [1] 114

models <- list(
  lm_robust(comment_count ~ gender_male, data = present_sample),
  lm_robust(comment_count ~ age, data = present_sample),
  lm_robust(comment_count ~ education, data = present_sample),
  lm_robust(comment_count ~ comments_online_w1, data = present_sample),
  lm_robust(comment_count ~ comment_karma, data = present_sample),
  lm_robust(comment_count ~ leftright_w1, data = present_sample),
  lm_robust(comment_count ~ polinterest_w1, data = present_sample),
  lm_robust(comment_count ~ affective_polarization_w1, data = present_sample),
  lm_robust(comment_count ~ trust_general_1_w1, data = present_sample),
  lm_robust(comment_count ~ trust_general_2_w1, data = present_sample),
  lm_robust(comment_count ~ trust_general_3_w1, data = present_sample),
  lm_robust(comment_count ~ trust_general_4_w1, data = present_sample),
  lm_robust(comment_count ~ mean_issue_distance_abs, data = present_sample),
  lm_robust(comment_count ~ mean_group_respectful, data = present_sample),
  lm_robust(comment_count ~ mean_group_knowledgeable, data = present_sample),
  lm_robust(comment_count ~ mean_group_polarized, data = present_sample),
  lm_robust(comment_count ~ mean_discussion_enjoyable, data = present_sample),
  lm_robust(comment_count ~ mean_discussion_toxic, data = present_sample),
  lm_robust(comment_count ~ mean_discussion_constructive, data = present_sample)
)

model_frames <- list()

for (i in seq_along(models)) {
  model_frames[[i]] <- data.frame(
    Coefficient = summary(models[[i]])$coef[-1, 1],  
    pval = summary(models[[i]])$coef[,4],
    SE = summary(models[[i]])$coef[-1, 2],         
    Predictor = predictors[i]                       
  )
}

combined_df <- do.call(rbind, model_frames)
combined_df <- combined_df%>%
  mutate(Significance = ifelse(pval < 0.05, "sig.", "insig."),
         Predictor = fct_inorder(rev(Predictor)))%>%
  group_by(Predictor)%>%
  slice(2) 


comment_plot_pred_active_ex0 <- ggplot(combined_df, aes(x = rev(Predictor), alpha = abs(Coefficient))) + 
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) + 
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE*interval2,
                      ymax = Coefficient + SE*interval2), colour = "black",
                  lwd = 1/2, position = position_dodge(width = 2/3)) + 
  guides(alpha = "none")+
  theme_bw(base_size=10)+ 
  coord_flip()+
  ggtitle("Comment activity", subtitle = "Linear prediction of comment count")+
  xlab("")+
  ylab("Scaled beta coefficients, HC2 robust SE")+
  theme(#panel.grid.major = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank())  

comment_plot_pred_active_ex0

ggsave("../output/comment_plot_pred_active_excl0.pdf",comment_plot_pred_active_ex0, width = 4, height = 4)

Linear models to predict Scores (Excluding 0 and inactive)

present_sample <- scaled_sample%>%filter(active == 1)

mean(present_sample$comment_mean_score)

## [1] 2.607417

sd(present_sample$comment_mean_score)

## [1] 1.722391

max(present_sample$comment_mean_score)

## [1] 18

models <- list(
  lm_robust(comment_mean_score ~ gender_male, data = present_sample),
  lm_robust(comment_mean_score ~ age, data = present_sample),
  lm_robust(comment_mean_score ~ education, data = present_sample),
  lm_robust(comment_mean_score ~ comments_online_w1, data = present_sample),
  lm_robust(comment_mean_score ~ comment_karma, data = present_sample),
  lm_robust(comment_mean_score ~ leftright_w1, data = present_sample),
  lm_robust(comment_mean_score ~ polinterest_w1, data = present_sample),
  lm_robust(comment_mean_score ~ affective_polarization_w1, data = present_sample),
  lm_robust(comment_mean_score ~ trust_general_1_w1, data = present_sample),
  lm_robust(comment_mean_score ~ trust_general_2_w1, data = present_sample),
  lm_robust(comment_mean_score ~ trust_general_3_w1, data = present_sample),
  lm_robust(comment_mean_score ~ trust_general_4_w1, data = present_sample),
  lm_robust(comment_mean_score ~ mean_issue_distance_abs, data = present_sample),
  lm_robust(comment_mean_score ~ mean_group_respectful, data = present_sample),
  lm_robust(comment_mean_score ~ mean_group_knowledgeable, data = present_sample),
  lm_robust(comment_mean_score ~ mean_group_polarized, data = present_sample),
  lm_robust(comment_mean_score ~ mean_discussion_enjoyable, data = present_sample),
  lm_robust(comment_mean_score ~ mean_discussion_toxic, data = present_sample),
  lm_robust(comment_mean_score ~ mean_discussion_constructive, data = present_sample)
)

model_frames <- list()

for (i in seq_along(models)) {
  model_frames[[i]] <- data.frame(
    Coefficient = summary(models[[i]])$coef[-1, 1],  
    pval = summary(models[[i]])$coef[,4],
    SE = summary(models[[i]])$coef[-1, 2],         
    Predictor = predictors[i]                       
  )
}

combined_df <- do.call(rbind, model_frames)
combined_df <- combined_df%>%
  mutate(Significance = ifelse(pval < 0.05, "sig.", "insig."),
         Predictor = fct_inorder(rev(Predictor)))%>%
  group_by(Predictor)%>%
  slice(2) 

comment_plot_pred_active_ex0_score <- ggplot(combined_df, aes(x = rev(Predictor), alpha = abs(Coefficient))) + 
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) + 
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE*interval2,
                      ymax = Coefficient + SE*interval2), colour = "black",
                  lwd = 1/2, position = position_dodge(width = 2/3)) + 
  guides(alpha = "none")+
  theme_bw(base_size=10)+ 
  coord_flip()+
  ggtitle("Comment scores", subtitle = "Linear prediction of comment scores")+
  xlab("")+
  ylab("Scaled beta coefficients, HC2 robust SE")+
  theme(#panel.grid.major = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank())  

comment_plot_pred_active_ex0_score

ggsave("../output/comment_plot_pred_active_excl0_score.pdf",comment_plot_pred_active_ex0_score, width = 4, height = 4)

Hurdle models for different experimental conditions (start RQ2)

present_sample <- scaled_sample%>%filter(active2 != "inactive")

# filtered data for control condition
scaled_sample_c <- present_sample %>% filter(condition == "control")

# models for control condition
models_control <- list(
  hurdle(comment_count ~ gender_male, data = scaled_sample_c),
  hurdle(comment_count ~ age, data = scaled_sample_c),
  hurdle(comment_count ~ education, data = scaled_sample_c),
  hurdle(comment_count ~ comments_online_w1, data = scaled_sample_c),
  hurdle(comment_count ~ comment_karma, data = scaled_sample_c),
  hurdle(comment_count ~ leftright_w1, data = scaled_sample_c),
  hurdle(comment_count ~ polinterest_w1, data = scaled_sample_c),
  hurdle(comment_count ~ affective_polarization_w1, data = scaled_sample_c),
  hurdle(comment_count ~ trust_general_1_w1, data = scaled_sample_c),
  hurdle(comment_count ~ trust_general_2_w1, data = scaled_sample_c),
  hurdle(comment_count ~ trust_general_3_w1, data = scaled_sample_c),
  hurdle(comment_count ~ trust_general_4_w1, data = scaled_sample_c),
  hurdle(comment_count ~ mean_issue_distance_abs, data = scaled_sample_c),
  hurdle(comment_count ~ mean_group_respectful, data = scaled_sample_c),
  hurdle(comment_count ~ mean_group_knowledgeable, data = scaled_sample_c),
  hurdle(comment_count ~ mean_group_polarized, data = scaled_sample_c),
  hurdle(comment_count ~ mean_discussion_enjoyable, data = scaled_sample_c),
  hurdle(comment_count ~ mean_discussion_toxic, data = scaled_sample_c),
  hurdle(comment_count ~ mean_discussion_constructive, data = scaled_sample_c)
)

model_frames_control <- list()

for (i in seq_along(models_control)) {
  model_summary <- summary(models_control[[i]])
  
  count_model_coef <- as.data.frame(model_summary$coefficients$count)
  count_model_coef$Predictor <- predictors[i]
  count_model_coef$Model <- "Count Model"  
  count_model_coef <- count_model_coef %>% 
    filter(row.names(.) != "(Intercept)") %>% 
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  zero_hurdle_coef <- as.data.frame(model_summary$coefficients$zero)
  zero_hurdle_coef$Predictor <- predictors[i]
  zero_hurdle_coef$Model <- "Zero Hurdle Model"  
  zero_hurdle_coef <- zero_hurdle_coef %>% 
    filter(row.names(.) != "(Intercept)") %>% 
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  model_frames_control[[i]] <- rbind(zero_hurdle_coef, count_model_coef)
}

combined_df_control <- bind_rows(model_frames_control) %>%
  mutate(Significance = ifelse(pval < 0.05, "sig.", "insig."),
         Predictor = fct_inorder(rev(Predictor))) %>%
  group_by(Model) %>%
  mutate(AlphaValue = abs(Coefficient) / max(abs(Coefficient), na.rm = TRUE)) %>%
  ungroup()

comment_plot_pred_c <- ggplot(combined_df_control, aes(x = rev(Predictor), alpha = AlphaValue)) +
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) +
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE * interval2,
                      ymax = Coefficient + SE * interval2), colour = "grey", lwd = 1) +
  guides(alpha = "none") +
  theme_bw(base_size = 10) +
  coord_flip() +
  facet_wrap(~ Model, ncol = 2, scales = "free_x") +
  ggtitle("Control condition", subtitle = "Prediction of user-level discussion participation") +
  xlab("") +
  ylab("Zero hurdle models for count data regression")+
  theme(#panel.grid.major = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank())  

# filtered data for moderation condition
scaled_sample_m <- present_sample %>% filter(condition == "moderation")

# Models for moderation condition
models_moderation <- list(
  hurdle(comment_count ~ gender_male, data = scaled_sample_m),
  hurdle(comment_count ~ age, data = scaled_sample_m),
  hurdle(comment_count ~ education, data = scaled_sample_m),
  hurdle(comment_count ~ comments_online_w1, data = scaled_sample_m),
  hurdle(comment_count ~ comment_karma, data = scaled_sample_m),
  hurdle(comment_count ~ leftright_w1, data = scaled_sample_m),
  hurdle(comment_count ~ polinterest_w1, data = scaled_sample_m),
  hurdle(comment_count ~ affective_polarization_w1, data = scaled_sample_m),
  hurdle(comment_count ~ trust_general_1_w1, data = scaled_sample_m),
  hurdle(comment_count ~ trust_general_2_w1, data = scaled_sample_m),
  hurdle(comment_count ~ trust_general_3_w1, data = scaled_sample_m),
  hurdle(comment_count ~ trust_general_4_w1, data = scaled_sample_m),
  hurdle(comment_count ~ mean_issue_distance_abs, data = scaled_sample_m),
  hurdle(comment_count ~ mean_group_respectful, data = scaled_sample_m),
  hurdle(comment_count ~ mean_group_knowledgeable, data = scaled_sample_m),
  hurdle(comment_count ~ mean_group_polarized, data = scaled_sample_m),
  hurdle(comment_count ~ mean_discussion_enjoyable, data = scaled_sample_m),
  hurdle(comment_count ~ mean_discussion_toxic, data = scaled_sample_m),
  hurdle(comment_count ~ mean_discussion_constructive, data = scaled_sample_m)
)


model_frames_moderation <- list()
for (i in seq_along(models_moderation)) {
  model_summary <- summary(models_moderation[[i]])
  
    count_model_coef <- as.data.frame(model_summary$coefficients$count)
  count_model_coef$Predictor <- predictors[i]
  count_model_coef$Model <- "Count Model"   
  count_model_coef <- count_model_coef %>% 
    filter(row.names(.) != "(Intercept)") %>%   
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  zero_hurdle_coef <- as.data.frame(model_summary$coefficients$zero)
  zero_hurdle_coef$Predictor <- predictors[i]
  zero_hurdle_coef$Model <- "Zero Hurdle Model"   
  zero_hurdle_coef <- zero_hurdle_coef %>% 
    filter(row.names(.) != "(Intercept)") %>%   
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  model_frames_moderation[[i]] <- rbind(zero_hurdle_coef, count_model_coef)
}

combined_df_moderation <- bind_rows(model_frames_moderation) %>%
  mutate(Significance = ifelse(pval < 0.05, "sig.", "insig."),
         Predictor = fct_inorder(rev(Predictor))) %>%
  group_by(Model) %>%
  mutate(AlphaValue = abs(Coefficient) / max(abs(Coefficient), na.rm = TRUE)) %>%
  ungroup()

comment_plot_pred_m <- ggplot(combined_df_moderation, aes(x = rev(Predictor), alpha = AlphaValue)) +
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) +
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE * interval2,
                      ymax = Coefficient + SE * interval2), colour = "#6699FF", lwd = 1) +
  guides(alpha = "none") +
  theme_bw(base_size = 10) +
  coord_flip() +
  facet_wrap(~ Model, ncol = 2, scales = "free_x") +
  ggtitle("Moderation condition", subtitle = "Prediction of user-level discussion participation") +
  xlab("") +
  ylab("Zero hurdle models for count data regression")+
  theme(#panel.grid.major = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank())  


# filtered data for incentives condition
scaled_sample_i <- present_sample %>% filter(condition == "incentives")

# Models for incentives condition
models_incentives <- list(
  hurdle(comment_count ~ gender_male, data = scaled_sample_i),
  hurdle(comment_count ~ age, data = scaled_sample_i),
  hurdle(comment_count ~ education, data = scaled_sample_i),
  hurdle(comment_count ~ comments_online_w1, data = scaled_sample_i),
  hurdle(comment_count ~ comment_karma, data = scaled_sample_i),
  hurdle(comment_count ~ leftright_w1, data = scaled_sample_i),
  hurdle(comment_count ~ polinterest_w1, data = scaled_sample_i),
  hurdle(comment_count ~ affective_polarization_w1, data = scaled_sample_i),
  hurdle(comment_count ~ trust_general_1_w1, data = scaled_sample_i),
  hurdle(comment_count ~ trust_general_2_w1, data = scaled_sample_i),
  hurdle(comment_count ~ trust_general_3_w1, data = scaled_sample_i),
  hurdle(comment_count ~ trust_general_4_w1, data = scaled_sample_i),
  hurdle(comment_count ~ mean_issue_distance_abs, data = scaled_sample_i),
  hurdle(comment_count ~ mean_group_respectful, data = scaled_sample_i),
  hurdle(comment_count ~ mean_group_knowledgeable, data = scaled_sample_i),
  hurdle(comment_count ~ mean_group_polarized, data = scaled_sample_i),
  hurdle(comment_count ~ mean_discussion_enjoyable, data = scaled_sample_i),
  hurdle(comment_count ~ mean_discussion_toxic, data = scaled_sample_i),
  hurdle(comment_count ~ mean_discussion_constructive, data = scaled_sample_i)
)


model_frames_incentives <- list()
for (i in seq_along(models_incentives)) {
  model_summary <- summary(models_incentives[[i]])
  
  count_model_coef <- as.data.frame(model_summary$coefficients$count)
  count_model_coef$Predictor <- predictors[i]
  count_model_coef$Model <- "Count Model"   
  count_model_coef <- count_model_coef %>% 
    filter(row.names(.) != "(Intercept)") %>%   
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  zero_hurdle_coef <- as.data.frame(model_summary$coefficients$zero)
  zero_hurdle_coef$Predictor <- predictors[i]
  zero_hurdle_coef$Model <- "Zero Hurdle Model"   
  zero_hurdle_coef <- zero_hurdle_coef %>% 
    filter(row.names(.) != "(Intercept)") %>%   
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  model_frames_incentives[[i]] <- rbind(zero_hurdle_coef, count_model_coef)
}

combined_df_incentives <- bind_rows(model_frames_incentives) %>%
  mutate(Significance = ifelse(pval < 0.05, "sig.", "insig."),
         Predictor = fct_inorder(rev(Predictor))) %>%
  group_by(Model) %>%
  mutate(AlphaValue = abs(Coefficient) / max(abs(Coefficient), na.rm = TRUE)) %>%
  ungroup()

comment_plot_pred_i <- ggplot(combined_df_incentives, aes(x = rev(Predictor), alpha = AlphaValue)) +
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) +
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE * interval2,
                      ymax = Coefficient + SE * interval2), colour = "#C5701A", lwd = 1) +
  guides(alpha = "none") +
  theme_bw(base_size = 10) +
  coord_flip() +
  facet_wrap(~ Model, ncol = 2, scales = "free_x") +
  ggtitle("Incentives condition", subtitle = "Prediction of user-level discussion participation") +
  xlab("") +
  ylab("Zero hurdle models for count data regression")+
  theme(#panel.grid.major = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank())  

combi_conditions <- grid.arrange(comment_plot_pred_c, comment_plot_pred_m, comment_plot_pred_i, nrow = 3)

ggsave("../output/combi_comment_plot_pred_conditions_hurdle.pdf", combi_conditions, width = 8, height = 12)

Hurdle models for different conditions simple for main manuscript

present_sample <- scaled_sample%>%filter(active2 != "inactive")

# filtered data for control condition
scaled_sample_c <- present_sample %>% filter(condition == "control")

models_control <- list(
  hurdle(comment_count ~ gender_male, data = scaled_sample_c),
  hurdle(comment_count ~ polinterest_w1, data = scaled_sample_c),
  hurdle(comment_count ~ mean_group_knowledgeable, data = scaled_sample_c),
  hurdle(comment_count ~ mean_discussion_toxic, data = scaled_sample_c)
)

predictors <- c("Gender (male)",  "Political interest", 
                "Group knowledgeable", "Discussion toxic")

model_frames_control <- list()

for (i in seq_along(models_control)) {
  model_summary <- summary(models_control[[i]])
  
  count_model_coef <- as.data.frame(model_summary$coefficients$count)
  count_model_coef$Predictor <- predictors[i]
  count_model_coef$Model <- "Count Model"   
  count_model_coef <- count_model_coef %>% 
    filter(row.names(.) != "(Intercept)") %>%   
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  zero_hurdle_coef <- as.data.frame(model_summary$coefficients$zero)
  zero_hurdle_coef$Predictor <- predictors[i]
  zero_hurdle_coef$Model <- "Zero Hurdle Model"   
  zero_hurdle_coef <- zero_hurdle_coef %>% 
    filter(row.names(.) != "(Intercept)") %>%   
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  model_frames_control[[i]] <- rbind(zero_hurdle_coef, count_model_coef)
}

combined_df_control <- bind_rows(model_frames_control) %>%
  mutate(Significance = ifelse(pval < 0.05, "sig.", "insig."),
         Predictor = fct_inorder(rev(Predictor))) %>%
  group_by(Model) %>%
  mutate(AlphaValue = abs(Coefficient) / max(abs(Coefficient), na.rm = TRUE)) %>%
  ungroup()

comment_plot_pred_c <- ggplot(combined_df_control, aes(x = rev(Predictor), alpha = AlphaValue)) +
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) +
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE * interval2,
                      ymax = Coefficient + SE * interval2), colour = "grey", lwd = 1) +
  guides(alpha = "none") +
  theme_bw(base_size = 10) +
  coord_flip() +
  ggtitle("Control condition", subtitle = "Prediction of discussion participation") +
  xlab("") +
  ylab("")+
  facet_wrap(~ forcats::fct_rev(Model), ncol = 2, scales = "free_x")+
  theme(#panel.grid.major = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank()) 

# filtered data for moderation condition
scaled_sample_m <- present_sample %>% filter(condition == "moderation")

models_moderation <- list(
  hurdle(comment_count ~ gender_male, data = scaled_sample_m),
  hurdle(comment_count ~ polinterest_w1, data = scaled_sample_m),
  hurdle(comment_count ~ mean_group_knowledgeable, data = scaled_sample_m),
  hurdle(comment_count ~ mean_discussion_toxic, data = scaled_sample_m)
)


model_frames_moderation <- list()
for (i in seq_along(models_moderation)) {
  model_summary <- summary(models_moderation[[i]])
  
    count_model_coef <- as.data.frame(model_summary$coefficients$count)
  count_model_coef$Predictor <- predictors[i]
  count_model_coef$Model <- "Count Model"   
  count_model_coef <- count_model_coef %>% 
    filter(row.names(.) != "(Intercept)") %>%   
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  zero_hurdle_coef <- as.data.frame(model_summary$coefficients$zero)
  zero_hurdle_coef$Predictor <- predictors[i]
  zero_hurdle_coef$Model <- "Zero Hurdle Model"   
  zero_hurdle_coef <- zero_hurdle_coef %>% 
    filter(row.names(.) != "(Intercept)") %>%   
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  model_frames_moderation[[i]] <- rbind(zero_hurdle_coef, count_model_coef)
}

combined_df_moderation <- bind_rows(model_frames_moderation) %>%
  mutate(Significance = ifelse(pval < 0.05, "sig.", "insig."),
         Predictor = fct_inorder(rev(Predictor))) %>%
  group_by(Model) %>%
  mutate(AlphaValue = abs(Coefficient) / max(abs(Coefficient), na.rm = TRUE)) %>%
  ungroup()

comment_plot_pred_m <- ggplot(combined_df_moderation, aes(x = rev(Predictor))) +
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) +
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE * interval2,
                      ymax = Coefficient + SE * interval2), colour = "#6699FF", lwd = 1) +
  guides(alpha = "none") +
  theme_bw(base_size = 10) +
  coord_flip() +
  ggtitle("Moderation condition", subtitle = "Prediction of discussion participation") +
  xlab("") +
  ylab("")+
  facet_wrap(~ forcats::fct_rev(Model), ncol = 2, scales = "free_x")+
  theme(#panel.grid.major = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank()) 


scaled_sample_i <- present_sample %>% filter(condition == "incentives")

models_incentives <- list(
  hurdle(comment_count ~ gender_male, data = scaled_sample_i),
  hurdle(comment_count ~ polinterest_w1, data = scaled_sample_i),
  hurdle(comment_count ~ mean_group_knowledgeable, data = scaled_sample_i),
  hurdle(comment_count ~ mean_discussion_toxic, data = scaled_sample_i)
)


model_frames_incentives <- list()
for (i in seq_along(models_incentives)) {
  model_summary <- summary(models_incentives[[i]])
  
  count_model_coef <- as.data.frame(model_summary$coefficients$count)
  count_model_coef$Predictor <- predictors[i]
  count_model_coef$Model <- "Count Model"   
  count_model_coef <- count_model_coef %>% 
    filter(row.names(.) != "(Intercept)") %>%   
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  zero_hurdle_coef <- as.data.frame(model_summary$coefficients$zero)
  zero_hurdle_coef$Predictor <- predictors[i]
  zero_hurdle_coef$Model <- "Zero Hurdle Model"   
  zero_hurdle_coef <- zero_hurdle_coef %>% 
    filter(row.names(.) != "(Intercept)") %>%   
    rename(Coefficient = Estimate, SE = `Std. Error`, pval = `Pr(>|z|)`)
  
  model_frames_incentives[[i]] <- rbind(zero_hurdle_coef, count_model_coef)
}

combined_df_incentives <- bind_rows(model_frames_incentives) %>%
  mutate(Significance = ifelse(pval < 0.05, "sig.", "insig."),
         Predictor = fct_inorder(rev(Predictor))) %>%
  group_by(Model) %>%
  mutate(AlphaValue = abs(Coefficient) / max(abs(Coefficient), na.rm = TRUE)) %>%
  ungroup()

comment_plot_pred_i <- ggplot(combined_df_incentives, aes(x = rev(Predictor))) +
  geom_hline(yintercept = 0, colour = gray(1/2), lty = 2) +
  geom_pointrange(aes(y = Coefficient, ymin = Coefficient - SE * interval2,
                      ymax = Coefficient + SE * interval2), colour = "#C5701A", lwd = 1) +
  guides(alpha = "none") +
  theme_bw(base_size = 10) +
  coord_flip() +
  ggtitle("Incentives condition",subtitle = "Prediction of discussion participation") +
  xlab("") +
  ylab("")+
  facet_wrap(~ forcats::fct_rev(Model), ncol = 2, scales = "free_x")+
  theme(#panel.grid.major = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank()) 

combi_conditions <- grid.arrange(comment_plot_pred_m, comment_plot_pred_i, nrow = 1)

ggsave("../output/combi_comment_plot_pred_conditions_hurdle_simple.pdf", combi_conditions, width = 10, height = 3)

Baseline Balance

# variables balance table
covariates <- c("age", "gender", "education","time_online_w1" , "social_media_w1" , "comments_online_w1" ,
 "leftright_w1"                  ,  "polinterest_w1"                , "affective_pol_1_w1"             ,
 "affective_pol_2_w1"            ,  "issue_attitudes_loan_w1"       , "issue_attitudes_airbnb_w1"      ,
 "issue_attitudes_minwage_w1"    ,  "issue_attitudes_ukraine_w1"    , "issue_attitudes_ubi_w1"         ,
 "issue_attitudes_climate_w1"    ,  "issue_attitudes_fur_w1"        , "issue_attitudes_renewable_w1"   ,
 "issue_attitudes_gaza_w1"       ,  "issue_attitudes_vegetarian_w1" , "issue_attitudes_gender_w1"      ,
 "issue_attitudes_sexwork_w1"    ,  "issue_attitudes_vaccine_w1"    , "issue_attitudes_healthcare_w1"  ,
 "issue_attitudes_immigration_w1",  "issue_attitudes_guns_w1"       , "issue_attitudes_deathpenalty_w1",
 "issue_attitudes_bodycams_w1"   ,  "issue_attitudes_ai_w1"         , "issue_attitudes_socialmedia_w1" ,
 "issue_knowledge_loan_w1"       ,  "issue_knowledge_airbnb_w1"     , "issue_knowledge_minwage_w1"     ,
 "issue_knowledge_ukraine_w1"    ,  "issue_knowledge_ubi_w1"        , "issue_knowledge_climate_w1"     ,
 "issue_knowledge_fur_w1"        ,  "issue_knowledge_renewable_w1"  , "issue_knowledge_mideast_w1"     ,
 "issue_knowledge_vegetarian_w1" ,  "issue_knowledge_gender_w1"     , "issue_knowledge_sexwork_w1"     ,
 "issue_knowledge_vaccine_w1"    ,  "issue_knowledge_healthcare_w1" , "issue_knowledge_immigration_w1" ,
 "issue_knowledge_guns_w1"       ,  "issue_knowledge_deathpenalty_w1", "issue_knowledge_bodycams_w1"    ,
 "issue_knowledge_ai_w1"         ,  "issue_knowledge_socialmedia_w1", "political_efficacy_1_w1"        ,
 "political_efficacy_2_w1"       ,  "political_efficacy_3_w1"       , "trust_general_1_w1"             ,
 "trust_general_2_w1"            ,  "trust_general_3_w1"            , "trust_general_4_w1"             ,
 "pol_cynicism_w1", "affective_polarization_w1"  ,     "efficacy_w1" ,"comment_karma","comment_count" ,                  "comment_mean_lenght"   ,   "comment_mean_score"  ,     "comment_mean_tox"  ,  "ex_comment_count",  "ex_comment_mean_score"  ,  "ex_comment_mean_tox"  ,    "ex_comment_mean_lenght"  )

table1 <- CreateTableOne(vars = covariates, strata = "condition", data = full_data)

latex_table <- print(table1, smd = TRUE, printToggle = FALSE)
rownames(latex_table) <- gsub("\\(mean \\(SD\\)\\)", "", rownames(latex_table))

knitr::kable(latex_table[, -grep("test", colnames(latex_table))], 
                      format = "html", booktabs = TRUE)

	control	incentives	moderation	p	SMD
n	181	169	170
age	43.46 (13.90)	43.03 (13.69)	43.77 (13.71)	0.883	0.036
gender	1.62 (0.59)	1.54 (0.61)	1.56 (0.60)	0.442	0.089
education	4.28 (1.35)	4.25 (1.38)	4.27 (1.31)	0.982	0.013
time_online_w1	3.93 (0.50)	3.89 (0.44)	3.92 (0.53)	0.743	0.056
social_media_w1	3.77 (0.64)	3.81 (0.60)	3.83 (0.60)	0.628	0.066
comments_online_w1	3.48 (1.08)	3.59 (1.00)	3.68 (1.02)	0.189	0.130
leftright_w1	3.73 (2.06)	3.85 (2.04)	4.16 (2.18)	0.150	0.134
polinterest_w1	3.60 (0.62)	3.60 (0.56)	3.61 (0.55)	0.971	0.017
affective_pol_1_w1	53.20 (25.74)	50.31 (27.37)	48.66 (27.65)	0.278	0.113
affective_pol_2_w1	16.51 (21.50)	15.16 (21.11)	16.46 (19.67)	0.795	0.043
issue_attitudes_loan_w1	1.96 (1.43)	2.20 (1.55)	2.41 (1.58)	0.020	0.201
issue_attitudes_airbnb_w1	3.12 (1.46)	3.15 (1.44)	3.08 (1.49)	0.904	0.033
issue_attitudes_minwage_w1	5.24 (1.19)	5.20 (1.28)	4.87 (1.51)	0.019	0.179
issue_attitudes_ukraine_w1	4.49 (1.61)	4.61 (1.48)	4.36 (1.70)	0.354	0.105
issue_attitudes_ubi_w1	1.98 (1.33)	2.21 (1.50)	2.45 (1.53)	0.012	0.213
issue_attitudes_climate_w1	5.19 (1.24)	5.02 (1.50)	5.02 (1.42)	0.425	0.084
issue_attitudes_fur_w1	3.08 (1.62)	3.18 (1.57)	2.91 (1.48)	0.254	0.120
issue_attitudes_renewable_w1	1.41 (0.93)	1.43 (0.90)	1.61 (1.20)	0.158	0.120
issue_attitudes_gaza_w1	4.81 (1.44)	4.92 (1.45)	4.68 (1.51)	0.320	0.109
issue_attitudes_vegetarian_w1	1.75 (0.97)	1.89 (1.16)	1.83 (1.07)	0.458	0.089
issue_attitudes_gender_w1	2.74 (1.63)	2.92 (1.70)	3.06 (1.73)	0.208	0.126
issue_attitudes_sexwork_w1	2.40 (1.37)	2.25 (1.30)	2.49 (1.50)	0.282	0.115
issue_attitudes_vaccine_w1	3.53 (1.55)	3.62 (1.60)	3.35 (1.68)	0.279	0.113
issue_attitudes_healthcare_w1	1.69 (1.29)	1.85 (1.44)	2.03 (1.51)	0.075	0.163
issue_attitudes_immigration_w1	3.31 (1.50)	3.21 (1.51)	3.48 (1.65)	0.260	0.117
issue_attitudes_guns_w1	4.89 (1.50)	4.63 (1.66)	4.45 (1.73)	0.042	0.179
issue_attitudes_deathpenalty_w1	2.12 (1.46)	2.08 (1.49)	2.26 (1.62)	0.502	0.079
issue_attitudes_bodycams_w1	5.70 (0.65)	5.59 (0.82)	5.62 (0.70)	0.385	0.097
issue_attitudes_ai_w1	2.44 (1.29)	2.51 (1.26)	2.44 (1.38)	0.861	0.035
issue_attitudes_socialmedia_w1	3.24 (1.53)	3.32 (1.50)	3.34 (1.44)	0.803	0.044
issue_knowledge_loan_w1	3.05 (0.81)	3.11 (0.78)	3.08 (0.80)	0.764	0.052
issue_knowledge_airbnb_w1	2.53 (0.76)	2.64 (0.92)	2.58 (0.79)	0.425	0.092
issue_knowledge_minwage_w1	3.18 (0.68)	3.20 (0.72)	3.11 (0.73)	0.439	0.089
issue_knowledge_ukraine_w1	2.86 (0.79)	3.01 (0.75)	3.02 (0.74)	0.086	0.140
issue_knowledge_ubi_w1	2.70 (0.75)	2.85 (0.71)	2.74 (0.76)	0.126	0.142
issue_knowledge_climate_w1	3.24 (0.73)	3.33 (0.71)	3.30 (0.65)	0.440	0.089
issue_knowledge_fur_w1	2.31 (0.80)	2.28 (0.84)	2.29 (0.94)	0.962	0.020
issue_knowledge_renewable_w1	2.94 (0.72)	3.07 (0.72)	2.98 (0.77)	0.261	0.115
issue_knowledge_mideast_w1	2.82 (0.72)	2.95 (0.74)	2.85 (0.75)	0.257	0.112
issue_knowledge_vegetarian_w1	2.76 (0.79)	2.79 (0.94)	2.81 (0.85)	0.857	0.039
issue_knowledge_gender_w1	3.13 (0.80)	3.17 (0.81)	3.11 (0.86)	0.760	0.053
issue_knowledge_sexwork_w1	2.64 (0.80)	2.62 (0.92)	2.56 (0.82)	0.713	0.056
issue_knowledge_vaccine_w1	2.92 (0.76)	3.05 (0.76)	2.98 (0.70)	0.291	0.112
issue_knowledge_healthcare_w1	3.18 (0.70)	3.28 (0.72)	3.17 (0.69)	0.304	0.101
issue_knowledge_immigration_w1	2.92 (0.74)	2.93 (0.74)	2.97 (0.76)	0.793	0.048
issue_knowledge_guns_w1	3.16 (0.72)	3.17 (0.71)	3.08 (0.73)	0.459	0.082
issue_knowledge_deathpenalty_w1	2.99 (0.78)	3.07 (0.81)	3.02 (0.73)	0.604	0.070
issue_knowledge_bodycams_w1	2.86 (0.73)	3.05 (0.77)	2.85 (0.73)	0.015	0.184
issue_knowledge_ai_w1	2.75 (0.71)	2.94 (0.76)	2.81 (0.74)	0.050	0.170
issue_knowledge_socialmedia_w1	3.24 (0.62)	3.25 (0.74)	3.25 (0.71)	0.988	0.011
political_efficacy_1_w1	2.08 (1.05)	2.03 (0.90)	2.08 (1.05)	0.865	0.036
political_efficacy_2_w1	4.54 (1.10)	4.67 (1.00)	4.64 (0.98)	0.429	0.089
political_efficacy_3_w1	4.76 (1.04)	4.82 (0.96)	4.78 (0.95)	0.846	0.041
trust_general_1_w1	1.77 (0.62)	1.84 (0.63)	1.81 (0.69)	0.629	0.069
trust_general_2_w1	2.08 (0.67)	2.08 (0.68)	2.06 (0.66)	0.941	0.024
trust_general_3_w1	3.62 (0.55)	3.63 (0.53)	3.49 (0.68)	0.058	0.152
trust_general_4_w1	2.40 (0.72)	2.46 (0.66)	2.32 (0.73)	0.195	0.131
pol_cynicism_w1	4.17 (0.99)	3.99 (1.05)	4.16 (1.08)	0.204	0.114
affective_polarization_w1	46.36 (28.59)	45.01 (28.05)	41.25 (30.20)	0.267	0.117
efficacy_w1	3.03 (0.35)	3.01 (0.30)	3.03 (0.35)	0.865	0.036
comment_karma	17451.63 (42386.03)	16819.56 (36107.33)	25077.38 (65773.47)	0.233	0.103
comment_count	16.64 (20.16)	17.38 (15.67)	18.10 (20.91)	0.850	0.050
comment_mean_lenght	426.23 (261.86)	432.31 (248.13)	398.90 (286.95)	0.618	0.083
comment_mean_score	3.09 (1.70)	2.37 (1.91)	2.33 (1.41)	0.001	0.303
comment_mean_tox	0.09 (0.05)	0.10 (0.06)	0.11 (0.07)	0.183	0.167
ex_comment_count	136.05 (169.42)	159.14 (164.32)	165.42 (176.26)	0.291	0.115
ex_comment_mean_score	6.22 (6.61)	7.64 (7.23)	8.63 (11.69)	0.059	0.187
ex_comment_mean_tox	0.12 (0.07)	0.13 (0.07)	0.12 (0.06)	0.672	0.071
ex_comment_mean_lenght	229.17 (173.18)	205.92 (136.93)	192.37 (124.69)	0.093	0.165

Display “Profiles”

profile_sample <- scaled_sample %>%
  mutate(group = case_when(comment_count == 0 ~ "Never Comment",
      comment_count > 0 & comment_count <= quantile(comment_count[comment_count > 0], 0.3) ~ "Infrequent Commenter",
      comment_count > quantile(comment_count[comment_count > 0], 0.7) ~ "Frequent Commenter",
      TRUE ~ NA_character_))

table(profile_sample$group)

## 
##   Frequent Commenter Infrequent Commenter        Never Comment 
##                   94                  106                  189

profile_sample <- profile_sample %>% filter(!is.na(group))%>%
  mutate(gender_male = as.numeric(as.character(gender_male)))%>%
  rename_with(~ gsub("_", ".", .x))

# Calculate means and confidence intervals for the specified variables by group
summary_table <- profile_sample %>%
  group_by(group) %>%
  summarise(across(c(gender.male, comments.online.w1, polinterest.w1, 
        mean.group.respectful, mean.group.knowledgeable, mean.group.polarized,
        mean.discussion.enjoyable, mean.discussion.toxic, mean.discussion.constructive),
      list(mean = ~mean(.x, na.rm = TRUE), 
           lower = ~mean(.x, na.rm = TRUE) - qt(0.975, df=n()-1) * (sd(.x, na.rm = TRUE)/sqrt(n())), 
           upper = ~mean(.x, na.rm = TRUE) + qt(0.975, df=n()-1) * (sd(.x, na.rm = TRUE)/sqrt(n())))))%>%
  pivot_longer(-group, names_to = c("variable", "value"), names_sep = "_", names_repair = "unique")%>%
  pivot_wider(names_from = value...3, values_from = value...4)

variable_labels <- c(
  gender.male = "Male Gender",
  comments.online.w1 = "Commenting Online",
  polinterest.w1 = "Political Interest",
  mean.group.respectful = "Group respectful",
  mean.group.knowledgeable = "Group knowledgeable",
  mean.group.polarized = "Group polarized",
  mean.discussion.enjoyable = "Discussions enjoyable",
  mean.discussion.toxic = "Discussions toxic",
  mean.discussion.constructive = "Discussions constructive"
)

summary_table$variable <- factor(summary_table$variable,
  levels = names(variable_labels), labels = variable_labels)

profiles <- ggplot(summary_table, aes(x = group, y = mean, fill = group)) +
  geom_bar(stat = "identity", position = position_dodge(), alpha = 0.8) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2, position = position_dodge(0.9)) +
  facet_wrap(~variable, scales = "free_y", ncol = 3) +  # Use facet_wrap with variable
  labs(title = "Profiles of three user groups", 
    x = "",
    y = "Mean Values by Group with 95% Confidence Intervals") +
  theme_minimal() +
  theme(legend.position = "bottom", axis.text.x=element_blank(),
    strip.text = element_text(size = 10)) +
  scale_fill_manual(values = c("#334c7f","#b2ccff","lightgrey"))

profiles

ggsave("../output/profiles_participation.pdf", profiles, width = 10, height = 6)

RQ1: Discussion Participation

Reddit Communities Field Experiment

2025-07-02

Participation Prediction

Zero Hurdle Models for Count data

Random effects with glmmTMB

Geometric distribution (instead of default poisson)

Negative Binomial

Binary Active vs. Passive

Binary Active vs. Passive - Random Effects

Manual poisson models for count data (active only)

Manual poisson models for count data (active only) - Random Effects

Linear models (Excluding 0 and inactive)

Linear models to predict Scores (Excluding 0 and inactive)

Hurdle models for different experimental conditions (start RQ2)

Hurdle models for different conditions simple for main manuscript

Baseline Balance

Display “Profiles”