library(tidyverse)
library(readr)
library(gridExtra)
library(haven)
library(RColorBrewer)
library(igraph)
library(kableExtra)
library(ggridges)
library(lubridate)

Load anonymous data (no comment texts, open comments, usernames or email addresses)

# reddit data
discussion_data <- read_csv("../data/anon/discussions_anon.csv")
external_data <- read_csv("../data/anon/external_reddit_anon.csv")
user_data <-  read_csv("../data/anon/users_anon.csv")

# survey data
pre_survey <- read_csv("../data/anon/pre_survey_anon.csv")
post_surveys <- read_csv("../data/anon/post_surveys_anon.csv")

Combine survey and reddit data

## Reddit metrics by user - add to sample
user_comments <- discussion_data%>%
  group_by(ParticipantID)%>%
  mutate(comment_count = n(),
         comment_mean_score = round(mean(score_comment,na.rm=T),3),
         comment_mean_tox = round(mean(comment_toxicity,na.rm=T),3),
         comment_mean_lenght = round(mean(length_comment_char,na.rm=T),3))%>%
  slice(1)%>%
  select(ParticipantID,comment_count,comment_mean_lenght,comment_mean_score,comment_mean_tox)

user_comments_external <- external_data%>%
  group_by(ParticipantID)%>%
  mutate(ex_comment_count = n(),
         ex_comment_mean_score = round(mean(score_comment,na.rm=T),3),
         ex_comment_mean_tox = round(mean(comment_toxicity,na.rm=T),3),
         ex_comment_mean_lenght = round(mean(length_comment_char,na.rm=T),3))%>%
  slice(1)%>%
  select(ParticipantID,ex_comment_count,ex_comment_mean_score,ex_comment_mean_tox,ex_comment_mean_lenght)

sample <- pre_survey%>%
  filter(on_reddit == 1)%>%
  left_join(., user_data, by = "ParticipantID")%>%
  left_join(., user_comments, by = "ParticipantID")%>%
  left_join(., user_comments_external, by = "ParticipantID")

write_csv(sample, file = "../data/anon/sample_anon.csv")   

Comment activity per user

# comments per user
part1 <- sample%>%
  mutate(comment_count = ifelse(is.na(comment_count),0,comment_count))%>%
  ggplot()+
  geom_bar(aes(comment_count), fill = "#6699FF")+
  theme_bw()+
  ylab("Number of users")+
  xlab("Number of comments per user")+
  labs(title = "Comment activity", subtitle = paste0("20 discussion issues | N(comments) = ", 
                                                     nrow(discussion_data), " | N(users) = ", nrow(sample)))+
  annotate(geom = "text", x = 20, hjust = 0,  y = 200, label = paste0("min = ",min(sample$comment_count,na.rm=T)))+
  annotate(geom = "text", x = 20, hjust = 0, y = 180, label = paste0("max = ",max(sample$comment_count,na.rm=T)))

any_check <- post_surveys%>%
  group_by(ParticipantID)%>%
  mutate(any_checkin = 1)%>%
  slice(1)%>%
  pull(ParticipantID)

sample%>% mutate(active = ifelse(is.na(comment_count),"silent","active"),
         active = ifelse(!ParticipantID %in% any_check & active == "silent", "join only", active))%>%count(active)
## # A tibble: 3 × 2
##   active        n
##   <chr>     <int>
## 1 active      331
## 2 join only    97
## 3 silent       92
bars <- sample%>%
  mutate(active = ifelse(is.na(comment_count),"silent","active"),
         active = ifelse(!ParticipantID %in% any_check & active == "silent", "join only", active),
         active = factor(active, levels = c("join only","silent","active")))%>%
  ggplot()+
  geom_bar(aes(active), fill = c("darkgrey","lightgrey","#6699FF"))+
  xlab("Participation group")+
  ylab("")+
  annotate(geom = "text", x = "join only",   y = 85, label = "97")+
  annotate(geom = "text", x = "silent",   y = 80, label = "92")+
  annotate(geom = "text", x = "active",   y = 319, label = "331")+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
  
histog <- ggplot(sample)+
  geom_bar(aes(comment_count), fill = "#6699FF")+
  xlab("Number of comments written in experiment")+
  ylab("Number of users")+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

dots <- ggplot(sample)+
  geom_point(aes(comment_count, log(comment_karma+1), colour = comment_mean_tox, 
                 size = comment_mean_lenght), alpha = 0.8)+
  #geom_smooth(method="lm", aes(comment_count,comment_mean_tox))+
  scale_colour_gradient(low = "#6699FF", high = "black")+
  xlab("Comments written in experiment")+
  ylab("Comment karma on Reddit (log-scale)")+
  theme_bw()+
  guides(colour = guide_legend(title="Toxicity"), size = guide_legend(title="Length"))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

combi2 <- grid.arrange(bars, histog, widths = c(3,4.5), top = "User-level discussion participation")

ggsave("../output/participation_dots.pdf", dots, width = 5, height = 4)
ggsave("../output/participation_total2.pdf", combi2, width = 6, height = 3)

Survey Sample

Demographics

#(1 = Man, 2 = Woman, 3 = Other)
table(sample$gender)
## 
##   1   2   3 
## 251 239  30
table(sample$condition)
## 
##    control incentives moderation 
##        181        169        170
table(sample$subreddit)
## 
## DiscussPolitics1 DiscussPolitics2 DiscussPolitics3 DiscussPolitics4 
##               80               85               87               82 
## DiscussPolitics5 DiscussPolitics6 
##               85              101
ageplot <- ggplot(sample)+
  geom_histogram(aes(age), fill = "#6699FF")+
  theme_bw()+ 
  ylab("Sample: Participants who joined subreddit")+
  xlab(paste0("Age in years (M = ",round(mean(sample$age),2),", SD = ",round(sd(sample$age),2),")"))+
  annotate(geom="text", x=60, y=30,hjust = 0, label= paste("Male: ",table(sample$gender)[1]))+
  annotate(geom="text", x=60, y=28,hjust = 0, label= paste("Female: ",table(sample$gender)[2]))+
  annotate(geom="text", x=60, y=26,hjust = 0, label= paste("Other: ",table(sample$gender)[3]))+
  annotate(geom="text", x=60, y=33,hjust = 0, label= paste("N: ", nrow(sample)),fontface =2)+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

educplot <- ggplot(sample)+
  geom_bar(aes(education), fill = "#6699FF")+
  coord_flip()+
  ylab("")+
  xlab("Level of formal education")+
  theme_bw()+
  annotate(geom="text", x = 1, y = 0, hjust = 0,  label = "No degree")+
  annotate(geom="text", x = 2, y = 0, hjust = 0, label = "High school")+
  annotate(geom="text", x = 3, y = 0, hjust = 0, label = "Some college education, no degree")+
  annotate(geom="text", x = 4, y = 0, hjust = 0, label = "Bachelor's degree")+
  annotate(geom="text", x = 5, y = 0, hjust = 0, label = "Some postgraduate education, no degree")+
  annotate(geom="text", x = 6, y = 0, hjust = 0, label = "Postgraduate degree")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

demo <- grid.arrange(ageplot, educplot, nrow = 1,
                     top = "Age and education across study sample")

ggsave("../output/demographics.pdf",demo, width = 10, height = 5)

Political variables pre-survey

int <- ggplot(sample, aes(polinterest))+
  geom_bar(fill = "#6699FF")+
  ggtitle("Political Interest")+
  xlab("")+
  ylab("")+
  theme_bw()+
  xlim("Not interested", "Sighly interested","Moderately interested", "Very interested")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

pol <- ggplot(sample)+
  geom_bar(aes(x = leftright, fill = ..x..))+
  theme_bw()+
  scale_fill_gradient2(low='#6699ff', mid='lightgrey', high='#bc3455', midpoint=6)+
  ggtitle("Political Orientation")+
  xlab("")+
  ylab("")+
  theme(legend.position = "none",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  xlim("far left", "", "", "", "", "", "", "",
       "", "", "far right")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

polint <- grid.arrange(int, pol, nrow = 1)

ggsave("../output/pol_lefright.pdf", polint, width = 12 , height = 5) 
ggsave("../output/lefright.pdf", pol, width = 4 , height = 4)

Reddit Data

Engagement metrics by political issue

Issue Toxicity Comments Views
The US should condemn Israel’s military actions in Gaza as acts of genocide and impose full sanctions. 0.191 435 5535
Prostitution should be illegal. 0.190 213 3134
Things like gender-neutral language and stating pronouns are silly issues. 0.183 538 3926
The death penalty should be reestablished US-wide. 0.149 388 4985
The US should provide financial and military aid to Ukraine. 0.136 263 3892
We need stricter gun control laws. 0.118 338 4472
Social media is a threat to democracy. 0.108 233 3460
Immigration should be regulated more strictly. 0.101 286 4235
Fur clothing should be banned. 0.096 230 3311
Police officers should wear body cameras. 0.092 313 3854
Climate change is one of the greatest threats to humanity. 0.084 223 2863
Employers should mandate vaccination. 0.073 258 3183
The government should not be responsible for providing universal health care. 0.073 220 3326
The government should not forgive student loan debt. 0.069 296 3850
Artificial intelligence should replace humans where possible. 0.068 293 4832
There should only be vegetarian food in cantines. 0.065 290 4683
A universal basic income would kill the economy. 0.064 225 3185
The federal minimum wage should be increased. 0.063 305 4321
The government should not invest in renewable energy. 0.060 198 3576
Airbnb should be banned in cities. 0.059 274 3043

Toxicity distributions by political issue

issue_order <- discussion_data%>%
  group_by(post_title)%>%
  summarise(Toxicity = round(mean(comment_toxicity, na.rm = T),3))%>%
  arrange(desc(Toxicity))%>%
  pull(post_title)

discussion_data$issues <- factor(discussion_data$post_title, levels = issue_order)

colour_values <- c("#6699ff", "#729dee", "#7ca1dd", "#86a5cc", "#91a9bb", "#9bada9", "#a6b298", "#d3d3d3", "#bfc7c7", "#b5b1b1", "#ab9b9b", "#a18585", "#976f6f", "#8d5959", "#844444", "#7b3232", "#732525", "#6a1a1a", "#621010", "#bc3455")

issues <- c("US foreign policy toward Israel and Gaza","Legality of prostitution", "Use of gender-neutral language and pronouns", "Death penalty reestablishment in the US", "US financial and military aid to Ukraine", "Gun control laws", "Impact of social media on democracy", "Immigration regulation", "Ban on fur clothing","Police accountability and body cameras", "Climate change and environmental threats","Mandatory vaccination policies by employers", "Universal healthcare responsibility","Government forgiveness of student loan debt", "Artificial intelligence replacing human labor", "Vegetarian food policies in public cantines", "Economic impact of universal basic income",  "Federal minimum wage increase", "Government investment in renewable energy", "Regulation of Airbnb in cities")

# colours, means
tox_ridge <- ggplot(discussion_data)+
  geom_density_ridges(aes(x = comment_toxicity, y = fct_rev(issues), fill = fct_rev(issues), 
                          alpha = 0.5))+
  theme(legend.position = "bottom", legend.direction="vertical", 
        legend.title = element_blank())+
  guides(alpha = "none")+
  ylab("")+
  xlab("")+
  scale_y_discrete(labels = rev(issues))+
  theme_ridges() + 
  scale_fill_manual(values = colour_values) +
  scale_x_continuous(limits = c(0, 0.6))+
  theme(legend.position = "none")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
tox_ridge

ggsave("../output/toxicity_issues_ridges.pdf", tox_ridge, width = 10, height = 6)

Reddit mertrics over time

views <- discussion_data%>%
  group_by(subreddit,date(created_post))%>%
  slice(1)%>%
  ggplot(aes(x = date(created_post), y = total_views, color = subreddit))+
  geom_smooth(method = loess,alpha=0.1)+
  geom_point(alpha = 0.3)+
  theme_bw()+
  scale_color_manual(values = c("#29c195","#6699ff","#424656","#a7aabd","#f96d86","#bc3455"))+
  ylab("")+
  guides(color = "none")+
  ggtitle("Views per post")+
  xlab("Date")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

comments <- discussion_data%>%
  group_by(subreddit,date(created_post))%>%
  slice(1)%>%
  ggplot(aes(x = date(created_post), y = `Collected comments`, color = subreddit))+
  geom_smooth(method = loess,alpha=0.1)+
  geom_point(alpha = 0.3)+
  theme_bw()+
  scale_color_manual(values = c("#29c195","#6699ff","#424656","#a7aabd","#f96d86","#bc3455"))+
  ylab("")+
  guides(color = "none")+
  ggtitle("Comments per post")+
  xlab("Date")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

comments_views <- discussion_data%>%
  mutate(comments_per_views = `Collected comments`/total_views)%>%
  group_by(subreddit,date(created_post))%>%
  slice(1)%>%
  ggplot(aes(x = date(created_post), y = comments_per_views, color = subreddit))+
  geom_smooth(method = loess,alpha=0.1)+
  geom_point(alpha = 0.3)+
  theme_bw()+
  scale_color_manual(values = c("#29c195","#6699ff","#424656","#a7aabd","#f96d86","#bc3455"))+
  ylab("")+
  ggtitle("Comments/views ratio")+
  xlab("Date")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

activity_combi <- grid.arrange(views, comments, comments_views, nrow=1, widths = c(3,3,4))

ggsave("../output/comments_views.pdf", activity_combi, width = 14, height = 5)
toxicity <- ggplot(discussion_data, aes(x = date(created_post), y = comment_toxicity, color = subreddit))+
  geom_smooth(method = loess,alpha=0.1)+
  geom_point(alpha = 0.1)+
  theme_bw()+
  scale_color_manual(values = c("#29c195","#6699ff","#424656","#a7aabd","#f96d86","#bc3455"))+
  ylab("")+
  guides(color = "none")+
  ggtitle("Comment toxicity")+
  xlab("Date")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

length <- ggplot(discussion_data, aes(x = date(created_post), y = length_comment_char, color = subreddit))+
  geom_smooth(method = loess,alpha=0.1)+
  geom_point(alpha = 0.1)+
  theme_bw()+
  scale_color_manual(values = c("#29c195","#6699ff","#424656","#a7aabd","#f96d86","#bc3455"))+
  ylab("")+
  guides(color = "none")+
  ggtitle("Comment length")+
  xlab("Date")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

score <- discussion_data%>%
  group_by(subreddit,date(created_post))%>%
  mutate(score_views = score_post/total_views)%>%
  slice(1)%>%
  ggplot(aes(x = date(created_post), y = score_views, color = subreddit))+
  geom_smooth(method = loess,alpha=0.1)+
  geom_point(alpha = 0.3)+
  theme_bw()+
  scale_color_manual(values = c("#29c195","#6699ff","#424656","#a7aabd","#f96d86","#bc3455"))+
  ylab("")+
  ggtitle("Post score/views ratio")+
  xlab("Date")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

characteristics_combi <- grid.arrange(toxicity, length, score, nrow=1, widths = c(3,3,4))

ggsave("../output/characteristics_combi.pdf", characteristics_combi, width = 14, height = 5)

Reddit activity by users - internal vs. external

comments_tox <- ggplot(sample,aes(x = comment_mean_tox, y = ex_comment_mean_tox, size = comment_count))+
  geom_point(alpha = 0.5, color = "#6699FF")+
  theme_bw()+
  xlab("Internal comment toxicity")+
  ylab("Average external comment toxicity")+
  guides(color = "none", size = "none")+
  geom_abline()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

cor.test(sample$comment_mean_tox,sample$ex_comment_mean_tox)
## 
##  Pearson's product-moment correlation
## 
## data:  sample$comment_mean_tox and sample$ex_comment_mean_tox
## t = 3.8092, df = 284, p-value = 0.0001709
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1072295 0.3280655
## sample estimates:
##      cor 
## 0.220471
comments_length <- ggplot(sample,aes(x = comment_mean_lenght, y = ex_comment_mean_lenght,
                                     size = comment_count))+
  geom_point(alpha = 0.5, color = "#6699FF")+
  theme_bw()+
  xlab("Internal comment length")+
  ylab("Average external comment length")+
  guides(color = "none", size = "none")+
  geom_abline()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

cor.test(sample$comment_mean_lenght,sample$ex_comment_mean_lenght)
## 
##  Pearson's product-moment correlation
## 
## data:  sample$comment_mean_lenght and sample$ex_comment_mean_lenght
## t = 10.417, df = 284, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4364171 0.6048829
## sample estimates:
##       cor 
## 0.5257867
cor.test(log(sample$comment_karma),sample$comment_count)
## 
##  Pearson's product-moment correlation
## 
## data:  log(sample$comment_karma) and sample$comment_count
## t = 2.481, df = 328, p-value = 0.0136
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.02817168 0.24016730
## sample estimates:
##       cor 
## 0.1357227
comment_tox_length <- grid.arrange(comments_tox, comments_length, nrow = 1)

ggsave("../output/comment_tox_length_narrow.pdf", comment_tox_length, width = 5 , height = 3)

Comment karma and platform seniority

# Posting a comment on an existing post and receiving upvotes results in "comment" karma
k1 <- ggplot(sample)+
  geom_histogram(aes(comment_karma), fill = "#6699FF")+
  theme_bw()+
  ylab("Number of users")+
  xlab("Comment Karma")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

k2 <- ggplot(sample)+
  geom_histogram(aes(log(comment_karma)), fill = "#6699FF")+
  theme_bw()+
  ylab("Number of users")+
  xlab("Comment Karma (Log-scale)")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

create <- ggplot(sample)+
  geom_histogram(aes(date(created)), fill = "lightgrey", bins = 100)+
  annotate("text", x = min(date(sample$created), na.rm = TRUE), y = Inf, label = paste("First:", min(date(sample$created), na.rm = T)), vjust = 2, hjust = 0) +
  annotate("text", x = max(date(sample$created), na.rm = TRUE), y = Inf, label = paste("Last:", max(date(sample$created), na.rm = T)), vjust = 2, hjust = 1)+
  theme_bw()+
  ylab("Number of accounts")+
  xlab("Date joined Reddit")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

kplot <- grid.arrange(k2,create, nrow = 1,
                      top = paste0("Distributions of platform seniority (N = ",nrow(sample),") Start of recruitment: 2024-05-09"))

ggsave("../output/platform_seniority.pdf", kplot, width = 8, height = 4)

External comments

internal_comments <- discussion_data %>% mutate(source = "internal")%>%
  select(ParticipantID,created_comment,comment_toxicity,length_comment_char, source)%>%
  filter(created_comment > as_datetime("2024-06-10") & created_comment < as_datetime("2024-07-06"))
  
external_comments <- external_data %>% mutate(source = "external")%>%
  select(ParticipantID,created_comment,comment_toxicity,length_comment_char, source)%>%
  filter(created_comment > as_datetime("2024-05-13"))

all_data <- bind_rows(internal_comments, external_comments)
all_data$created_comment <- as.Date(all_data$created_comment)

all_data <- all_data %>%
  mutate(week = floor_date(created_comment, unit = "week"))

daily_summary <- all_data %>%
  group_by(source, ParticipantID, created_comment) %>%
  dplyr::summarize(comment_volume = n(),  # Count of comments per user per day
    avg_toxicity = mean(comment_toxicity, na.rm = TRUE)) %>%
  ungroup()

study_start <- as.Date("2024-06-10")
study_end <- as.Date("2024-07-05")

int_ext <- ggplot(daily_summary, aes(x = created_comment, y = avg_toxicity)) +
  geom_point(data = subset(daily_summary, source == "external"),
             aes(size = comment_volume), color = "grey", alpha = 0.2) +
  geom_point(data = subset(daily_summary, source == "internal"),
             aes(size = comment_volume), color = "#6699FF", alpha = 0.5) +
  geom_smooth(data = subset(daily_summary, source == "external"),
              aes(y = avg_toxicity), color = "darkgrey", method = "loess", se = FALSE) +
  geom_smooth(data = subset(daily_summary, source == "internal"),
              aes(y = avg_toxicity), color = "#3d5b99", method = "loess", se = FALSE) +
  geom_vline(xintercept = as.numeric(study_start), linetype = "dashed", color = "black") +
  geom_vline(xintercept = as.numeric(study_end), linetype = "dashed", color = "black") +
  labs(x = "Date", y = "Average toxicity (per user per day)",
       size = "Number of Comments \n per user per day",
       title = "Daily Comment Toxicity and Volume per User (External vs Internal Data)") +
  scale_size_continuous(range = c(1, 5)) +  
  theme_minimal() +
  theme(legend.position = "right",
    plot.title = element_text(hjust = 0.5),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank())
int_ext

ggsave("../output/internal_external_toxicity.pdf", int_ext, width = 9, height = 4)
# Aggregate data by ParticipantID, day, and source

daily_summary_act <- all_data %>%
  group_by(source, ParticipantID, created_comment) %>%
  dplyr::summarize(comment_volume = n(), 
    avg_toxicity = mean(comment_toxicity, na.rm = TRUE)) %>%
  ungroup()

int_ext_act <- ggplot(daily_summary, aes(x = created_comment, y = comment_volume)) +
  geom_point(data = subset(daily_summary, source == "external"),
             aes(size = avg_toxicity, alpha = avg_toxicity), color = "grey") +
  geom_point(data = subset(daily_summary, source == "internal"),
             aes(size = avg_toxicity, alpha = avg_toxicity), color = "#6699FF") +
  geom_smooth(data = subset(daily_summary, source == "external"),
              aes(y = comment_volume), color = "darkgrey", method = "loess", se = FALSE) +
  geom_smooth(data = subset(daily_summary, source == "internal"),
              aes(y = comment_volume), color = "#3d5b99", method = "loess", se = FALSE) +
  geom_vline(xintercept = as.numeric(study_start), linetype = "dashed", color = "black") +
  geom_vline(xintercept = as.numeric(study_end), linetype = "dashed", color = "black") +
  labs(x = "Date", y = "Number of Comments per User per Day",
       size = "Average Toxicity",
       alpha = "Average Toxicity",
       title = "Daily Comment Volume and Toxicity per User (External vs Internal Data)") +
  scale_size_continuous(range = c(1, 5)) + 
  scale_alpha_continuous(range = c(0.2, 1)) + 
  theme_minimal() +
  theme(legend.position = "right",
    plot.title = element_text(hjust = 0.5))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

int_ext_act

ggsave("../output/internal_external_activity.pdf", int_ext_act, width = 9, height = 3)

``

Other survey descriptives

Online activity

a <- ggplot(sample)+
  geom_bar(aes(time_online), fill = "#6699FF")+
  theme_bw()+
  coord_flip()+
  xlab("")+
  ylab("")+
  labs(title = "Time online")+
  annotate(geom="text", x = 2, y = 0, hjust = 0, label = "Up to one hour per week")+
  annotate(geom="text", x = 3, y = 0, hjust = 0, label = "Up to one hour per day")+
  annotate(geom="text", x = 4, y = 0, hjust = 0, label = "Multiple hours per day")+
  annotate(geom="text", x = 5, y = 0, hjust = 0, label = "Almost the entire day")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())


b <- ggplot(sample)+
  geom_bar(aes(social_media), fill = "#6699FF")+
  theme_bw()+
  coord_flip()+
  xlab("")+
  labs(title = "Social media use")+
  annotate(geom="text", x = 2, y = 0, hjust = 0, label = "A couple of times per week")+
  annotate(geom="text", x = 3, y = 0, hjust = 0, label = "About once per day")+
  annotate(geom="text", x = 4, y = 0, hjust = 0, label = "Multiple times per day")+
  annotate(geom="text", x = 5, y = 0, hjust = 0, label = "Almost constantly")+
  ylab("")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

c <- ggplot(sample)+
  geom_bar(aes(comments_online), fill = "#6699FF")+
  theme_bw()+
  coord_flip()+
  xlab("")+
  ylab("")+
  labs(title = "Commenting online")+
  annotate(geom="text", x = 1, y = 0, hjust = 0, label = "Never")+
  annotate(geom="text", x = 2, y = 0, hjust = 0, label = "About once per month")+
  annotate(geom="text", x = 3, y = 0, hjust = 0, label = "About once per week")+
  annotate(geom="text", x = 4, y = 0, hjust = 0, label = "Almost daily")+
  annotate(geom="text", x = 5, y = 0, hjust = 0, label = "Multiple times per day")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

abc <- grid.arrange(a, b, c, nrow = 1)

ggsave("../output/online_activity.pdf", abc, width = 10 , height = 3) 
# affective polarization
parties <- ggplot(sample)+
  geom_density(aes(affective_pol_1),fill = "#6699ff", alpha = 0.5)+
  geom_density(aes(affective_pol_2),fill = "#bc3455", alpha = 0.5)+
  theme_bw()+
  ylab("")+
  annotate(geom = "text", x = 10, y = 0.04, hjust = 0,  label = "Rating of Republican Party")+
  annotate(geom = "text", x = 60, y = 0.02, hjust = 0, label = "Rating of Democratic Party")+
  xlab("Party Rating: 0 = negative, 100 = positive")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

dist <- ggplot(sample, aes(affective_polarization))+
  geom_density(fill = '#a7aabd', alpha = 0.5)+
  theme_bw()+
  xlab("Affective Polarization (In party - out party)")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

polint <- grid.arrange(parties, dist, nrow = 1, top = "Affective Polarization")

ggsave("../output/polarization.pdf", polint, width = 10 , height = 5)

Attitude grids

# trust matrix 
trust <- sample %>% 
  dplyr::select(trust_general_1:trust_general_4)%>%
  rename(`Politics` = trust_general_1,
         `Media` = trust_general_2,
         `Science` = trust_general_3,
         `People generally` = trust_general_4)%>%
  # change 1: convert haven_labelled variables to factors 
  as_factor() %>% 
  pivot_longer(
    cols = 1:4,
    names_to = "Variable",
    values_to = "Agreement"
  ) %>% 
  count(Variable, Agreement) %>% 
  ggplot(aes(y = n, x = Agreement)) +
  facet_wrap(. ~ Variable) +
  geom_col(fill = "#6699FF")+
  ylab("")+
  xlab("1: Not at all, 2: A little, 3: Quite a bit, 4: Very much")+
  ggtitle("Trust")+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
trust

ggsave("../output/trust_matrix.pdf", trust, width = 4 , height = 5)
## issue knowledge 
knowledge <- sample %>% 
  dplyr::select(issue_knowledge_loan:issue_knowledge_socialmedia)%>%
  rename_at(1:20, list(~ substr(., 17, nchar(.))))%>%
  as_factor() %>% 
  pivot_longer( cols = 1:20,
    names_to = "Variable",
    values_to = "Knowledge") %>% 
  count(Variable, Knowledge) %>% 
  ggplot(aes(y = n, x = Knowledge)) +
  facet_wrap(. ~ Variable) +
  geom_col(fill = "#6699FF")+
  ylab("")+
  xlab("1: Nothing, 2: A little, 3: A moderate amoung, 4: A lot")+
  ggtitle("Issue Knowledge T1")+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
knowledge

ggsave("../output/knowledge_matrix.pdf", knowledge, width = 10 , height = 8)
## issue attitudes 
attitudes <- sample %>% 
  dplyr::select(issue_attitudes_loan:issue_attitudes_socialmedia)%>%
  rename_at(1:20, list(~ substr(., 17, nchar(.))))%>%
  as_factor() %>% 
  pivot_longer(cols = 1:20,
    names_to = "Variable",
    values_to = "Attitude") %>% 
  count(Variable, Attitude) %>% 
  ggplot(aes(y = n, x = Attitude)) +
  facet_wrap(. ~ Variable) +
  geom_col(fill = "#6699FF")+
  ylab("")+
  xlab("1: Strongly disagree - 6: Strongly agree")+
  ggtitle("Issue Attitudes T1")+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
attitudes

ggsave("../output/attitudes_matrix.pdf", attitudes, width = 10 , height = 8) 

Check-in Surveys

# exposure check
exp <- ggplot(post_surveys)+
  geom_density(aes(exposure_correct, fill = as.factor(survey_week)), alpha = 0.7)+
  theme_bw()+
  ylab("")+
  xlab("Exposure Check: TP - FP, 5 topics, 5 distractors")+
  scale_fill_manual(name = "Discussion week", values = rev(c("#6699FF","#70facb","#29c195","#008a62")))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

ggsave("../output/exposure_check.pdf", exp, width = 5, height = 3)

# issue distance
dist_abs <- ggplot(post_surveys)+
  geom_density(aes(issue_distance_abs, fill = as.factor(survey_week)), alpha = 0.7)+
  theme_bw()+
  ylab("")+
  xlab("Absolute Issue Distance: | self - others |, aggregate over 5 topics")+
  scale_fill_manual(name = "Discussion week", values = rev(c("#6699FF","#70facb","#29c195","#008a62")))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

plot <- grid.arrange(exp,dist_abs, nrow = 1)

ggsave("../output/issue_distance.pdf", dist_abs, width = 6, height = 4)
# discussion perception
perceptions <- post_surveys %>% 
  dplyr::select(discussion_percep_1:discussion_percep_3)%>%
  rename(`toxic` = discussion_percep_1,
         `constructive` = discussion_percep_2,
         `enjoyable` = discussion_percep_3)%>%
  as_factor() %>% 
  pivot_longer(cols = 1:3,
    names_to = "Variable",
    values_to = "Perception") %>% 
  count(Variable, Perception) %>% 
  ggplot(aes(y = n, x = Perception)) +
  facet_wrap(. ~ Variable) +
  geom_col(fill = "#6699FF")+
  ylab("")+
  xlab("1: Not at all, 2: A little, 3: Quite a bit, 4: Very much")+
  ggtitle("Discussion Perception")+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

ggsave("../output/discussion_perception.pdf", perceptions, width = 6 , height = 4)
# group perception
perceptions_g <- post_surveys %>% 
  dplyr::select(group_percep_1:group_percep_3)%>%
  rename(`respectful` = group_percep_1,
         `polarized` = group_percep_2,
         `knowledgeable` = group_percep_3)%>%
  as_factor() %>% 
  pivot_longer(cols = 1:3,
    names_to = "Variable",
    values_to = "Perception") %>% 
  count(Variable, Perception) %>% 
  ggplot(aes(y = n, x = Perception)) +
  facet_wrap(. ~ Variable) +
  geom_col(fill = "#6699FF")+
  ylab("")+
  xlab("1: Not at all, 2: A little, 3: Quite a bit, 4: Very much")+
  ggtitle("Group Perception")+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

plot <- grid.arrange(perceptions,perceptions_g, nrow = 1)

ggsave("../output/group_perception.pdf", perceptions_g, width = 6 , height = 4)
# participation
participation <- post_surveys %>% 
  dplyr::select(participation_4:participation_1)%>%
  rename(`checked topics` = participation_4,
         `read some discussions` = participation_5,
         `followed discussions` = participation_6,
         `voted some` = participation_7,
         `voted many` = participation_8,
         `wrote few comments` = participation_9,
         `commented everything` = participation_10,
         `did not participate` = participation_1)%>%
  as_factor() %>% 
  pivot_longer(cols = 1:8,
    names_to = "Variable",
    values_to = "Participation") %>%
  group_by(Variable)%>%
  summarise(Participation = sum(Participation, na.rm = T))%>%
  mutate(Variable = factor(Variable, levels = c("did not participate", "checked topics",
                                                   "read some discussions",
                                                   "followed discussions",
                                                   "voted some",
                                                   "voted many",
                                                   "wrote few comments",
                                                   "commented everything")))%>%
  ggplot(aes(y = Participation, x = reorder(Variable,Participation))) +
  geom_col(fill = "#6699FF")+
  ylab("")+
  xlab("")+
  ggtitle("Self-reported participation")+
  theme_bw()+
  coord_flip()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

ggsave("../output/participation_check.pdf", participation, width = 6 , height = 4)

Motives for participation (and barriers)

# motives 
motives <- post_surveys %>% 
  dplyr::select(post_motives_20:post_motives_51)%>%
  rename(`inform others` = post_motives_20,
         `entertain others` = post_motives_39,
         `express my opinion` = post_motives_40,
         `provoke others` = post_motives_41,
         `express my emotions` = post_motives_42,
         `connect with others` = post_motives_43,
         `deceive others` = post_motives_44,
         `gain attention` = post_motives_45,
         `prove a point` = post_motives_46,
         `cause chaos` = post_motives_47,
         `bring attention to a topic` = post_motives_48,
         `pursuade others` = post_motives_49,
         `surprise or shock others` = post_motives_50,
         `help researchers` = post_motives_51)%>%
  pivot_longer(cols = 1:14,
    names_to = "Motives",
    values_to = "Variable") %>%
  group_by(Motives)%>%
  summarise(Variable = sum(Variable, na.rm = T))%>%
  mutate(Motives = factor(Motives))%>%
  ggplot(aes(x = reorder(Motives,Variable), y = Variable)) +
  geom_col(fill = "#6699FF")+
  ylab("")+
  xlab("")+
  ggtitle("Motives for writing comments")+
  theme_bw()+
  coord_flip()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

ggsave("../output/motives.pdf", motives, width = 5 , height = 4)
# negative motives / barriers to participation
barriers <- post_surveys %>% 
  dplyr::select(non_participation_2:non_participation_5,non_participation_1)%>%
  rename(`no barriers` = non_participation_1,
         `personally triggered` = non_participation_2,
         `could not add anything` = non_participation_3,
         `intimidated by sophistication` = non_participation_4,
         `afraid of backlash` = non_participation_5)%>%
  pivot_longer(cols = 1:5,
    names_to = "Barriers",
    values_to = "Variable") %>%
  group_by(Barriers)%>%
  summarise(Variable = sum(Variable, na.rm = T))%>%
  mutate(Barriers = factor(Barriers))%>%
  ggplot(aes(y = Variable, x = reorder(Barriers,Variable))) +
  geom_col(fill = "#6699FF")+
  ylab("")+
  xlab("")+
  ggtitle("Barriers to writing comments")+
  theme_bw()+
  coord_flip()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

plot <- grid.arrange(participation,motives,barriers, nrow = 1)

ggsave("../output/barriers.pdf", barriers, width = 5 , height = 2)
## different to normal Reddit experience 
differences <- post_surveys %>% 
  dplyr::select(different_to_real_1:different_to_real_6)%>%
  na.omit()%>%
  rename(`toxicity discussion` = different_to_real_1,
         `constructiveness discussion` = different_to_real_2,
         `enjoyability discussion` = different_to_real_3,
         `knowleadgeability group` = different_to_real_4,
         `polarization group` = different_to_real_5,
         `respectfulness group` = different_to_real_6)%>%
  as_factor() %>% 
  pivot_longer(cols = 1:6,
    names_to = "Variable",
    values_to = "Perception") %>% 
  count(Variable, Perception) %>% 
  mutate(Variable = factor(Variable, levels = c("toxicity discussion",
                                                "constructiveness discussion",
                                                "enjoyability discussion",
                                                "knowleadgeability group",
                                                "polarization group",
                                                "respectfulness group")))%>%
  ggplot(aes(y = n, x = Perception)) +
  facet_wrap(. ~ Variable) +
  geom_col(fill = "#6699FF")+
  #coord_flip()+
  ylab("")+
  xlab("1: Very similar, 2: Somewhat similar, 3: Somewhat different, 4: Very different")+
  ggtitle("Perceived difference to typical experience on Reddit")+
  theme_bw()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
differences

ggsave("../output/differences_perception.pdf", differences, width = 6 , height = 5) 

Relative change in commenting activity

#colors
colors <- c(control = "gray", incentives = "#C5701A", moderation = "#6699FF")

#adjust alphas to .7
colors <- c(control = adjustcolor("gray", alpha.f = .7), incentives = adjustcolor("#C5701A", alpha.f = .7), moderation = adjustcolor("#6699FF", alpha.f = .7))

comment_data_ranges <- daily_summary_act %>% group_by(ParticipantID) %>% dplyr::summarize(earliest_comment = min(created_comment), latest_comment = max(created_comment))

#plot hist of earliest and latest comment dates
h1 <- hist(comment_data_ranges$earliest_comment, breaks = 20, main = "Earliest comment date", xlab = "Date")

h2 <- hist(comment_data_ranges$latest_comment, breaks = 20, main = "Latest comment date", xlab = "Date")

#calculate number of days between earliest comment and study start
days_prestudy_period <- study_start - min(comment_data_ranges$earliest_comment)

#calculate mean number of comments per day in the pre-study period
mean_prestudy_daily_comments <- daily_summary_act %>%
  filter(created_comment < study_start) %>%
  group_by(ParticipantID) %>%
  dplyr::summarize(mean_daily_comments_prestudy = sum(comment_volume)/as.numeric(days_prestudy_period))

#calculate mean number of comments-per day in the study period, within study
mean_duringstudy_int_daily_comments <- daily_summary_act %>% filter(source == "internal") %>%
  filter(created_comment >= study_start & created_comment <= study_end) %>%
  group_by(ParticipantID) %>%
  dplyr::summarize(mean_daily_comments_instudy = sum(comment_volume)/as.numeric(study_end - study_start))

#calculate mean number of comments-per day in the study period, outside study
mean_duringstudy_ext_daily_comments <- daily_summary_act %>% filter(source == "external") %>%
  filter(created_comment >= study_start & created_comment <= study_end) %>%
  group_by(ParticipantID) %>%
  dplyr::summarize(mean_daily_comments_extstudy = sum(comment_volume)/as.numeric(study_end - study_start))

#join the two dataframes
mean_daily_comments <- left_join(mean_prestudy_daily_comments, mean_duringstudy_int_daily_comments, by = "ParticipantID")
mean_daily_comments <- left_join(mean_daily_comments, mean_duringstudy_ext_daily_comments, by = "ParticipantID")

daily_summary_act_complete <- daily_summary_act %>%
  complete(source, ParticipantID, created_comment = seq.Date(min(created_comment), max(created_comment), by = "day"), fill = list(comment_volume = 0, avg_toxicity = NA))

mean_duringstudy_int_commentprob <- daily_summary_act_complete %>% filter(source == "internal") %>%
  filter(created_comment >= study_start & created_comment <= study_end) %>%
  group_by(ParticipantID) %>%
  dplyr::summarize(duringstudy_int_daily_commentprob = sum(comment_volume!=0)/as.numeric(study_end - study_start))

mean_duringstudy_ext_commentprob <- daily_summary_act_complete %>% filter(source == "external") %>%
  filter(created_comment >= study_start & created_comment <= study_end) %>%
  group_by(ParticipantID) %>%
  dplyr::summarize(duringstudy_ext_daily_commentprob = sum(comment_volume!=0)/as.numeric(study_end - study_start))

mean_prestudy_commentprob <- daily_summary_act_complete %>%
  filter(created_comment < study_start) %>%
  group_by(ParticipantID) %>%
  dplyr::summarize(prestudy_daily_commentprob = sum(comment_volume!=0)/as.numeric(days_prestudy_period))

mean_daily_comments <- left_join(mean_daily_comments, mean_duringstudy_int_commentprob, by = "ParticipantID")
mean_daily_comments <- left_join(mean_daily_comments, mean_duringstudy_ext_commentprob, by = "ParticipantID")
mean_daily_comments <- left_join(mean_daily_comments, mean_prestudy_commentprob, by = "ParticipantID")

# join mean_daily_comments to sample
s <- left_join(sample, mean_daily_comments, by = "ParticipantID")

#come back and use S for this
daily_summary_act %>% pull(created_comment) %>% hist(breaks = "day") 

# bar chart of comment count by day, stacked by source=="internal" vs source=="external"
barplot(table(daily_summary_act$source, daily_summary_act$created_comment), beside = T, col = c("grey50", "#6699FF"), legend = T, border = NA)

# break down by condition
control <- s %>% filter(condition == "control")
moderation <- s %>% filter(condition == "moderation")
incentives <- s %>% filter(condition == "incentives")

# copy of 3-gang version subset by condition, not logged
this_xlim = c(0,12)
this_ylim = c(0,5)

par(pty="s", mfrow = c(1,3), cex = .6)
plot((control$mean_daily_comments_prestudy), (control$mean_daily_comments_instudy), xlab = "Mean daily comments pre-study", ylab = "Mean daily comments in study", main = "Control", pch = 16, col = colors['control'], xlim = this_xlim, ylim = this_ylim)
abline(0,1)
plot((moderation$mean_daily_comments_prestudy), (moderation$mean_daily_comments_instudy), xlab = "Mean daily comments pre-study", ylab = "Mean daily comments in study", main = "Moderation", pch = 16, col = colors['moderation'], xlim = this_xlim, ylim = this_ylim)
abline(0,1)
plot((incentives$mean_daily_comments_prestudy), (incentives$mean_daily_comments_instudy), xlab = "Mean daily comments pre-study", ylab = "Mean daily comments in study", main = "Incentives", pch = 16, col = colors['incentives'], xlim = this_xlim, ylim = this_ylim)
abline(0,1)

#plot ratio by prestudy
s <- s %>% mutate(prestudy_instudy_comment_ratio = mean_daily_comments_instudy/mean_daily_comments_prestudy)

control <- s %>% filter(condition == "control")
moderation <- s %>% filter(condition == "moderation")
incentives <- s %>% filter(condition == "incentives")

par(cex = .8)
plot(control$mean_daily_comments_prestudy, control$prestudy_instudy_comment_ratio, xlab = "Mean daily comments pre-study", ylab = "In-Study/Pre-Study", main = "Mean daily comments in study / pre-study ", pch = 16, col = colors['control'])
points(moderation$mean_daily_comments_prestudy, moderation$prestudy_instudy_comment_ratio, pch = 16, col = colors['moderation'])
points(incentives$mean_daily_comments_prestudy, incentives$prestudy_instudy_comment_ratio, pch = 16, col = colors['incentives'])

#plot difference by prestudy
s <- s %>% mutate(prestudy_instudy_comment_difference = mean_daily_comments_instudy-mean_daily_comments_prestudy)

control <- s %>% filter(condition == "control")
moderation <- s %>% filter(condition == "moderation")
incentives <- s %>% filter(condition == "incentives")

par(cex = .8)
plot(control$mean_daily_comments_prestudy, control$prestudy_instudy_comment_difference, xlab = "Mean daily comments pre-study", ylab = "In-Study - Pre-Study", main = "Mean daily comments pre-study vs in study", pch = 16, col = colors['control'])
points(moderation$mean_daily_comments_prestudy, moderation$prestudy_instudy_comment_difference, pch = 16, col = colors['moderation'])
points(incentives$mean_daily_comments_prestudy, incentives$prestudy_instudy_comment_difference, pch = 16, col = colors['incentives'])
abline(h = 0, lty = 2)
abline(0,-1)

#### Make Raw non-diff versions
#keep
#pdf("ws_outputs/comment_raw_by_prestudy_quantile.pdf", width = 8, height = 10)

s <- s %>% 
  mutate(prestudy_quantile_bin = cut(mean_daily_comments_prestudy, breaks = quantile(mean_daily_comments_prestudy, probs = seq(0, 1, by = 0.25), na.rm = TRUE)))

# plot s subsets
control <- s %>% filter(condition == "control") %>% 
  select(prestudy_quantile_bin, mean_daily_comments_instudy) %>% 
  drop_na()
moderation <- s %>% 
  filter(condition == "moderation") %>% 
  select(prestudy_quantile_bin, mean_daily_comments_instudy) %>% 
  drop_na()
incentives <- s %>% 
  filter(condition == "incentives") %>% 
  select(prestudy_quantile_bin, mean_daily_comments_instudy) %>% 
  drop_na()

#fill out the missing bins in control
bins <- s %>% pull(prestudy_quantile_bin) %>% unique()
#remove NA values and put in order
bins <- bins[!is.na(bins)]
bins <- bins[order(bins)]
control <- control %>% complete(prestudy_quantile_bin = bins)

par(mfrow = c(3,1))
this_ylim = c(0,4)
#this_ylim = NULL

this_xlab = "Pre-Study Mean Daily Comments"
this_ylab = "In-Study Mean Daily Comments"
this_main = "Pre-Study vs In-Study Mean Daily Comments"

boxplot(mean_daily_comments_instudy ~ prestudy_quantile_bin, data = control, xlab =this_xlab, ylab = this_ylab, main = this_main, pch = 16, col = colors['control'], ylim =this_ylim)
#abline(h = 0, lty = 2)
#abline(0,-1)

plot(mean_daily_comments_instudy ~ prestudy_quantile_bin, data = moderation, xlab = this_xlab, ylab = this_ylab, main = this_main, pch = 16, col = colors['moderation'], ylim = this_ylim)
#abline(h = 0, lty = 2)
#abline(0,-1)

plot(mean_daily_comments_instudy ~ prestudy_quantile_bin, data = incentives, xlab = this_xlab, ylab = this_ylab, main = this_main, pch = 16, col = colors['incentives'], ylim = this_ylim)
#abline(h = 0, lty = 2)
#abline(0,-1)

legend("topright", legend = c("Control", "Moderation", "Incentives"), col = c(colors['control'], colors['moderation'], colors['incentives']), pch = 15, bty = "n")

#dev.off()