library(tidyverse)
library(readxl)
library(here)
library(kableExtra)
library(DT)
library(data.table)

OUTLINE

DESCRIPTIVE STATISTICS
COMPARING MEANS
CATEGORICAL ANALYSIS
ANOVA
LINEAR REGRESSION

DESCRIPTIVE STATISTICS

# sneek peak
glimpse(all_raw[sample(nrow(all_raw)),1:8])

## Rows: 4,310,422
## Columns: 8
## $ DATE        <chr> "2021-12-17", "2022-03-09", "2022-01-14", "2022-03-20", "2~
## $ TIME        <chr> "14:52:59", "15:23:37", "13:27:26", "15:38:42", "11:30:47"~
## $ TITLE       <chr> "Katni autobus Neoplan Auwarter, 1990.g. preuređen u kampe~
## $ FROM        <chr> "aukcije.hr", "fashion.hr", "vecernji.hr", "forum.hr", "fo~
## $ AUTHOR      <chr> "aukcije.hr", NA, NA, NA, NA, "anonymous_user", NA, NA, "@~
## $ URL         <chr> "https://www.facebook.com/112885299395/posts/1015996338445~
## $ URL_PHOTO   <chr> "https://mediatoolkit.com/img/50x50,sc,scuMDJuov0rsDwLyfK9~
## $ SOURCE_TYPE <chr> "facebook", "web", "comment", "forum", "forum", "instagram~

# size of the data
dim(all_raw)

## [1] 4310422      47

# time rage
range(all_raw$DATE)

## [1] "2021-11-07" "2022-03-27"

# how much activity
all_raw[SOURCE_TYPE == "web", .N]

## [1] 1803578

# how many authors
all_raw[SOURCE_TYPE == "web", length(unique(all_raw$AUTHOR))]

## [1] 123952

# how many domains?
all_raw[SOURCE_TYPE == "web", length(unique(all_raw$FROM))]

## [1] 87001

# how many domains?
all_raw[SOURCE_TYPE == "web",
        .(Number = length(unique(FROM)))]

##    Number
## 1:   4142

# how many CRO domains?
all_raw[SOURCE_TYPE == "web" & grepl(".hr", unique(all_raw$FROM)),
        .(Number = length(unique(FROM)))]

##    Number
## 1:   1809

# some domains?
web[,.(UniqueDomains = unique(FROM))] %>% sample_n(5)

##              UniqueDomains
## 1: restaurant-hotel.com.hr
## 2:             teleskop.hr
## 3:  prevoditelj-teksta.com
## 4:                  uik.hr
## 5:          novaknjiga.com

# some CRO domains?
web[str_detect(web$FROM, "hr"),.(UniqueDomains = unique(FROM))] %>% sample_n(5)

##            UniqueDomains
## 1:              harta.hr
## 2:       lika-express.hr
## 3:            lccshop.hr
## 4: knjiznica-bjelovar.hr
## 5:                muo.hr

# check all domains
web[str_detect(web$FROM, ".hr"),.N,
        FROM][order(-N),] %>%
  datatable(., rownames = FALSE, options = list(pageLength = 5, scrollX=T))

web[str_detect(web$FROM, ".hr"),.N,FROM][N >= 50,][order(-N),] %>%
  ggplot(., aes(N)) +
  geom_histogram(bins = 10, color = "#000000", fill = "#0099F8") + 
  geom_vline(aes(xintercept = mean(N)), color = "#000000", size = 1.25) +
  geom_vline(aes(xintercept = mean(N) + sd(N)), color = "#000000", size = 1, linetype = "dashed") +
  geom_vline(aes(xintercept = mean(N) - sd(N)), color = "#000000", size = 1, linetype = "dashed") +
  labs(
    title = "Histogram of portal activity in Croatia",
    subtitle = "Made by LSMA",
    caption = "Source: Mediatoolkit dataset",
    x = "Number of articles",
    y = "Count"
  ) +
  theme_classic() +
  theme(
    plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
    plot.subtitle = element_text(size = 10, face = "bold"),
    plot.caption = element_text(face = "italic")
  ) -> gg1

# just another view
web[str_detect(web$FROM, ".hr"),.N,FROM][, All := sum(N)][, avg := (N / All)*100][order(-N),]

##                           FROM     N     All          avg
##    1:                 index.hr 63592 1429761 4.447736e+00
##    2:              jutarnji.hr 52188 1429761 3.650121e+00
##    3:     slobodnadalmacija.hr 52180 1429761 3.649561e+00
##    4:                24sata.hr 46103 1429761 3.224525e+00
##    5:             glasistre.hr 44296 1429761 3.098140e+00
##   ---                                                    
## 2488:                unison.hr     1 1429761 6.994176e-05
## 2489:            knjizaraum.hr     1 1429761 6.994176e-05
## 2490:           dive-in.com.hr     1 1429761 6.994176e-05
## 2491: dom-zdravlja-metkovic.hr     1 1429761 6.994176e-05
## 2492:                  civz.hr     1 1429761 6.994176e-05

# check overall descriptives (number of articles)
web[str_detect(web$FROM, ".hr"),.N,FROM][,.(mean = mean(N),stdev = sd(N), total = sum(N))]

##        mean    stdev   total
## 1: 573.7404 3438.654 1429761

# check ranking by No. of articles
web[str_detect(web$FROM, ".hr"),
          .(.N,REACH = sum(REACH, na.rm = T),
          VIRALITY = sum(VIRALITY, na.rm = T),
          LIKE = sum(LIKE_COUNT, na.rm = T),
          COMMENT = sum(COMMENT_COUNT, na.rm = T)), 
          FROM][order(-N),] %>%
  datatable(., rownames = FALSE, options = list(pageLength = 5, scrollX=T) )

# check overall descriptives (number of comments)
web[str_detect(web$FROM, ".hr"),
    .(COMMENT = sum(COMMENT_COUNT, na.rm = T)),FROM][,.(mean = mean(COMMENT),
                                                        stdev = sd(COMMENT),
                                                        total = sum(COMMENT))]

##        mean    stdev    total
## 1: 10411.01 161745.8 25944235

# check ranking by No. of comments
web[str_detect(web$FROM, ".hr"),
          .(.N,REACH = sum(REACH, na.rm = T),
          VIRALITY = sum(VIRALITY, na.rm = T),
          LIKE = sum(LIKE_COUNT, na.rm = T),
          COMMENT = sum(COMMENT_COUNT, na.rm = T)), 
          FROM][order(-COMMENT),] %>%
  datatable(., rownames = FALSE, options = list(pageLength = 5, scrollX=T))

# check overall descriptives (number of likes)
web[str_detect(web$FROM, ".hr"),
    .(LIKE = sum(LIKE_COUNT, na.rm = T)),FROM][,.(mean = mean(LIKE),
                                                        stdev = sd(LIKE),
                                                        total = sum(LIKE))]

##        mean    stdev    total
## 1: 27785.09 384659.3 69240439

# check ranking by No. of likes
web[str_detect(web$FROM, ".hr"),
          .(.N,REACH = sum(REACH, na.rm = T),
          VIRALITY = sum(VIRALITY, na.rm = T),
          LIKE = sum(LIKE_COUNT, na.rm = T),
          COMMENT = sum(COMMENT_COUNT, na.rm = T)), 
          FROM][order(-LIKE),] %>%
  datatable(., rownames = FALSE, options = list(pageLength = 5, scrollX=T))

# check overall descriptives (reach)
web[str_detect(web$FROM, ".hr"),
    .(REACH = sum(REACH, na.rm = T)),FROM][,.(mean = mean(REACH),
                                                        stdev = sd(REACH),
                                                        total = sum(REACH))]

##       mean    stdev      total
## 1: 2280700 33341915 5683505057

# check ranking by reach
web[str_detect(web$FROM, ".hr"),
          .(.N,REACH = sum(REACH, na.rm = T),
          VIRALITY = sum(VIRALITY, na.rm = T),
          LIKE = sum(LIKE_COUNT, na.rm = T),
          COMMENT = sum(COMMENT_COUNT, na.rm = T)), 
          FROM][order(-REACH),] %>%
  datatable(., rownames = FALSE, options = list(pageLength = 5, scrollX=T))

COMPARING MEANS

Lets compare everage activity of first 100th portals by different criteria(comments,likes,reach).

We first have to subset these rankings:

# select data to object
webRanking <- web[str_detect(web$FROM, ".hr"),
          .(.N,REACH = sum(REACH, na.rm = T),
          VIRALITY = sum(VIRALITY, na.rm = T),
          LIKE = sum(LIKE_COUNT, na.rm = T),
          COMMENT = sum(COMMENT_COUNT, na.rm = T)), 
          FROM]
# filter 100 biggest by criteria
portals100 <-  webRanking[head(order(-N),100),.(FROM,N)]
reach100 <- webRanking[head(order(-REACH),100),.(FROM,N, REACH)]
like100 <- webRanking[head(order(-LIKE),100),.(FROM,N, LIKE)]
comment100 <- webRanking[head(order(-COMMENT),100),.(FROM,N, COMMENT)]

Now lets use one sided t-test to check if 100 biggest potals publish average number of articles (482;see before).

# library from the lsr boook
library(lsr)
# do the test
oneSampleTTest(portals100$N, mu=482 )

## 
##    One sample t-test 
## 
## Data variable:   portals100$N 
## 
## Descriptive statistics: 
##                     N
##    mean     11070.230
##    std dev. 13375.141
## 
## Hypotheses: 
##    null:        population mean equals 482 
##    alternative: population mean not equal to 482 
## 
## Test results: 
##    t-statistic:  7.916 
##    degrees of freedom:  99 
##    p-value:  <.001 
## 
## Other information: 
##    two-sided 95% confidence interval:  [8416.312, 13724.148] 
##    estimated effect size (Cohen's d):  0.792

Our data is not intependent so we use paired test:

# first check reach vs. like
 t.test( 
   x = reach100$N, # variable 1 is the number of articles for 100 biggest portals by reach
   y = like100$N, # variable 2 is the number of articles for 100 biggest portals by like
   paired = TRUE # paired test
 )

## 
##  Paired t-test
## 
## data:  reach100$N and like100$N
## t = 0.30265, df = 99, p-value = 0.7628
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1482.387  2015.987
## sample estimates:
## mean of the differences 
##                   266.8

# second check reach vs. comment

t.test( 
   x = reach100$N, 
   y = comment100$N, 
   paired = TRUE 
 )

## 
##  Paired t-test
## 
## data:  reach100$N and comment100$N
## t = 0.16114, df = 99, p-value = 0.8723
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1840.034  2165.314
## sample estimates:
## mean of the differences 
##                  162.64

# third check comment vs. like

t.test( 
   x = comment100$N,
   y = like100$N, 
   paired = TRUE 
 )

## 
##  Paired t-test
## 
## data:  comment100$N and like100$N
## t = 0.10904, df = 99, p-value = 0.9134
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1791.231  1999.551
## sample estimates:
## mean of the differences 
##                  104.16

# fourth check comment vs. reach

t.test( 
   x = comment100$N, 
   y = reach100$N,
   paired = TRUE # paired test
 )

## 
##  Paired t-test
## 
## data:  comment100$N and reach100$N
## t = -0.16114, df = 99, p-value = 0.8723
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2165.314  1840.034
## sample estimates:
## mean of the differences 
##                 -162.64

Runings independent t tests requires some indexing:

# filter 100 - 200 biggest by criteria
portals200 <-  webRanking[order(-N),.(FROM,N)] %>% slice(101:201)
reach200 <- webRanking[order(-REACH),.(FROM,N, REACH)] %>% slice(101:201)
like200 <- webRanking[order(-LIKE),.(FROM,N, LIKE)] %>% slice(101:201)
comment200 <- webRanking[order(-COMMENT),.(FROM,N, COMMENT)] %>% slice(101:201)

Now we can compare if the activity (mean published articles) of 1th and 2nd hundred biggest portals is the same:

# first we need to check the date to chose the right test 
mean(portals100$N)

## [1] 11070.23

mean(portals200$N)

## [1] 1499.683

sd(portals100$N)

## [1] 13375.14

sd(portals200$N)

## [1] 415.8958

# then run the test
t.test( 
   x = portals100$N, 
   y = portals200$N,
   var.equal = FALSE, # Welch test
   paired = FALSE # independent test
 )

## 
##  Welch Two Sample t-test
## 
## data:  portals100$N and portals200$N
## t = 7.1521, df = 99.19, p-value = 1.484e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   6915.421 12225.672
## sample estimates:
## mean of x mean of y 
## 11070.230  1499.683

# second check for the reach
mean(reach100$N)

## [1] 9063.16

mean(reach200$N)

## [1] 1466.485

sd(reach100$N)

## [1] 13160.93

sd(reach200$N)

## [1] 1422.395

# then run the test
t.test( 
   x = reach100$N, 
   y = reach200$N,
   var.equal = TRUE, # Student test
   paired = FALSE # independent test
 )

## 
##  Two Sample t-test
## 
## data:  reach100$N and reach200$N
## t = 5.7672, df = 199, p-value = 3.043e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   4999.151 10194.199
## sample estimates:
## mean of x mean of y 
##  9063.160  1466.485

To see how good these tests are, one could also check the distributional properties (normality) with histogram, QQ plot and/or Shapiro-Wilk test.

CATEGORICAL ANALYSIS

Here we have two (famous) tests: Goodnes of Fit and Independence of association.

GOODNES OF FIT (GOF)

To run the GOF test we will use the full dataset (not just web) to see weather all of the networks are equally important. Lets explore the data once again:

# check the activity per network
table(all_raw$SOURCE_TYPE)

## 
##   comment  facebook     forum instagram    reddit   twitter       web   youtube 
##    111494    430719    964321     84816    258684    454078   1803578    202732

# run the GOF test
goodnessOfFitTest(as.factor(all_raw$SOURCE_TYPE))

## 
##      Chi-square test against specified probabilities
## 
## Data variable:   as.factor(all_raw$SOURCE_TYPE) 
## 
## Hypotheses: 
##    null:        true probabilities are as specified
##    alternative: true probabilities differ from those specified
## 
## Descriptives: 
##           observed freq. expected freq. specified prob.
## comment           111494       538802.8           0.125
## facebook          430719       538802.8           0.125
## forum             964321       538802.8           0.125
## instagram          84816       538802.8           0.125
## reddit            258684       538802.8           0.125
## twitter           454078       538802.8           0.125
## web              1803578       538802.8           0.125
## youtube           202732       538802.8           0.125
## 
## Test results: 
##    X-squared statistic:  4416624 
##    degrees of freedom:  7 
##    p-value:  <.001

Different kind of probability specification is possible. For example, we want to test if the probablility of the web activity in the whole social media space equals 40%. To complete the example lets first do some data tweaking:

# create new column for web and everything else
all_raw[, Network := if_else(SOURCE_TYPE != "web", "other", "web")][,Network := as.factor(Network)]
# check the size of web vs. other media
table(all_raw$Network)

## 
##   other     web 
## 2506844 1803578

Now, lets run the test:

# specify the probablities
probs = c(other = 0.60, web = 0.40)
probs

## other   web 
##   0.6   0.4

# run the test
goodnessOfFitTest( all_raw$Network, p = probs )

## 
##      Chi-square test against specified probabilities
## 
## Data variable:   all_raw$Network 
## 
## Hypotheses: 
##    null:        true probabilities are as specified
##    alternative: true probabilities differ from those specified
## 
## Descriptives: 
##       observed freq. expected freq. specified prob.
## other        2506844        2586253             0.6
## web          1803578        1724169             0.4
## 
## Test results: 
##    X-squared statistic:  6095.518 
##    degrees of freedom:  1 
##    p-value:  <.001

# another way to run the test
#chisq.test(all_raw$Network, p = c(other = 0.60, web = 0.40))

CHISQ TEST OF INDEPENDENCE

For this test we need to have at least two factorial variables. Lets make some new variables:

# package
library(anytime)
library(lubridate)
# convert chr date to date format
all_raw[,DTIME := anytime(paste(all_raw$DATE,all_raw$TIME))]
# make lubridate
all_raw[,DTIME := ymd_hms(paste(all_raw$DATE,all_raw$TIME))]
# create breaks
breaks <- hour(hm("00:00", "6:00", "12:00", "18:00", "23:59"))
# labels for the breaks
labels <- c("Night", "Morning", "Afternoon", "Evening")
# make a new variable
all_raw[, INTERDAYTIME := cut(x=hour(all_raw$DTIME),
                         breaks = breaks,
                         labels = labels,
                         include.lowest=TRUE)][
                           ,INTERDAYTIME := as.factor(INTERDAYTIME)][
                             ,SOURCE_TYPE := as.factor(SOURCE_TYPE)
                             ]

Now, we should check how these variables look like:

# check the result
table(all_raw$INTERDAYTIME)

## 
##     Night   Morning Afternoon   Evening 
##    339443   1478728   1522517    969734

# also peak into activity again (&again)
table(all_raw$SOURCE_TYPE)

## 
##   comment  facebook     forum instagram    reddit   twitter       web   youtube 
##    111494    430719    964321     84816    258684    454078   1803578    202732

# we are interested in cross-tabulated View
xtabs(~ INTERDAYTIME + SOURCE_TYPE, all_raw)

##             SOURCE_TYPE
## INTERDAYTIME comment facebook  forum instagram reddit twitter    web youtube
##    Night       13296    13445  74389      3475  39531   48144 128644   18519
##    Morning     33713   178120 250544     32437  71936  138001 721982   51995
##    Afternoon   37772   158794 356845     31684  78824  150027 632794   75777
##    Evening     26713    80360 282543     17220  68393  117906 320158   56441

Finally, lets run the test:

associationTest( formula = ~ INTERDAYTIME + SOURCE_TYPE, data = all_raw )

## 
##      Chi-square test of categorical association
## 
## Variables:   INTERDAYTIME, SOURCE_TYPE 
## 
## Hypotheses: 
##    null:        variables are independent of one another
##    alternative: some contingency exists between variables
## 
## Observed contingency table:
##             SOURCE_TYPE
## INTERDAYTIME comment facebook  forum instagram reddit twitter    web youtube
##    Night       13296    13445  74389      3475  39531   48144 128644   18519
##    Morning     33713   178120 250544     32437  71936  138001 721982   51995
##    Afternoon   37772   158794 356845     31684  78824  150027 632794   75777
##    Evening     26713    80360 282543     17220  68393  117906 320158   56441
## 
## Expected contingency table under the null hypothesis:
##             SOURCE_TYPE
## INTERDAYTIME comment facebook  forum instagram reddit twitter    web youtube
##    Night        8780    33919  75940      6679  20371   35758 142031   15965
##    Morning     38249   147762 330819     29097  88744  155775 618733   69549
##    Afternoon   39382   152138 340615     29959  91372  160388 637056   71609
##    Evening     25083    96901 216947     19081  58197  102156 405759   45609
## 
## Test results: 
##    X-squared statistic:  145422.5 
##    degrees of freedom:  21 
##    p-value:  <.001 
## 
## Other information: 
##    estimated effect size (Cramer's v):  0.106

ANOVA

We are interested in differences of web activity response with respect to the intraday period. Lets first select the data of intrest and check relevant descriptives.

# first take relevant data from web
anova <- all_raw[SOURCE_TYPE == "web",.(INTERDAYTIME, LIKE_COUNT, COMMENT_COUNT, REACH)]
# summarise data
anova[,.(LIKE = mean(LIKE_COUNT, na.rm = TRUE),
         COMMENT = mean(COMMENT_COUNT, na.rm = TRUE),
         REACH = mean(REACH, na.rm = TRUE)),
      INTERDAYTIME]

##    INTERDAYTIME     LIKE   COMMENT    REACH
## 1:      Evening 54.11983 19.375075 4234.318
## 2:    Afternoon 47.53877 17.191580 4191.948
## 3:      Morning 51.86043 18.361232 4848.696
## 4:        Night 24.46538  7.718923 4136.323

Now we can run the anova test(s):

# test for likes
summary(aov( formula = LIKE_COUNT ~ INTERDAYTIME, data = anova))

##                   Df    Sum Sq  Mean Sq F value Pr(>F)    
## INTERDAYTIME       3 9.299e+07 30997021   88.35 <2e-16 ***
## Residuals    1803211 6.327e+11   350848                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 363 observations deleted due to missingness

# test for comments
summary(aov( formula = COMMENT_COUNT ~ INTERDAYTIME, data = anova))

##                   Df    Sum Sq Mean Sq F value Pr(>F)    
## INTERDAYTIME       3 1.400e+07 4665442   162.5 <2e-16 ***
## Residuals    1803211 5.178e+10   28716                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 363 observations deleted due to missingness

# test for reach
summary(aov( formula = REACH ~ INTERDAYTIME, data = anova))

##                   Df    Sum Sq   Mean Sq F value   Pr(>F)    
## INTERDAYTIME       3 1.843e+11 6.144e+10   15.87 2.56e-10 ***
## Residuals    1803474 6.980e+15 3.871e+09                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 100 observations deleted due to missingness

Checking assumptions

LINEAR REGRESSION

The ususal pipeline consits of following elements:

explore data (looking at your data, vizualization, summary statistics)
fit the model
check quality

# select relevant data
reg <- all_raw[SOURCE_TYPE == "web",.(REACH,INTERDAYTIME, LIKE_COUNT, COMMENT_COUNT, TITLE, .N)][,TITLE := nchar(TITLE)]
# check descriptives
reg[,.(meanREACH = mean(REACH, na.rm = TRUE),
       meanLIKE = mean(LIKE_COUNT, na.rm = TRUE),
       meanCOMMENT = mean(COMMENT_COUNT, na.rm = TRUE),
       No = mean(N),
       meanTIT = mean(TITLE, na.rm = TRUE)
       )]

##    meanREACH meanLIKE meanCOMMENT      No  meanTIT
## 1:  4458.414 48.79083    17.37161 1803578 72.41959

Lets also make some vizual inspections

ggplot(reg, aes(x = log(REACH))) +
  geom_histogram(bins = 50, color = "#000000", fill = "#0099F8") +
  labs(
    title = "Distribution of REACH metric",
    subtitle = "Made by LSMA",
    caption = "Source: Mediatoolkit dataset",
    x = "REACH (log scale)",
    y = "Count"
  ) +
  theme_classic() +
  theme(
    plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
    plot.subtitle = element_text(size = 10, face = "bold"),
    plot.caption = element_text(face = "italic")
  )

ggplot(reg, aes(x = log(LIKE_COUNT))) +
  geom_histogram(bins = 50, color = "#000000", fill = "#0099F8") +
  labs(
    title = "Distribution of LIKE_COUNT metric",
    subtitle = "Made by LSMA",
    caption = "Source: Mediatoolkit dataset",
    x = "LIKE (log scale)",
    y = "Count"
  ) +
  theme_classic() +
  theme(
    plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
    plot.subtitle = element_text(size = 10, face = "bold"),
    plot.caption = element_text(face = "italic")
  )

ggplot(reg, aes(x = log(COMMENT_COUNT))) +
  geom_histogram(bins = 50, color = "#000000", fill = "#0099F8") +
  labs(
    title = "Distribution of COMMENT_COUNT metric",
    subtitle = "Made by LSMA",
    caption = "Source: Mediatoolkit dataset",
    x = "COMMENT (log scale)",
    y = "Count"
  ) +
  theme_classic() +
  theme(
    plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
    plot.subtitle = element_text(size = 10, face = "bold"),
    plot.caption = element_text(face = "italic")
  )

ggplot(reg, aes(x = INTERDAYTIME, y = log(REACH))) +
  geom_boxplot() +
  labs(x = "INTERDAYTIME", y = "REACH") +
  labs(
    title = "Distribution of REACH by time of the day",
    subtitle = "Made by LSMA",
    caption = "Source: Mediatoolkit dataset",
    x = "INTERDAY",
    y = "Reach (log scale)"
  ) +
  theme_classic() +
  theme(
    plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
    plot.subtitle = element_text(size = 10, face = "bold"),
    plot.caption = element_text(face = "italic")
  )

Also, lets see how the relationship looks

ggplot(reg, aes(x = REACH, y = LIKE_COUNT)) +
  geom_point() +
  labs(x = "REACH", y = "LIKE_COUNT") + 
  geom_smooth(methot = "lm", se = FALSE)

ggplot(reg, aes(x = REACH, y = COMMENT_COUNT)) +
  geom_point() +
  labs(x = "REACH", y = "COMMENT_COUNT") + 
  geom_smooth(methot = "lm", se = FALSE)

ggplot(reg, aes(x = REACH, y = LIKE_COUNT)) +
  geom_point() +
  labs(x = "REACH", y = "LIKE_COUNT") + 
  geom_smooth(methot = "lm", se = FALSE)

Now we can fit a first regression model:

# run the model1
model1 <- lm(REACH ~ COMMENT_COUNT, data = reg)
# output content
summary(model1)

## 
## Call:
## lm(formula = REACH ~ COMMENT_COUNT, data = reg)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -11532340      -345      -211       887  12207675 
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)   345.8384    35.5965    9.716   <2e-16 ***
## COMMENT_COUNT 236.7576     0.2089 1133.145   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47550 on 1803213 degrees of freedom
##   (363 observations deleted due to missingness)
## Multiple R-squared:  0.4159, Adjusted R-squared:  0.4159 
## F-statistic: 1.284e+06 on 1 and 1803213 DF,  p-value: < 2.2e-16

# run the model2
model2 <- lm(REACH ~ LIKE_COUNT, data = reg)
# output content
summary(model2)

## 
## Call:
## lm(formula = REACH ~ LIKE_COUNT, data = reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -9201845     -703     -645      352 11695363 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 702.92949   31.63060   22.22   <2e-16 ***
## LIKE_COUNT   76.97697    0.05322 1446.48   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42330 on 1803213 degrees of freedom
##   (363 observations deleted due to missingness)
## Multiple R-squared:  0.5371, Adjusted R-squared:  0.5371 
## F-statistic: 2.092e+06 on 1 and 1803213 DF,  p-value: < 2.2e-16

# run the model3
model3 <- lm(REACH ~ TITLE, data = reg)
# output content
summary(model3)

## 
## Call:
## lm(formula = REACH ~ TITLE, data = reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
##    -6033    -4222    -3613    -1850 15016763 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3168.682    114.529   27.67   <2e-16 ***
## TITLE         17.901      1.446   12.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 62270 on 1799990 degrees of freedom
##   (3586 observations deleted due to missingness)
## Multiple R-squared:  8.516e-05,  Adjusted R-squared:  8.461e-05 
## F-statistic: 153.3 on 1 and 1799990 DF,  p-value: < 2.2e-16

# run the model4
model4 <- lm(REACH ~ INTERDAYTIME, data = reg)
# output content
summary(model4)

## 
## Call:
## lm(formula = REACH ~ INTERDAYTIME, data = reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
##    -4849    -4219    -3908    -1646 15015852 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            4136.32     173.46  23.846  < 2e-16 ***
## INTERDAYTIMEMorning     712.37     188.28   3.784 0.000155 ***
## INTERDAYTIMEAfternoon    55.62     190.27   0.292 0.770026    
## INTERDAYTIMEEvening      97.99     205.38   0.477 0.633261    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 62210 on 1803474 degrees of freedom
##   (100 observations deleted due to missingness)
## Multiple R-squared:  2.64e-05,   Adjusted R-squared:  2.474e-05 
## F-statistic: 15.87 on 3 and 1803474 DF,  p-value: 2.564e-10

Combine all variables together in a multiple regression model:

# run the model5
model5 <- lm(REACH ~ LIKE_COUNT + COMMENT_COUNT + TITLE + INTERDAYTIME, 
                    data = reg)
# output content
summary(model5)

## 
## Call:
## lm(formula = REACH ~ LIKE_COUNT + COMMENT_COUNT + TITLE + INTERDAYTIME, 
##     data = reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -7025092     -931       37     1322 11439598 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            3.505e+03  1.298e+02   27.01   <2e-16 ***
## LIKE_COUNT             5.881e+01  7.526e-02  781.41   <2e-16 ***
## COMMENT_COUNT          8.756e+01  2.631e-01  332.72   <2e-16 ***
## TITLE                 -2.361e+01  9.586e-01  -24.63   <2e-16 ***
## INTERDAYTIMEMorning   -1.590e+03  1.251e+02  -12.70   <2e-16 ***
## INTERDAYTIMEAfternoon -1.888e+03  1.265e+02  -14.93   <2e-16 ***
## INTERDAYTIMEEvening   -2.414e+03  1.365e+02  -17.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 41120 on 1799725 degrees of freedom
##   (3846 observations deleted due to missingness)
## Multiple R-squared:  0.5641, Adjusted R-squared:  0.5641 
## F-statistic: 3.882e+05 on 6 and 1799725 DF,  p-value: < 2.2e-16

The procedure doesnt end with the model. We also need to choose the best model and asses its quality. The main criteria in regression model is R2.

library(moderndive)
# Get fitted/values & residuals, compute R^2 using residuals for model1
get_regression_points(model1) %>%
  summarize(r_squared = 1 - var(residual) / var(REACH))

## # A tibble: 1 x 1
##   r_squared
##       <dbl>
## 1     0.416

# Get fitted/values & residuals, compute R^2 using residuals for model2
get_regression_points(model2) %>%
  summarize(r_squared = 1 - var(residual) / var(REACH))

## # A tibble: 1 x 1
##   r_squared
##       <dbl>
## 1     0.537

# Get fitted/values & residuals, compute R^2 using residuals for model3
get_regression_points(model3) %>%
  summarize(r_squared = 1 - var(residual) / var(REACH))

## # A tibble: 1 x 1
##   r_squared
##       <dbl>
## 1 0.0000852

# Get fitted/values & residuals, compute R^2 using residuals for model4
get_regression_points(model4) %>%
  summarize(r_squared = 1 - var(residual) / var(REACH))

## # A tibble: 1 x 1
##   r_squared
##       <dbl>
## 1 0.0000264

# Get fitted/values & residuals, compute R^2 using residuals for model5
get_regression_points(model5) %>%
  summarize(r_squared = 1 - var(residual) / var(REACH))

## # A tibble: 1 x 1
##   r_squared
##       <dbl>
## 1     0.564

Thank you for your attention!

Learning Social Media Analytics

Lecture 5: Statistics

Luka Sikic, PhD
Faculty of Croatian Studies | LSMA

OUTLINE

DESCRIPTIVE STATISTICS

COMPARING MEANS

CATEGORICAL ANALYSIS

GOODNES OF FIT (GOF)

CHISQ TEST OF INDEPENDENCE

ANOVA

Checking assumptions

LINEAR REGRESSION

Thank you for your attention!

Learning Social Media Analytics

Lecture 5: Statistics

Luka Sikic, PhD Faculty of Croatian Studies | LSMA

OUTLINE

DESCRIPTIVE STATISTICS

COMPARING MEANS

CATEGORICAL ANALYSIS

GOODNES OF FIT (GOF)

CHISQ TEST OF INDEPENDENCE

ANOVA

Checking assumptions

LINEAR REGRESSION

Thank you for your attention!

Luka Sikic, PhD
Faculty of Croatian Studies | LSMA