library(tidyverse)
library(readxl)
library(here)
library(kableExtra)
library(DT)
library(data.table)
DESCRIPTIVE STATISTICS
COMPARING MEANS
CATEGORICAL ANALYSIS
ANOVA
LINEAR REGRESSION
# sneek peak
glimpse(all_raw[sample(nrow(all_raw)),1:8])
## Rows: 4,310,422
## Columns: 8
## $ DATE <chr> "2021-12-17", "2022-03-09", "2022-01-14", "2022-03-20", "2~
## $ TIME <chr> "14:52:59", "15:23:37", "13:27:26", "15:38:42", "11:30:47"~
## $ TITLE <chr> "Katni autobus Neoplan Auwarter, 1990.g. preuređen u kampe~
## $ FROM <chr> "aukcije.hr", "fashion.hr", "vecernji.hr", "forum.hr", "fo~
## $ AUTHOR <chr> "aukcije.hr", NA, NA, NA, NA, "anonymous_user", NA, NA, "@~
## $ URL <chr> "https://www.facebook.com/112885299395/posts/1015996338445~
## $ URL_PHOTO <chr> "https://mediatoolkit.com/img/50x50,sc,scuMDJuov0rsDwLyfK9~
## $ SOURCE_TYPE <chr> "facebook", "web", "comment", "forum", "forum", "instagram~
# size of the data
dim(all_raw)
## [1] 4310422 47
# time rage
range(all_raw$DATE)
## [1] "2021-11-07" "2022-03-27"
# how much activity
== "web", .N] all_raw[SOURCE_TYPE
## [1] 1803578
# how many authors
== "web", length(unique(all_raw$AUTHOR))] all_raw[SOURCE_TYPE
## [1] 123952
# how many domains?
== "web", length(unique(all_raw$FROM))] all_raw[SOURCE_TYPE
## [1] 87001
# how many domains?
== "web",
all_raw[SOURCE_TYPE Number = length(unique(FROM)))] .(
## Number
## 1: 4142
# how many CRO domains?
== "web" & grepl(".hr", unique(all_raw$FROM)),
all_raw[SOURCE_TYPE Number = length(unique(FROM)))] .(
## Number
## 1: 1809
# some domains?
UniqueDomains = unique(FROM))] %>% sample_n(5) web[,.(
## UniqueDomains
## 1: restaurant-hotel.com.hr
## 2: teleskop.hr
## 3: prevoditelj-teksta.com
## 4: uik.hr
## 5: novaknjiga.com
# some CRO domains?
str_detect(web$FROM, "hr"),.(UniqueDomains = unique(FROM))] %>% sample_n(5) web[
## UniqueDomains
## 1: harta.hr
## 2: lika-express.hr
## 3: lccshop.hr
## 4: knjiznica-bjelovar.hr
## 5: muo.hr
# check all domains
str_detect(web$FROM, ".hr"),.N,
web[order(-N),] %>%
FROM][datatable(., rownames = FALSE, options = list(pageLength = 5, scrollX=T))
str_detect(web$FROM, ".hr"),.N,FROM][N >= 50,][order(-N),] %>%
web[ggplot(., aes(N)) +
geom_histogram(bins = 10, color = "#000000", fill = "#0099F8") +
geom_vline(aes(xintercept = mean(N)), color = "#000000", size = 1.25) +
geom_vline(aes(xintercept = mean(N) + sd(N)), color = "#000000", size = 1, linetype = "dashed") +
geom_vline(aes(xintercept = mean(N) - sd(N)), color = "#000000", size = 1, linetype = "dashed") +
labs(
title = "Histogram of portal activity in Croatia",
subtitle = "Made by LSMA",
caption = "Source: Mediatoolkit dataset",
x = "Number of articles",
y = "Count"
+
) theme_classic() +
theme(
plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
plot.subtitle = element_text(size = 10, face = "bold"),
plot.caption = element_text(face = "italic")
-> gg1 )
# just another view
str_detect(web$FROM, ".hr"),.N,FROM][, All := sum(N)][, avg := (N / All)*100][order(-N),] web[
## FROM N All avg
## 1: index.hr 63592 1429761 4.447736e+00
## 2: jutarnji.hr 52188 1429761 3.650121e+00
## 3: slobodnadalmacija.hr 52180 1429761 3.649561e+00
## 4: 24sata.hr 46103 1429761 3.224525e+00
## 5: glasistre.hr 44296 1429761 3.098140e+00
## ---
## 2488: unison.hr 1 1429761 6.994176e-05
## 2489: knjizaraum.hr 1 1429761 6.994176e-05
## 2490: dive-in.com.hr 1 1429761 6.994176e-05
## 2491: dom-zdravlja-metkovic.hr 1 1429761 6.994176e-05
## 2492: civz.hr 1 1429761 6.994176e-05
# check overall descriptives (number of articles)
str_detect(web$FROM, ".hr"),.N,FROM][,.(mean = mean(N),stdev = sd(N), total = sum(N))] web[
## mean stdev total
## 1: 573.7404 3438.654 1429761
# check ranking by No. of articles
str_detect(web$FROM, ".hr"),
web[REACH = sum(REACH, na.rm = T),
.(.N,VIRALITY = sum(VIRALITY, na.rm = T),
LIKE = sum(LIKE_COUNT, na.rm = T),
COMMENT = sum(COMMENT_COUNT, na.rm = T)),
order(-N),] %>%
FROM][datatable(., rownames = FALSE, options = list(pageLength = 5, scrollX=T) )
# check overall descriptives (number of comments)
str_detect(web$FROM, ".hr"),
web[COMMENT = sum(COMMENT_COUNT, na.rm = T)),FROM][,.(mean = mean(COMMENT),
.(stdev = sd(COMMENT),
total = sum(COMMENT))]
## mean stdev total
## 1: 10411.01 161745.8 25944235
# check ranking by No. of comments
str_detect(web$FROM, ".hr"),
web[REACH = sum(REACH, na.rm = T),
.(.N,VIRALITY = sum(VIRALITY, na.rm = T),
LIKE = sum(LIKE_COUNT, na.rm = T),
COMMENT = sum(COMMENT_COUNT, na.rm = T)),
order(-COMMENT),] %>%
FROM][datatable(., rownames = FALSE, options = list(pageLength = 5, scrollX=T))
# check overall descriptives (number of likes)
str_detect(web$FROM, ".hr"),
web[LIKE = sum(LIKE_COUNT, na.rm = T)),FROM][,.(mean = mean(LIKE),
.(stdev = sd(LIKE),
total = sum(LIKE))]
## mean stdev total
## 1: 27785.09 384659.3 69240439
# check ranking by No. of likes
str_detect(web$FROM, ".hr"),
web[REACH = sum(REACH, na.rm = T),
.(.N,VIRALITY = sum(VIRALITY, na.rm = T),
LIKE = sum(LIKE_COUNT, na.rm = T),
COMMENT = sum(COMMENT_COUNT, na.rm = T)),
order(-LIKE),] %>%
FROM][datatable(., rownames = FALSE, options = list(pageLength = 5, scrollX=T))
# check overall descriptives (reach)
str_detect(web$FROM, ".hr"),
web[REACH = sum(REACH, na.rm = T)),FROM][,.(mean = mean(REACH),
.(stdev = sd(REACH),
total = sum(REACH))]
## mean stdev total
## 1: 2280700 33341915 5683505057
# check ranking by reach
str_detect(web$FROM, ".hr"),
web[REACH = sum(REACH, na.rm = T),
.(.N,VIRALITY = sum(VIRALITY, na.rm = T),
LIKE = sum(LIKE_COUNT, na.rm = T),
COMMENT = sum(COMMENT_COUNT, na.rm = T)),
order(-REACH),] %>%
FROM][datatable(., rownames = FALSE, options = list(pageLength = 5, scrollX=T))
Lets compare everage activity of first 100th portals by different criteria(comments,likes,reach).
We first have to subset these rankings:
# select data to object
<- web[str_detect(web$FROM, ".hr"),
webRanking REACH = sum(REACH, na.rm = T),
.(.N,VIRALITY = sum(VIRALITY, na.rm = T),
LIKE = sum(LIKE_COUNT, na.rm = T),
COMMENT = sum(COMMENT_COUNT, na.rm = T)),
FROM]# filter 100 biggest by criteria
<- webRanking[head(order(-N),100),.(FROM,N)]
portals100 <- webRanking[head(order(-REACH),100),.(FROM,N, REACH)]
reach100 <- webRanking[head(order(-LIKE),100),.(FROM,N, LIKE)]
like100 <- webRanking[head(order(-COMMENT),100),.(FROM,N, COMMENT)] comment100
Now lets use one sided t-test to check if 100 biggest potals publish average number of articles (482;see before).
# library from the lsr boook
library(lsr)
# do the test
oneSampleTTest(portals100$N, mu=482 )
##
## One sample t-test
##
## Data variable: portals100$N
##
## Descriptive statistics:
## N
## mean 11070.230
## std dev. 13375.141
##
## Hypotheses:
## null: population mean equals 482
## alternative: population mean not equal to 482
##
## Test results:
## t-statistic: 7.916
## degrees of freedom: 99
## p-value: <.001
##
## Other information:
## two-sided 95% confidence interval: [8416.312, 13724.148]
## estimated effect size (Cohen's d): 0.792
Our data is not intependent so we use paired test:
# first check reach vs. like
t.test(
x = reach100$N, # variable 1 is the number of articles for 100 biggest portals by reach
y = like100$N, # variable 2 is the number of articles for 100 biggest portals by like
paired = TRUE # paired test
)
##
## Paired t-test
##
## data: reach100$N and like100$N
## t = 0.30265, df = 99, p-value = 0.7628
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1482.387 2015.987
## sample estimates:
## mean of the differences
## 266.8
# second check reach vs. comment
t.test(
x = reach100$N,
y = comment100$N,
paired = TRUE
)
##
## Paired t-test
##
## data: reach100$N and comment100$N
## t = 0.16114, df = 99, p-value = 0.8723
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1840.034 2165.314
## sample estimates:
## mean of the differences
## 162.64
# third check comment vs. like
t.test(
x = comment100$N,
y = like100$N,
paired = TRUE
)
##
## Paired t-test
##
## data: comment100$N and like100$N
## t = 0.10904, df = 99, p-value = 0.9134
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1791.231 1999.551
## sample estimates:
## mean of the differences
## 104.16
# fourth check comment vs. reach
t.test(
x = comment100$N,
y = reach100$N,
paired = TRUE # paired test
)
##
## Paired t-test
##
## data: comment100$N and reach100$N
## t = -0.16114, df = 99, p-value = 0.8723
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2165.314 1840.034
## sample estimates:
## mean of the differences
## -162.64
Runings independent t tests requires some indexing:
# filter 100 - 200 biggest by criteria
<- webRanking[order(-N),.(FROM,N)] %>% slice(101:201)
portals200 <- webRanking[order(-REACH),.(FROM,N, REACH)] %>% slice(101:201)
reach200 <- webRanking[order(-LIKE),.(FROM,N, LIKE)] %>% slice(101:201)
like200 <- webRanking[order(-COMMENT),.(FROM,N, COMMENT)] %>% slice(101:201) comment200
Now we can compare if the activity (mean published articles) of 1th and 2nd hundred biggest portals is the same:
# first we need to check the date to chose the right test
mean(portals100$N)
## [1] 11070.23
mean(portals200$N)
## [1] 1499.683
sd(portals100$N)
## [1] 13375.14
sd(portals200$N)
## [1] 415.8958
# then run the test
t.test(
x = portals100$N,
y = portals200$N,
var.equal = FALSE, # Welch test
paired = FALSE # independent test
)
##
## Welch Two Sample t-test
##
## data: portals100$N and portals200$N
## t = 7.1521, df = 99.19, p-value = 1.484e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 6915.421 12225.672
## sample estimates:
## mean of x mean of y
## 11070.230 1499.683
# second check for the reach
mean(reach100$N)
## [1] 9063.16
mean(reach200$N)
## [1] 1466.485
sd(reach100$N)
## [1] 13160.93
sd(reach200$N)
## [1] 1422.395
# then run the test
t.test(
x = reach100$N,
y = reach200$N,
var.equal = TRUE, # Student test
paired = FALSE # independent test
)
##
## Two Sample t-test
##
## data: reach100$N and reach200$N
## t = 5.7672, df = 199, p-value = 3.043e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 4999.151 10194.199
## sample estimates:
## mean of x mean of y
## 9063.160 1466.485
To see how good these tests are, one could also check the distributional properties (normality) with histogram, QQ plot and/or Shapiro-Wilk test.
Here we have two (famous) tests: Goodnes of Fit and Independence of association.
To run the GOF test we will use the full dataset (not just web) to see weather all of the networks are equally important. Lets explore the data once again:
# check the activity per network
table(all_raw$SOURCE_TYPE)
##
## comment facebook forum instagram reddit twitter web youtube
## 111494 430719 964321 84816 258684 454078 1803578 202732
# run the GOF test
goodnessOfFitTest(as.factor(all_raw$SOURCE_TYPE))
##
## Chi-square test against specified probabilities
##
## Data variable: as.factor(all_raw$SOURCE_TYPE)
##
## Hypotheses:
## null: true probabilities are as specified
## alternative: true probabilities differ from those specified
##
## Descriptives:
## observed freq. expected freq. specified prob.
## comment 111494 538802.8 0.125
## facebook 430719 538802.8 0.125
## forum 964321 538802.8 0.125
## instagram 84816 538802.8 0.125
## reddit 258684 538802.8 0.125
## twitter 454078 538802.8 0.125
## web 1803578 538802.8 0.125
## youtube 202732 538802.8 0.125
##
## Test results:
## X-squared statistic: 4416624
## degrees of freedom: 7
## p-value: <.001
Different kind of probability specification is possible. For example, we want to test if the probablility of the web activity in the whole social media space equals 40%. To complete the example lets first do some data tweaking:
# create new column for web and everything else
:= if_else(SOURCE_TYPE != "web", "other", "web")][,Network := as.factor(Network)]
all_raw[, Network # check the size of web vs. other media
table(all_raw$Network)
##
## other web
## 2506844 1803578
Now, lets run the test:
# specify the probablities
= c(other = 0.60, web = 0.40)
probs probs
## other web
## 0.6 0.4
# run the test
goodnessOfFitTest( all_raw$Network, p = probs )
##
## Chi-square test against specified probabilities
##
## Data variable: all_raw$Network
##
## Hypotheses:
## null: true probabilities are as specified
## alternative: true probabilities differ from those specified
##
## Descriptives:
## observed freq. expected freq. specified prob.
## other 2506844 2586253 0.6
## web 1803578 1724169 0.4
##
## Test results:
## X-squared statistic: 6095.518
## degrees of freedom: 1
## p-value: <.001
# another way to run the test
#chisq.test(all_raw$Network, p = c(other = 0.60, web = 0.40))
For this test we need to have at least two factorial variables. Lets make some new variables:
# package
library(anytime)
library(lubridate)
# convert chr date to date format
:= anytime(paste(all_raw$DATE,all_raw$TIME))]
all_raw[,DTIME # make lubridate
:= ymd_hms(paste(all_raw$DATE,all_raw$TIME))]
all_raw[,DTIME # create breaks
<- hour(hm("00:00", "6:00", "12:00", "18:00", "23:59"))
breaks # labels for the breaks
<- c("Night", "Morning", "Afternoon", "Evening")
labels # make a new variable
:= cut(x=hour(all_raw$DTIME),
all_raw[, INTERDAYTIME breaks = breaks,
labels = labels,
include.lowest=TRUE)][
:= as.factor(INTERDAYTIME)][
,INTERDAYTIME := as.factor(SOURCE_TYPE)
,SOURCE_TYPE ]
Now, we should check how these variables look like:
# check the result
table(all_raw$INTERDAYTIME)
##
## Night Morning Afternoon Evening
## 339443 1478728 1522517 969734
# also peak into activity again (&again)
table(all_raw$SOURCE_TYPE)
##
## comment facebook forum instagram reddit twitter web youtube
## 111494 430719 964321 84816 258684 454078 1803578 202732
# we are interested in cross-tabulated View
xtabs(~ INTERDAYTIME + SOURCE_TYPE, all_raw)
## SOURCE_TYPE
## INTERDAYTIME comment facebook forum instagram reddit twitter web youtube
## Night 13296 13445 74389 3475 39531 48144 128644 18519
## Morning 33713 178120 250544 32437 71936 138001 721982 51995
## Afternoon 37772 158794 356845 31684 78824 150027 632794 75777
## Evening 26713 80360 282543 17220 68393 117906 320158 56441
Finally, lets run the test:
associationTest( formula = ~ INTERDAYTIME + SOURCE_TYPE, data = all_raw )
##
## Chi-square test of categorical association
##
## Variables: INTERDAYTIME, SOURCE_TYPE
##
## Hypotheses:
## null: variables are independent of one another
## alternative: some contingency exists between variables
##
## Observed contingency table:
## SOURCE_TYPE
## INTERDAYTIME comment facebook forum instagram reddit twitter web youtube
## Night 13296 13445 74389 3475 39531 48144 128644 18519
## Morning 33713 178120 250544 32437 71936 138001 721982 51995
## Afternoon 37772 158794 356845 31684 78824 150027 632794 75777
## Evening 26713 80360 282543 17220 68393 117906 320158 56441
##
## Expected contingency table under the null hypothesis:
## SOURCE_TYPE
## INTERDAYTIME comment facebook forum instagram reddit twitter web youtube
## Night 8780 33919 75940 6679 20371 35758 142031 15965
## Morning 38249 147762 330819 29097 88744 155775 618733 69549
## Afternoon 39382 152138 340615 29959 91372 160388 637056 71609
## Evening 25083 96901 216947 19081 58197 102156 405759 45609
##
## Test results:
## X-squared statistic: 145422.5
## degrees of freedom: 21
## p-value: <.001
##
## Other information:
## estimated effect size (Cramer's v): 0.106
We are interested in differences of web activity response with respect to the intraday period. Lets first select the data of intrest and check relevant descriptives.
# first take relevant data from web
<- all_raw[SOURCE_TYPE == "web",.(INTERDAYTIME, LIKE_COUNT, COMMENT_COUNT, REACH)]
anova # summarise data
LIKE = mean(LIKE_COUNT, na.rm = TRUE),
anova[,.(COMMENT = mean(COMMENT_COUNT, na.rm = TRUE),
REACH = mean(REACH, na.rm = TRUE)),
INTERDAYTIME]
## INTERDAYTIME LIKE COMMENT REACH
## 1: Evening 54.11983 19.375075 4234.318
## 2: Afternoon 47.53877 17.191580 4191.948
## 3: Morning 51.86043 18.361232 4848.696
## 4: Night 24.46538 7.718923 4136.323
Now we can run the anova test(s):
# test for likes
summary(aov( formula = LIKE_COUNT ~ INTERDAYTIME, data = anova))
## Df Sum Sq Mean Sq F value Pr(>F)
## INTERDAYTIME 3 9.299e+07 30997021 88.35 <2e-16 ***
## Residuals 1803211 6.327e+11 350848
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 363 observations deleted due to missingness
# test for comments
summary(aov( formula = COMMENT_COUNT ~ INTERDAYTIME, data = anova))
## Df Sum Sq Mean Sq F value Pr(>F)
## INTERDAYTIME 3 1.400e+07 4665442 162.5 <2e-16 ***
## Residuals 1803211 5.178e+10 28716
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 363 observations deleted due to missingness
# test for reach
summary(aov( formula = REACH ~ INTERDAYTIME, data = anova))
## Df Sum Sq Mean Sq F value Pr(>F)
## INTERDAYTIME 3 1.843e+11 6.144e+10 15.87 2.56e-10 ***
## Residuals 1803474 6.980e+15 3.871e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 100 observations deleted due to missingness
The ususal pipeline consits of following elements:
# select relevant data
<- all_raw[SOURCE_TYPE == "web",.(REACH,INTERDAYTIME, LIKE_COUNT, COMMENT_COUNT, TITLE, .N)][,TITLE := nchar(TITLE)]
reg # check descriptives
meanREACH = mean(REACH, na.rm = TRUE),
reg[,.(meanLIKE = mean(LIKE_COUNT, na.rm = TRUE),
meanCOMMENT = mean(COMMENT_COUNT, na.rm = TRUE),
No = mean(N),
meanTIT = mean(TITLE, na.rm = TRUE)
)]
## meanREACH meanLIKE meanCOMMENT No meanTIT
## 1: 4458.414 48.79083 17.37161 1803578 72.41959
Lets also make some vizual inspections
ggplot(reg, aes(x = log(REACH))) +
geom_histogram(bins = 50, color = "#000000", fill = "#0099F8") +
labs(
title = "Distribution of REACH metric",
subtitle = "Made by LSMA",
caption = "Source: Mediatoolkit dataset",
x = "REACH (log scale)",
y = "Count"
+
) theme_classic() +
theme(
plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
plot.subtitle = element_text(size = 10, face = "bold"),
plot.caption = element_text(face = "italic")
)
ggplot(reg, aes(x = log(LIKE_COUNT))) +
geom_histogram(bins = 50, color = "#000000", fill = "#0099F8") +
labs(
title = "Distribution of LIKE_COUNT metric",
subtitle = "Made by LSMA",
caption = "Source: Mediatoolkit dataset",
x = "LIKE (log scale)",
y = "Count"
+
) theme_classic() +
theme(
plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
plot.subtitle = element_text(size = 10, face = "bold"),
plot.caption = element_text(face = "italic")
)
ggplot(reg, aes(x = log(COMMENT_COUNT))) +
geom_histogram(bins = 50, color = "#000000", fill = "#0099F8") +
labs(
title = "Distribution of COMMENT_COUNT metric",
subtitle = "Made by LSMA",
caption = "Source: Mediatoolkit dataset",
x = "COMMENT (log scale)",
y = "Count"
+
) theme_classic() +
theme(
plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
plot.subtitle = element_text(size = 10, face = "bold"),
plot.caption = element_text(face = "italic")
)
ggplot(reg, aes(x = INTERDAYTIME, y = log(REACH))) +
geom_boxplot() +
labs(x = "INTERDAYTIME", y = "REACH") +
labs(
title = "Distribution of REACH by time of the day",
subtitle = "Made by LSMA",
caption = "Source: Mediatoolkit dataset",
x = "INTERDAY",
y = "Reach (log scale)"
+
) theme_classic() +
theme(
plot.title = element_text(color = "#0099F8", size = 16, face = "bold"),
plot.subtitle = element_text(size = 10, face = "bold"),
plot.caption = element_text(face = "italic")
)
Also, lets see how the relationship looks
ggplot(reg, aes(x = REACH, y = LIKE_COUNT)) +
geom_point() +
labs(x = "REACH", y = "LIKE_COUNT") +
geom_smooth(methot = "lm", se = FALSE)
ggplot(reg, aes(x = REACH, y = COMMENT_COUNT)) +
geom_point() +
labs(x = "REACH", y = "COMMENT_COUNT") +
geom_smooth(methot = "lm", se = FALSE)
ggplot(reg, aes(x = REACH, y = LIKE_COUNT)) +
geom_point() +
labs(x = "REACH", y = "LIKE_COUNT") +
geom_smooth(methot = "lm", se = FALSE)
Now we can fit a first regression model:
# run the model1
<- lm(REACH ~ COMMENT_COUNT, data = reg)
model1 # output content
summary(model1)
##
## Call:
## lm(formula = REACH ~ COMMENT_COUNT, data = reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11532340 -345 -211 887 12207675
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 345.8384 35.5965 9.716 <2e-16 ***
## COMMENT_COUNT 236.7576 0.2089 1133.145 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47550 on 1803213 degrees of freedom
## (363 observations deleted due to missingness)
## Multiple R-squared: 0.4159, Adjusted R-squared: 0.4159
## F-statistic: 1.284e+06 on 1 and 1803213 DF, p-value: < 2.2e-16
# run the model2
<- lm(REACH ~ LIKE_COUNT, data = reg)
model2 # output content
summary(model2)
##
## Call:
## lm(formula = REACH ~ LIKE_COUNT, data = reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9201845 -703 -645 352 11695363
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 702.92949 31.63060 22.22 <2e-16 ***
## LIKE_COUNT 76.97697 0.05322 1446.48 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42330 on 1803213 degrees of freedom
## (363 observations deleted due to missingness)
## Multiple R-squared: 0.5371, Adjusted R-squared: 0.5371
## F-statistic: 2.092e+06 on 1 and 1803213 DF, p-value: < 2.2e-16
# run the model3
<- lm(REACH ~ TITLE, data = reg)
model3 # output content
summary(model3)
##
## Call:
## lm(formula = REACH ~ TITLE, data = reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6033 -4222 -3613 -1850 15016763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3168.682 114.529 27.67 <2e-16 ***
## TITLE 17.901 1.446 12.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 62270 on 1799990 degrees of freedom
## (3586 observations deleted due to missingness)
## Multiple R-squared: 8.516e-05, Adjusted R-squared: 8.461e-05
## F-statistic: 153.3 on 1 and 1799990 DF, p-value: < 2.2e-16
# run the model4
<- lm(REACH ~ INTERDAYTIME, data = reg)
model4 # output content
summary(model4)
##
## Call:
## lm(formula = REACH ~ INTERDAYTIME, data = reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4849 -4219 -3908 -1646 15015852
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4136.32 173.46 23.846 < 2e-16 ***
## INTERDAYTIMEMorning 712.37 188.28 3.784 0.000155 ***
## INTERDAYTIMEAfternoon 55.62 190.27 0.292 0.770026
## INTERDAYTIMEEvening 97.99 205.38 0.477 0.633261
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 62210 on 1803474 degrees of freedom
## (100 observations deleted due to missingness)
## Multiple R-squared: 2.64e-05, Adjusted R-squared: 2.474e-05
## F-statistic: 15.87 on 3 and 1803474 DF, p-value: 2.564e-10
Combine all variables together in a multiple regression model:
# run the model5
<- lm(REACH ~ LIKE_COUNT + COMMENT_COUNT + TITLE + INTERDAYTIME,
model5 data = reg)
# output content
summary(model5)
##
## Call:
## lm(formula = REACH ~ LIKE_COUNT + COMMENT_COUNT + TITLE + INTERDAYTIME,
## data = reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7025092 -931 37 1322 11439598
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.505e+03 1.298e+02 27.01 <2e-16 ***
## LIKE_COUNT 5.881e+01 7.526e-02 781.41 <2e-16 ***
## COMMENT_COUNT 8.756e+01 2.631e-01 332.72 <2e-16 ***
## TITLE -2.361e+01 9.586e-01 -24.63 <2e-16 ***
## INTERDAYTIMEMorning -1.590e+03 1.251e+02 -12.70 <2e-16 ***
## INTERDAYTIMEAfternoon -1.888e+03 1.265e+02 -14.93 <2e-16 ***
## INTERDAYTIMEEvening -2.414e+03 1.365e+02 -17.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 41120 on 1799725 degrees of freedom
## (3846 observations deleted due to missingness)
## Multiple R-squared: 0.5641, Adjusted R-squared: 0.5641
## F-statistic: 3.882e+05 on 6 and 1799725 DF, p-value: < 2.2e-16
The procedure doesnt end with the model. We also need to choose the best model and asses its quality. The main criteria in regression model is R2.
library(moderndive)
# Get fitted/values & residuals, compute R^2 using residuals for model1
get_regression_points(model1) %>%
summarize(r_squared = 1 - var(residual) / var(REACH))
## # A tibble: 1 x 1
## r_squared
## <dbl>
## 1 0.416
# Get fitted/values & residuals, compute R^2 using residuals for model2
get_regression_points(model2) %>%
summarize(r_squared = 1 - var(residual) / var(REACH))
## # A tibble: 1 x 1
## r_squared
## <dbl>
## 1 0.537
# Get fitted/values & residuals, compute R^2 using residuals for model3
get_regression_points(model3) %>%
summarize(r_squared = 1 - var(residual) / var(REACH))
## # A tibble: 1 x 1
## r_squared
## <dbl>
## 1 0.0000852
# Get fitted/values & residuals, compute R^2 using residuals for model4
get_regression_points(model4) %>%
summarize(r_squared = 1 - var(residual) / var(REACH))
## # A tibble: 1 x 1
## r_squared
## <dbl>
## 1 0.0000264
# Get fitted/values & residuals, compute R^2 using residuals for model5
get_regression_points(model5) %>%
summarize(r_squared = 1 - var(residual) / var(REACH))
## # A tibble: 1 x 1
## r_squared
## <dbl>
## 1 0.564