if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse,
gutenbergr,
magrittr,
tidytext,
scales,
igraph,
ggraph,
topicmodels,
textdata)
authors <- rbind(
gutenberg_works(author == "Twain, Mark"),
gutenberg_works(author == "Carroll, Lewis"),
gutenberg_works(author == "Dumas, Alexandre")
)
books <- gutenberg_download(c("11", "12", "74", "76", "86", "3176", "1184")) %>%
left_join(authors) %>%
select(gutenberg_id, text, author, title)
data("stop_words")
books_cleansed <- unnest_tokens(books, word, text) %>% anti_join(stop_words)
head(books_cleansed %>% filter(gutenberg_id == 74))
## # A tibble: 6 x 4
## gutenberg_id author title word
## <int> <chr> <chr> <chr>
## 1 74 Twain, Mark The Adventures of Tom Sawyer adventures
## 2 74 Twain, Mark The Adventures of Tom Sawyer tom
## 3 74 Twain, Mark The Adventures of Tom Sawyer sawyer
## 4 74 Twain, Mark The Adventures of Tom Sawyer mark
## 5 74 Twain, Mark The Adventures of Tom Sawyer twain
## 6 74 Twain, Mark The Adventures of Tom Sawyer samuel
twain_words <- books_cleansed %>% filter(author == "Twain, Mark") %>%
count(word, sort = TRUE) %>%
head(15)
twain_words
## # A tibble: 15 x 2
## word n
## <chr> <int>
## 1 time 1192
## 2 tom 952
## 3 day 616
## 4 people 557
## 5 king 526
## 6 night 458
## 7 hundred 452
## 8 sir 421
## 9 head 377
## 10 jim 369
## 11 chapter 343
## 12 found 342
## 13 half 342
## 14 dead 336
## 15 hand 326
ggplot(twain_words, aes(n, reorder(word, n))) + geom_col()
frequency <- books_cleansed %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(author, proportion) %>%
gather(author, proportion, `Dumas, Alexandre`:`Carroll, Lewis`)
ggplot(frequency, aes(x = proportion, y = `Twain, Mark`, color = abs(`Twain, Mark` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~author, ncol = 2) +
theme(legend.position="none") +
labs(y = "Mark Twain", x = NULL)
## Warning: Removed 41573 rows containing missing values (geom_point).
## Warning: Removed 41575 rows containing missing values (geom_text).
This plot describes the correlations between the frequencies of words used by Mark Twain and those used by each of the other two authors.
Twain’s word choice is very slightly more highly correlated with that of Dumas’ translations than with Carroll’s works.
cor.test(data = frequency %>% filter(author == "Carroll, Lewis"),
~ proportion + `Twain, Mark`)
##
## Pearson's product-moment correlation
##
## data: proportion and Twain, Mark
## t = 29.05, df = 2959, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4425662 0.4986378
## sample estimates:
## cor
## 0.4710777
cor.test(data = frequency %>% filter(author == "Dumas, Alexandre"),
~ proportion + `Twain, Mark`)
##
## Pearson's product-moment correlation
##
## data: proportion and Twain, Mark
## t = 55.112, df = 9788, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4713817 0.5016203
## sample estimates:
## cor
## 0.4866467
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
get_sentiments("afinn")
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
twain_chapters <- books %>%
filter(author == "Twain, Mark") %>%
group_by(gutenberg_id) %>%
mutate(linenumber = row_number(),
chapter = cumsum(as.numeric(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE))))) %>%
ungroup() %>%
unnest_tokens(word, text)
twain_chapters %>%
filter(title == "A Connecticut Yankee in King Arthur's Court") %>%
inner_join(get_sentiments("nrc")) %>%
filter(sentiment == "joy") %>%
count(word, sort = TRUE) %>%
head(10)
## Joining, by = "word"
## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 good 246
## 2 church 57
## 3 found 57
## 4 kind 56
## 5 pay 51
## 6 pretty 49
## 7 wages 49
## 8 true 47
## 9 young 47
## 10 god 42
The assignment of the label “joy” to these words is very sensitive to context. The words “good”, “kind” and “pretty” are probably relatively robustly associated with this sentiment (although still far from perfectly), but the rest are probably quite easily found also in neutral or negative sentences.
mark_twain_sentiment <- twain_chapters %>%
inner_join(get_sentiments("bing"), by = "word") %>%
count(title, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
ggplot(mark_twain_sentiment, aes(index, sentiment, color = title)) +
geom_col() +
facet_wrap(~ title, scales = "free") +
theme(legend.position = "hide")
huckleberry_finn <- filter(twain_chapters, title == "Adventures of Huckleberry Finn")
afinn <- huckleberry_finn %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
bing_and_nrc <- bind_rows(huckleberry_finn %>%
inner_join(get_sentiments("bing"), by = "word") %>%
mutate(method = "Bing et al."),
huckleberry_finn %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative")), by = "word") %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
huck_finn_sentiment <- bind_rows(afinn,bing_and_nrc)
ggplot(huck_finn_sentiment, aes(index, sentiment, color = method)) +
geom_col() +
facet_wrap(~ method, scales = "free", ncol = 1) +
theme(legend.position = "hide")
twograms <- books %>% filter(author == "Twain, Mark") %>%
unnest_tokens(twogram, text, token = "ngrams", n = 2) %>%
count(twogram, sort = TRUE)
head(twograms, 10)
## # A tibble: 10 x 2
## twogram n
## <chr> <int>
## 1 <NA> 9960
## 2 of the 3003
## 3 in the 2355
## 4 it was 1465
## 5 and the 1274
## 6 to the 1193
## 7 was a 834
## 8 of a 806
## 9 in a 719
## 10 it is 716
twograms_cleansed <- twograms %>% separate(twogram, c("word1", "word2")) %>%
anti_join(stop_words, by = c("word1" = "word")) %>%
anti_join(stop_words, by = c("word2" = "word"))
head(twograms_cleansed, 10)
## # A tibble: 10 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 <NA> <NA> 9960
## 2 tom sawyer 65
## 3 aunt polly 56
## 4 sir launcelot 48
## 5 injun joe 47
## 6 mary jane 42
## 7 aunt sally 38
## 8 holy land 33
## 9 hundred yards 33
## 10 hundred feet 31
(graph <- twograms_cleansed %>% filter(n > 10) %>% graph_from_data_frame())
## IGRAPH a57d475 DN-- 107 71 --
## + attr: name (v/c), n (e/n)
## + edges from a57d475 (vertex names):
## [1] NA ->NA tom ->sawyer aunt ->polly
## [4] sir ->launcelot injun ->joe mary ->jane
## [7] aunt ->sally holy ->land hundred ->yards
## [10] hundred ->feet sir ->marhaus sunday ->school
## [13] centuries->ago st ->peter sir ->kay
## [16] sir ->sagramor ten ->minutes holy ->sepulchre
## [19] miss ->watson muff ->potter thousand ->dollars
## [22] dead ->sea mars ->tom miss ->mary
## + ... omitted several edges
ggraph(graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)+
theme_graph()
by_chapter <- books %>% filter(author != "Dumas, Alexandre") %>%
group_by(title) %>%
mutate(chapter = cumsum(as.numeric(str_detect(text, regex("^chapter ", ignore_case = TRUE))))) %>%
ungroup() %>%
filter(chapter > 0) %>%
unite(document, title, chapter)
# split into words
by_chapter_word <- by_chapter %>%
unnest_tokens(word, text)
# find document-word counts
word_counts <- by_chapter_word %>%
anti_join(stop_words, by = "word") %>%
count(document, word, sort = TRUE) %>%
ungroup()
word_counts %>% head(10)
## # A tibble: 10 x 3
## document word n
## <chr> <chr> <int>
## 1 Adventures of Huckleberry Finn_51 de 96
## 2 Through the Looking-Glass_9 queen 89
## 3 Adventures of Huckleberry Finn_51 en 72
## 4 Through the Looking-Glass_9 alice 72
## 5 The Adventures of Tom Sawyer_66 tom 58
## 6 Through the Looking-Glass_6 alice 57
## 7 A Connecticut Yankee in King Arthur's Court_9 sir 56
## 8 Through the Looking-Glass_8 knight 55
## 9 Through the Looking-Glass_5 alice 54
## 10 The Adventures of Tom Sawyer_41 tom 53
#dtm is the format we need for LDA
chapters_dtm <- word_counts %>%
cast_dtm(document, word, n)
chapters_dtm
## <<DocumentTermMatrix (documents: 286, terms: 22628)>>
## Non-/sparse entries: 126405/6345203
## Sparsity : 98%
## Maximal term length: 70
## Weighting : term frequency (tf)
LDA <- LDA(
chapters_dtm,
k = 6, # number of different books
control = list(seed = 1234)
)
LDA_wordprob <- tidy(LDA, matrix = "beta")
(LDA_mostprob <- LDA_wordprob %>%
group_by(topic) %>%
slice_max(beta, n = 5))
## # A tibble: 30 x 3
## # Groups: topic [6]
## topic term beta
## <int> <chr> <dbl>
## 1 1 time 0.00728
## 2 1 king 0.00643
## 3 1 people 0.00430
## 4 1 ye 0.00389
## 5 1 hundred 0.00364
## 6 2 alice 0.0423
## 7 2 queen 0.0127
## 8 2 time 0.00774
## 9 2 it’s 0.00763
## 10 2 don’t 0.00722
## # ... with 20 more rows
ggplot(LDA_mostprob, aes(reorder(term, beta), beta, fill = topic)) +
geom_col() +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
theme(legend.position = "hide")
(LDA_pertopicprob <- tidy(LDA, matrix = "gamma"))
## # A tibble: 1,716 x 3
## document topic gamma
## <chr> <int> <dbl>
## 1 Adventures of Huckleberry Finn_51 1 0.0000118
## 2 Through the Looking-Glass_9 1 0.0000138
## 3 The Adventures of Tom Sawyer_66 1 0.0000185
## 4 Through the Looking-Glass_6 1 0.0000193
## 5 A Connecticut Yankee in King Arthur's Court_9 1 0.0000256
## 6 Through the Looking-Glass_8 1 0.0000155
## 7 Through the Looking-Glass_5 1 0.0000203
## 8 The Adventures of Tom Sawyer_41 1 0.0000143
## 9 A Connecticut Yankee in King Arthur's Court_43 1 0.0000145
## 10 Alice's Adventures in Wonderland_7 1 0.0000265
## # ... with 1,706 more rows
chapters_gamma <- LDA_pertopicprob %>%
separate(document, c("title", "chapter"), sep = "_", convert = TRUE)
chapters_gamma %>% head(10)
## # A tibble: 10 x 4
## title chapter topic gamma
## <chr> <int> <int> <dbl>
## 1 Adventures of Huckleberry Finn 51 1 0.0000118
## 2 Through the Looking-Glass 9 1 0.0000138
## 3 The Adventures of Tom Sawyer 66 1 0.0000185
## 4 Through the Looking-Glass 6 1 0.0000193
## 5 A Connecticut Yankee in King Arthur's Court 9 1 0.0000256
## 6 Through the Looking-Glass 8 1 0.0000155
## 7 Through the Looking-Glass 5 1 0.0000203
## 8 The Adventures of Tom Sawyer 41 1 0.0000143
## 9 A Connecticut Yankee in King Arthur's Court 43 1 0.0000145
## 10 Alice's Adventures in Wonderland 7 1 0.0000265
(highest_gamma <- chapters_gamma %>%
group_by(title, chapter) %>%
slice_max(gamma, n = 1) %>% # 'top_n' has been superseded by 'slice_max'
ungroup())
## # A tibble: 286 x 4
## title chapter topic gamma
## <chr> <int> <int> <dbl>
## 1 A Connecticut Yankee in King Arthur's Court 1 1 0.429
## 2 A Connecticut Yankee in King Arthur's Court 2 1 0.425
## 3 A Connecticut Yankee in King Arthur's Court 3 3 0.402
## 4 A Connecticut Yankee in King Arthur's Court 4 3 0.701
## 5 A Connecticut Yankee in King Arthur's Court 5 1 0.975
## 6 A Connecticut Yankee in King Arthur's Court 6 1 0.750
## 7 A Connecticut Yankee in King Arthur's Court 7 1 0.563
## 8 A Connecticut Yankee in King Arthur's Court 8 1 1.00
## 9 A Connecticut Yankee in King Arthur's Court 9 3 1.00
## 10 A Connecticut Yankee in King Arthur's Court 10 3 1.00
## # ... with 276 more rows
book_topics <- highest_gamma %>%
count(title, topic) %>%
group_by(title) %>%
top_n(1, n) %>%
ungroup() %>%
transmute(consensus = title, topic)
highest_gamma %>%
inner_join(book_topics, by = "topic") %>%
filter(title != consensus)
## # A tibble: 55 x 5
## title chapter topic gamma consensus
## <chr> <int> <int> <dbl> <chr>
## 1 A Connecticut Yankee in Kin~ 21 4 0.536 The Adventures of Tom Sawyer
## 2 A Connecticut Yankee in Kin~ 33 4 0.697 The Adventures of Tom Sawyer
## 3 Adventures of Huckleberry F~ 4 4 0.981 The Adventures of Tom Sawyer
## 4 Adventures of Huckleberry F~ 5 4 0.984 The Adventures of Tom Sawyer
## 5 Adventures of Huckleberry F~ 6 4 0.990 The Adventures of Tom Sawyer
## 6 Adventures of Huckleberry F~ 9 4 0.981 The Adventures of Tom Sawyer
## 7 Adventures of Huckleberry F~ 11 4 0.639 The Adventures of Tom Sawyer
## 8 Adventures of Huckleberry F~ 24 5 0.671 The Innocents Abroad
## 9 Adventures of Huckleberry F~ 27 4 0.989 The Adventures of Tom Sawyer
## 10 Adventures of Huckleberry F~ 29 1 0.824 A Connecticut Yankee in Kin~
## # ... with 45 more rows
topic_assignment <- augment(LDA) %>% count(document, .topic, term)
slice_sample(topic_assignment, n = 20)
## # A tibble: 20 x 4
## document .topic term n
## <chr> <dbl> <chr> <int>
## 1 The Innocents Abroad_58 1 minute 1
## 2 Alice's Adventures in Wonderland_2 2 soft 1
## 3 The Adventures of Tom Sawyer_46 4 opportunity 1
## 4 The Innocents Abroad_18 1 window 1
## 5 Adventures of Huckleberry Finn_86 6 buy 1
## 6 The Innocents Abroad_25 3 knocked 1
## 7 Adventures of Huckleberry Finn_58 6 swift 1
## 8 Adventures of Huckleberry Finn_61 6 families 1
## 9 The Innocents Abroad_42 5 existed 1
## 10 The Innocents Abroad_43 5 holy 1
## 11 A Connecticut Yankee in King Arthur's Court_33 4 dinner 1
## 12 The Innocents Abroad_29 3 shamefully 1
## 13 The Innocents Abroad_43 5 superbly 1
## 14 A Connecticut Yankee in King Arthur's Court_3 1 conquered 1
## 15 The Adventures of Tom Sawyer_55 4 whipped 1
## 16 The Innocents Abroad_28 3 beds 1
## 17 Through the Looking-Glass_8 2 continued 1
## 18 Adventures of Huckleberry Finn_66 6 call 1
## 19 A Connecticut Yankee in King Arthur's Court_35 1 laid 1
## 20 A Connecticut Yankee in King Arthur's Court_16 1 suspicion 1
assignments <- topic_assignment %>%
separate(document, c("title", "chapter"), sep = "_", convert = TRUE) %>%
inner_join(book_topics, by = c(".topic" = "topic"))
assignments %>%
slice_sample(n = 20)
## # A tibble: 20 x 6
## title chapter .topic term n consensus
## <chr> <int> <dbl> <chr> <int> <chr>
## 1 The Innocents Abroad 7 5 educated 1 The Innocents Abroad
## 2 A Connecticut Yankee ~ 11 5 scolloped 1 The Innocents Abroad
## 3 Adventures of Huckleb~ 67 6 ashamed 1 Adventures of Hucklebe~
## 4 The Innocents Abroad 34 4 cools 1 The Adventures of Tom ~
## 5 The Adventures of Tom~ 70 4 that’s 1 The Adventures of Tom ~
## 6 The Innocents Abroad 58 1 swarm 1 A Connecticut Yankee i~
## 7 The Adventures of Tom~ 52 4 rare 1 The Adventures of Tom ~
## 8 Through the Looking-G~ 7 2 tiny 1 Alice's Adventures in ~
## 9 The Innocents Abroad 21 1 hell 1 A Connecticut Yankee i~
## 10 A Connecticut Yankee ~ 11 5 expediti~ 1 The Innocents Abroad
## 11 The Innocents Abroad 8 5 dare 1 The Innocents Abroad
## 12 A Connecticut Yankee ~ 5 1 step 1 A Connecticut Yankee i~
## 13 A Connecticut Yankee ~ 44 1 hundred 1 A Connecticut Yankee i~
## 14 Alice's Adventures in~ 2 2 pair 1 Through the Looking-Gl~
## 15 The Adventures of Tom~ 43 4 _tempora~ 1 The Adventures of Tom ~
## 16 A Connecticut Yankee ~ 24 1 greek 1 A Connecticut Yankee i~
## 17 Through the Looking-G~ 6 2 remember 1 Through the Looking-Gl~
## 18 Alice's Adventures in~ 8 2 timidly 1 Through the Looking-Gl~
## 19 The Innocents Abroad 60 5 complain~ 1 The Innocents Abroad
## 20 Adventures of Huckleb~ 64 6 hinge 1 Adventures of Hucklebe~
assignments %>%
count(title, consensus, wt = n) %>%
group_by(title) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(consensus, title, fill = percent)) +
geom_tile() +
scale_fill_gradient2(high = "red", label = percent_format()) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.grid = element_blank()) +
labs(x = "Book words were assigned to",
y = "Book words came from",
fill = "% of assignments")
Most of the books are very well-assigned (in particular, “The Adventures of Huckleberry Finn” and “The Adventures of Tom Sawyer” are almost perfectly assigned), but it seems that we fail to differentiate at all between “Through the Looking-Glass” and “Alice’s Adventures in Wonderland” (though we at least do not confuse them with other books).