Text Mining Assignment

Libraries and Data

if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse,
               gutenbergr,
               magrittr,
               tidytext,
               scales,
               igraph,
               ggraph,
               topicmodels,
               textdata)

authors <- rbind(
  gutenberg_works(author == "Twain, Mark"),
  gutenberg_works(author == "Carroll, Lewis"),
  gutenberg_works(author == "Dumas, Alexandre")
)

books <- gutenberg_download(c("11", "12", "74", "76", "86", "3176", "1184")) %>%
  left_join(authors) %>%
  select(gutenberg_id, text, author, title)

Tokens and Stop Words

Question 1

data("stop_words")

books_cleansed <- unnest_tokens(books, word, text) %>% anti_join(stop_words)

head(books_cleansed %>% filter(gutenberg_id == 74))

## # A tibble: 6 x 4
##   gutenberg_id author      title                        word      
##          <int> <chr>       <chr>                        <chr>     
## 1           74 Twain, Mark The Adventures of Tom Sawyer adventures
## 2           74 Twain, Mark The Adventures of Tom Sawyer tom       
## 3           74 Twain, Mark The Adventures of Tom Sawyer sawyer    
## 4           74 Twain, Mark The Adventures of Tom Sawyer mark      
## 5           74 Twain, Mark The Adventures of Tom Sawyer twain     
## 6           74 Twain, Mark The Adventures of Tom Sawyer samuel

Question 2

twain_words <- books_cleansed %>% filter(author == "Twain, Mark") %>%
  count(word, sort = TRUE) %>%
  head(15)
twain_words

## # A tibble: 15 x 2
##    word        n
##    <chr>   <int>
##  1 time     1192
##  2 tom       952
##  3 day       616
##  4 people    557
##  5 king      526
##  6 night     458
##  7 hundred   452
##  8 sir       421
##  9 head      377
## 10 jim       369
## 11 chapter   343
## 12 found     342
## 13 half      342
## 14 dead      336
## 15 hand      326

Question 3

ggplot(twain_words, aes(n, reorder(word, n))) + geom_col()

Question 4

frequency <- books_cleansed %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n / sum(n)) %>% 
  select(-n) %>% 
  spread(author, proportion) %>% 
  gather(author, proportion, `Dumas, Alexandre`:`Carroll, Lewis`)

Question 5

ggplot(frequency, aes(x = proportion, y = `Twain, Mark`, color = abs(`Twain, Mark` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  facet_wrap(~author, ncol = 2) +
  theme(legend.position="none") +
  labs(y = "Mark Twain", x = NULL)

## Warning: Removed 41573 rows containing missing values (geom_point).

## Warning: Removed 41575 rows containing missing values (geom_text).

This plot describes the correlations between the frequencies of words used by Mark Twain and those used by each of the other two authors.

Question 6

Twain’s word choice is very slightly more highly correlated with that of Dumas’ translations than with Carroll’s works.

cor.test(data = frequency %>% filter(author == "Carroll, Lewis"),
         ~ proportion + `Twain, Mark`)

## 
##  Pearson's product-moment correlation
## 
## data:  proportion and Twain, Mark
## t = 29.05, df = 2959, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4425662 0.4986378
## sample estimates:
##       cor 
## 0.4710777

cor.test(data = frequency %>% filter(author == "Dumas, Alexandre"),
         ~ proportion + `Twain, Mark`)

## 
##  Pearson's product-moment correlation
## 
## data:  proportion and Twain, Mark
## t = 55.112, df = 9788, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4713817 0.5016203
## sample estimates:
##       cor 
## 0.4866467

Sentiment Analysis

Question 1

get_sentiments("bing")

## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows

get_sentiments("afinn")

## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,467 more rows

get_sentiments("nrc")

## # A tibble: 13,901 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ... with 13,891 more rows

Question 2

twain_chapters <- books %>%
  filter(author == "Twain, Mark") %>%
  group_by(gutenberg_id) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(as.numeric(str_detect(text, regex("^chapter [\\divxlc]",
                                                 ignore_case = TRUE))))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

Question 3

twain_chapters %>%
  filter(title == "A Connecticut Yankee in King Arthur's Court") %>%
  inner_join(get_sentiments("nrc")) %>%
  filter(sentiment == "joy") %>%
  count(word, sort = TRUE) %>%
  head(10)

## Joining, by = "word"

## # A tibble: 10 x 2
##    word       n
##    <chr>  <int>
##  1 good     246
##  2 church    57
##  3 found     57
##  4 kind      56
##  5 pay       51
##  6 pretty    49
##  7 wages     49
##  8 true      47
##  9 young     47
## 10 god       42

The assignment of the label “joy” to these words is very sensitive to context. The words “good”, “kind” and “pretty” are probably relatively robustly associated with this sentiment (although still far from perfectly), but the rest are probably quite easily found also in neutral or negative sentences.

Question 4

mark_twain_sentiment <- twain_chapters %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  count(title, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

Question 5

ggplot(mark_twain_sentiment, aes(index, sentiment, color = title)) +
  geom_col() +
  facet_wrap(~ title, scales = "free") + 
  theme(legend.position = "hide")

The problems of sentiment analysis

Question 1

huckleberry_finn <- filter(twain_chapters, title == "Adventures of Huckleberry Finn")

Question 2

afinn <- huckleberry_finn %>% 
  inner_join(get_sentiments("afinn"), by = "word") %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

bing_and_nrc <- bind_rows(huckleberry_finn %>% 
                            inner_join(get_sentiments("bing"), by = "word") %>%
                            mutate(method = "Bing et al."),
                          huckleberry_finn %>% 
                            inner_join(get_sentiments("nrc") %>% 
                                         filter(sentiment %in% c("positive", 
                                                                 "negative")), by = "word") %>%
                            mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

huck_finn_sentiment <- bind_rows(afinn,bing_and_nrc)

Question 3

ggplot(huck_finn_sentiment, aes(index, sentiment, color = method)) +
  geom_col() +
  facet_wrap(~ method, scales = "free", ncol = 1) + 
  theme(legend.position = "hide")

n-grams

Question 1

twograms <- books %>% filter(author == "Twain, Mark") %>%
  unnest_tokens(twogram, text, token = "ngrams", n = 2) %>%
  count(twogram, sort = TRUE)

Question 2

head(twograms, 10)

## # A tibble: 10 x 2
##    twogram     n
##    <chr>   <int>
##  1 <NA>     9960
##  2 of the   3003
##  3 in the   2355
##  4 it was   1465
##  5 and the  1274
##  6 to the   1193
##  7 was a     834
##  8 of a      806
##  9 in a      719
## 10 it is     716

Question 3

twograms_cleansed <- twograms %>% separate(twogram, c("word1", "word2")) %>%
  anti_join(stop_words, by = c("word1" = "word")) %>%
  anti_join(stop_words, by = c("word2" = "word"))

head(twograms_cleansed, 10)

## # A tibble: 10 x 3
##    word1   word2         n
##    <chr>   <chr>     <int>
##  1 <NA>    <NA>       9960
##  2 tom     sawyer       65
##  3 aunt    polly        56
##  4 sir     launcelot    48
##  5 injun   joe          47
##  6 mary    jane         42
##  7 aunt    sally        38
##  8 holy    land         33
##  9 hundred yards        33
## 10 hundred feet         31

Question 4

(graph <- twograms_cleansed %>% filter(n > 10) %>% graph_from_data_frame())

## IGRAPH a57d475 DN-- 107 71 -- 
## + attr: name (v/c), n (e/n)
## + edges from a57d475 (vertex names):
##  [1] NA       ->NA        tom      ->sawyer    aunt     ->polly    
##  [4] sir      ->launcelot injun    ->joe       mary     ->jane     
##  [7] aunt     ->sally     holy     ->land      hundred  ->yards    
## [10] hundred  ->feet      sir      ->marhaus   sunday   ->school   
## [13] centuries->ago       st       ->peter     sir      ->kay      
## [16] sir      ->sagramor  ten      ->minutes   holy     ->sepulchre
## [19] miss     ->watson    muff     ->potter    thousand ->dollars  
## [22] dead     ->sea       mars     ->tom       miss     ->mary     
## + ... omitted several edges

Question 5

ggraph(graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)+
  theme_graph()

Topic Modeling - Latent Dirichlet Allocation (LDA)

Question 1

by_chapter <- books %>% filter(author != "Dumas, Alexandre") %>%
  group_by(title) %>%
  mutate(chapter = cumsum(as.numeric(str_detect(text, regex("^chapter ", ignore_case = TRUE))))) %>%
  ungroup() %>%
  filter(chapter > 0) %>%
  unite(document, title, chapter)

# split into words
by_chapter_word <- by_chapter %>%
  unnest_tokens(word, text)

# find document-word counts
word_counts <- by_chapter_word %>%
  anti_join(stop_words, by = "word") %>%
  count(document, word, sort = TRUE) %>%
  ungroup()

word_counts %>% head(10)

## # A tibble: 10 x 3
##    document                                      word       n
##    <chr>                                         <chr>  <int>
##  1 Adventures of Huckleberry Finn_51             de        96
##  2 Through the Looking-Glass_9                   queen     89
##  3 Adventures of Huckleberry Finn_51             en        72
##  4 Through the Looking-Glass_9                   alice     72
##  5 The Adventures of Tom Sawyer_66               tom       58
##  6 Through the Looking-Glass_6                   alice     57
##  7 A Connecticut Yankee in King Arthur's Court_9 sir       56
##  8 Through the Looking-Glass_8                   knight    55
##  9 Through the Looking-Glass_5                   alice     54
## 10 The Adventures of Tom Sawyer_41               tom       53

#dtm is the format we need for LDA
chapters_dtm <- word_counts %>%
  cast_dtm(document, word, n)

chapters_dtm

## <<DocumentTermMatrix (documents: 286, terms: 22628)>>
## Non-/sparse entries: 126405/6345203
## Sparsity           : 98%
## Maximal term length: 70
## Weighting          : term frequency (tf)

Question 2

LDA <- LDA(
  chapters_dtm,
  k = 6, # number of different books
  control = list(seed = 1234)
)

Question 3

LDA_wordprob <- tidy(LDA, matrix = "beta")

Question 4

(LDA_mostprob <- LDA_wordprob %>%
  group_by(topic) %>%
  slice_max(beta, n = 5))

## # A tibble: 30 x 3
## # Groups:   topic [6]
##    topic term       beta
##    <int> <chr>     <dbl>
##  1     1 time    0.00728
##  2     1 king    0.00643
##  3     1 people  0.00430
##  4     1 ye      0.00389
##  5     1 hundred 0.00364
##  6     2 alice   0.0423 
##  7     2 queen   0.0127 
##  8     2 time    0.00774
##  9     2 it’s    0.00763
## 10     2 don’t   0.00722
## # ... with 20 more rows

Question 5

ggplot(LDA_mostprob, aes(reorder(term, beta), beta, fill = topic)) +
  geom_col() +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(legend.position = "hide")

Question 6

(LDA_pertopicprob <- tidy(LDA, matrix = "gamma"))

## # A tibble: 1,716 x 3
##    document                                       topic     gamma
##    <chr>                                          <int>     <dbl>
##  1 Adventures of Huckleberry Finn_51                  1 0.0000118
##  2 Through the Looking-Glass_9                        1 0.0000138
##  3 The Adventures of Tom Sawyer_66                    1 0.0000185
##  4 Through the Looking-Glass_6                        1 0.0000193
##  5 A Connecticut Yankee in King Arthur's Court_9      1 0.0000256
##  6 Through the Looking-Glass_8                        1 0.0000155
##  7 Through the Looking-Glass_5                        1 0.0000203
##  8 The Adventures of Tom Sawyer_41                    1 0.0000143
##  9 A Connecticut Yankee in King Arthur's Court_43     1 0.0000145
## 10 Alice's Adventures in Wonderland_7                 1 0.0000265
## # ... with 1,706 more rows

Question 7

chapters_gamma <- LDA_pertopicprob %>%
  separate(document, c("title", "chapter"), sep = "_", convert = TRUE)

chapters_gamma %>% head(10)

## # A tibble: 10 x 4
##    title                                       chapter topic     gamma
##    <chr>                                         <int> <int>     <dbl>
##  1 Adventures of Huckleberry Finn                   51     1 0.0000118
##  2 Through the Looking-Glass                         9     1 0.0000138
##  3 The Adventures of Tom Sawyer                     66     1 0.0000185
##  4 Through the Looking-Glass                         6     1 0.0000193
##  5 A Connecticut Yankee in King Arthur's Court       9     1 0.0000256
##  6 Through the Looking-Glass                         8     1 0.0000155
##  7 Through the Looking-Glass                         5     1 0.0000203
##  8 The Adventures of Tom Sawyer                     41     1 0.0000143
##  9 A Connecticut Yankee in King Arthur's Court      43     1 0.0000145
## 10 Alice's Adventures in Wonderland                  7     1 0.0000265

Question 8

(highest_gamma <- chapters_gamma %>%
  group_by(title, chapter) %>%
  slice_max(gamma, n = 1) %>%   # 'top_n' has been superseded by 'slice_max'
  ungroup())

## # A tibble: 286 x 4
##    title                                       chapter topic gamma
##    <chr>                                         <int> <int> <dbl>
##  1 A Connecticut Yankee in King Arthur's Court       1     1 0.429
##  2 A Connecticut Yankee in King Arthur's Court       2     1 0.425
##  3 A Connecticut Yankee in King Arthur's Court       3     3 0.402
##  4 A Connecticut Yankee in King Arthur's Court       4     3 0.701
##  5 A Connecticut Yankee in King Arthur's Court       5     1 0.975
##  6 A Connecticut Yankee in King Arthur's Court       6     1 0.750
##  7 A Connecticut Yankee in King Arthur's Court       7     1 0.563
##  8 A Connecticut Yankee in King Arthur's Court       8     1 1.00 
##  9 A Connecticut Yankee in King Arthur's Court       9     3 1.00 
## 10 A Connecticut Yankee in King Arthur's Court      10     3 1.00 
## # ... with 276 more rows

Question 9

book_topics <- highest_gamma %>%
  count(title, topic) %>%
  group_by(title) %>%
  top_n(1, n) %>%
  ungroup() %>%
  transmute(consensus = title, topic)

highest_gamma %>%
  inner_join(book_topics, by = "topic") %>%
  filter(title != consensus)

## # A tibble: 55 x 5
##    title                        chapter topic gamma consensus                   
##    <chr>                          <int> <int> <dbl> <chr>                       
##  1 A Connecticut Yankee in Kin~      21     4 0.536 The Adventures of Tom Sawyer
##  2 A Connecticut Yankee in Kin~      33     4 0.697 The Adventures of Tom Sawyer
##  3 Adventures of Huckleberry F~       4     4 0.981 The Adventures of Tom Sawyer
##  4 Adventures of Huckleberry F~       5     4 0.984 The Adventures of Tom Sawyer
##  5 Adventures of Huckleberry F~       6     4 0.990 The Adventures of Tom Sawyer
##  6 Adventures of Huckleberry F~       9     4 0.981 The Adventures of Tom Sawyer
##  7 Adventures of Huckleberry F~      11     4 0.639 The Adventures of Tom Sawyer
##  8 Adventures of Huckleberry F~      24     5 0.671 The Innocents Abroad        
##  9 Adventures of Huckleberry F~      27     4 0.989 The Adventures of Tom Sawyer
## 10 Adventures of Huckleberry F~      29     1 0.824 A Connecticut Yankee in Kin~
## # ... with 45 more rows

Question 10

topic_assignment <- augment(LDA) %>% count(document, .topic, term)
slice_sample(topic_assignment, n = 20)

## # A tibble: 20 x 4
##    document                                       .topic term            n
##    <chr>                                           <dbl> <chr>       <int>
##  1 The Innocents Abroad_58                             1 minute          1
##  2 Alice's Adventures in Wonderland_2                  2 soft            1
##  3 The Adventures of Tom Sawyer_46                     4 opportunity     1
##  4 The Innocents Abroad_18                             1 window          1
##  5 Adventures of Huckleberry Finn_86                   6 buy             1
##  6 The Innocents Abroad_25                             3 knocked         1
##  7 Adventures of Huckleberry Finn_58                   6 swift           1
##  8 Adventures of Huckleberry Finn_61                   6 families        1
##  9 The Innocents Abroad_42                             5 existed         1
## 10 The Innocents Abroad_43                             5 holy            1
## 11 A Connecticut Yankee in King Arthur's Court_33      4 dinner          1
## 12 The Innocents Abroad_29                             3 shamefully      1
## 13 The Innocents Abroad_43                             5 superbly        1
## 14 A Connecticut Yankee in King Arthur's Court_3       1 conquered       1
## 15 The Adventures of Tom Sawyer_55                     4 whipped         1
## 16 The Innocents Abroad_28                             3 beds            1
## 17 Through the Looking-Glass_8                         2 continued       1
## 18 Adventures of Huckleberry Finn_66                   6 call            1
## 19 A Connecticut Yankee in King Arthur's Court_35      1 laid            1
## 20 A Connecticut Yankee in King Arthur's Court_16      1 suspicion       1

Question 11

assignments <- topic_assignment %>%
  separate(document, c("title", "chapter"), sep = "_", convert = TRUE) %>%
  inner_join(book_topics, by = c(".topic" = "topic"))

assignments %>% 
  slice_sample(n = 20)

## # A tibble: 20 x 6
##    title                  chapter .topic term          n consensus              
##    <chr>                    <int>  <dbl> <chr>     <int> <chr>                  
##  1 The Innocents Abroad         7      5 educated      1 The Innocents Abroad   
##  2 A Connecticut Yankee ~      11      5 scolloped     1 The Innocents Abroad   
##  3 Adventures of Huckleb~      67      6 ashamed       1 Adventures of Hucklebe~
##  4 The Innocents Abroad        34      4 cools         1 The Adventures of Tom ~
##  5 The Adventures of Tom~      70      4 that’s        1 The Adventures of Tom ~
##  6 The Innocents Abroad        58      1 swarm         1 A Connecticut Yankee i~
##  7 The Adventures of Tom~      52      4 rare          1 The Adventures of Tom ~
##  8 Through the Looking-G~       7      2 tiny          1 Alice's Adventures in ~
##  9 The Innocents Abroad        21      1 hell          1 A Connecticut Yankee i~
## 10 A Connecticut Yankee ~      11      5 expediti~     1 The Innocents Abroad   
## 11 The Innocents Abroad         8      5 dare          1 The Innocents Abroad   
## 12 A Connecticut Yankee ~       5      1 step          1 A Connecticut Yankee i~
## 13 A Connecticut Yankee ~      44      1 hundred       1 A Connecticut Yankee i~
## 14 Alice's Adventures in~       2      2 pair          1 Through the Looking-Gl~
## 15 The Adventures of Tom~      43      4 _tempora~     1 The Adventures of Tom ~
## 16 A Connecticut Yankee ~      24      1 greek         1 A Connecticut Yankee i~
## 17 Through the Looking-G~       6      2 remember      1 Through the Looking-Gl~
## 18 Alice's Adventures in~       8      2 timidly       1 Through the Looking-Gl~
## 19 The Innocents Abroad        60      5 complain~     1 The Innocents Abroad   
## 20 Adventures of Huckleb~      64      6 hinge         1 Adventures of Hucklebe~

Question 12

assignments %>%
  count(title, consensus, wt = n) %>%
  group_by(title) %>%
  mutate(percent = n / sum(n)) %>%
  ggplot(aes(consensus, title, fill = percent)) +
  geom_tile() +
  scale_fill_gradient2(high = "red", label = percent_format()) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.grid = element_blank()) +
  labs(x = "Book words were assigned to",
       y = "Book words came from",
       fill = "% of assignments")

Question 13

Most of the books are very well-assigned (in particular, “The Adventures of Huckleberry Finn” and “The Adventures of Tom Sawyer” are almost perfectly assigned), but it seems that we fail to differentiate at all between “Through the Looking-Glass” and “Alice’s Adventures in Wonderland” (though we at least do not confuse them with other books).

Text Mining Assignment

Yotam Nir

15 6 2021

Libraries and Data

Tokens and Stop Words

Question 1

Question 2

Question 3

Question 4

Question 5

Question 6

Sentiment Analysis

Question 1

Question 2

Question 3

Question 4

Question 5

The problems of sentiment analysis

Question 1

Question 2

Question 3

n-grams

Question 1

Question 2

Question 3

Question 4

Question 5

Topic Modeling - Latent Dirichlet Allocation (LDA)

Question 1

Question 2

Question 3

Question 4

Question 5

Question 6

Question 7

Question 8

Question 9

Question 10

Question 11

Question 12

Question 13