1 Install/load packages

# ------------------------------------------------------------
# Install missing packages and load them
# ------------------------------------------------------------

# List of required packages
packages <- c(
  "data.table",  # fast tables and grouped ops
  "text2vec",    # tokenization utilities
  "stopwords",   # multilingual stopword lists
  "ggplot2",     # plotting
  "glmnet",      # logistic regression (optional)
  "pROC",        # AUC calculation (optional)
  "sentimentr"   # sentiment analysis
)

# Identify which packages are not yet installed
new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]

# Install missing ones quietly
if (length(new_packages)) {
  install.packages(new_packages, dependencies = TRUE)
}

# Load all packages
invisible(lapply(packages, library, character.only = TRUE))

2 Tiny corpus (3 reviews, ~40–50 words total)

# set mini_corpus = FALSE to use a larger dataset (100 movie reviews)
mini_corpus = TRUE
# Mini corpus designed for transparency in class
if (mini_corpus) {
docs <- data.table(
  doc_id = c("r1","r2","r3"),
  text = c(
    "Amazing camera and battery life; great photos, fast performance. Highly recommend this phone.",
    "Battery drains quickly, camera mediocre, app crashes, not good value. I regret buying this phone.",
    "Camera is good for daylight, battery ok, screen sharp; low-light noise, but overall decent value."
  )
)
docs
} else {
data("movie_review", package = "text2vec")                 # load a small movie-review dataset that ships with {text2vec}
docs <- as.data.table(movie_review)
docs <- docs[, .(id, review)][1:100]
setnames(docs, c("doc_id", "text"))
head(docs)
}

##    doc_id
##    <char>
## 1:     r1
## 2:     r2
## 3:     r3
##                                                                                                 text
##                                                                                               <char>
## 1:     Amazing camera and battery life; great photos, fast performance. Highly recommend this phone.
## 2: Battery drains quickly, camera mediocre, app crashes, not good value. I regret buying this phone.
## 3: Camera is good for daylight, battery ok, screen sharp; low-light noise, but overall decent value.

numdocs = nrow(docs)

3 Tokenize → clean → keep unigrams

# Preprocessing decisions (keep simple for pedagogy):
# - lowercase words so 'Amazing' and 'amazing' match
# - tokenize into unigrams (one word at a time)
# - remove stopwords and tokens that are only punctuation

prep_fun <- tolower
tok_fun  <- word_tokenizer
stops    <- stopwords("en")

# Tokenize each document into a character vector of tokens
tokens <- tok_fun(prep_fun(docs$text))

# Clean tokens: strip punctuation, drop empties, remove stopwords
tokens <- lapply(tokens, function(x) {
  x <- gsub("[^a-z0-9']+", "", x)  # keep letters/numbers/apostrophes
  x <- x[nchar(x) > 0]             # drop empty tokens
  x[!x %in% stops]                  # remove stopwords
})

# Inspect tokens per doc
data.table(doc_id = docs$doc_id, tokens = I(tokens))[1:min(numdocs, 10)]

##    doc_id       tokens
##    <char>       <AsIs>
## 1:     r1 amazing,....
## 2:     r2 battery,....
## 3:     r3 camera, ....

4 Build counts (Document–Term Matrix, DTM) as a long table

# Unnest tokens into a long table of (doc_id, term) rows
long <- data.table(
  doc_id = rep(docs$doc_id, lengths(tokens)),
  term   = unlist(tokens)
)

# Count term frequency n per (doc, term)
tf_counts <- long[, .(n = .N), by = .(doc_id, term)]
tf_counts[order(doc_id, -n)][1:min(numdocs, 10)]

##    doc_id    term     n
##    <char>  <char> <int>
## 1:     r1 amazing     1
## 2:     r1  camera     1
## 3:     r1 battery     1

5 Compute TF, DF, IDF, and TF–IDF (by hand)

# N = number of documents
N <- nrow(docs)

# For each doc: total token count (denominator for TF)
tf_counts[, total_terms := sum(n), by = doc_id]

# TF (term frequency within doc) = n / total_terms
tf_counts[, tf := n / total_terms]

# DF (document frequency) = number of docs containing the term at least once
df <- unique(tf_counts[, .(doc_id, term)])[, .(df = .N), by = term]

# Merge DF into TF table, compute IDF and TF–IDF
# IDF (plain) = log(N / df); 
dt <- merge(tf_counts, df, by = "term")
dt[, idf := log(N / df)]
dt[, tf_idf := tf * idf]

# Order to see most distinctive terms per document
dt <- dt[order(doc_id, -tf_idf)]
dt[1:min(numdocs, 10)]

##       term doc_id     n total_terms         tf    df      idf     tf_idf
##     <char> <char> <int>       <int>      <num> <int>    <num>      <num>
## 1: amazing     r1     1          11 0.09090909     1 1.098612 0.09987384
## 2:    fast     r1     1          11 0.09090909     1 1.098612 0.09987384
## 3:   great     r1     1          11 0.09090909     1 1.098612 0.09987384

6 Make compact DTM & TF–IDF matrices (wide form)

# Build matrices with documents as rows and terms as columns:
# - DTM_counts: raw counts
# - TFIDF_mat : TF–IDF weights
DTM_counts <- dcast(tf_counts, doc_id ~ term, value.var = "n", fill = 0)
TFIDF_mat  <- dcast(dt[, .(doc_id, term, tf_idf)],
                    doc_id ~ term, value.var = "tf_idf", fill = 0)

DTM_counts[1:min(numdocs, 10), 1:10]

## Key: <doc_id>
##    doc_id amazing   app battery buying camera crashes daylight decent drains
##    <char>   <int> <int>   <int>  <int>  <int>   <int>    <int>  <int>  <int>
## 1:     r1       1     0       1      0      1       0        0      0      0
## 2:     r2       0     1       1      1      1       1        0      0      1
## 3:     r3       0     0       1      0      1       0        1      1      0

TFIDF_mat[1:min(numdocs, 10),1:10]

## Key: <doc_id>
##    doc_id    amazing        app battery     buying camera    crashes   daylight
##    <char>      <num>      <num>   <num>      <num>  <num>      <num>      <num>
## 1:     r1 0.09987384 0.00000000       0 0.00000000      0 0.00000000 0.00000000
## 2:     r2 0.00000000 0.09155102       0 0.09155102      0 0.09155102 0.00000000
## 3:     r3 0.00000000 0.00000000       0 0.00000000      0 0.00000000 0.08450864
##        decent     drains
##         <num>      <num>
## 1: 0.00000000 0.00000000
## 2: 0.00000000 0.09155102
## 3: 0.08450864 0.00000000

7 Visualize: Top-k TF–IDF terms per document

# Choose top-k TF–IDF terms per doc to summarize each review
topk <- 3
top_dt <- dt[, .SD[order(-tf_idf)][1:topk], by = doc_id]
top_dt[1:min(numdocs*3, 20)]

##    doc_id     term     n total_terms         tf    df      idf     tf_idf
##    <char>   <char> <int>       <int>      <num> <int>    <num>      <num>
## 1:     r1  amazing     1          11 0.09090909     1 1.098612 0.09987384
## 2:     r1     fast     1          11 0.09090909     1 1.098612 0.09987384
## 3:     r1    great     1          11 0.09090909     1 1.098612 0.09987384
## 4:     r2      app     1          12 0.08333333     1 1.098612 0.09155102
## 5:     r2   buying     1          12 0.08333333     1 1.098612 0.09155102
## 6:     r2  crashes     1          12 0.08333333     1 1.098612 0.09155102
## 7:     r3 daylight     1          13 0.07692308     1 1.098612 0.08450864
## 8:     r3   decent     1          13 0.07692308     1 1.098612 0.08450864
## 9:     r3    light     1          13 0.07692308     1 1.098612 0.08450864

Exercise: Compute & Visualize TF–IDF on a Tiny Corpus

Davide Proserpio

1 Install/load packages

2 Tiny corpus (3 reviews, ~40–50 words total)

3 Tokenize → clean → keep unigrams

4 Build counts (Document–Term Matrix, DTM) as a long table

5 Compute TF, DF, IDF, and TF–IDF (by hand)

6 Make compact DTM & TF–IDF matrices (wide form)

7 Visualize: Top-k TF–IDF terms per document