Install/load
packages
# ------------------------------------------------------------
# Install missing packages and load them
# ------------------------------------------------------------
# List of required packages
packages <- c(
"data.table", # fast tables and grouped ops
"text2vec", # tokenization utilities
"stopwords", # multilingual stopword lists
"ggplot2", # plotting
"glmnet", # logistic regression (optional)
"pROC", # AUC calculation (optional)
"sentimentr" # sentiment analysis
)
# Identify which packages are not yet installed
new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
# Install missing ones quietly
if (length(new_packages)) {
install.packages(new_packages, dependencies = TRUE)
}
# Load all packages
invisible(lapply(packages, library, character.only = TRUE))
Tiny corpus (3 reviews,
~40–50 words total)
# set mini_corpus = FALSE to use a larger dataset (100 movie reviews)
mini_corpus = TRUE
# Mini corpus designed for transparency in class
if (mini_corpus) {
docs <- data.table(
doc_id = c("r1","r2","r3"),
text = c(
"Amazing camera and battery life; great photos, fast performance. Highly recommend this phone.",
"Battery drains quickly, camera mediocre, app crashes, not good value. I regret buying this phone.",
"Camera is good for daylight, battery ok, screen sharp; low-light noise, but overall decent value."
)
)
docs
} else {
data("movie_review", package = "text2vec") # load a small movie-review dataset that ships with {text2vec}
docs <- as.data.table(movie_review)
docs <- docs[, .(id, review)][1:100]
setnames(docs, c("doc_id", "text"))
head(docs)
}
## doc_id
## <char>
## 1: r1
## 2: r2
## 3: r3
## text
## <char>
## 1: Amazing camera and battery life; great photos, fast performance. Highly recommend this phone.
## 2: Battery drains quickly, camera mediocre, app crashes, not good value. I regret buying this phone.
## 3: Camera is good for daylight, battery ok, screen sharp; low-light noise, but overall decent value.
numdocs = nrow(docs)
Tokenize → clean → keep
unigrams
# Preprocessing decisions (keep simple for pedagogy):
# - lowercase words so 'Amazing' and 'amazing' match
# - tokenize into unigrams (one word at a time)
# - remove stopwords and tokens that are only punctuation
prep_fun <- tolower
tok_fun <- word_tokenizer
stops <- stopwords("en")
# Tokenize each document into a character vector of tokens
tokens <- tok_fun(prep_fun(docs$text))
# Clean tokens: strip punctuation, drop empties, remove stopwords
tokens <- lapply(tokens, function(x) {
x <- gsub("[^a-z0-9']+", "", x) # keep letters/numbers/apostrophes
x <- x[nchar(x) > 0] # drop empty tokens
x[!x %in% stops] # remove stopwords
})
# Inspect tokens per doc
data.table(doc_id = docs$doc_id, tokens = I(tokens))[1:min(numdocs, 10)]
## doc_id tokens
## <char> <AsIs>
## 1: r1 amazing,....
## 2: r2 battery,....
## 3: r3 camera, ....
Build counts
(Document–Term Matrix, DTM) as a long table
# Unnest tokens into a long table of (doc_id, term) rows
long <- data.table(
doc_id = rep(docs$doc_id, lengths(tokens)),
term = unlist(tokens)
)
# Count term frequency n per (doc, term)
tf_counts <- long[, .(n = .N), by = .(doc_id, term)]
tf_counts[order(doc_id, -n)][1:min(numdocs, 10)]
## doc_id term n
## <char> <char> <int>
## 1: r1 amazing 1
## 2: r1 camera 1
## 3: r1 battery 1
Compute TF, DF, IDF,
and TF–IDF (by hand)
# N = number of documents
N <- nrow(docs)
# For each doc: total token count (denominator for TF)
tf_counts[, total_terms := sum(n), by = doc_id]
# TF (term frequency within doc) = n / total_terms
tf_counts[, tf := n / total_terms]
# DF (document frequency) = number of docs containing the term at least once
df <- unique(tf_counts[, .(doc_id, term)])[, .(df = .N), by = term]
# Merge DF into TF table, compute IDF and TF–IDF
# IDF (plain) = log(N / df);
dt <- merge(tf_counts, df, by = "term")
dt[, idf := log(N / df)]
dt[, tf_idf := tf * idf]
# Order to see most distinctive terms per document
dt <- dt[order(doc_id, -tf_idf)]
dt[1:min(numdocs, 10)]
## term doc_id n total_terms tf df idf tf_idf
## <char> <char> <int> <int> <num> <int> <num> <num>
## 1: amazing r1 1 11 0.09090909 1 1.098612 0.09987384
## 2: fast r1 1 11 0.09090909 1 1.098612 0.09987384
## 3: great r1 1 11 0.09090909 1 1.098612 0.09987384
Visualize: Top-k TF–IDF
terms per document
# Choose top-k TF–IDF terms per doc to summarize each review
topk <- 3
top_dt <- dt[, .SD[order(-tf_idf)][1:topk], by = doc_id]
top_dt[1:min(numdocs*3, 20)]
## doc_id term n total_terms tf df idf tf_idf
## <char> <char> <int> <int> <num> <int> <num> <num>
## 1: r1 amazing 1 11 0.09090909 1 1.098612 0.09987384
## 2: r1 fast 1 11 0.09090909 1 1.098612 0.09987384
## 3: r1 great 1 11 0.09090909 1 1.098612 0.09987384
## 4: r2 app 1 12 0.08333333 1 1.098612 0.09155102
## 5: r2 buying 1 12 0.08333333 1 1.098612 0.09155102
## 6: r2 crashes 1 12 0.08333333 1 1.098612 0.09155102
## 7: r3 daylight 1 13 0.07692308 1 1.098612 0.08450864
## 8: r3 decent 1 13 0.07692308 1 1.098612 0.08450864
## 9: r3 light 1 13 0.07692308 1 1.098612 0.08450864