Predicting spam messages is a machine learning algorithm that most email providers need to include to protect their users. Five classification methods are used to identify spam emails from non-spam using word frequency. Logistic lasso regression proves to be the most reliable and accurate method.
The “Spam Mails Dataset” is publicly available on Kaggle and contains the modified text of 500 spam emails and 2,500 non-spam emails (Garne, 2019). Data cleaning was conducted using a guide by Shreyas Khades (n.d.), and the steps included stemming (retrieving the root of a word), removing common stop words (such as “the” and “to”), and creating predictor variables of word frequencies. One disadvantage of the data was the number of spam emails that were either empty or in a different language, which changed the ratio between spam and non-spam emails we were able to use.
# Loading email data set: consists of "text data" and "dummy variable"
email_df <- data.table::fread( "spam_or_not_spam.csv", header = T )
# Randomizing data
set.seed(46234)
email_df <- email_df[ sample( 1:nrow( email_df ) ), ]
# Factoring dummy variable
email_df$label <- factor( email_df$label )library(DT)
email_df2 = data.frame(
Type = ifelse(email_df$label == 1, "Spam", "Non-Spam"),
"Raw Email Text" = email_df$email
)
datatable(email_df2, rownames = TRUE, filter="top", options = list(pageLength = 10, scrollX=T) )# datatable(..., class = 'white-space: nowrap')library(pacman)
p_load(
# Basics
here, skimr, dplyr, stringr, fastverse, disk.frame,
# Visualizing
ggplot2, ggthemes, ggthemes, wordcloud, RColorBrewer,
# Text Processing
tm, SnowballC,
# Modelling
e1071,naivebayes, tidymodels, gridExtra, caret, ranger,
# Knitting
knitr, kableExtra, DT, shiny, equatiomatic
)To retrieve the root of a word (eg, doing -> do), options are “stemming” &“lemmatization”.
STEMMING: faster but maybe not as effective
LEMMATIZATION: slower but more effective
More on this here
VectorSource(): creates one document for each email
Vcorpus(): creates a volatile corpus from these individual emails
email_corpus <- VCorpus(
VectorSource(
email_df$email
)
)
# Using `tm` package to stem email content
email_corpus <- tm::tm_map( email_corpus,
tm::stemDocument )
# Removing puctuations
email_corpus = tm_map( email_corpus,
removePunctuation )
# Removing stopwords
email_corpus <- tm_map( email_corpus,
removeWords, stopwords( "en" ) )
# Removing two most frequent stopwords: "NUMBER", "URL"
email_corpus <- tm_map( email_corpus,
removeWords, c("NUMBER", "number", "url", "URL") )
# DocumentTermMatrix(): tokenize the email corpus.
email_dtm <- tm::DocumentTermMatrix( sample( email_corpus, length( email_corpus ) ) )Using wordclouds, visualize text data after cleaning and pre-processing
# Preprocessed data for visualizations
reverse_email <- data.frame(
text = sapply( email_corpus, as.character ),
stringsAsFactors = FALSE,
type = email_df$label
)# All emails
wordcloud( reverse_email$text,
max.words = 150,
colors = brewer.pal( 7, "Dark2" ),
random.order = FALSE ) # Subsetting to spam == 1
spam <- reverse_email %>% filter( type == 1 )
# layout(matrix(c(1, 2), nrow=2), heights=c(1, 8))
# par(mar=rep(0, 4))
# plot.new()
# text(x=0.5, y=0.5, cex = 1.5, offset = 0.5, "Most frequent words in spam")
wordcloud( spam$text,
max.words = 150,
colors = brewer.pal( 7, "Dark2" ),
random.order = FALSE,
main = "Spam")# Subsetting to spam == 0
ham <- reverse_email %>% filter( type == 0 )
wordcloud( ham$text,
max.words = 150,
colors = brewer.pal( 7, "Dark2" ),
random.order = FALSE,
main = "Non-Spam") Earlier we randomly sorted the data, so we can use the indices to split 80% training and 20% testing.
# Split 80% training, 20% testing
email_dtm_train <- email_dtm[ 1:2400, ]
email_dtm_test <- email_dtm[2401:3000, ]
# Add labels for convenience
email_train_label <- email_df[ 1:2400, ]$label
email_test_label <- email_df[2401:3000, ]$label# Create table
prop_table = data.frame(c(prop.table( table( email_train_label ) )[[2]], #Train
prop.table( table( email_train_label ) )[[1]]),
c(prop.table( table( email_test_label ) )[[2]], # Test
prop.table( table( email_test_label ) )[[1]])
)
# Add table headings
rownames(prop_table) = c("Spam", "Non-Spam")
names(prop_table) = c("Train", "Test")
# View table
kable(prop_table, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Train | Test | |
|---|---|---|
| Spam | 0.17 | 0.152 |
| Non-Spam | 0.83 | 0.848 |
There are currently 25,050 variables, which is most likely way too many! So, define a threshold (eg. 1 == 1%) and reduce the number of features used.
Goal: Eliminate words that appear in only 10% of records in the training data
min_freq <- round( email_dtm$nrow * ( ( threshold = 10.0 ) / 100 ), 0 ) # using 10%# Create vector of most frequent words
freq_words <- findFreqTerms( x = email_dtm,
lowfreq = min_freq )
# Filter the DTM
email_dtm_freq_train <- email_dtm_train[ , freq_words]
email_dtm_freq_test <- email_dtm_test[ , freq_words]# Create table
freq_words_plus_1 = c(freq_words, "")
freq_words_dt = data.frame(
c(freq_words_plus_1[seq( 1+0*1,16*1)]),
c(freq_words_plus_1[seq(1*16+1,16*2)]),
c(freq_words_plus_1[seq(2*16+1,16*3)]),
c(freq_words_plus_1[seq(3*16+1,16*4)]),
c(freq_words_plus_1[seq(4*16+1,16*5)]),
c(freq_words_plus_1[seq(5*16+1,16*6)]),
c(freq_words_plus_1[seq(6*16+1,16*7)]),
c(freq_words_plus_1[seq(7*16+1,16*8)]),
c(freq_words_plus_1[seq(8*16+1,16*9)]),
c(freq_words_plus_1[seq(9*16+1,16*10)]),
c(freq_words_plus_1[seq(10*16+1,16*11)]),
c(freq_words_plus_1[seq(11*16+1,16*12)]),
c(freq_words_plus_1[seq(12*16+1,16*13)])
)
names(freq_words_dt)[] = ""
# View table
kable(freq_words_dt, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
column_spec (1:13,border_left = T, border_right = T) %>%
row_spec(1:16, extra_css = c("border-bottom: 1px solid",
"border-top: 1px solid")) %>%
add_header_above()| actual | can | differ | found | instal | long | much | onli | problem | right | site | test | want |
| address | case | doe | free | interest | look | must | order | process | rpm | softwar | thank | way |
| also | chang | doesn | get | internet | lot | name | origin | product | run | someth | thing | web |
| american | check | don | give | invest | made | nation | packag | program | said | spam | think | week |
| ani | click | good | issu | need | part | provid | say | spamassassin | time | well | ||
| anoth | code | end | got | just | make | net | peopl | put | secur | sponsor | today | whi |
| anyon | com | error | govern | keep | manag | network | perl | rate | see | start | trade | will |
| back | come | even | great | know | mani | never | person | razor | seem | state | tri | window |
| base | compani | everi | group | last | market | new | phone | read | send | still | two | within |
| becaus | comput | exmh | help | life | may | next | place | real | sent | subject | type | without |
| befor | countri | file | high | like | mean | now | pleas | realli | sep | support | unit | work |
| best | current | find | home | line | messag | numbertnumber | point | receiv | server | sure | unsubscrib | world |
| better | data | first | hyperlink | link | might | offer | post | releas | servic | system | use | write |
| build | date | follow | idea | linux | million | old | power | remov | set | take | user | wrote |
| busi | day | fork | includ | list | money | onc | price | report | show | talk | veri | year |
| call | develop | form | inform | live | month | one | probabl | requir | sinc | technolog | version |
# Simple dummy transformation fn.
convert_values <- function(x){
x = ifelse( x > 0, "Yes", "No" )
}
# Declaring final `train` and `test` datasets
email_train <- apply( email_dtm_freq_train, MARGIN = 2,
convert_values )
email_test <- apply( email_dtm_freq_test, MARGIN = 2,
convert_values )# View data
datatable(email_train, rownames = FALSE, filter="none", options = list(pageLength = 5, scrollX=T) )The five machine learning methods used for predictions were the Naive Bayes classifier, lasso regression, logistic regression, logistic lasso regression, and a random forest. In lasso, logistic lasso, and logistic regressions, the penalty was tuned to minimize the mean squared error (MSE). For the random forest, we used 200 trees.
# Function to create confusion matrix
## Source https://stackoverflow.com/questions/23891140/r-how-to-visualize-confusion-matrix-using-the-caret-package
# Draw confusion matric
draw_confusion_matrix <- function(cm) {
total <- sum(cm$table)
res <- as.numeric(cm$table)
# Generate color gradients. Palettes come from RColorBrewer.
greenPalette <- c("#F7FCF5","#E5F5E0","#C7E9C0","#A1D99B","#74C476","#41AB5D","#238B45","#006D2C","#00441B")
redPalette <- c("#FFF5F0","#FEE0D2","#FCBBA1","#FC9272","#FB6A4A","#EF3B2C","#CB181D","#A50F15","#67000D")
getColor <- function (greenOrRed = "green", amount = 0) {
if (amount == 0)
return("#FFFFFF")
palette <- greenPalette
if (greenOrRed == "red")
palette <- redPalette
colorRampPalette(palette)(100)[10 + ceiling(90 * amount / total)]
}
# Set the basic layout
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('Confusion Matrix', cex.main=2)
# Create the matrix
classes = colnames(cm$table)
rect(150, 430, 240, 370, col=getColor("green", res[1]))
text(195, 435, "Non-Spam", cex=1.2)
rect(250, 430, 340, 370, col=getColor("red", res[3]))
text(295, 435, "Spam", cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col=getColor("red", res[2]))
rect(250, 305, 340, 365, col=getColor("green", res[4]))
text(140, 400, "Non-Spam", cex=1.2, srt=90)
text(140, 335, "Spam", cex=1.2, srt=90)
# Add in the cm results
text(195, 400, res[1], cex=1.6, font=2, col='black')
text(195, 335, res[2], cex=1.6, font=2, col='black')
text(295, 400, res[3], cex=1.6, font=2, col='black')
text(295, 335, res[4], cex=1.6, font=2, col='black')
# Add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "Metrics", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# Add in accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
} # Create model from the training dataset
bayes_classifier <- e1071::naiveBayes( email_train,
email_train_label )
# Predicting on test data
email_test_pred <- predict( bayes_classifier,
email_test )# Display results
naive_bayes_results = confusionMatrix( data = email_test_pred,
reference = email_test_label,
positive = "1",
dnn = c("Prediction", "Actual") )
draw_confusion_matrix(naive_bayes_results)# Splitting for 5-fold cross-validation
folds <- email_train %>% vfold_cv(v = 5)
# Defining Lambdas (from Lecture 005)
lambdas <- data.frame( penalty = c( 0, 10^seq( from = 5, to = -2, length = 100 ) ) )
# Defining the recipe
data_recipe <- recipe(
email_train_label ~ .,
data = email_train
) %>%
update_role(V1, new_role = 'id variable') %>%
step_dummy(all_nominal(), - all_outcomes())
# Defining Lasso Model
lasso <- linear_reg(
penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
# Setting up workflow
workflow_lasso <- workflow() %>%
add_recipe( data_recipe ) %>%
add_model( lasso )
# Parallelize
doParallel::registerDoParallel(cores = 4)
# CV
lasso_cv <- workflow_lasso %>%
tune_grid(
resamples = folds,
grid = lambdas,
metrics = metric_set(rmse, mae)
)# Find best models
## Source: juliasilge.com/blog/lasso-the-office/
# Graph results
lasso_cv %>% collect_metrics() %>%
ggplot(aes(penalty, mean, color = .metric)) +
geom_errorbar(aes(
ymin = mean - std_err,
ymax = mean + std_err
),
alpha = 0.5
) +
geom_line(size = 1.5) +
facet_wrap(~.metric, scales = "free", nrow = 2) +
theme(legend.position = "none") + theme_base() +
scale_x_log10()# Get best penalties
best_lasso_mae = lasso_cv %>% show_best(metric = "mae") %>% filter(penalty == min(penalty))
best_lasso_rmse = lasso_cv %>% show_best(metric = "rmse") %>% filter(penalty == min(penalty))
best_lasso = rbind(best_lasso_mae, best_lasso_rmse)
# View in table
names(best_lasso) = c("Penalty", "Metric", "Estimator", "Mean", "n", "Standard Error", ".config")
kable(best_lasso, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Penalty | Metric | Estimator | Mean | n | Standard Error | .config |
|---|---|---|---|---|---|---|
| 0.031 | mae | standard | 0.283 | 5 | 0.002 | Preprocessor1_Model009 |
| 0.023 | rmse | standard | 0.376 | 5 | 0.004 | Preprocessor1_Model007 |
# Get test metrics
## Define best model - according to MAE
best_lasso_workflow = workflow_lasso %>%
finalize_workflow(select_best(lasso_cv, metric = "mae")) %>%
fit(data = email_train)
best_lasso = best_lasso_workflow %>% extract_fit_parsnip()
## Clean test data
email_test_clean = recipe(
email_test_label ~ .,
data = email_test
) %>%
update_role(V1, new_role = 'id variable') %>%
step_dummy(all_nominal(), - all_outcomes()) %>%
prep() %>% juice()
## Make predictions
lasso_predictions = predict(best_lasso, email_test_clean)
lasso_predictions = lasso_predictions %>% mutate(prediction = ifelse(.pred < 0.5, 0, 1))
email_test_clean$predictions = lasso_predictions$prediction
## Calculate accuracy
email_test_clean = email_test_clean %>% mutate(accurate = ifelse(predictions == email_test_label, 1, 0))
acc = sum(email_test_clean$accurate) / nrow(email_test_clean)
## Calculate sensitivity
tp = 0 # our model predicts not-spam for all, so no true positives
fn = email_test_clean %>% filter(email_test_label == 1) %>% nrow()
sens = tp / (tp + fn)
## Calculate specificity
fp = 0 # our model predicts not-spam for all, so no false positives
tn = email_test_clean %>% filter(email_test_label == 0) %>% nrow()
spec = tn / (tn + fp)
## Make table
lasso_table = tibble(
metric = c("accuracy", "sensitivity", "specificity"),
value = c(acc, sens, spec)
)
lasso_table %>% kable(digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| metric | value |
|---|---|
| accuracy | 0.848 |
| sensitivity | 0.000 |
| specificity | 1.000 |
# Set seed
set.seed(9753)
# Converting outcome variable into character vector
email_train <- email_train %>%
mutate(
outcome_as_vector = ifelse(email_train_label == 1, "Yes", "No")
)
# Split for 5-fold cross-validation
folds <- vfold_cv(email_train, v = 5)
# Defining the recipe
data_recipe <- recipe(
outcome_as_vector ~ .,
data = email_train
) %>%
step_rm(email_train_label) %>%
update_role(V1, new_role = 'id variable') %>%
step_dummy(all_nominal(), - all_outcomes()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
# Defining Lambdas (from Lecture 005)
lambdas <- data.frame( penalty = c( 0, 10^seq( from = 5, to = -2, length = 100 ) ) )
# Defining Lasso Model
log_lasso <- logistic_reg(
mode = 'classification',
penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
# Setting up workflow
workflow_lasso <- workflow() %>%
add_recipe( data_recipe ) %>%
add_model( log_lasso )
# CV
log_lasso_cv <- workflow_lasso %>%
tune_grid(
resamples = folds,
metrics = metric_set(yardstick::accuracy,
yardstick::roc_auc,
yardstick::sens,
yardstick::spec,
yardstick::precision),
grid = grid_latin_hypercube(penalty(), size = 5),
control = control_grid(parallel_over = 'resamples')
)# Find test metrics
log_lasso_cv_results = log_lasso_cv %>% collect_metrics() %>% group_by(.metric) %>% summarize(mean_accuracy = mean(mean, na.rm = T))
log_lasso_cv_results[1] = c("Accuracy", "Precision", "Area Under the Curve", "Sensitivity", "Specificity")
names(log_lasso_cv_results) = c("Metric", "Mean")
## View table
kable(log_lasso_cv_results, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Metric | Mean |
|---|---|
| Accuracy | 0.809 |
| Precision | 0.831 |
| Area Under the Curve | 0.508 |
| Sensitivity | 0.967 |
| Specificity | 0.040 |
# Set seed
set.seed(9753)
# Split for 5-fold cross-validation
folds <- vfold_cv(email_train, v = 5)
# Defining the recipe
data_recipe <- recipe(
outcome_as_vector ~ .,
data = email_train
) %>%
step_rm(email_train_label) %>%
update_role(V1, new_role = 'id variable') %>%
step_dummy(all_nominal(), - all_outcomes()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
# Define the model - simple LogReg because no penalty
model_logistic <- logistic_reg(
mode = 'classification') %>%
set_engine('glm')
# Define the workflow
workflow_logistic <- workflow() %>%
add_recipe(data_recipe) %>%
add_model(model_logistic)
# CV
cv_logistic <- workflow_logistic %>%
fit_resamples(
resamples = folds,
metrics = metric_set(yardstick::accuracy,
yardstick::roc_auc,
yardstick::sens,
yardstick::spec,
yardstick::precision)
)# Get test metrics
log_reg_results = cv_logistic %>% collect_metrics() %>% select(.metric, mean)
log_reg_results[1] = c("Accuracy", "Precision", "Area Under the Curve", "Sensitivity", "Specificity")
names(log_reg_results) = c("Metric", "Mean")
# View in table
kable(log_reg_results, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Metric | Mean |
|---|---|
| Accuracy | 0.797 |
| Precision | 0.832 |
| Area Under the Curve | 0.511 |
| Sensitivity | 0.947 |
| Specificity | 0.071 |
# Train the model - using 200 trees
random_forest <- train(
x = email_train_rf,
y = email_train_label_rf,
method = "ranger",
num.trees = 200,
importance = "impurity",
trControl = trainControl(method = "cv",
number = 3,
verboseIter = TRUE
)
)## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2, splitrule = gini, min.node.size = 1 on full training set
# Check variable importances
top_25words = varImp(random_forest, scale = TRUE)$importance %>%
rownames_to_column() %>%
arrange(-Overall) %>%
top_n(25)
# Plot variable importance
ggplot(data = top_25words,
aes(x=reorder(rowname, Overall),
y = Overall)) +
geom_bar(stat = "identity") +
theme_base() +
theme(axis.text.x=element_text(angle=50, hjust=1))+
xlab("Top 25 Predictive Words (stemmed)")+
ylab("Frequency of Word") +
labs(title = "Most Predictive Words") +
theme(plot.title = element_text(hjust = 0.5))# Re-declaring test data
email_test <- apply( email_dtm_freq_test, MARGIN = 2,
convert_values )
email_test <- cbind( email_test,
email_test_label )
# Predict on test data
predictions <- predict(random_forest, email_test)
# View test metrics in confusion matrix
random_forest_results = confusionMatrix( data = predictions,
reference = email_test_label,
positive = "1",
dnn = c("Prediction", "Actual") )
draw_confusion_matrix(random_forest_results)The crucial metrics in the spam email context are test accuracy (ACC) and sensitivity (SENS). ACC can help determine whether a model is performing well, but it is not the only measure of a good predictor. SENS is key because clicking on spam is dangerous, so a spam email that is predicted not-spam (a false negative) has consequences. Naive Bayes produced a test ACC of 0.788 and a SENS of 0.04. Lasso regression produces an ACC of 0.848 and a SENS of 0. Logistic lasso returns 0.809 ACC and 0.967 SENS. Logistic regression returns 0.797 ACC and 0.947 SENS. Finally, the random forest returns 0.847 ACC and 0 SENS. While each model has its advantages and disadvantages, balancing test accuracy and sensitivity, logistic lasso is the best predictor of spam.
# Creating df with metrics of all models
comparing_acc_table = data.frame(
c(
"Naive Bayes",
"Lasso",
"Logistic Lasso",
"Logistic",
"Random Forest"
),
c(
naive_bayes_results[["overall"]][["Accuracy"]],
acc,
log_lasso_cv_results$Mean[1],
log_reg_results$Mean[1],
random_forest_results[["overall"]][["Accuracy"]]
),
c(
naive_bayes_results[["byClass"]][["Sensitivity"]],
sens,
log_lasso_cv_results$Mean[4],
log_reg_results$Mean[4],
random_forest_results[["byClass"]][["Sensitivity"]]
),
c(
naive_bayes_results[["byClass"]][["Specificity"]],
spec,
log_lasso_cv_results$Mean[5],
log_reg_results$Mean[5],
random_forest_results[["byClass"]][["Specificity"]]
),
c(
naive_bayes_results[["byClass"]][["Precision"]],
# ifelse((tp/(fp+tp))) == NaN, 0, (tp/(fp+tp)),
0, #hardcoded
log_lasso_cv_results$Mean[2],
log_reg_results$Mean[2],
random_forest_results[["byClass"]][["Precision"]]
)
)
names(comparing_acc_table) = c("Method", "Accuracy", "Sensitivity", "Specificity", "Precision")
# View table
kable(comparing_acc_table, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))| Method | Accuracy | Sensitivity | Specificity | Precision |
|---|---|---|---|---|
| Naive Bayes | 0.788 | 0.044 | 0.921 | 0.091 |
| Lasso | 0.848 | 0.000 | 1.000 | 0.000 |
| Logistic Lasso | 0.809 | 0.967 | 0.040 | 0.831 |
| Logistic | 0.797 | 0.947 | 0.071 | 0.832 |
| Random Forest | 0.847 | 0.000 | 0.998 | 0.000 |