# Load the ad-click data (one row per impression) from CSV into a data.frame
df <- fread("../data/ad_click_data.csv")
# Inspect the first few rows and summary statistics.
# Quick peek at the first rows to sanity-check columns and types
head(df)
## click device ad_position user_segment pages_viewed time_on_site topic_match
## <int> <char> <char> <char> <int> <int> <int>
## 1: 0 mobile top returning 6 67 0
## 2: 1 desktop bottom returning 7 146 0
## 3: 0 mobile middle new 4 82 0
## 4: 0 desktop middle returning 1 37 1
## 5: 1 desktop top returning 3 95 1
## 6: 0 mobile top returning 2 89 0
## past_ctr_hi
## <int>
## 1: 0
## 2: 1
## 3: 0
## 4: 0
## 5: 0
## 6: 0
# Summary stats: distribution of numeric vars and levels of factors
summary(df)
## click device ad_position user_segment
## Min. :0.0000 Length:5000 Length:5000 Length:5000
## 1st Qu.:0.0000 Class :character Class :character Class :character
## Median :0.0000 Mode :character Mode :character Mode :character
## Mean :0.3178
## 3rd Qu.:1.0000
## Max. :1.0000
## pages_viewed time_on_site topic_match past_ctr_hi
## Min. : 1.000 Min. : 12.00 Min. :0.0000 Min. :0.0000
## 1st Qu.: 4.000 1st Qu.: 48.00 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 5.000 Median : 66.00 Median :0.0000 Median :0.0000
## Mean : 5.003 Mean : 75.38 Mean :0.4422 Mean :0.2934
## 3rd Qu.: 6.000 3rd Qu.: 93.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :13.000 Max. :350.00 Max. :1.0000 Max. :1.0000
Questions
# Split dataset into train (70%) and test (30%) to evaluate generalization.
# Randomly select ~30% of rows to form the test set (holdout)
test_idx <- sample(nrow(df), size = 0.3 * nrow(df))
# Training set: all rows NOT in the test index
train <- df[-test_idx, ]
# Test set: rows reserved for honest model evaluation
test <- df[test_idx, ]
Questions
# Fit a logistic regression to predict clicks using selected predictors.
# Here we use "pages_viewed" and "ad_position" as features.
logit_fit <- glm(
click ~ pages_viewed + ad_position,
data = train,
family = binomial()
)
# Summarize coefficients in a clean format
# Nicely formatted regression table to inspect coefficient signs and magnitudes
stargazer(logit_fit, type = "text", single.row = TRUE, no.space = TRUE)
##
## =============================================
## Dependent variable:
## ---------------------------
## click
## ---------------------------------------------
## pages_viewed 0.125*** (0.019)
## ad_positionmiddle 0.408*** (0.128)
## ad_positiontop 0.916*** (0.121)
## Constant -2.025*** (0.149)
## ---------------------------------------------
## Observations 3,500
## Log Likelihood -2,126.933
## Akaike Inf. Crit. 4,261.867
## =============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Questions
# Predict probability of clicking on the test set.
test$p_hat <- predict(logit_fit, newdata = test, type = "response")
# Classify each impression as "predicted click" (1) if p >= 0.5, else 0.
test$y_hat <- as.integer(test$p_hat >= 0.5)
# Inspect true label vs predicted probability and classification for a few rows
head(test[, c("click","p_hat","y_hat")])
## click p_hat y_hat
## <int> <num> <int>
## 1: 1 0.4420035 0
## 2: 0 0.2466424 0
## 3: 0 0.3814320 0
## 4: 0 0.2186011 0
## 5: 0 0.3814320 0
## 6: 0 0.4420035 0
Questions
# Confusion matrix elements
# True Positives: predicted click and actually clicked
TP <- sum(test$y_hat==1 & test$click==1, na.rm = TRUE)
# False Positives: predicted click but actually no click
FP <- sum(test$y_hat==1 & test$click==0, na.rm = TRUE)
# False Negatives: predicted no click but actually clicked
FN <- sum(test$y_hat==0 & test$click==1, na.rm = TRUE)
# True Negatives: predicted no click and actually no click
TN <- sum(test$y_hat==0 & test$click==0, na.rm = TRUE)
print(paste("TP:", TP, "FP:", FP, "FN:", FN, "TN:", TN))
## [1] "TP: 16 FP: 14 FN: 460 TN: 1010"
# Compute standard metrics
# Accuracy: overall fraction of correct predictions (can be misleading with class imbalance)
accuracy <- (TP + TN) / (TP + FP + FN + TN) # overall correctness
# Precision: among predicted clicks, how many were real clicks
precision <- ifelse((TP + FP)==0, NA, TP/(TP+FP)) # quality of positive predictions
# Recall (sensitivity): among real clicks, how many we correctly predicted
recall <- TP/(TP+FN) # coverage of actual positives
# F1: harmonic mean of precision and recall (balances the two)
f1 <- ifelse(is.na(precision) | (precision+recall)==0, NA, 2*precision*recall/(precision+recall)) # balance between precision and recall
data.frame(
Threshold = 0.50,
Accuracy = round(accuracy, 3),
Precision = round(precision, 3),
Recall = round(recall, 3),
F1 = round(f1, 3)
)
## Threshold Accuracy Precision Recall F1
## 1 0.5 0.684 0.533 0.034 0.063
Questions
# ROC curve plots tradeoff between True Positive Rate and False Positive Rate
# Build ROC object from true labels and predicted probabilities
roc_obj <- roc(response = test$click, predictor = test$p_hat, quiet = TRUE)
# False Positive Rate (x-axis) derived from specificities (1 - TNR)
fpr <- 1 - roc_obj$specificities # False Positive Rate
# True Positive Rate (y-axis), also called recall
tpr <- roc_obj$sensitivities # True Positive Rate
# Plot ROC curve with the random-guess diagonal as a baseline
ggplot(data = data.frame(FPR = fpr, TPR = tpr), aes(x = FPR, y = TPR)) +
geom_line(size = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
labs(title = "ROC Curve", x = "False Positive Rate (FPR)", y = "True Positive Rate (TPR)") +
theme_minimal(base_size = 13)
Questions
# AUC: single-number summary of ROC (probability a random positive ranks above a random negative)
auc_val <- as.numeric(auc(roc_obj))
round(auc_val, 3)
## [1] 0.619
Questions
# Convert log-odds coefficients into odds ratios for manager-friendly interpretation
or <- exp(coef(logit_fit))
round(or, 3)
## (Intercept) pages_viewed ad_positionmiddle ad_positiontop
## 0.132 1.133 1.503 2.498
Questions
ad_positiontop
” has an odds ratio of ~2.5, how
should we interpret this?