f <- "customer_clustering_data.csv"
df <- fread(f)
head(df)
## CustomerID Age Income Online_Spend InStore_Spend
## <int> <int> <int> <int> <int>
## 1: 1 23 37241 1212 239
## 2: 2 24 35265 1073 181
## 3: 3 31 39611 584 520
## 4: 4 25 45250 1539 431
## 5: 5 26 32545 762 273
## 6: 6 32 23454 1422 354
print(skim(df))
## ── Data Summary ────────────────────────
## Values
## Name df
## Number of rows 198
## Number of columns 5
## Key NULL
## _______________________
## Column type frequency:
## numeric 5
## ________________________
## Group variables None
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0 p25 p50
## 1 CustomerID 0 1 99.5 57.3 1 50.2 99.5
## 2 Age 0 1 35.4 9.92 17 27 34
## 3 Income 0 1 57681. 19976. 23454 37633. 58478.
## 4 Online_Spend 0 1 1169. 741. 1 502. 1088
## 5 InStore_Spend 0 1 1617. 1052. 147 356 1938
## p75 p100 hist
## 1 149. 198 ▇▇▇▇▇
## 2 43 59 ▆▇▇▅▂
## 3 71885 105041 ▇▅▇▃▂
## 4 1648. 3475 ▇▇▅▃▁
## 5 2474. 3615 ▇▁▅▇▂
Let’s plot the scatter plot of Online vs In-Store Spend, income, and age distribution
p1 <- ggplot(df, aes(Online_Spend, InStore_Spend)) +
geom_point(alpha = 0.6) +
labs(title = "Online vs In-Store Spend", x = "Online Spend", y = "In-Store Spend") +
theme_minimal()
p2 <- ggplot(df, aes(Income)) +
geom_histogram(bins = 30, fill = "grey70", color = "white") +
labs(title = "Income Distribution", x = "Income", y = "Count") +
theme_minimal()
p3 <- ggplot(df, aes(Age)) +
geom_histogram(bins = 30, fill = "grey70", color = "white") +
labs(title = "Age Distribution", x = "Age", y = "Count") +
theme_minimal()
p1
p2
p3
SOLUTION: Yes: clusters around high in-store / low online; high online / low in-store; and high both (omni-channel).
features <- df[, .(Online_Spend, InStore_Spend, Age, Income)]
X <- scale(features) # mean=0, sd=1
summary(X)
## Online_Spend InStore_Spend Age Income
## Min. :-1.5776 Min. :-1.3977 Min. :-1.8528 Min. :-1.71338
## 1st Qu.:-0.9005 1st Qu.:-1.1990 1st Qu.:-0.8447 1st Qu.:-1.00358
## Median :-0.1099 Median : 0.3050 Median :-0.1390 Median : 0.03987
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.6469 3rd Qu.: 0.8143 3rd Qu.: 0.7683 3rd Qu.: 0.71104
## Max. : 3.1130 Max. : 1.8993 Max. : 2.3813 Max. : 2.37081
SOLUTION: K-means uses Euclidean distance; unscaled high-variance features (Income/Spend) would dominate distance and clustering.
wss <- sapply(2:10, function(k){
kmeans(X, centers = k, nstart = 25)$tot.withinss
})
## plot the within-cluster sum of squares
elbow_plot <- data.frame(k = 2:10, wss = wss)
ggplot(elbow_plot, aes(x = k, y = wss)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = 2:10) +
labs(title = "Elbow Method: Total Within-Cluster Sum of Squares vs. Number of Clusters",
x = "Number of Clusters (k)",
y = "Total Within-Cluster Sum of Squares") +
theme_minimal()
SOLUTION Elbow typically around k ≈ 3 for this dataset.
k <- 3
km <- kmeans(X, centers = k, nstart = 50)
df$cluster <- factor(km$cluster)
# Cluster visualization in original spend space
ggplot(df, aes(Online_Spend, InStore_Spend, color = cluster)) +
geom_point(size = 2, alpha = 0.8) +
labs(title = paste("K-means Clusters (k =", k, ")"),
x = "Online Spend", y = "In-Store Spend") +
theme_minimal() +
theme(legend.position = "top")
SOLUTION (1) Online-heavy low in-store; (2) In-store-heavy low online; (3) High both (omni-channel, likely highest value).
profile = df[, .(Online_Spend = mean(Online_Spend),
InStore_Spend = mean(InStore_Spend),
Age = mean(Age),
Income = mean(Income)),
by = cluster]
profile
## cluster Online_Spend InStore_Spend Age Income
## <fctr> <num> <num> <num> <num>
## 1: 1 1184.9552 305.6269 25.14925 35594.22
## 2: 2 417.7059 2453.2353 45.92647 59542.09
## 3: 3 1964.2063 2109.7460 34.87302 79161.30
SOLUTION Omni-channel (high online & in-store) likely most valuable (cross-sell, loyalty). Online-heavy: digital promotions; In-store-heavy: local offers/merchandising.
pca_result <- prcomp(X, center = FALSE, scale. = FALSE)
# check variance explained
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.4944 1.1599 0.49753 0.41693
## Proportion of Variance 0.5583 0.3363 0.06188 0.04346
## Cumulative Proportion 0.5583 0.8947 0.95654 1.00000
# Keep first two principal components (scores) and add cluster labels
pca_data <- data.frame(pca_result$x[, 1:2], cluster = df$cluster)
# Extract loadings which tell you how each original variable contributes to each PC.
pca_loadings <- as.data.frame(pca_result$rotation[, 1:2]) # 2 columns: PC1, PC2
pca_loadings$variable <- rownames(pca_loadings) # keep var names for plotting
pca_loadings[,1:2]
## PC1 PC2
## Online_Spend -0.1261202 -0.81186777
## InStore_Spend 0.6324303 -0.02334112
## Age 0.5819806 0.28345536
## Income 0.4954031 -0.50988133
SOLUTION PC1: positive loadings on In-Store Spend, Age, Income; PC2: negative loading on Online_Spend, positive on age, negative on income.
#plot loadings
# Scale loadings for better visibility in plots
arrow_scale <- 5 # tweak if arrows are too short/long
# Plot loadings alone (annotated with variables names)
p_loadings <- ggplot(pca_loadings, aes(x = PC1 * arrow_scale, y = PC2 * arrow_scale, label = variable)) +
geom_segment(aes(x = 0, y = 0, xend = PC1 * arrow_scale, yend = PC2 * arrow_scale),
arrow = arrow(length = unit(0.3, "cm")), color = "blue") +
geom_text(size = 5) +
labs(
title = "PCA Loadings: Store attributes",
x = "\nPrincipal Component 1",
y = "Principal Component 2\n"
) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray") +
theme_minimal() +
coord_cartesian(xlim = c(-2, 8), ylim = c(-10, 4)) # adjust as needed
p_loadings
SOLUTION Yes: PC1 contrasts high in-store spend, age, income vs low; PC2 contrasts high online spend vs low.
# plot the PCA scores with clusters
ggplot(pca_data, aes(PC1, PC2, color = cluster)) +
geom_point(size = 2, alpha = 0.8) +
labs(title = "PCA Projection of Customers",
x = "Principal Component 1",
y = "Principal Component 2") +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray") +
theme_minimal() +
theme(legend.position = "top")
SOLUTION Yes—first two PCs (largely spend contrasts) separate the three groups visually.
arrow_scale <- 2 # tweak if arrows are too short/long
biplot <- ggplot() +
geom_point(data = pca_data, aes(x = PC1, y = PC2, color = cluster), size = 2, alpha = 0.8) +
geom_segment(data = pca_loadings,
aes(x = 0, y = 0, xend = PC1*arrow_scale, yend = PC2*arrow_scale),
arrow = arrow(length = unit(0.2, "cm")),
color = "blue") +
geom_text(data = pca_loadings,
aes(x = PC1 * 1.4, y = PC2 * 1.4, label = variable),
size = 4) +
labs(
title = "Biplot of PCA",
x = "\nPrincipal Component 1",
y = "Principal Component 2\n"
) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray") +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray") +
theme_minimal()
biplot
- How do the clusters relate to the original features based on the
biplot? Do you see a correspondence with the clustering means table
above (part 6)?
SOLUTION Yes: Cluster 1 (in-store heavy) aligns with high InStore_Spend; Cluster 2 (online heavy) aligns with high Online_Spend; Cluster 3 (omni-channel) aligns with high both.
SOLUTION Omni-channel (high both) likely most valuable (cross-sell, loyalty). Online-heavy: digital promotions; In-store-heavy: local offers/merchandising.