1 Load the Data

df <- fread("data/marketing_eda.csv")
glimpse(df)
## Rows: 1,000
## Columns: 9
## $ CustomerID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ Age        <int> 54, 18, 42, 27, 53, 35, 64, 41, 24, 53, 42, 54, 63, 37, 43,…
## $ Gender     <chr> "M", "F", "F", "F", "F", "M", "F", "M", "M", "F", "M", "F",…
## $ Device     <chr> "Mobile", "Mobile", "Mobile", "Desktop", "Mobile", "Desktop…
## $ Channel    <chr> "Social", "Search", "Search", "Social", "Social", "Video", …
## $ Ad_Spend   <dbl> 718.60, 233.00, 122.51, 198.78, 145.19, 125.74, 421.93, 189…
## $ Clicks     <int> 95, 34, 18, 19, 19, 9, 47, 25, 23, 15, 41, 2, 33, 66, 2, 17…
## $ Purchases  <int> 6, 1, 0, 1, 4, 0, 0, 4, 2, 1, 1, 0, 2, 2, 0, 0, 0, 2, 4, 0,…
## $ Revenue    <dbl> 149.16, 22.22, 0.00, 13.22, 150.48, 0.00, 0.00, 82.48, 177.…

2 Numerical Variation: Ad Spend

ggplot(df, aes(Ad_Spend)) +
  geom_histogram(bins = 30, fill = "grey70", color = "white") +
  geom_vline(aes(xintercept = mean(Ad_Spend)), linetype = "dashed") +
  geom_vline(aes(xintercept = median(Ad_Spend))) +
  labs(subtitle = "Dashed = mean, solid = median",
       x = "Ad Spend ($)", y = "Count") +
  theme_minimal()

Questions:

  • Is the distribution symmetric or skewed?
  • What does the fact the mean is larger than the median tell you?
  • What marketing strategy might create this shape?

2.1 Ad Spend, Log Scale

ggplot(df, aes(Ad_Spend)) +
  geom_histogram(bins = 30, fill = "grey70", color = "white") +
  scale_x_continuous(trans = "log10") +
  labs(title = "Ad Spend on Log Scale",
       x = "Ad Spend ($, log10)", y = "Count") +
  theme_minimal()

Questions:

  • Is the distribution symmetric or skewed?
  • Does log scale make it easier to see what is “typical”? Why?

3 Categorical Variation: Ad Channel mix

df %>%
  count(Channel) %>%
  mutate(p = n / sum(n)) %>%
  ggplot(aes(reorder(Channel, n), n)) +
  geom_col(fill = "steelblue") +
  geom_text(aes(label = percent(p, accuracy = 0.1)),
            vjust = -0.3, size = 3.5) +
  labs(title = "Channel Mix",
       x = "Channel", y = "Count") +
  theme_minimal()

Questions:

  • Is the data balanced across channels?
  • Which channels dominate?
  • Does the distribution make intuitive sense to you? Why or why not?

4 Identifying Outliers in Ad Spend

#plot boxplot
ggplot(df, aes(y = Ad_Spend)) +
  geom_boxplot(fill = "lightblue") +
  scale_y_continuous(labels = dollar_format()) +
  labs(title = "Boxplot of Ad Spend",
       y = "Ad Spend ($)") +
  theme_minimal()

Questions:

  • Are there many outliers?
  • Are outliers generally large or small?
  • Should we drop outliers for a future analysis?

5 Identifying Outliers in Ad Spend (log scale)

#plot boxplot
ggplot(df, aes(y = log(Ad_Spend+1))) +
  geom_boxplot(fill = "lightblue") +
  scale_y_continuous(labels = dollar_format()) +
  labs(title = "Boxplot of Ad Spend",
       y = "log Ad Spend ($)") +
  theme_minimal()

Questions:

  • Does log scale help identify more subtle outliers?