if (!require("pacman")) install.packages("pacman")

pacman::p_load(
  tidyverse,   # for data wrangling and visualization
  here        # for referencing folders and files
)
set.seed(1203)

Load browser data

web_raw <- here("05-regression-regularization/data","browser-domains.csv") %>% 
  read_csv()

Read in the actual website names and relabel site factor

sitenames_raw <- here("05-regression-regularization/data", "browser-sites.txt") %>% 
  read_tsv(col_names = FALSE)
spend_raw <- here("05-regression-regularization/data", "browser-totalspend.csv") %>% 
  read_csv()

Data preprocessing

sitenames <- sitenames_raw %>% 
  pull()
web <- web_raw %>% 
  mutate(
    id = as_factor(id),
    site_name = factor(site, levels = 1:length(sitenames), labels = sitenames)
  ) %>% 
  group_by(id) %>% 
  mutate(visitpercent = 100 * visits / sum(visits)) %>% 
  ungroup()
spend <- spend_raw %>% 
  mutate(log_spend = log(spend))
web_wide <- web %>% 
  select(id, site_name, visitpercent) %>% 
  arrange(id) %>% 
  pivot_wider(
    names_from = site_name,
    values_from = visitpercent,
    values_fill = list(visitpercent = 0)
  )

random sample of size 1000 obs X 250 features

vars_idx <- sample(1:1000, 250)
obs_idx  <- sample(1:10000, 1000)

generate response and feature matrices

Xweb <- web_wide %>% 
  select(-id) %>%
  select(vars_idx) %>% 
  slice(obs_idx) %>%
  as.matrix()

Yweb <- spend %>% 
  select(log_spend) %>%
  slice(obs_idx) %>% 
  as.matrix()

Save processed data

save(
  Xweb,
  file = here("05-regression-regularization/data","Xweb.Rdata")
)

save(
  Yweb,
  file = here("05-regression-regularization/data","Yweb.Rdata")
)