if (!require("pacman")) install.packages("pacman")

pacman::p_load(
  tidyverse,   # for data wrangling and visualization
  here        # for referencing folders and files
)
set.seed(1203)

Load browser data

web_raw <- here("05-regression-regularization/data","browser-domains.csv") %>% 
  read_csv()

Read in the actual website names and relabel site factor

sitenames_raw <- here("05-regression-regularization/data", "browser-sites.txt") %>% 
  read_tsv(col_names = FALSE)
spend_raw <- here("05-regression-regularization/data", "browser-totalspend.csv") %>% 
  read_csv()

Data preprocessing

sitenames <- sitenames_raw %>% 
  pull()
web <- web_raw %>% 
  mutate(
    id = as_factor(id),
    site_name = factor(
      site,
      levels = 1:length(sitenames),
      labels = sitenames
    )
  ) %>% 
  group_by(id) %>% 
  mutate(visitpercent = 100 * visits / sum(visits)) %>% 
  ungroup()
spend <- spend_raw %>% 
  mutate(
    log_spend = log(spend),
    id = as_factor(id)
  )
web_wide <- web %>% 
  select(id, site_name, visitpercent) %>% 
  arrange(id) %>% 
  pivot_wider(
    names_from = site_name,
    values_from = visitpercent,
    values_fill = list(visitpercent = 0)
  )

random sample of size 1000 obs X 250 features

vars_idx <- sample(1:1000, 200)
obs_idx  <- sample(1:10000, 1200)

generate response and feature matrices

Xweb <- web_wide %>% 
  select(-id) %>%
  select(vars_idx) %>% 
  slice(obs_idx)

Yweb <- spend %>% 
  select(log_spend) %>%
  slice(obs_idx)

browser_sample <- bind_cols(Yweb, Xweb)

browser <- spend %>% 
  left_join(web_wide) %>% 
  select(-id, -spend)

Save processed data

write.csv(
  browser,
  file = here("05-regression-regularization/data","browser-all.csv"),
  row.names = FALSE
  )
write.csv(
  browser_sample,
  file = here("05-regression-regularization/data","browser-sample.csv"),
  row.names = FALSE
  )