if (!require("pacman")) install.packages("pacman")
pacman::p_load(
tidyverse, # for data wrangling and visualization
here # for referencing folders and files
)
Load browser data
web_raw <- here("05-regression-regularization/data","browser-domains.csv") %>%
read_csv()
Read in the actual website names and relabel site factor
sitenames_raw <- here("05-regression-regularization/data", "browser-sites.txt") %>%
read_tsv(col_names = FALSE)
spend_raw <- here("05-regression-regularization/data", "browser-totalspend.csv") %>%
read_csv()
Data preprocessing
sitenames <- sitenames_raw %>%
pull()
web <- web_raw %>%
mutate(
id = as_factor(id),
site_name = factor(
site,
levels = 1:length(sitenames),
labels = sitenames
)
) %>%
group_by(id) %>%
mutate(visitpercent = 100 * visits / sum(visits)) %>%
ungroup()
spend <- spend_raw %>%
mutate(
log_spend = log(spend),
id = as_factor(id)
)
web_wide <- web %>%
select(id, site_name, visitpercent) %>%
arrange(id) %>%
pivot_wider(
names_from = site_name,
values_from = visitpercent,
values_fill = list(visitpercent = 0)
)
random sample of size 1000 obs X 250 features
vars_idx <- sample(1:1000, 200)
obs_idx <- sample(1:10000, 1200)
generate response and feature matrices
Xweb <- web_wide %>%
select(-id) %>%
select(vars_idx) %>%
slice(obs_idx)
Yweb <- spend %>%
select(log_spend) %>%
slice(obs_idx)
browser_sample <- bind_cols(Yweb, Xweb)
browser <- spend %>%
left_join(web_wide) %>%
select(-id, -spend)
Save processed data
write.csv(
browser,
file = here("05-regression-regularization/data","browser-all.csv"),
row.names = FALSE
)
write.csv(
browser_sample,
file = here("05-regression-regularization/data","browser-sample.csv"),
row.names = FALSE
)