Homework 3

In the last two assignments, you used a set of variables to predict an outcome. Now, try to see whether there are any clusters among those variables. That is, use the clustering techniques we talked about in class to try to uncover any structure underlying the predictors in your model. Describe what you find. This assignment is due December 22, 2020.

I use an GLRM to make a political profile of some variables that showed up as relevant in previous assignments. These variables are categorical, so when used in analysis, they increased the dimensionality of the data. These variables are related, so it would make sense to cluster them together.

## Importing data----
getwd()
anes16<-import("./ML/ANES/raw/anes_timeseries_2016_dta/anes_timeseries_2016.dta")

d1 <- anes16 %>%
  select(V161007, V161267x,V161342,V161310x,V161270,V161010d,
         V161268, V161115, V161217:V161223, V161326:V161329,
         V161522, ends_with('x')) %>% 
  rename(internet= V161007,
         int_home = V161326,
         age     = V161267x,
         gender  = V161342,
         race    = V161310x,
         education = V161270,
         region = V161010d,
         income = V161361x,
         trust_peple = V161219,
         gov_waste = V161217,
         unemp = V161142x,
         marital = V161268,
         Pres_for_rel = V161084x,
         occup = V161276x) %>% 
  # remove_noresponse() %>%
  mutate( internet = case_when(
    internet == 2 ~ 0,
    internet == 1 ~ 1),
    int_home = case_when(
      int_home == 2 ~ 0,
      int_home == 1 ~ 1)
  ) %>% 
  # filter(!is.na(int_home)) %>%
  filter(!is.na(internet)) %>%
  # drop_na(internet) %>% #I could have also used
  factorize() #%>%
# reduce_fct_levels()


# Wrangling data ----------------------------------------------------------

library(recipes)
blueprint <- recipe(~ ., data=d1) %>%
  step_nzv(all_predictors())  %>% 
  step_knnimpute(all_numeric(), neighbors = 6) %>%
  step_other(all_nominal(), threshold = 0.01,
             other = "other") %>%
  step_unknown(all_nominal()) %>%
  step_YeoJohnson(all_numeric()) %>% 
  step_scale(all_numeric()) %>%
  step_dummy(all_nominal(), one_hot = FALSE)
prep_data <- recipes::prep(blueprint, d1)
tmp_data <- bake(prep_data, 
                 new_data=d1)


# Select variables to cluster
pol_prof <- tmp_data %>%
  select(starts_with(c('trust_peple',
                       'gov_waste',
                       'Pres_for_re'))) #%>%
  # as.matrix()

library(h2o)
h2o.no_progress()  # turn off progress bars
h2o.init(max_mem_size = "5g")  # connect to H2O instance
pp_h <- as.h2o(pol_prof)
split <- h2o.splitFrame(pp_h, ratios = 0.6, seed = 123)
train <- split[[1]]
valid <- split[[2]]
# Create hyperparameter search grid
params <- expand.grid(
  regularization_x = c("None", "NonNegative", "L1"),
  regularization_y = c("None", "NonNegative", "L1"),
  gamma_x = seq(0, 1, by = .25),
  gamma_y = seq(0, 1, by = .25),
  error = 0,
  stringsAsFactors = FALSE
)

for(i in seq_len(nrow(params))) {
  # Create model
  glrm_pp <- h2o.glrm(
    training_frame = train,
    k = 2, 
    loss = "Quadratic",
    regularization_x = params$regularization_x[i], 
    regularization_y = params$regularization_y[i],
    gamma_x = params$gamma_x[i],
    gamma_y = params$gamma_y[i],
    transform = "STANDARDIZE", 
    max_runtime_secs = 1000,
    seed = 333
  )
  # Predict on validation set and extract error
  validate <- h2o.performance(glrm_pp, valid)
  params$error[i] <- validate@metrics$numerr
}

params %>% arrange(error) %>% head(10)

glrm_pp <- h2o.glrm(
  training_frame = pp_h,
  k = 2, 
  loss = "Quadratic",
  regularization_x = "None", 
  regularization_y = "NonNegative",
  gamma_x = 0,
  gamma_y = 0,
  transform = "STANDARDIZE", 
  max_runtime_secs = 1000,
  seed = 333
)

glrm_pp@model$importance

labels <- colnames(glrm_pp@model$archetypes)
t(glrm_pp@model$archetypes) %>% 
  as_tibble() %>% 
  mutate(label = labels) %>% 
  arrange(desc(Arch1),desc(Arch2))


p1 <- t(glrm_pp@model$archetypes) %>% 
  as.data.frame() %>% 
  mutate(feature = row.names(.)) %>%
  ggplot(aes(Arch1, reorder(feature, Arch1))) +
  geom_point()

p2 <- t(glrm_pp@model$archetypes) %>% 
  as.data.frame() %>% 
  mutate(feature = row.names(.)) %>%
  ggplot(aes(Arch2, reorder(feature, Arch2))) +
  geom_point()


gridExtra::grid.arrange(p1, p2, nrow = 1)

regularization_x	regularization_y	gamma_x	error
None	NonNegative	0.00	4970.378
NonNegative	NonNegative	0.00	4970.378
L1	NonNegative	0.00	4970.378
None	NonNegative	0.25	4970.378
NonNegative	NonNegative	0.25	4970.378
L1	NonNegative	0.25	4970.378
None	NonNegative	0.50	4970.378
NonNegative	NonNegative	0.50	4970.378
L1	NonNegative	0.50	4970.378
None	NonNegative	0.75	4970.378

	pc1	pc2
Standard deviation	1.0383576	0.7750300
Proportion of Variance	0.0898489	0.0500560
Cumulative Proportion	0.0898489	0.1399048

Archetypes of GLRM

It seems that the Archetypes are capturing two types of people. On one hand, they disapprove how the president was handling foreign relationships, consider government is wasting money somewhat, and trust people half and most the time. On the other, there’s people that think government does not waste very much, but never trusts people.

Nevertheless, in terms of the variation, the archetypes are not capturing a lot of variation.

Homework 3

Hans Martinez

December 22, 2020