########################################################
##### ResEcon 703: Topics in Advanced Econometrics #####
#####    Week 12: Individual-Level Coefficients    #####
#####         Matt Woerman, UMass Amherst          #####
########################################################

### Individual-Level Coefficients R Example ------------------------------------

### Load and look at dataset
## Load tidyverse and mlogit
library(tidyverse)
library(mlogit)
## Load dataset from mlogit package
data('HC', package = 'mlogit')
## Look at dataset
tibble(HC)

### Format dataset
## Pivot into a long dataset
hvac_long <- HC %>% 
  mutate(id = 1:n()) %>% 
  pivot_longer(c(starts_with('ich.'), starts_with('och.')), 
               names_to = c('cost', 'alt'), names_sep = '[.]',
               values_to = 'value') %>% 
  pivot_wider(names_from = cost, values_from = value) %>% 
  mutate(choice = (depvar == alt)) %>% 
  select(-depvar)
## Look at long dataset
tibble(hvac_long)
## Combine heating and cooling costs into one variable
hvac_clean <- hvac_long %>% 
  mutate(ac = 1 * (nchar(alt) == 3), 
         ic = ich + ac * icca,
         oc = och + ac * occa) %>% 
  select(id, alt, choice, ac, ic, oc, income) %>% 
  arrange(id, alt)
## Look at cleaned dataset
tibble(hvac_clean)
## Convert cleaned dataset to dfidx format
hvac_dfidx <- dfidx(hvac_clean, shape = 'long', 
                    choice = 'choice', idx = c('id', 'alt'))
## Look at data in dfidx format
tibble(hvac_dfidx)

### Model HVAC choice as a mixed logit using mlogit
## Model choice using ac, ic, and oc with normal random coefficients
model_1 <- mlogit(formula = choice ~ ac + ic + oc | 0 | 0, 
                  data = hvac_dfidx, 
                  reflevel = 'hpc',
                  rpar = c(ac = 'n', ic = 'n', oc = 'n'), 
                  R = 1000, seed = 703)
## Summarize model results
summary(model_1)

### Find the mean coefficient for each chosen alternative using mlogit
## Calculate mean coefficient for each household
coefs_1 <- model_1 %>% 
  fitted(type = 'parameters') %>% 
  as_tibble() %>% 
  rename(ac_coef = ac, ic_coef = ic, oc_coef = oc)
coefs_1
## Average coefficient over all households with each HVAC system
hvac_clean %>% 
  filter(choice == 1) %>% 
  cbind(coefs_1) %>% 
  group_by(alt) %>% 
  summarize(ac_coef = mean(ac_coef), 
            ic_coef = mean(ic_coef), 
            oc_coef = mean(oc_coef),
            .groups = 'drop')

### Model HVAC choice as a mixed logit coded by hand
## Set seed for replication
set.seed(703)
## Draw standard normal random variables for each household
draws_hh <- map(1:250, ~ tibble(ac_draw = rnorm(100),
                                ic_draw = rnorm(100),
                                oc_draw = rnorm(100)))
## Split data into list by household
data_hh <- hvac_clean %>% 
  group_by(id) %>% 
  group_split()
## Function to simulate choice probabilities for an individual household
sim_probs_ind <- function(params, draws_ind, data_ind){
  ## Select relevant variables and convert into a matrix [J x K]
  data_matrix <- data_ind %>% 
    select(ac, ic, oc) %>% 
    as.matrix()
  ## Transform random coefficients based on parameters [R x K]
  coef_matrix <- draws_ind %>% 
    mutate(ac_coef = params[1] + params[4] * ac_draw,
           ic_coef = params[2] + params[5] * ic_draw,
           oc_coef = params[3] + params[6] * oc_draw) %>% 
    select(ac_coef, ic_coef, oc_coef) %>% 
    as.matrix()
  ## Calculate representative utility for each alternative in each draw [R x J]
  utility <- (coef_matrix %*% t(data_matrix)) %>% 
    pmin(700) %>% 
    pmax(-700)
  ## Sum the exponential of utility over alternatives [R x 1]
  prob_denom <- utility %>% 
    exp() %>% 
    rowSums()
  ## Calculate the conditional probability for each alternative and draw [R x J]
  cond_prob <- exp(utility) / prob_denom
  ## Calculate simulated choice probabilities as means over all draws [1 x J]
  sim_prob <- colMeans(cond_prob)
  ## Add simulated probability to initial dataset
  data_ind_out <- data_ind %>% 
    mutate(prob = sim_prob)
  ## Return initial dataset with simulated probability variable
  return(data_ind_out)
}
## Function to calculate simulated log-likelihood
sim_ll_fn <- function(params, draws_list, data_list){
  ## Simulate probabilities for each individual household
  data_sim_ind <- map2(.x = draws_list, .y = data_list,
                       .f = ~ sim_probs_ind(params = params, 
                                            draws_ind = .x, 
                                            data_ind = .y))
  ## Combine individual datasets into one
  data_sim <- data_sim_ind %>% 
    bind_rows()
  ## Calculate log of simulated probability for the chosen alternative
  data_sim <- data_sim %>% 
    filter(choice == TRUE) %>% 
    mutate(log_prob = log(prob))
  ## Calculate the simulated log-likelihood
  sim_ll <- sum(data_sim$log_prob)
  ## Return the negative of simulated log-likelihood
  return(-sim_ll)
}
## Maximize the log-likelihood function
model_2 <- optim(par = c(6.53, -0.17, -1.04, 0, 0, 0), fn = sim_ll_fn,
                 draws_list = draws_hh, data_list = data_hh, 
                 method = 'BFGS', hessian = TRUE,
                 control = list(trace = 1, REPORT = 5))
## Report model results
model_2

### Find the mean coefficient for each chosen alternative coded by hand
## Function to simulate individual coefficients for one individual
calc_mean_coefs <- function(params, draws_ind, data_ind){
  ## Select relevant variables and convert into a matrix [J x K]
  data_matrix <- data_ind %>% 
    select(ac, ic, oc) %>% 
    as.matrix()
  ## Transform random draws into coefficients based on parameters
  coef <- draws_ind %>% 
    mutate(ac_coef = params[1] + params[4] * ac_draw,
           ic_coef = params[2] + params[5] * ic_draw,
           oc_coef = params[3] + params[6] * oc_draw) %>% 
    select(ac_coef, ic_coef, oc_coef)
  ## Convert coefficients tibble to a matrix [R x K]
  coef_matrix <- as.matrix(coef)
  ## Calculate representative utility for each alternative in each draw [R x J]
  utility <- (coef_matrix %*% t(data_matrix)) %>% 
    pmin(700) %>% 
    pmax(-700)
  ## Sum the exponential of utility over alternatives [R x 1]
  prob_denom <- utility %>% 
    exp() %>% 
    rowSums()
  ## Calculate the conditional probability for each alternative and draw [R x J]
  cond_prob <- exp(utility) / prob_denom
  ## Calculate the numerator of the draw weights as prob of chosen alt [R x 1]
  weights_num <- c(cond_prob %*% data_ind$choice)
  ## Calculate the draw weights [R x 1]
  weights <- weights_num / sum(weights_num)
  ## Add draw weights to dataset of coefficients
  coef <- coef %>%
    mutate(weight = weights)
  ## Calculate weighted mean for each coefficient
  coef_means <- coef %>%
    summarize(ac_coef_mean = sum(ac_coef * weight),
              ic_coef_mean = sum(ic_coef * weight),
              oc_coef_mean = sum(oc_coef * weight))
  ## Add individual coefficient means to initial dataset
  data_ind_out <- data_ind %>% 
    bind_cols(coef_means)
  ## Return initial dataset with simulated probability variable
  return(data_ind_out)
}
## Calculate mean coefficients for each individual
data_2_ind <- map2(.x = draws_hh, .y = data_hh,
                   .f = ~ calc_mean_coefs(params = model_2$par, 
                                          draws_ind = .x, 
                                          data_ind = .y))
## Combine list of data into one tibble
data_2 <- data_2_ind %>% 
  bind_rows()
## Calculate mean coefficients by chosen alternative
data_2 %>%
  filter(choice == 1) %>% 
  group_by(alt) %>% 
  summarize(ac_coef = mean(ac_coef_mean),
            ic_coef = mean(ic_coef_mean),
            oc_coef = mean(oc_coef_mean),
            .groups = 'drop')