Load packages

if (!require("pacman")) install.packages("pacman")

pacman::p_load(
  glmnet,   # for estimating ridge and lasso
  leaps,    # for best subset selection
  MASS,     # for simulating multivariate normal
  tidyverse # for wrangling and plotting
)

Set seed for replication

set.seed(1203)

Set parameters

n <- 100 # sample size
p <- 100 # number of feature
s <- 3   # number of features with non-zero coefficients

Generate data

independent features

Generate response and features

X_ind <- matrix(rnorm(n * p), ncol = p)
beta  <- c(rep(5, s), rep(0, p - s))
Y_ind <- X_ind %*% beta + rnorm(n)

dependent features

Variance covariance matrix

Sigma <- matrix(
  c(1,    0.9, 0.2,
    0.9,  1,   0,
    0.2,  0,   1  ),
  nrow = s,
  ncol = s
)

cov2cor(Sigma)
     [,1] [,2] [,3]
[1,]  1.0  0.9  0.2
[2,]  0.9  1.0  0.0
[3,]  0.2  0.0  1.0

Generate response and features

X_1   <- mvrnorm(n = n, rep(0, s), Sigma = Sigma)
X_2   <- matrix(rnorm(n * (p - s)), ncol = p - s)
X_dep <- cbind(X_1, X_2)
beta  <- c(rep(5, s), rep(0, p - s))
Y_dep  <- X_dep %*% beta + rnorm(n)

Estimate regularization path

independent features

Regularization path

fit_lasso <- glmnet(X_ind, Y_ind)
plot(fit_lasso, xvar = "lambda", label = TRUE)

fit_ridge <- glmnet(X_ind, Y_ind, alpha = 0)
plot(fit_ridge, xvar = "lambda", label = TRUE)

Tuning \(\lambda\)

cv_lasso <- cv.glmnet(X_ind, Y_ind)
plot(cv_lasso, xvar = "lambda", label = TRUE)

cv_ridge <- cv.glmnet(X_ind, Y_ind, alpha = 0)
plot(cv_ridge, xvar = "lambda", label = TRUE)

Dependent features

Regularization path

fit_lasso <- glmnet(X_dep, Y_dep)
plot(fit_lasso, xvar = "lambda", label = TRUE)

fit_ridge <- glmnet(X_dep, Y_dep, alpha = 0)
plot(fit_ridge, xvar = "lambda", label = TRUE)

Tuning \(\lambda\)

cv_lasso <- cv.glmnet(X_dep, Y_dep)
plot(cv_lasso, xvar = "lambda", label = TRUE)

cv_ridge <- cv.glmnet(X_dep, Y_dep, alpha = 0)
plot(cv_ridge, xvar = "lambda", label = TRUE)

Thresholding

Simulate data

y      <- seq(-0.5, 0.5, 0.1)
n      <- length(y)
X      <- diag(n)
lambda <- 0.1

OLS

ols   <- lm(y ~ X - 1)
b_ols <- coef(ols)

Ridge

ridge   <- glmnet(X, y, alpha = 0, standardize = TRUE, intercept = FALSE)
b_ridge <- coef(ridge, s = lambda)[2:(n+1),1]

Lasso

lasso   <- glmnet(X, y, alpha = 1, standardize = TRUE, intercept = FALSE)
b_lasso <- coef(lasso, s = lambda)[2:(n+1),1]

Plot results

bs <- cbind(b_ols,b_lasso,b_ridge) %>%
  as_tibble() 

bs %>% 
  ggplot(aes(x = b_ols)) +
  geom_line(aes(y = b_ols,   color = "ols")) +
  geom_line(aes(y = b_lasso, color = "lasso")) +
  geom_line(aes(y = b_ridge, color = "ridge")) +
  labs(
    x = expression(beta^{OLS}),
    y = "coefficient estimate"
  )

Best subset selection

subset <- regsubsets(x = X, y = y, intercept = FALSE, nvmax = 10)
summary(subset)
Subset selection object
11 Variables 
  Forced in Forced out
b     FALSE      FALSE
c     FALSE      FALSE
d     FALSE      FALSE
e     FALSE      FALSE
f     FALSE      FALSE
g     FALSE      FALSE
h     FALSE      FALSE
i     FALSE      FALSE
j     FALSE      FALSE
k     FALSE      FALSE
1 subsets of each size up to 10
Selection Algorithm: exhaustive
          a   b   c   d   e   f   g   h   i   j   k  
1  ( 1 )  "*" " " " " " " " " " " " " " " " " " " " "
2  ( 1 )  "*" " " " " " " " " " " " " " " " " " " "*"
3  ( 1 )  "*" "*" " " " " " " " " " " " " " " " " "*"
4  ( 1 )  "*" "*" " " " " " " " " " " " " " " "*" "*"
5  ( 1 )  "*" "*" " " " " " " " " " " " " "*" "*" "*"
6  ( 1 )  "*" "*" "*" " " " " " " " " " " "*" "*" "*"
7  ( 1 )  "*" "*" "*" " " " " " " " " "*" "*" "*" "*"
8  ( 1 )  "*" "*" "*" "*" " " " " " " "*" "*" "*" "*"
9  ( 1 )  "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*"
10  ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*"