if (!require("pacman")) install.packages("pacman")
pacman::p_load(
glmnet, # for estimating ridge and lasso
leaps, # for best subset selection
MASS, # for simulating multivariate normal
tidyverse # for wrangling and plotting
)package 'leaps' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\internet\AppData\Local\Temp\RtmpkFs8Vi\downloaded_packages
set.seed(1203)n <- 100 # sample size
p <- 100 # number of feature
s <- 3 # number of features with non-zero coefficientsGenerate response and features
X_ind <- matrix(rnorm(n * p), ncol = p)
beta <- c(rep(5, s), rep(0, p - s))
Y_ind <- X_ind %*% beta + rnorm(n)Variance covariance matrix
Sigma <- matrix(
c(1, 0.9, 0.2,
0.9, 1, 0,
0.2, 0, 1 ),
nrow = s,
ncol = s
)
cov2cor(Sigma) [,1] [,2] [,3]
[1,] 1.0 0.9 0.2
[2,] 0.9 1.0 0.0
[3,] 0.2 0.0 1.0
Generate response and features
X_1 <- mvrnorm(n = n, rep(0, s), Sigma = Sigma)
X_2 <- matrix(rnorm(n * (p - s)), ncol = p - s)
X_dep <- cbind(X_1, X_2)
beta <- c(rep(5, s), rep(0, p - s))
Y_dep <- X_dep %*% beta + rnorm(n)Regularization path
fit_lasso <- glmnet(X_ind, Y_ind)
plot(fit_lasso, xvar = "lambda", label = TRUE)fit_ridge <- glmnet(X_ind, Y_ind, alpha = 0)
plot(fit_ridge, xvar = "lambda", label = TRUE)Tuning \(\lambda\)
cv_lasso <- cv.glmnet(X_ind, Y_ind)
plot(cv_lasso, xvar = "lambda", label = TRUE)cv_ridge <- cv.glmnet(X_ind, Y_ind, alpha = 0)
plot(cv_ridge, xvar = "lambda", label = TRUE)Regularization path
fit_lasso <- glmnet(X_dep, Y_dep)
plot(fit_lasso, xvar = "lambda", label = TRUE)fit_ridge <- glmnet(X_dep, Y_dep, alpha = 0)
plot(fit_ridge, xvar = "lambda", label = TRUE)Tuning \(\lambda\)
cv_lasso <- cv.glmnet(X_dep, Y_dep)
plot(cv_lasso, xvar = "lambda", label = TRUE)cv_ridge <- cv.glmnet(X_dep, Y_dep, alpha = 0)
plot(cv_ridge, xvar = "lambda", label = TRUE)Simulate data
y <- seq(-0.5, 0.5, 0.1)
n <- length(y)
X <- diag(n)
lambda <- 0.1OLS
ols <- lm(y ~ X - 1)
b_ols <- coef(ols)Ridge
ridge <- glmnet(X, y, alpha = 0, standardize = TRUE, intercept = FALSE)
b_ridge <- coef(ridge, s = lambda)[2:(n+1),1]Lasso
lasso <- glmnet(X, y, alpha = 1, standardize = TRUE, intercept = FALSE)
b_lasso <- coef(lasso, s = lambda)[2:(n+1),1]Plot results
bs <- cbind(b_ols,b_lasso,b_ridge) %>%
as_tibble()
bs %>%
ggplot(aes(x = b_ols)) +
geom_line(aes(y = b_ols, color = "ols")) +
geom_line(aes(y = b_lasso, color = "lasso")) +
geom_line(aes(y = b_ridge, color = "ridge")) +
labs(
x = expression(beta^{OLS}),
y = "coefficient estimate"
)Best subset selection
subset <- regsubsets(x = X, y = y, intercept = FALSE, nvmax = 10)
summary(subset)Subset selection object
11 Variables
Forced in Forced out
b FALSE FALSE
c FALSE FALSE
d FALSE FALSE
e FALSE FALSE
f FALSE FALSE
g FALSE FALSE
h FALSE FALSE
i FALSE FALSE
j FALSE FALSE
k FALSE FALSE
1 subsets of each size up to 10
Selection Algorithm: exhaustive
a b c d e f g h i j k
1 ( 1 ) "*" " " " " " " " " " " " " " " " " " " " "
2 ( 1 ) "*" " " " " " " " " " " " " " " " " " " "*"
3 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " "*"
4 ( 1 ) "*" "*" " " " " " " " " " " " " " " "*" "*"
5 ( 1 ) "*" "*" " " " " " " " " " " " " "*" "*" "*"
6 ( 1 ) "*" "*" "*" " " " " " " " " " " "*" "*" "*"
7 ( 1 ) "*" "*" "*" " " " " " " " " "*" "*" "*" "*"
8 ( 1 ) "*" "*" "*" "*" " " " " " " "*" "*" "*" "*"
9 ( 1 ) "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*"
10 ( 1 ) "*" "*" "*" "*" "*" " " "*" "*" "*" "*" "*"