# Setup ----------------------------------------------------------------------------------
    # Install 'pacman' package
    #install.packages("pacman")
    # Load 'pacman'
    library(pacman)
    # Load other packages
    p_load(tidyverse, caret, parallel, tictoc, here, magrittr)

# Parameters -----------------------------------------------------------------------------
    # 0. Set seed
    set.seed(12345)
    # 1. Set simulation parameters
    N = 1000
    T = 50

# Generate population: Indices -----------------------------------------------------------
  # Create the indices of our population
  pop_df = expand_grid(
    i = 1:N,
    t = 1:T
  )
  # Check our population tibble's dimensions
  pop_df %>% dim()

# Generate population: Disturbances, covariates, response --------------------------------
  # Option 1: Generate disturbances and x with serial correlation
  # Disturbance for (i,t): Correlated within time (not across individuals)
  pop_df %<>%
    group_by(i) %>%
    mutate(
      x = arima.sim(model = list(ar = c(0.9)), n = T) %>% as.numeric(),
      e_it = arima.sim(model = list(ar = c(0.9)), n = T) %>% as.numeric()
    ) %>%
    ungroup()
  # Disturbance for (i): Constant within individual
  pop_df %<>%
    group_by(i) %>%
    mutate(e_i = runif(n = 1, min = -100, max = 100)) %>%
    ungroup()
  # # Option 2: Generate x_i, e_i, e_it without serial correlation
  # pop_df %<>% mutate(
  #   x = rnorm(T * N),
  #   e_i = runif(T * N, min = -100, max = 100),
  #   e_it = rnorm(T * N)
  # )
# NOTE: Continue here...
  # Calculate 'y'
  pop_df %<>% mutate(
    y = e_i + 3 * x - 2 * x^2 + 0.1 * x^4 + e_it
  )
  # Drop disturbances
  pop_df %<>% select(i, t, y, x)

  # Plot a few observations' time series
  ggplot(data = pop_df %>% filter(i < 5), aes(x = t, y = y, color = as.factor(i))) +
  geom_line() +
  geom_point() +
  theme_minimal(base_size = 16) +
  theme(legend.position = "none")

  # Plot x,y for t=1:3
  ggplot(data = pop_df %>% filter(i < 10 & t < 4), aes(x = x, y = y, color = as.factor(i))) +
  geom_line() +
  geom_point() +
  theme_minimal(base_size = 16) +
  theme(legend.position = "none")

# One iteration --------------------------------------------------------------------------
  # Define sample size
  n_i = 50
  # Draw sample (will be an input of our function)
  i_sampled = sample.int(N, size = n_i)
  # Draw a sample
  sample_df = pop_df %>% filter(i %in% i_sampled)

  # Define number of folds
  k_folds = 5
  # k-fold CV
  cv_output = train(
    # The relationship: y as a function of x
    y ~ .,
    # The method of estimating the model (linear regression)
    method = "knn",
    # The training data (which will be used for cross validation)
    data = sample_df %>% select(y, x),
    # Controls for the model training: k-fold cross validation
    trControl = trainControl(method = "cv", number = k_folds),
    # Allow cross validation to choose best value of K (# nearest neighbors)
    tuneGrid = expand.grid(k = seq(1, 500, by = 5))
  )

  # Check output from train()
  cv_output
  # And plot it
  ggplot(cv_output) + theme_minimal()

  # If you want the results as a data frame
  cv_output$results
  cv_output$resample

  # Subset to unseen data
  test_df = pop_df %>% filter(i %>% is_in(i_sampled) %>% not())
  # Make predictions on
  predictions = predict(
    cv_output,
    newdata = test_df
  )
  # Calculate RMSE in full population
  rmse_true = mean((test_df$y - predictions)^2) %>% sqrt()
  rmse_true

  # Plot the true test MSE against the estimated MSE
  ggplot(cv_output) +
  geom_segment(
    data = cv_output$results %>% filter(RMSE == min(RMSE)),
    aes(x = k, xend = k, y = RMSE, yend = rmse_true),
    color = "orange",
    linetype = "longdash",
    size = 0.3
  ) +
  geom_point(
    data = cv_output$results %>% filter(RMSE == min(RMSE)),
    aes(x = k, y = RMSE),
    color = "orange",
    size = 3
  ) +
  theme_minimal(base_size = 16) +
  geom_hline(yintercept = rmse_true, color = "blue") +
  scale_y_continuous("RMSE") +
  scale_x_continuous("K (# of nearest neighbors)")

# Function: One iteration ----------------------------------------------------------------
  # Our function for a single iteration of the simulation
  sim_fun = function(iter, n_i = 50, k_folds = 5) {
    # Draw sample
    i_sampled = sample.int(N, size = n_i)
    # Draw a sample
    sample_df = pop_df %>% filter(i %in% i_sampled)
    # k-fold CV
    cv_output = train(
      # The relationship: y as a function of w and x
      y ~ .,
      # The method of estimating the model (linear regression)
      method = "knn",
      # The training data (which will be used for cross validation)
      data = sample_df %>% select(y, x),
      # Controls for the model training: k-fold cross validation
      trControl = trainControl(method = "cv", number = k_folds),
      # Allow cross validation to choose best value of K (# nearest neighbors)
      tuneGrid = expand.grid(k = seq(1, 200, by = 10))
    )
    # Find the estimated MSE
    mse_est = mean(cv_output$resample$RMSE^2)
    # Subset to unseen data
    test_df = pop_df %>% filter(i %>% is_in(i_sampled) %>% not())
    # Make predictions on true test data
    predictions = predict(
      cv_output,
      newdata = test_df
    )
    # Calculate RMSE in full population
    mse_true = mean((test_df$y - predictions)^2)
    # Output results
    data.frame(
      iter,
      k = cv_output$bestTune$k,
      mse_est,
      mse_true
    )
  }

# # Example: Run the function once ---------------------------------------------------------
#   # Set seed
#   set.seed(1)
#   # Run (and time) one iteration
#   tic()
#   sim_fun(iter = 1)
#   toc()

# Run the simulation ---------------------------------------------------------------------
  # Set seed
  set.seed(123)
  # Run simulation 1,000 times in parallel (and time)
  tic()
  sim_df = mclapply(
    X = 1:1e3,
    FUN = sim_fun,
    mc.cores = 11
  ) %>% bind_rows()
  toc()
  # Save dataset
  # write_csv(x = sim_df, path = "cv-sim-knn.csv")
  write_csv(x = sim_df, path = "cv-sim-knn-uncorrelated.csv")