Features Engineering

Author

Rami Krispin

Published

October 17, 2024

Data

Loading the saved data:

us_gas <- readRDS(file = "./data/us_gas.RDS")

head(us_gas)

  area_name area_code process            process_name       date value units
1  Arkansas        AR     VRS Residential Consumption 1993-01-01  8141  MMCF
2  Arkansas        AR     VRS Residential Consumption 1996-06-01  1202  MMCF
3  Arkansas        AR     VRS Residential Consumption 1997-10-01  1345  MMCF
4  Arkansas        AR     VRS Residential Consumption 2000-04-01  2439  MMCF
5  Arkansas        AR     VRS Residential Consumption 2003-04-01  3043  MMCF
6  Arkansas        AR     VRS Residential Consumption 2003-11-01  2065  MMCF
                                          description
1 Arkansas Natural Gas Residential Consumption (MMcf)
2 Arkansas Natural Gas Residential Consumption (MMcf)
3 Arkansas Natural Gas Residential Consumption (MMcf)
4 Arkansas Natural Gas Residential Consumption (MMcf)
5 Arkansas Natural Gas Residential Consumption (MMcf)
6 Arkansas Natural Gas Residential Consumption (MMcf)

Reformat the data.frame into a tsibble object:

ts_obj <- us_gas |>
    dplyr::select(date, area_name, process, value) |>
    dplyr::mutate(index = tsibble::yearmonth(date)) |>
    tsibble::as_tsibble(index = index, key = c(area_name, process))

ts_obj

# A tsibble: 99,013 x 5 [1M]
# Key:       area_name, process [314]
   date       area_name process value    index
   <date>     <chr>     <chr>   <int>    <mth>
 1 1989-01-01 Alabama   VCS      3434 1989 Jan
 2 1989-02-01 Alabama   VCS      3514 1989 Feb
 3 1989-03-01 Alabama   VCS      3395 1989 Mar
 4 1989-04-01 Alabama   VCS      2369 1989 Apr
 5 1989-05-01 Alabama   VCS      1720 1989 May
 6 1989-06-01 Alabama   VCS      1215 1989 Jun
 7 1989-07-01 Alabama   VCS      1673 1989 Jul
 8 1989-08-01 Alabama   VCS      1117 1989 Aug
 9 1989-09-01 Alabama   VCS      1189 1989 Sep
10 1989-10-01 Alabama   VCS      1382 1989 Oct
# ℹ 99,003 more rows

keys <- attributes(ts_obj)$key

Features Engineering

The next step is to collapse the time series into a feature table. This includes looping over each series and:

Check for missing values and impute if possible
Drop series that does not have a sufficient number of observations
Calculate a set of features for each series

We will use the tsfeatures library to create for each series a set of features such as:

Trend
AutoCorrelation features
Arch stat features
Nonlinearity measurment feature

features_df <- NULL
features_df <- lapply(1:nrow(keys), function(i) {
    d <- NULL
    d <- ts_obj |>
        dplyr::filter(
            area_name == keys$area_name[i],
            process == keys$process[i]
        )

    s <- TRUE
    # Check for missing values and zeros
    z <- which(d$value == 0)

    m <- which(is.na(d$value))
    if (length(m) > 0) {
        if (length(m) < nrow(d) * 0.1 && length(z) == 0) {
            if (any(diff(m) == 1)) {
                x <- m[which(diff(m) == 1)]
                for (n in x) {
                    d$value[n] <- (d$value[n - 12] + d$value[n - 24] + d$value[n - 36]) / 3
                }

                y <- which(is.na(d$value))
                if (length(y) > 0) {
                    for (n in y) {
                        if (n < nrow(d)) {
                            d$value[n] <- (d$value[n - 1] + d$value[n + 1]) / 2
                        } else {
                            d$value[n] <- (d$value[n - 12] + d$value[n - 24]) / 2
                        }
                    }
                }
            } else {
                for (n in m) {
                    if (n < nrow(d)) {
                        d$value[n] <- (d$value[n - 1] + d$value[n + 1]) / 2
                    } else {
                        d$value[n] <- (d$value[n - 12] + d$value[n - 24]) / 2
                    }
                }
            }
        } else {
            s <- FALSE
        }
    }

    if (s) {
        f <- tsfeatures::tsfeatures(d$value)
        f$arch_stat <- tsfeatures::arch_stat(d$value)
        f <- cbind(f, t(as.data.frame(tsfeatures::autocorr_features(d$value))))
        f$nonlinearity <- tsfeatures::nonlinearity(d$value)
        f <- cbind(f, t(as.data.frame(tsfeatures::pacf_features(d$value))))

        row.names(f) <- NULL
        f$area_name <- keys$area_name[i]
        f$process <- keys$process[i]
        f$nperiods <- NULL
        f$frequency <- NULL
        f$seasonal_period <- NULL
        f$success <- TRUE
    } else {
        f <- data.frame(success = FALSE)
    }

    return(f)
}) |>
    dplyr::bind_rows()

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo

Warning in tsfeatures::tsfeatures(d$value): Some series are constant and cannot
be scaled, so scaling has been disabled (`scale = FALSE`).

Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
  zero-variance series

Warning in firstmin_ac(x, acfv): No minimum was found.
Warning in firstmin_ac(x, acfv): Some series are constant and cannot be scaled,
so scaling has been disabled (`scale = FALSE`).

Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
  zero-variance series

Warning in firstmin_ac(x, acfv): No minimum was found.
Warning in firstmin_ac(x, acfv): Some series are constant and cannot be scaled,
so scaling has been disabled (`scale = FALSE`).

Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
  zero-variance series

Warning in firstmin_ac(x, acfv): No minimum was found.
Warning in firstmin_ac(x, acfv): Some series are constant and cannot be scaled,
so scaling has been disabled (`scale = FALSE`).

Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
  zero-variance series

Warning in firstmin_ac(x, acfv): No minimum was found.

head(features_df)

        trend        spike  linearity  curvature    e_acf1   e_acf10   entropy
1 0.003057269 9.712750e-06 -0.9752748 -0.1708456 0.7523232 2.4133825 0.5646704
2 0.948030396 6.726470e-07 -7.2015516 -3.0015766 0.4296742 0.4436997 0.4504870
3 0.800618791 1.125195e-06 14.2253302 -3.1554623 0.5867871 0.8027259 0.3008994
4 0.836803825 6.086026e-07 14.8085737 -1.5848186 0.3569556 1.0992692 0.2368032
5 0.850064046 6.811054e-07 13.7873300  4.5584124 0.5766747 1.0069159 0.1810873
6 0.055090260 9.327703e-06 -4.6036276 -0.2804776 0.7735770 2.5141998 0.5991920
     x_acf1  x_acf10  diff1_acf1 diff1_acf10 diff2_acf1 diff2_acf10 arch_stat
1 0.7530430 2.389684  0.29888389  0.57436812 -0.3554251   0.1948968 0.3704131
2 0.9643920 7.019090 -0.18144404  0.07357371 -0.6279015   0.5012790 0.8498117
3 0.9053211 5.520940  0.02418422  0.32177527 -0.4643220   0.3928694 0.7123761
4 0.8877402 6.144797 -0.07201543  0.69852439 -0.4692256   0.4904081 0.5880839
5 0.9334901 6.614953 -0.30579509  0.33282521 -0.6917163   0.8276553 0.6707924
6 0.7845562 2.122313  0.45377070  0.79226036 -0.1257789   0.1624230 0.5199401
  embed2_incircle_1 embed2_incircle_2        ac_9 firstmin_ac      trev_num
1                 0                 0 -0.07868176           6 -7.229022e+07
2                 0                 0  0.72951504          74  8.126437e+00
3                 0                 0  0.68210089           4 -9.869577e+09
4                 0                 0  0.67427730           3 -9.078644e+10
5                 0                 0  0.77306245           6 -1.963808e+08
6                 0                 0 -0.04204759           6  8.863228e+08
  motiftwo_entro3 walker_propcross nonlinearity   x_pacf5 diff1x_pacf5
1       1.5666745        0.1807512    0.9068600 0.9851775   0.20617291
2       0.7702653        0.1264368    0.1082076 0.9606749   0.04383207
3       1.2237333        0.2624113    0.1502996 0.9320975   0.15499032
4       1.2732961        0.3120567    0.9465369 1.1507212   0.43904278
5       1.2211373        0.2021277    0.5132332 1.0126882   0.16944682
6       1.5166308        0.1666667    0.4290233 1.0753667   0.33320892
  diff2x_pacf5 area_name process success
1    0.2335997   Alabama     VCS    TRUE
2    0.6195178   Alabama     VDV    TRUE
3    0.5246779   Alabama     VEU    TRUE
4    0.7932931   Alabama     VGT    TRUE
5    0.6569552   Alabama     VIN    TRUE
6    0.1429389   Alabama     VRS    TRUE

nrow(features_df)

[1] 314

table(features_df$success)


FALSE  TRUE 
    6   308

Remove missing values and failed calculations:

features_clean <- na.omit(features_df)
table(features_clean$success)


TRUE 
 304

features_clean <- features_clean |>
    dplyr::filter(success) |>
    dplyr::select(-success)

Calculating the PCA and merging its first three components with the features table:

pca <- features_clean |>
    dplyr::select(-area_name, -process) |>
    prcomp(scale = TRUE)

features <- cbind(features_clean, as.data.frame(pca$x[, 1:3]))

head(features)

        trend        spike  linearity  curvature    e_acf1   e_acf10   entropy
1 0.003057269 9.712750e-06 -0.9752748 -0.1708456 0.7523232 2.4133825 0.5646704
2 0.948030396 6.726470e-07 -7.2015516 -3.0015766 0.4296742 0.4436997 0.4504870
3 0.800618791 1.125195e-06 14.2253302 -3.1554623 0.5867871 0.8027259 0.3008994
4 0.836803825 6.086026e-07 14.8085737 -1.5848186 0.3569556 1.0992692 0.2368032
5 0.850064046 6.811054e-07 13.7873300  4.5584124 0.5766747 1.0069159 0.1810873
6 0.055090260 9.327703e-06 -4.6036276 -0.2804776 0.7735770 2.5141998 0.5991920
     x_acf1  x_acf10  diff1_acf1 diff1_acf10 diff2_acf1 diff2_acf10 arch_stat
1 0.7530430 2.389684  0.29888389  0.57436812 -0.3554251   0.1948968 0.3704131
2 0.9643920 7.019090 -0.18144404  0.07357371 -0.6279015   0.5012790 0.8498117
3 0.9053211 5.520940  0.02418422  0.32177527 -0.4643220   0.3928694 0.7123761
4 0.8877402 6.144797 -0.07201543  0.69852439 -0.4692256   0.4904081 0.5880839
5 0.9334901 6.614953 -0.30579509  0.33282521 -0.6917163   0.8276553 0.6707924
6 0.7845562 2.122313  0.45377070  0.79226036 -0.1257789   0.1624230 0.5199401
  embed2_incircle_1 embed2_incircle_2        ac_9 firstmin_ac      trev_num
1                 0                 0 -0.07868176           6 -7.229022e+07
2                 0                 0  0.72951504          74  8.126437e+00
3                 0                 0  0.68210089           4 -9.869577e+09
4                 0                 0  0.67427730           3 -9.078644e+10
5                 0                 0  0.77306245           6 -1.963808e+08
6                 0                 0 -0.04204759           6  8.863228e+08
  motiftwo_entro3 walker_propcross nonlinearity   x_pacf5 diff1x_pacf5
1       1.5666745        0.1807512    0.9068600 0.9851775   0.20617291
2       0.7702653        0.1264368    0.1082076 0.9606749   0.04383207
3       1.2237333        0.2624113    0.1502996 0.9320975   0.15499032
4       1.2732961        0.3120567    0.9465369 1.1507212   0.43904278
5       1.2211373        0.2021277    0.5132332 1.0126882   0.16944682
6       1.5166308        0.1666667    0.4290233 1.0753667   0.33320892
  diff2x_pacf5 area_name process       PC1        PC2        PC3
1    0.2335997   Alabama     VCS -2.130900 -1.3036161  0.9384032
2    0.6195178   Alabama     VDV  4.626793  1.4217552  1.0713550
3    0.5246779   Alabama     VEU  2.536517  0.1059892 -1.3959387
4    0.7932931   Alabama     VGT  2.799819  0.3110900 -2.1326149
5    0.6569552   Alabama     VIN  4.075354  0.3044582 -1.6657347
6    0.1429389   Alabama     VRS -2.950452 -0.1183790  0.8885529

Scale the features table:

features_scale <- cbind(scale(features[, 1:25]), features[, c("area_name", "process")])

Calculate the K-means and merge it back to features table:

km2 <- kmeans(features_scale[, 1:25], centers = 2, nstart = 25)
km3 <- kmeans(features_scale[, 1:25], centers = 3, nstart = 25)
km4 <- kmeans(features_scale[, 1:25], centers = 4, nstart = 25)
km5 <- kmeans(features_scale[, 1:25], centers = 5, nstart = 25)


features$cluster2 <- km2[1]$cluster
features$cluster3 <- km3[1]$cluster
features$cluster4 <- km4[1]$cluster
features$cluster5 <- km5[1]$cluster

head(features)

        trend        spike  linearity  curvature    e_acf1   e_acf10   entropy
1 0.003057269 9.712750e-06 -0.9752748 -0.1708456 0.7523232 2.4133825 0.5646704
2 0.948030396 6.726470e-07 -7.2015516 -3.0015766 0.4296742 0.4436997 0.4504870
3 0.800618791 1.125195e-06 14.2253302 -3.1554623 0.5867871 0.8027259 0.3008994
4 0.836803825 6.086026e-07 14.8085737 -1.5848186 0.3569556 1.0992692 0.2368032
5 0.850064046 6.811054e-07 13.7873300  4.5584124 0.5766747 1.0069159 0.1810873
6 0.055090260 9.327703e-06 -4.6036276 -0.2804776 0.7735770 2.5141998 0.5991920
     x_acf1  x_acf10  diff1_acf1 diff1_acf10 diff2_acf1 diff2_acf10 arch_stat
1 0.7530430 2.389684  0.29888389  0.57436812 -0.3554251   0.1948968 0.3704131
2 0.9643920 7.019090 -0.18144404  0.07357371 -0.6279015   0.5012790 0.8498117
3 0.9053211 5.520940  0.02418422  0.32177527 -0.4643220   0.3928694 0.7123761
4 0.8877402 6.144797 -0.07201543  0.69852439 -0.4692256   0.4904081 0.5880839
5 0.9334901 6.614953 -0.30579509  0.33282521 -0.6917163   0.8276553 0.6707924
6 0.7845562 2.122313  0.45377070  0.79226036 -0.1257789   0.1624230 0.5199401
  embed2_incircle_1 embed2_incircle_2        ac_9 firstmin_ac      trev_num
1                 0                 0 -0.07868176           6 -7.229022e+07
2                 0                 0  0.72951504          74  8.126437e+00
3                 0                 0  0.68210089           4 -9.869577e+09
4                 0                 0  0.67427730           3 -9.078644e+10
5                 0                 0  0.77306245           6 -1.963808e+08
6                 0                 0 -0.04204759           6  8.863228e+08
  motiftwo_entro3 walker_propcross nonlinearity   x_pacf5 diff1x_pacf5
1       1.5666745        0.1807512    0.9068600 0.9851775   0.20617291
2       0.7702653        0.1264368    0.1082076 0.9606749   0.04383207
3       1.2237333        0.2624113    0.1502996 0.9320975   0.15499032
4       1.2732961        0.3120567    0.9465369 1.1507212   0.43904278
5       1.2211373        0.2021277    0.5132332 1.0126882   0.16944682
6       1.5166308        0.1666667    0.4290233 1.0753667   0.33320892
  diff2x_pacf5 area_name process       PC1        PC2        PC3 cluster2
1    0.2335997   Alabama     VCS -2.130900 -1.3036161  0.9384032        1
2    0.6195178   Alabama     VDV  4.626793  1.4217552  1.0713550        2
3    0.5246779   Alabama     VEU  2.536517  0.1059892 -1.3959387        2
4    0.7932931   Alabama     VGT  2.799819  0.3110900 -2.1326149        2
5    0.6569552   Alabama     VIN  4.075354  0.3044582 -1.6657347        2
6    0.1429389   Alabama     VRS -2.950452 -0.1183790  0.8885529        1
  cluster3 cluster4 cluster5
1        1        3        5
2        3        1        1
3        2        2        3
4        2        2        3
5        3        2        3
6        1        4        4

Save the features table:

saveRDS(features, file = "./data/features.RDS")

write.csv(features, "./data/features.csv", row.names = FALSE)