Features Engineering

Author

Rami Krispin

Published

March 13, 2025

Data

Loading the saved data:

us_gas <- readRDS(file = "./data/us_gas.RDS")

head(us_gas)
  area_name area_code process           process_name       date value units
1  Colorado        CO     VCS Commercial Consumption 2024-11-01  6936  MMCF
2  Colorado        CO     VCS Commercial Consumption 2024-01-01 10314  MMCF
3  Colorado        CO     VCS Commercial Consumption 2024-06-01  1949  MMCF
4  Colorado        CO     VCS Commercial Consumption 1989-01-01 10522  MMCF
5  Colorado        CO     VCS Commercial Consumption 1989-09-01  2303  MMCF
6  Colorado        CO     VCS Commercial Consumption 1990-10-01  2918  MMCF
                                                                                              description
1 Natural Gas Deliveries to Commercial Consumers (Including Vehicle Fuel through 1996) in Colorado (MMcf)
2 Natural Gas Deliveries to Commercial Consumers (Including Vehicle Fuel through 1996) in Colorado (MMcf)
3 Natural Gas Deliveries to Commercial Consumers (Including Vehicle Fuel through 1996) in Colorado (MMcf)
4 Natural Gas Deliveries to Commercial Consumers (Including Vehicle Fuel through 1996) in Colorado (MMcf)
5 Natural Gas Deliveries to Commercial Consumers (Including Vehicle Fuel through 1996) in Colorado (MMcf)
6 Natural Gas Deliveries to Commercial Consumers (Including Vehicle Fuel through 1996) in Colorado (MMcf)

Reformat the data.frame into a tsibble object:

ts_obj <- us_gas |>
    dplyr::select(date, area_name, process, value) |>
    dplyr::mutate(index = tsibble::yearmonth(date)) |>
    tsibble::as_tsibble(index = index, key = c(area_name, process))

ts_obj
# A tsibble: 100,583 x 5 [1M]
# Key:       area_name, process [314]
   date       area_name process value    index
   <date>     <chr>     <chr>   <int>    <mth>
 1 1989-01-01 Alabama   VCS      3434 1989 Jan
 2 1989-02-01 Alabama   VCS      3514 1989 Feb
 3 1989-03-01 Alabama   VCS      3395 1989 Mar
 4 1989-04-01 Alabama   VCS      2369 1989 Apr
 5 1989-05-01 Alabama   VCS      1720 1989 May
 6 1989-06-01 Alabama   VCS      1215 1989 Jun
 7 1989-07-01 Alabama   VCS      1673 1989 Jul
 8 1989-08-01 Alabama   VCS      1117 1989 Aug
 9 1989-09-01 Alabama   VCS      1189 1989 Sep
10 1989-10-01 Alabama   VCS      1382 1989 Oct
# ℹ 100,573 more rows
keys <- attributes(ts_obj)$key

Features Engineering

The next step is to collapse the time series into a feature table. This includes looping over each series and:

  • Check for missing values and impute if possible
  • Drop series that does not have a sufficient number of observations
  • Calculate a set of features for each series

We will use the tsfeatures library to create for each series a set of features such as:

  • Trend
  • AutoCorrelation features
  • Arch stat features
  • Nonlinearity measurment feature
features_df <- NULL
features_df <- lapply(1:nrow(keys), function(i) {
    d <- NULL
    d <- ts_obj |>
        dplyr::filter(
            area_name == keys$area_name[i],
            process == keys$process[i]
        )

    s <- TRUE
    # Check for missing values and zeros
    z <- which(d$value == 0)

    m <- which(is.na(d$value))
    if (length(m) > 0) {
        if (length(m) < nrow(d) * 0.1 && length(z) == 0) {
            if (any(diff(m) == 1)) {
                x <- m[which(diff(m) == 1)]
                for (n in x) {
                    d$value[n] <- (d$value[n - 12] + d$value[n - 24] + d$value[n - 36]) / 3
                }

                y <- which(is.na(d$value))
                if (length(y) > 0) {
                    for (n in y) {
                        if (n < nrow(d)) {
                            d$value[n] <- (d$value[n - 1] + d$value[n + 1]) / 2
                        } else {
                            d$value[n] <- (d$value[n - 12] + d$value[n - 24]) / 2
                        }
                    }
                }
            } else {
                for (n in m) {
                    if (n < nrow(d)) {
                        d$value[n] <- (d$value[n - 1] + d$value[n + 1]) / 2
                    } else {
                        d$value[n] <- (d$value[n - 12] + d$value[n - 24]) / 2
                    }
                }
            }
        } else {
            s <- FALSE
        }
    }

    if (s) {
        f <- tsfeatures::tsfeatures(d$value)
        f$arch_stat <- tsfeatures::arch_stat(d$value)
        f <- cbind(f, t(as.data.frame(tsfeatures::autocorr_features(d$value))))
        f$nonlinearity <- tsfeatures::nonlinearity(d$value)
        f <- cbind(f, t(as.data.frame(tsfeatures::pacf_features(d$value))))

        row.names(f) <- NULL
        f$area_name <- keys$area_name[i]
        f$process <- keys$process[i]
        f$nperiods <- NULL
        f$frequency <- NULL
        f$seasonal_period <- NULL
        f$success <- TRUE
    } else {
        f <- data.frame(success = FALSE)
    }

    return(f)
}) |>
    dplyr::bind_rows()
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
Warning in tsfeatures::tsfeatures(d$value): Some series are constant and cannot
be scaled, so scaling has been disabled (`scale = FALSE`).
Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
  zero-variance series
Warning in firstmin_ac(x, acfv): No minimum was found.
Warning in firstmin_ac(x, acfv): Some series are constant and cannot be scaled,
so scaling has been disabled (`scale = FALSE`).
Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
  zero-variance series
Warning in firstmin_ac(x, acfv): No minimum was found.
Warning in firstmin_ac(x, acfv): Some series are constant and cannot be scaled,
so scaling has been disabled (`scale = FALSE`).
Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
  zero-variance series
Warning in firstmin_ac(x, acfv): No minimum was found.
Warning in firstmin_ac(x, acfv): Some series are constant and cannot be scaled,
so scaling has been disabled (`scale = FALSE`).
Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action,  : 
  zero-variance series
Warning in firstmin_ac(x, acfv): No minimum was found.
head(features_df)
        trend        spike linearity  curvature    e_acf1   e_acf10   entropy
1 0.003398831 9.524934e-06 -1.118874 -0.2623495 0.7508079 2.4240440 0.2459573
2 0.935352253 7.909344e-07 -7.458762 -2.7312904 0.5398311 0.6312546 0.4496333
3 0.801042782 1.055713e-06 14.326348 -3.1651073 0.5919222 0.7868258 0.2856361
4 0.837720165 5.833055e-07 14.846169 -1.6891853 0.3550440 1.1029172 0.2335578
5 0.849349395 6.548899e-07 14.073318  4.1932365 0.5751706 0.9701880 0.2109784
6 0.057609881 9.146451e-06 -4.809029 -0.3580087 0.7726904 2.5173058 0.2716254
     x_acf1  x_acf10  diff1_acf1 diff1_acf10 diff2_acf1 diff2_acf10 arch_stat
1 0.7521144 2.397351  0.29144579  0.57539570 -0.3681277   0.2093100 0.3694279
2 0.9648267 7.051760 -0.17417960  0.06775811 -0.6250807   0.4925355 0.8523744
3 0.9121012 5.506638  0.01943907  0.32086202 -0.4593933   0.3962221 0.7257865
4 0.8910733 6.156808 -0.07677648  0.70639299 -0.4699257   0.5049839 0.5968198
5 0.9289929 6.678439 -0.29188831  0.31939297 -0.6892386   0.8210981 0.6521779
6 0.7855201 2.112944  0.44964235  0.78850610 -0.1328695   0.1619485 0.5224504
  embed2_incircle_1 embed2_incircle_2        ac_9 firstmin_ac      trev_num
1                 0                 0 -0.08477898           6 -6.529396e+07
2                 0                 0  0.73322456          72  7.893855e+00
3                 0                 0  0.67652700           4 -2.424188e+10
4                 0                 0  0.67578292           3 -1.058054e+11
5                 0                 0  0.78486321           6 -1.562761e+08
6                 0                 0 -0.04709358           6  9.268957e+08
  motiftwo_entro3 walker_propcross nonlinearity   x_pacf5 diff1x_pacf5
1       1.5657273       0.18097448    0.9306287 0.9779110   0.20382194
2       0.7608726       0.09497207    0.1233921 0.9626648   0.04080339
3       1.2730113       0.26132404    0.2245167 0.9344324   0.16958371
4       1.2294644       0.31358885    0.9265772 1.1232687   0.47236137
5       1.1681461       0.20209059    0.5287830 0.9747106   0.16018364
6       1.5120847       0.16705336    0.4363147 1.0702491   0.32722026
  diff2x_pacf5 area_name process success
1    0.2414457   Alabama     VCS    TRUE
2    0.6166896   Alabama     VDV    TRUE
3    0.5340146   Alabama     VEU    TRUE
4    0.8097153   Alabama     VGT    TRUE
5    0.6393660   Alabama     VIN    TRUE
6    0.1420455   Alabama     VRS    TRUE
nrow(features_df)
[1] 314
table(features_df$success)

FALSE  TRUE 
    6   308 

Remove missing values and failed calculations:

features_clean <- na.omit(features_df)
table(features_clean$success)

TRUE 
 304 
features_clean <- features_clean |>
    dplyr::filter(success) |>
    dplyr::select(-success)

Calculating the PCA and merging its first three components with the features table:

pca <- features_clean |>
    dplyr::select(-area_name, -process) |>
    prcomp(scale = TRUE)

features <- cbind(features_clean, as.data.frame(pca$x[, 1:3]))

head(features)
        trend        spike linearity  curvature    e_acf1   e_acf10   entropy
1 0.003398831 9.524934e-06 -1.118874 -0.2623495 0.7508079 2.4240440 0.2459573
2 0.935352253 7.909344e-07 -7.458762 -2.7312904 0.5398311 0.6312546 0.4496333
3 0.801042782 1.055713e-06 14.326348 -3.1651073 0.5919222 0.7868258 0.2856361
4 0.837720165 5.833055e-07 14.846169 -1.6891853 0.3550440 1.1029172 0.2335578
5 0.849349395 6.548899e-07 14.073318  4.1932365 0.5751706 0.9701880 0.2109784
6 0.057609881 9.146451e-06 -4.809029 -0.3580087 0.7726904 2.5173058 0.2716254
     x_acf1  x_acf10  diff1_acf1 diff1_acf10 diff2_acf1 diff2_acf10 arch_stat
1 0.7521144 2.397351  0.29144579  0.57539570 -0.3681277   0.2093100 0.3694279
2 0.9648267 7.051760 -0.17417960  0.06775811 -0.6250807   0.4925355 0.8523744
3 0.9121012 5.506638  0.01943907  0.32086202 -0.4593933   0.3962221 0.7257865
4 0.8910733 6.156808 -0.07677648  0.70639299 -0.4699257   0.5049839 0.5968198
5 0.9289929 6.678439 -0.29188831  0.31939297 -0.6892386   0.8210981 0.6521779
6 0.7855201 2.112944  0.44964235  0.78850610 -0.1328695   0.1619485 0.5224504
  embed2_incircle_1 embed2_incircle_2        ac_9 firstmin_ac      trev_num
1                 0                 0 -0.08477898           6 -6.529396e+07
2                 0                 0  0.73322456          72  7.893855e+00
3                 0                 0  0.67652700           4 -2.424188e+10
4                 0                 0  0.67578292           3 -1.058054e+11
5                 0                 0  0.78486321           6 -1.562761e+08
6                 0                 0 -0.04709358           6  9.268957e+08
  motiftwo_entro3 walker_propcross nonlinearity   x_pacf5 diff1x_pacf5
1       1.5657273       0.18097448    0.9306287 0.9779110   0.20382194
2       0.7608726       0.09497207    0.1233921 0.9626648   0.04080339
3       1.2730113       0.26132404    0.2245167 0.9344324   0.16958371
4       1.2294644       0.31358885    0.9265772 1.1232687   0.47236137
5       1.1681461       0.20209059    0.5287830 0.9747106   0.16018364
6       1.5120847       0.16705336    0.4363147 1.0702491   0.32722026
  diff2x_pacf5 area_name process       PC1         PC2        PC3
1    0.2414457   Alabama     VCS -1.976452 -1.04772486  0.7012179
2    0.6166896   Alabama     VDV  4.464822  1.79678924  1.0697063
3    0.5340146   Alabama     VEU  2.396487  0.34041682 -1.3673512
4    0.8097153   Alabama     VGT  2.720845  0.63121447 -2.2712976
5    0.6393660   Alabama     VIN  3.911493  0.62041401 -1.5097302
6    0.1420455   Alabama     VRS -2.882772 -0.03653145  0.7291997

Scale the features table:

features_scale <- cbind(scale(features[, 1:25]), features[, c("area_name", "process")])

Calculate the K-means and merge it back to features table:

km2 <- kmeans(features_scale[, 1:25], centers = 2, nstart = 25)
km3 <- kmeans(features_scale[, 1:25], centers = 3, nstart = 25)
km4 <- kmeans(features_scale[, 1:25], centers = 4, nstart = 25)
km5 <- kmeans(features_scale[, 1:25], centers = 5, nstart = 25)


features$cluster2 <- km2[1]$cluster
features$cluster3 <- km3[1]$cluster
features$cluster4 <- km4[1]$cluster
features$cluster5 <- km5[1]$cluster

head(features)
        trend        spike linearity  curvature    e_acf1   e_acf10   entropy
1 0.003398831 9.524934e-06 -1.118874 -0.2623495 0.7508079 2.4240440 0.2459573
2 0.935352253 7.909344e-07 -7.458762 -2.7312904 0.5398311 0.6312546 0.4496333
3 0.801042782 1.055713e-06 14.326348 -3.1651073 0.5919222 0.7868258 0.2856361
4 0.837720165 5.833055e-07 14.846169 -1.6891853 0.3550440 1.1029172 0.2335578
5 0.849349395 6.548899e-07 14.073318  4.1932365 0.5751706 0.9701880 0.2109784
6 0.057609881 9.146451e-06 -4.809029 -0.3580087 0.7726904 2.5173058 0.2716254
     x_acf1  x_acf10  diff1_acf1 diff1_acf10 diff2_acf1 diff2_acf10 arch_stat
1 0.7521144 2.397351  0.29144579  0.57539570 -0.3681277   0.2093100 0.3694279
2 0.9648267 7.051760 -0.17417960  0.06775811 -0.6250807   0.4925355 0.8523744
3 0.9121012 5.506638  0.01943907  0.32086202 -0.4593933   0.3962221 0.7257865
4 0.8910733 6.156808 -0.07677648  0.70639299 -0.4699257   0.5049839 0.5968198
5 0.9289929 6.678439 -0.29188831  0.31939297 -0.6892386   0.8210981 0.6521779
6 0.7855201 2.112944  0.44964235  0.78850610 -0.1328695   0.1619485 0.5224504
  embed2_incircle_1 embed2_incircle_2        ac_9 firstmin_ac      trev_num
1                 0                 0 -0.08477898           6 -6.529396e+07
2                 0                 0  0.73322456          72  7.893855e+00
3                 0                 0  0.67652700           4 -2.424188e+10
4                 0                 0  0.67578292           3 -1.058054e+11
5                 0                 0  0.78486321           6 -1.562761e+08
6                 0                 0 -0.04709358           6  9.268957e+08
  motiftwo_entro3 walker_propcross nonlinearity   x_pacf5 diff1x_pacf5
1       1.5657273       0.18097448    0.9306287 0.9779110   0.20382194
2       0.7608726       0.09497207    0.1233921 0.9626648   0.04080339
3       1.2730113       0.26132404    0.2245167 0.9344324   0.16958371
4       1.2294644       0.31358885    0.9265772 1.1232687   0.47236137
5       1.1681461       0.20209059    0.5287830 0.9747106   0.16018364
6       1.5120847       0.16705336    0.4363147 1.0702491   0.32722026
  diff2x_pacf5 area_name process       PC1         PC2        PC3 cluster2
1    0.2414457   Alabama     VCS -1.976452 -1.04772486  0.7012179        1
2    0.6166896   Alabama     VDV  4.464822  1.79678924  1.0697063        2
3    0.5340146   Alabama     VEU  2.396487  0.34041682 -1.3673512        2
4    0.8097153   Alabama     VGT  2.720845  0.63121447 -2.2712976        2
5    0.6393660   Alabama     VIN  3.911493  0.62041401 -1.5097302        2
6    0.1420455   Alabama     VRS -2.882772 -0.03653145  0.7291997        1
  cluster3 cluster4 cluster5
1        2        4        1
2        1        1        4
3        1        2        3
4        1        2        3
5        1        2        3
6        2        4        1

Save the features table:

saveRDS(features, file = "./data/features.RDS")

write.csv(features, "./data/features.csv", row.names = FALSE)