This is just an initial exploratory step to see our raw data.
NOTE: Please click on the Code
button to see the codes.
Let’s download the data and load the packages.
setwd("~/Dropbox/RNS")
load("~/Dropbox/RNS2/DataON/finaldata.RData")
processed_data <- finaldata
This rmd file uses several packages. If you want to replicate the same results, please install them first.
library(tidyverse)
library(tidyquant)
library(recipes)
library(rsample)
library(knitr)
library(maps)
# Data Cleaning
library(janitor)
# EDA
library(skimr)
library(DataExplorer)
library(DT)
library(corrplot)
# ggplot2 Helpers
library(scales)
theme_set(theme_tq())
library(raster)
library(spdplyr)
library(dplyr)
library(RColorBrewer)
As usual, let’s take a glimpse()
of our data to see how we should proceed. Since the number of columns is 85, we divided it to 3 sections
glimpse(processed_data[,1:30])
## Rows: 5,066
## Columns: 30
## $ CD <chr> "Algoma", "Algoma", "Algoma", "Algoma…
## $ Episode <date> 2020-03-01, 2020-03-02, 2020-03-03, …
## $ all_day_bing_tiles_visited_relt <dbl> 0.04122, 0.06040, 0.06287, 0.03830, 0…
## $ all_day_ratio_single_tile_users <dbl> 0.27549, 0.20603, 0.19751, 0.19553, 0…
## $ pc3 <chr> "NA", "NA", "NA", "NA", "NA", "NA", "…
## $ cases <dbl> NA, NA, NA, NA, NA, NA, NA, 1, NA, NA…
## $ maleperc <dbl> NA, NA, NA, NA, NA, NA, NA, 0.0, NA, …
## $ delay_mean <dbl> NA, NA, NA, NA, NA, NA, NA, 39.0, NA,…
## $ CC <dbl> NA, NA, NA, NA, NA, NA, NA, 0.0, NA, …
## $ NoEpi <dbl> NA, NA, NA, NA, NA, NA, NA, 1.0, NA, …
## $ NoInfo <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, NA…
## $ Unknown <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, NA…
## $ OB <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, NA…
## $ Travel <dbl> NA, NA, NA, NA, NA, NA, NA, 0.0, NA, …
## $ age <dbl> NA, NA, NA, NA, NA, NA, NA, 60, NA, N…
## $ city <dbl> NA, NA, NA, NA, NA, NA, NA, 23, NA, N…
## $ max_temperature <dbl> 3.2, 0.8, 3.6, -0.5, -0.9, -2.2, 3.9,…
## $ avg_hourly_temperature <dbl> -3.26, -2.56, -1.59, -2.35, -4.97, -6…
## $ avg_temperature <dbl> -3.90, -6.05, -3.55, -4.30, -7.90, -1…
## $ min_temperature <dbl> -11.0, -12.9, -10.7, -8.1, -14.9, -18…
## $ max_humidex <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ min_windchill <dbl> -16, -13, -13, -10, -17, -17, -21, -1…
## $ max_relative_humidity <dbl> 90, 94, 96, 89, 92, 87, 87, 86, 92, 9…
## $ avg_hourly_relative_humidity <dbl> 74.8, 80.0, 82.2, 80.8, 88.9, 71.0, 6…
## $ avg_relative_humidity <dbl> 72.5, 82.5, 76.0, 79.5, 87.0, 68.0, 6…
## $ min_relative_humidity <dbl> 55, 71, 56, 70, 82, 49, 34, 41, 55, 7…
## $ max_dew_point <dbl> -4.4, -3.1, -0.5, -3.0, -2.5, -4.4, -…
## $ avg_hourly_dew_point <dbl> -7.3, -5.6, -4.5, -5.2, -6.5, -10.9, …
## $ avg_dew_point <dbl> -8.6, -7.5, -6.2, -5.2, -9.2, -11.5, …
## $ min_dew_point <dbl> -12.8, -11.8, -11.8, -7.5, -16.0, -18…
glimpse(processed_data[,31:60])
## Rows: 5,066
## Columns: 30
## $ max_wind_speed <dbl> 28, 31, 19, 41, 27, 32, 21, 20, 28, 34, 1…
## $ avg_hourly_wind_speed <dbl> 15.54, 16.83, 10.71, 22.50, 16.83, 17.83,…
## $ avg_wind_speed <dbl> 18.0, 16.0, 11.0, 22.5, 15.0, 16.5, 12.5,…
## $ min_wind_speed <dbl> 8, 1, 3, 4, 3, 1, 4, 1, 0, 1, 5, 5, 10, 1…
## $ max_wind_gust <dbl> 48, 45, NA, 52, 42, 42, 33, 37, 39, 50, N…
## $ wind_gust_dir_10s <dbl> 12, 30, NA, 31, 8, 32, 19, 18, 31, 31, NA…
## $ max_pressure_sea <dbl> 101.74, 100.52, 100.45, 101.85, 101.89, 1…
## $ avg_hourly_pressure_sea <dbl> 100.96, 100.32, 100.01, 100.89, 101.60, 1…
## $ avg_pressure_sea <dbl> 100.95, 100.30, 100.04, 100.99, 101.57, 1…
## $ min_pressure_sea <dbl> 100.16, 100.09, 99.63, 100.13, 101.25, 10…
## $ max_pressure_station <dbl> 99.31, 98.15, 98.07, 99.45, 99.47, 100.89…
## $ avg_hourly_pressure_station <dbl> 98.57, 97.96, 97.65, 98.52, 99.19, 100.32…
## $ avg_pressure_station <dbl> 98.56, 97.95, 97.68, 98.61, 99.15, 100.07…
## $ min_pressure_station <dbl> 97.80, 97.75, 97.29, 97.78, 98.84, 99.25,…
## $ max_visibility <dbl> 32200, 40200, 40200, 32200, 32200, 40200,…
## $ avg_hourly_visibility <dbl> 32200.0, 31912.5, 24337.5, 25291.7, 13225…
## $ avg_visibility <dbl> 32200, 24950, 20500, 20150, 16700, 21900,…
## $ min_visibility <dbl> 32200, 9700, 800, 8100, 1200, 3600, 32200…
## $ max_health_index <dbl> 2.9, 3.0, 2.9, 2.8, 2.7, 2.7, 2.8, 4.0, 3…
## $ avg_hourly_health_index <dbl> 2.4, 2.5, 2.4, 2.3, 2.3, 2.3, 2.4, 2.8, 2…
## $ avg_health_index <dbl> 2.5, 2.6, 2.5, 2.5, 2.4, 2.3, 2.4, 3.0, 2…
## $ min_health_index <dbl> 2.0, 2.3, 2.0, 2.1, 2.1, 1.9, 2.0, 2.1, 2…
## $ heatdegdays <dbl> 21.9, 24.1, 21.6, 22.3, 25.9, 28.6, 25.5,…
## $ cooldegdays <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ growdegdays_5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ growdegdays_7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ growdegdays_10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ precipitation <dbl> 0.0, 0.0, 1.4, 0.8, 7.8, 0.5, 0.0, 0.0, 1…
## $ rain <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1…
## $ snow <dbl> 0.0, 0.0, 2.7, 0.8, 10.6, 1.2, 0.0, 0.0, …
glimpse(processed_data[,61:85])
## Rows: 5,066
## Columns: 25
## $ snow_on_ground <dbl> 72, 72, 72, 74, 73, 82, 82, 82, 74, 73,…
## $ sunrise <chr> "7:16:00", "7:14:00", "7:13:00", "7:11:…
## $ sunset <chr> "18:23:00", "18:24:00", "18:26:00", "18…
## $ daylight <dbl> 11.12, 11.17, 11.22, 11.27, 11.33, 11.3…
## $ sunrise_f <dbl> 7.27, 7.23, 7.22, 7.18, 7.15, 7.12, 7.0…
## $ sunset_f <dbl> 18.38, 18.40, 18.43, 18.45, 18.48, 18.5…
## $ min_uv_forecast <dbl> 2, 2, 2, 2, 1, 3, 4, 4, 3, 4, 1, 2, 1, …
## $ max_uv_forecast <dbl> 2, 3, 2, 2, 1, 3, 4, 4, 3, 4, 1, 2, 1, …
## $ min_high_temperature_forecast <dbl> 2, 1, 2, -1, -1, -1, 3, 8, 6, -1, 2, 3,…
## $ max_high_temperature_forecast <dbl> 2, 1, 2, 0, -1, -1, 3, 8, 8, -1, 2, 3, …
## $ min_low_temperature_forecast <dbl> -1, -8, -3, -8, -9, -17, -6, 2, -3, -9,…
## $ max_low_temperature_forecast <dbl> -1, -6, -3, -8, -9, -9, -4, 2, -3, -8, …
## $ solar_radiation <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ max_cloud_cover_4 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ avg_hourly_cloud_cover_4 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ avg_cloud_cover_4 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ min_cloud_cover_4 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ max_cloud_cover_8 <dbl> 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, …
## $ avg_hourly_cloud_cover_8 <dbl> 6.0, 5.8, 6.8, 7.2, 5.4, 3.7, 4.3, 4.3,…
## $ avg_cloud_cover_8 <dbl> 5.5, 5.0, 5.5, 6.5, 4.0, 4.0, 4.0, 4.5,…
## $ min_cloud_cover_8 <dbl> 3, 2, 3, 5, 0, 0, 0, 2, 2, 2, 5, 8, 6, …
## $ max_cloud_cover_10 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ avg_hourly_cloud_cover_10 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ avg_cloud_cover_10 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ min_cloud_cover_10 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
The visualization provided by plot_missing()
helps identify columns that may need attention.
The data columns are divided into three sections:
plot_missing(processed_data[,1:30],
title = "% of Missing Data (filtered to cols w/missing data)",
ggtheme = theme_tq())
plot_missing(processed_data[,31:60],
title = "% of Missing Data (filtered to cols w/missing data)",
ggtheme = theme_tq())
plot_missing(processed_data[,61:85],
title = "% of Missing Data (filtered to cols w/missing data)",
ggtheme = theme_tq())
skim()
will do the job:
skim(processed_data)
Name | processed_data |
Number of rows | 5066 |
Number of columns | 85 |
_______________________ | |
Column type frequency: | |
character | 4 |
Date | 1 |
numeric | 80 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
CD | 0 | 1 | 4 | 30 | 0 | 34 | 0 |
pc3 | 0 | 1 | 2 | 3 | 0 | 35 | 0 |
sunrise | 0 | 1 | 7 | 7 | 0 | 182 | 0 |
sunset | 0 | 1 | 8 | 8 | 0 | 239 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
Episode | 0 | 1 | 2020-03-01 | 2020-07-27 | 2020-05-14 | 149 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
all_day_bing_tiles_visited_relt | 0 | 1.00 | -0.13 | 0.19 | -0.66 | -0.27 | -0.11 | 0.01 | 0.46 | ▁▆▇▅▁ |
all_day_ratio_single_tile_users | 0 | 1.00 | 0.29 | 0.08 | 0.14 | 0.24 | 0.28 | 0.35 | 0.55 | ▂▇▅▂▁ |
cases | 2133 | 0.58 | 13.21 | 27.45 | 1.00 | 1.00 | 4.00 | 11.00 | 286.00 | ▇▁▁▁▁ |
maleperc | 2133 | 0.58 | 0.46 | 0.32 | 0.00 | 0.25 | 0.47 | 0.67 | 1.00 | ▆▅▇▃▅ |
delay_mean | 2133 | 0.58 | 6.62 | 6.09 | 0.00 | 3.00 | 5.00 | 8.04 | 87.00 | ▇▁▁▁▁ |
CC | 2133 | 0.58 | 0.30 | 0.32 | 0.00 | 0.00 | 0.24 | 0.50 | 1.00 | ▇▃▃▁▂ |
NoEpi | 2133 | 0.58 | 0.26 | 0.31 | 0.00 | 0.00 | 0.17 | 0.39 | 1.00 | ▇▃▂▁▂ |
NoInfo | 2133 | 0.58 | 0.03 | 0.13 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
Unknown | 2133 | 0.58 | 0.00 | 0.04 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
OB | 2133 | 0.58 | 0.30 | 0.34 | 0.00 | 0.00 | 0.17 | 0.50 | 1.00 | ▇▂▂▂▂ |
Travel | 2133 | 0.58 | 0.10 | 0.25 | 0.00 | 0.00 | 0.00 | 0.02 | 1.00 | ▇▁▁▁▁ |
age | 2133 | 0.58 | 44.67 | 14.29 | 20.00 | 34.42 | 44.35 | 53.33 | 90.00 | ▆▇▇▂▁ |
city | 2133 | 0.58 | 17.99 | 10.33 | 1.00 | 9.00 | 17.00 | 28.00 | 34.00 | ▆▆▅▅▇ |
max_temperature | 3 | 1.00 | 16.87 | 10.11 | -7.60 | 8.10 | 15.90 | 26.50 | 36.90 | ▂▇▇▇▆ |
avg_hourly_temperature | 218 | 0.96 | 11.87 | 9.30 | -14.70 | 3.85 | 11.47 | 20.85 | 28.70 | ▁▅▇▆▇ |
avg_temperature | 3 | 1.00 | 11.20 | 9.46 | -15.85 | 3.35 | 10.50 | 20.50 | 28.90 | ▁▅▇▆▇ |
min_temperature | 3 | 1.00 | 5.53 | 9.24 | -29.70 | -1.80 | 4.80 | 14.10 | 24.50 | ▁▁▇▆▆ |
max_humidex | 3253 | 0.36 | 33.11 | 4.28 | 25.00 | 30.00 | 33.00 | 36.00 | 44.00 | ▃▇▇▅▁ |
min_windchill | 3517 | 0.31 | -8.30 | 5.01 | -35.00 | -10.00 | -7.00 | -5.00 | 0.00 | ▁▁▂▇▇ |
max_relative_humidity | 32 | 0.99 | 90.02 | 9.90 | 30.00 | 85.00 | 93.00 | 98.00 | 100.00 | ▁▁▁▂▇ |
avg_hourly_relative_humidity | 32 | 0.99 | 69.16 | 13.27 | 22.70 | 60.40 | 69.40 | 79.00 | 100.00 | ▁▂▇▇▃ |
avg_relative_humidity | 32 | 0.99 | 68.80 | 11.13 | 21.50 | 62.00 | 69.00 | 76.50 | 100.00 | ▁▁▇▇▂ |
min_relative_humidity | 32 | 0.99 | 47.59 | 15.60 | 11.00 | 36.00 | 46.00 | 57.00 | 100.00 | ▂▇▇▃▁ |
max_dew_point | 32 | 0.99 | 8.63 | 9.24 | -21.70 | 0.72 | 8.10 | 17.60 | 25.30 | ▁▂▇▆▇ |
avg_hourly_dew_point | 32 | 0.99 | 5.38 | 9.69 | -23.40 | -2.60 | 4.40 | 14.97 | 23.00 | ▁▃▇▅▇ |
avg_dew_point | 32 | 0.99 | 5.26 | 9.64 | -23.60 | -2.80 | 4.20 | 14.80 | 23.00 | ▁▃▇▅▇ |
min_dew_point | 32 | 0.99 | 1.88 | 10.30 | -31.20 | -6.50 | 1.10 | 11.90 | 21.40 | ▁▂▇▆▆ |
max_wind_speed | 30 | 0.99 | 24.22 | 8.82 | 4.00 | 18.00 | 23.00 | 30.00 | 94.00 | ▇▇▁▁▁ |
avg_hourly_wind_speed | 30 | 0.99 | 13.42 | 5.82 | 2.12 | 9.17 | 12.46 | 16.58 | 43.17 | ▆▇▂▁▁ |
avg_wind_speed | 30 | 0.99 | 14.20 | 5.79 | 2.00 | 10.00 | 13.50 | 17.50 | 50.00 | ▆▇▂▁▁ |
min_wind_speed | 30 | 0.99 | 4.19 | 4.03 | 0.00 | 1.00 | 4.00 | 6.00 | 35.00 | ▇▂▁▁▁ |
max_wind_gust | 1224 | 0.76 | 42.60 | 11.26 | 27.00 | 34.00 | 40.00 | 48.00 | 160.00 | ▇▂▁▁▁ |
wind_gust_dir_10s | 1225 | 0.76 | 21.79 | 9.65 | 0.00 | 16.00 | 24.00 | 30.00 | 36.00 | ▃▃▅▇▇ |
max_pressure_sea | 151 | 0.97 | 101.86 | 0.71 | 99.62 | 101.42 | 101.79 | 102.21 | 104.59 | ▁▅▇▂▁ |
avg_hourly_pressure_sea | 151 | 0.97 | 101.53 | 0.74 | 99.07 | 101.10 | 101.54 | 101.92 | 104.30 | ▁▃▇▂▁ |
avg_pressure_sea | 151 | 0.97 | 101.53 | 0.72 | 99.29 | 101.09 | 101.53 | 101.89 | 104.24 | ▁▅▇▂▁ |
min_pressure_sea | 151 | 0.97 | 101.20 | 0.79 | 98.43 | 100.72 | 101.26 | 101.64 | 103.99 | ▁▃▇▂▁ |
max_pressure_station | 151 | 0.97 | 99.32 | 1.30 | 95.42 | 98.44 | 99.36 | 100.26 | 103.62 | ▁▅▇▃▁ |
avg_hourly_pressure_station | 151 | 0.97 | 99.01 | 1.31 | 94.84 | 98.15 | 99.05 | 99.95 | 103.35 | ▁▅▇▃▁ |
avg_pressure_station | 151 | 0.97 | 99.00 | 1.30 | 94.84 | 98.14 | 99.05 | 99.94 | 103.31 | ▁▃▇▃▁ |
min_pressure_station | 151 | 0.97 | 98.68 | 1.34 | 94.10 | 97.80 | 98.73 | 99.64 | 103.00 | ▁▃▇▅▁ |
max_visibility | 1194 | 0.76 | 22733.55 | 7484.95 | 12900.00 | 16100.00 | 24100.00 | 24100.00 | 80500.00 | ▇▁▁▁▁ |
avg_hourly_visibility | 1194 | 0.76 | 20331.38 | 5536.76 | 3825.00 | 16100.00 | 21230.40 | 24100.00 | 40200.00 | ▁▇▇▂▁ |
avg_visibility | 1194 | 0.76 | 18588.61 | 6447.86 | 6550.00 | 14450.00 | 16100.00 | 24100.00 | 52320.00 | ▅▇▁▁▁ |
min_visibility | 1194 | 0.76 | 14443.67 | 8811.92 | 0.00 | 4800.00 | 16100.00 | 24100.00 | 40200.00 | ▅▂▇▁▁ |
max_health_index | 1349 | 0.73 | 2.86 | 0.70 | 1.00 | 2.40 | 2.80 | 3.20 | 7.60 | ▂▇▂▁▁ |
avg_hourly_health_index | 1349 | 0.73 | 2.28 | 0.50 | 1.00 | 2.00 | 2.30 | 2.60 | 4.10 | ▂▇▇▂▁ |
avg_health_index | 1349 | 0.73 | 2.31 | 0.51 | 1.00 | 2.00 | 2.30 | 2.60 | 4.70 | ▂▇▆▁▁ |
min_health_index | 1349 | 0.73 | 1.76 | 0.47 | 1.00 | 1.40 | 1.80 | 2.10 | 3.40 | ▆▇▇▁▁ |
heatdegdays | 3 | 1.00 | 8.22 | 7.76 | 0.00 | 0.00 | 7.50 | 14.70 | 33.90 | ▇▃▅▁▁ |
cooldegdays | 3 | 1.00 | 1.42 | 2.43 | 0.00 | 0.00 | 0.00 | 2.50 | 10.90 | ▇▁▁▁▁ |
growdegdays_5 | 3 | 1.00 | 7.66 | 7.63 | 0.00 | 0.00 | 5.50 | 15.50 | 23.90 | ▇▂▂▃▁ |
growdegdays_7 | 3 | 1.00 | 6.40 | 6.92 | 0.00 | 0.00 | 3.50 | 13.50 | 21.90 | ▇▂▂▃▁ |
growdegdays_10 | 3 | 1.00 | 4.76 | 5.73 | 0.00 | 0.00 | 0.50 | 10.50 | 18.90 | ▇▁▂▂▁ |
precipitation | 819 | 0.84 | 2.15 | 5.66 | 0.00 | 0.00 | 0.00 | 1.30 | 86.60 | ▇▁▁▁▁ |
rain | 2930 | 0.42 | 1.99 | 5.64 | 0.00 | 0.00 | 0.00 | 1.00 | 86.60 | ▇▁▁▁▁ |
snow | 2927 | 0.42 | 0.15 | 0.83 | 0.00 | 0.00 | 0.00 | 0.00 | 12.20 | ▇▁▁▁▁ |
snow_on_ground | 3766 | 0.26 | 9.46 | 19.22 | 0.00 | 0.00 | 0.00 | 8.00 | 96.00 | ▇▁▁▁▁ |
daylight | 0 | 1.00 | 14.17 | 1.38 | 11.00 | 13.07 | 14.73 | 15.30 | 16.35 | ▃▃▃▇▆ |
sunrise_f | 0 | 1.00 | 6.23 | 0.68 | 5.15 | 5.70 | 6.00 | 6.77 | 8.40 | ▇▇▅▃▁ |
sunset_f | 0 | 1.00 | 20.40 | 0.79 | 17.77 | 19.90 | 20.63 | 21.00 | 22.05 | ▁▁▅▇▁ |
min_uv_forecast | 2 | 1.00 | 5.96 | 2.63 | 1.00 | 4.00 | 6.00 | 8.00 | 12.00 | ▆▇▅▇▂ |
max_uv_forecast | 2 | 1.00 | 6.44 | 2.57 | 1.00 | 4.00 | 6.00 | 9.00 | 12.00 | ▃▇▅▇▃ |
min_high_temperature_forecast | 1 | 1.00 | 16.68 | 10.40 | -12.00 | 8.00 | 16.00 | 27.00 | 36.00 | ▁▇▇▆▇ |
max_high_temperature_forecast | 1 | 1.00 | 17.02 | 10.34 | -8.00 | 8.00 | 16.00 | 27.00 | 36.00 | ▁▇▇▇▇ |
min_low_temperature_forecast | 0 | 1.00 | 6.32 | 9.42 | -24.00 | -1.00 | 6.00 | 15.00 | 25.00 | ▁▃▇▆▅ |
max_low_temperature_forecast | 0 | 1.00 | 7.10 | 9.27 | -21.00 | -1.00 | 7.00 | 16.00 | 25.00 | ▁▃▇▆▆ |
solar_radiation | 4416 | 0.13 | 18914.72 | 7667.59 | 1331.00 | 13180.25 | 19808.00 | 25243.25 | 32399.00 | ▂▅▆▇▆ |
max_cloud_cover_4 | 3506 | 0.31 | 3.45 | 1.15 | 0.00 | 4.00 | 4.00 | 4.00 | 4.00 | ▁▁▁▁▇ |
avg_hourly_cloud_cover_4 | 3506 | 0.31 | 1.84 | 1.26 | 0.00 | 0.70 | 1.80 | 2.90 | 4.00 | ▇▆▆▆▆ |
avg_cloud_cover_4 | 3506 | 0.31 | 1.87 | 0.78 | 0.00 | 2.00 | 2.00 | 2.00 | 4.00 | ▁▂▇▁▁ |
min_cloud_cover_4 | 3506 | 0.31 | 0.30 | 0.89 | 0.00 | 0.00 | 0.00 | 0.00 | 4.00 | ▇▁▁▁▁ |
max_cloud_cover_8 | 2533 | 0.50 | 7.32 | 1.44 | 1.00 | 7.00 | 8.00 | 8.00 | 8.00 | ▁▁▁▁▇ |
avg_hourly_cloud_cover_8 | 2533 | 0.50 | 4.95 | 2.00 | 0.00 | 3.60 | 5.10 | 6.60 | 8.00 | ▂▃▆▇▇ |
avg_cloud_cover_8 | 2533 | 0.50 | 4.48 | 1.47 | 0.50 | 4.00 | 4.00 | 5.00 | 8.00 | ▁▂▇▂▂ |
min_cloud_cover_8 | 2533 | 0.50 | 1.64 | 2.17 | 0.00 | 0.00 | 1.00 | 2.00 | 8.00 | ▇▂▁▁▁ |
max_cloud_cover_10 | 4768 | 0.06 | 8.10 | 3.27 | 0.00 | 7.25 | 10.00 | 10.00 | 10.00 | ▂▁▁▁▇ |
avg_hourly_cloud_cover_10 | 4768 | 0.06 | 4.37 | 3.07 | 0.00 | 1.40 | 4.30 | 7.05 | 10.00 | ▇▅▅▅▃ |
avg_cloud_cover_10 | 4768 | 0.06 | 4.38 | 2.05 | 0.00 | 3.62 | 5.00 | 5.00 | 10.00 | ▂▁▇▁▁ |
min_cloud_cover_10 | 4768 | 0.06 | 0.65 | 1.92 | 0.00 | 0.00 | 0.00 | 0.00 | 10.00 | ▇▁▁▁▁ |
skim(processed_data$all_day_bing_tiles_visited_relt)
Name | processed_data$all_day_bi… |
Number of rows | 5066 |
Number of columns | 1 |
_______________________ | |
Column type frequency: | |
numeric | 1 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
data | 0 | 1 | -0.13 | 0.19 | -0.66 | -0.27 | -0.11 | 0.01 | 0.46 | ▁▆▇▅▁ |
Let’s visualize the data in three sections.
datatable(processed_data[, 1:30], rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )
datatable(processed_data[, 31:60], rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )
datatable(processed_data[, 61:85], rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )
geo <- processed_data[,c(1,3,6:8,15,17,21)] %>%
dplyr::group_by(CD) %>%
dplyr::summarise_all(mean, na.rm = TRUE)
datatable(geo, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )
Case frequencies along with the percent of total.
processed_data %>%
janitor::tabyl(cases) %>%
arrange(desc(percent)) %>%
# formatting
janitor::adorn_totals() %>%
janitor::adorn_pct_formatting() %>%
rmarkdown::paged_table()
This section uses only Toronto but can be expanded to other regions. The case numbers are also lagged for 10 days.
A <- processed_data[,c(1,3,6:8,15,17,21)] %>%
dplyr::filter(CD == "Toronto", ) %>%
dplyr::mutate(lagged_cases = lag(cases, n = 10L))
B <- cor(A[,-1], use = "complete.obs")
B
## all_day_bing_tiles_visited_relt cases
## all_day_bing_tiles_visited_relt 1.0000000 -0.79205061
## cases -0.7920506 1.00000000
## maleperc 0.2399123 -0.07432073
## delay_mean -0.4381753 0.60027151
## age -0.4012677 0.44226471
## max_temperature 0.4174737 -0.31406409
## max_humidex 0.3086334 -0.28368514
## lagged_cases -0.7113276 0.87931153
## maleperc delay_mean age
## all_day_bing_tiles_visited_relt 0.239912329 -0.4381753 -0.40126765
## cases -0.074320734 0.6002715 0.44226471
## maleperc 1.000000000 -0.1038246 -0.26590062
## delay_mean -0.103824616 1.0000000 0.39967474
## age -0.265900621 0.3996747 1.00000000
## max_temperature -0.022524454 -0.0961471 0.06058689
## max_humidex -0.013659505 -0.1148618 -0.03511465
## lagged_cases -0.007234725 0.5566002 0.35654930
## max_temperature max_humidex lagged_cases
## all_day_bing_tiles_visited_relt 0.41747367 0.30863342 -0.711327634
## cases -0.31406409 -0.28368514 0.879311531
## maleperc -0.02252445 -0.01365951 -0.007234725
## delay_mean -0.09614710 -0.11486176 0.556600221
## age 0.06058689 -0.03511465 0.356549300
## max_temperature 1.00000000 0.85082857 -0.342176427
## max_humidex 0.85082857 1.00000000 -0.346667692
## lagged_cases -0.34217643 -0.34666769 1.000000000
library(GGally)
ggcorr(B)
An excellent source for learning making maps in R is provided by Nick Eubank and Claudia Engel in this link
Get the county level spatial data from ucdavis.edu:
can2<-getData('GADM', country="CAN", level=2) # counties
head(can2@data, 10)
## GID_0 NAME_0 GID_1 NAME_1 NL_NAME_1 GID_2 NAME_2 VARNAME_2
## 3 CAN Canada CAN.1_1 Alberta <NA> CAN.1.11_1 Division No. 1 <NA>
## 1 CAN Canada CAN.1_1 Alberta <NA> CAN.1.1_1 Division No. 10 <NA>
## 12 CAN Canada CAN.1_1 Alberta <NA> CAN.1.2_1 Division No. 11 <NA>
## 13 CAN Canada CAN.1_1 Alberta <NA> CAN.1.3_1 Division No. 12 <NA>
## 14 CAN Canada CAN.1_1 Alberta <NA> CAN.1.4_1 Division No. 13 <NA>
## 15 CAN Canada CAN.1_1 Alberta <NA> CAN.1.5_1 Division No. 14 <NA>
## 16 CAN Canada CAN.1_1 Alberta <NA> CAN.1.6_1 Division No. 15 <NA>
## 17 CAN Canada CAN.1_1 Alberta <NA> CAN.1.7_1 Division No. 16 <NA>
## 18 CAN Canada CAN.1_1 Alberta <NA> CAN.1.8_1 Division No. 17 <NA>
## 19 CAN Canada CAN.1_1 Alberta <NA> CAN.1.9_1 Division No. 18 <NA>
## NL_NAME_2 TYPE_2 ENGTYPE_2 CC_2 HASC_2
## 3 <NA> Census Division Census Division 01 CA.AB.ON
## 1 <NA> Census Division Census Division 10 CA.AB.TE
## 12 <NA> Census Division Census Division 11 CA.AB.EL
## 13 <NA> Census Division Census Division 12 CA.AB.TV
## 14 <NA> Census Division Census Division 13 CA.AB.TN
## 15 <NA> Census Division Census Division 14 CA.AB.FT
## 16 <NA> Census Division Census Division 15 CA.AB.FN
## 17 <NA> Census Division Census Division 16 CA.AB.ST
## 18 <NA> Census Division Census Division 17 CA.AB.SN
## 19 <NA> Census Division Census Division 18 CA.AB.ET
class(can2)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
county_on <- can2 %>%
filter(NAME_1 == "Ontario")
geo2 <- processed_data %>%
group_by(CD) %>%
summarize(maxcases = max(cases, na.rm = TRUE),
meanmob = mean(all_day_bing_tiles_visited_relt))
a <- unique(county_on$NAME_2)
b <- unique(geo2$CD)
#is.element(b, a)
w = dplyr::left_join(county_on, geo2, by = c("NAME_2" = "CD"))
#class(w)
#nrow(w)
my.palette <- brewer.pal(n = 12, name = "Accent")
spplot(w["maxcases"], main = "Ontario COVID19",
sub = "Maximum cases observed any day",
col = "transparent",
col.regions = my.palette,
cuts = 12)
my.palette <- brewer.pal(n = 12, name = "Accent")
spplot(w["meanmob"], main = "Ontario Mobility",
sub = "Average daily mobility",
col = "transparent",
col.regions = my.palette,
cuts = 12)
# Change these parameters
scale.parameter = 0.45 # scaling parameter. less than 1 is zooming in, more than 1 zooming out.
xshift = 6 # Shift to right in map units.
yshift = -6 # Shift to left in map units.
original.bbox = w@bbox # Pass bbox of your Spatial* Object.
# Just copy-paste the following
edges = original.bbox
edges[1, ] <- (edges[1, ] - mean(edges[1, ])) * scale.parameter + mean(edges[1,
]) + xshift
edges[2, ] <- (edges[2, ] - mean(edges[2, ])) * scale.parameter + mean(edges[2,
]) + yshift
# In `spplot`, set xlim to edges[1,] and ylim to edges[2,]
spplot(w, "maxcases", main = "Ontario COVID19",
sub = "Maximum cases observed any day",
col = "transparent",
xlim = edges[1, ], ylim = edges[2, ])