This is just an initial exploratory step to see our raw data.
NOTE: Please click on the Code button to see the codes.
Let’s download the data and load the packages.
setwd("~/Dropbox/RNS")
load("~/Dropbox/RNS2/DataON/finaldata.RData")
processed_data <- finaldata
This rmd file uses several packages. If you want to replicate the same results, please install them first.
library(tidyverse)
library(tidyquant)
library(recipes)
library(rsample)
library(knitr)
library(maps)
# Data Cleaning
library(janitor)
# EDA
library(skimr)
library(DataExplorer)
library(DT)
library(corrplot)
# ggplot2 Helpers
library(scales)
theme_set(theme_tq())
library(raster)
library(spdplyr)
library(dplyr)
library(RColorBrewer)
As usual, let’s take a glimpse() of our data to see how we should proceed. Since the number of columns is 85, we divided it to 3 sections
glimpse(processed_data[,1:30])
## Rows: 5,066
## Columns: 30
## $ CD                              <chr> "Algoma", "Algoma", "Algoma", "Algoma…
## $ Episode                         <date> 2020-03-01, 2020-03-02, 2020-03-03, …
## $ all_day_bing_tiles_visited_relt <dbl> 0.04122, 0.06040, 0.06287, 0.03830, 0…
## $ all_day_ratio_single_tile_users <dbl> 0.27549, 0.20603, 0.19751, 0.19553, 0…
## $ pc3                             <chr> "NA", "NA", "NA", "NA", "NA", "NA", "…
## $ cases                           <dbl> NA, NA, NA, NA, NA, NA, NA, 1, NA, NA…
## $ maleperc                        <dbl> NA, NA, NA, NA, NA, NA, NA, 0.0, NA, …
## $ delay_mean                      <dbl> NA, NA, NA, NA, NA, NA, NA, 39.0, NA,…
## $ CC                              <dbl> NA, NA, NA, NA, NA, NA, NA, 0.0, NA, …
## $ NoEpi                           <dbl> NA, NA, NA, NA, NA, NA, NA, 1.0, NA, …
## $ NoInfo                          <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, NA…
## $ Unknown                         <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, NA…
## $ OB                              <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, NA…
## $ Travel                          <dbl> NA, NA, NA, NA, NA, NA, NA, 0.0, NA, …
## $ age                             <dbl> NA, NA, NA, NA, NA, NA, NA, 60, NA, N…
## $ city                            <dbl> NA, NA, NA, NA, NA, NA, NA, 23, NA, N…
## $ max_temperature                 <dbl> 3.2, 0.8, 3.6, -0.5, -0.9, -2.2, 3.9,…
## $ avg_hourly_temperature          <dbl> -3.26, -2.56, -1.59, -2.35, -4.97, -6…
## $ avg_temperature                 <dbl> -3.90, -6.05, -3.55, -4.30, -7.90, -1…
## $ min_temperature                 <dbl> -11.0, -12.9, -10.7, -8.1, -14.9, -18…
## $ max_humidex                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ min_windchill                   <dbl> -16, -13, -13, -10, -17, -17, -21, -1…
## $ max_relative_humidity           <dbl> 90, 94, 96, 89, 92, 87, 87, 86, 92, 9…
## $ avg_hourly_relative_humidity    <dbl> 74.8, 80.0, 82.2, 80.8, 88.9, 71.0, 6…
## $ avg_relative_humidity           <dbl> 72.5, 82.5, 76.0, 79.5, 87.0, 68.0, 6…
## $ min_relative_humidity           <dbl> 55, 71, 56, 70, 82, 49, 34, 41, 55, 7…
## $ max_dew_point                   <dbl> -4.4, -3.1, -0.5, -3.0, -2.5, -4.4, -…
## $ avg_hourly_dew_point            <dbl> -7.3, -5.6, -4.5, -5.2, -6.5, -10.9, …
## $ avg_dew_point                   <dbl> -8.6, -7.5, -6.2, -5.2, -9.2, -11.5, …
## $ min_dew_point                   <dbl> -12.8, -11.8, -11.8, -7.5, -16.0, -18…
glimpse(processed_data[,31:60])
## Rows: 5,066
## Columns: 30
## $ max_wind_speed              <dbl> 28, 31, 19, 41, 27, 32, 21, 20, 28, 34, 1…
## $ avg_hourly_wind_speed       <dbl> 15.54, 16.83, 10.71, 22.50, 16.83, 17.83,…
## $ avg_wind_speed              <dbl> 18.0, 16.0, 11.0, 22.5, 15.0, 16.5, 12.5,…
## $ min_wind_speed              <dbl> 8, 1, 3, 4, 3, 1, 4, 1, 0, 1, 5, 5, 10, 1…
## $ max_wind_gust               <dbl> 48, 45, NA, 52, 42, 42, 33, 37, 39, 50, N…
## $ wind_gust_dir_10s           <dbl> 12, 30, NA, 31, 8, 32, 19, 18, 31, 31, NA…
## $ max_pressure_sea            <dbl> 101.74, 100.52, 100.45, 101.85, 101.89, 1…
## $ avg_hourly_pressure_sea     <dbl> 100.96, 100.32, 100.01, 100.89, 101.60, 1…
## $ avg_pressure_sea            <dbl> 100.95, 100.30, 100.04, 100.99, 101.57, 1…
## $ min_pressure_sea            <dbl> 100.16, 100.09, 99.63, 100.13, 101.25, 10…
## $ max_pressure_station        <dbl> 99.31, 98.15, 98.07, 99.45, 99.47, 100.89…
## $ avg_hourly_pressure_station <dbl> 98.57, 97.96, 97.65, 98.52, 99.19, 100.32…
## $ avg_pressure_station        <dbl> 98.56, 97.95, 97.68, 98.61, 99.15, 100.07…
## $ min_pressure_station        <dbl> 97.80, 97.75, 97.29, 97.78, 98.84, 99.25,…
## $ max_visibility              <dbl> 32200, 40200, 40200, 32200, 32200, 40200,…
## $ avg_hourly_visibility       <dbl> 32200.0, 31912.5, 24337.5, 25291.7, 13225…
## $ avg_visibility              <dbl> 32200, 24950, 20500, 20150, 16700, 21900,…
## $ min_visibility              <dbl> 32200, 9700, 800, 8100, 1200, 3600, 32200…
## $ max_health_index            <dbl> 2.9, 3.0, 2.9, 2.8, 2.7, 2.7, 2.8, 4.0, 3…
## $ avg_hourly_health_index     <dbl> 2.4, 2.5, 2.4, 2.3, 2.3, 2.3, 2.4, 2.8, 2…
## $ avg_health_index            <dbl> 2.5, 2.6, 2.5, 2.5, 2.4, 2.3, 2.4, 3.0, 2…
## $ min_health_index            <dbl> 2.0, 2.3, 2.0, 2.1, 2.1, 1.9, 2.0, 2.1, 2…
## $ heatdegdays                 <dbl> 21.9, 24.1, 21.6, 22.3, 25.9, 28.6, 25.5,…
## $ cooldegdays                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ growdegdays_5               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ growdegdays_7               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ growdegdays_10              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ precipitation               <dbl> 0.0, 0.0, 1.4, 0.8, 7.8, 0.5, 0.0, 0.0, 1…
## $ rain                        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1…
## $ snow                        <dbl> 0.0, 0.0, 2.7, 0.8, 10.6, 1.2, 0.0, 0.0, …
glimpse(processed_data[,61:85])
## Rows: 5,066
## Columns: 25
## $ snow_on_ground                <dbl> 72, 72, 72, 74, 73, 82, 82, 82, 74, 73,…
## $ sunrise                       <chr> "7:16:00", "7:14:00", "7:13:00", "7:11:…
## $ sunset                        <chr> "18:23:00", "18:24:00", "18:26:00", "18…
## $ daylight                      <dbl> 11.12, 11.17, 11.22, 11.27, 11.33, 11.3…
## $ sunrise_f                     <dbl> 7.27, 7.23, 7.22, 7.18, 7.15, 7.12, 7.0…
## $ sunset_f                      <dbl> 18.38, 18.40, 18.43, 18.45, 18.48, 18.5…
## $ min_uv_forecast               <dbl> 2, 2, 2, 2, 1, 3, 4, 4, 3, 4, 1, 2, 1, …
## $ max_uv_forecast               <dbl> 2, 3, 2, 2, 1, 3, 4, 4, 3, 4, 1, 2, 1, …
## $ min_high_temperature_forecast <dbl> 2, 1, 2, -1, -1, -1, 3, 8, 6, -1, 2, 3,…
## $ max_high_temperature_forecast <dbl> 2, 1, 2, 0, -1, -1, 3, 8, 8, -1, 2, 3, …
## $ min_low_temperature_forecast  <dbl> -1, -8, -3, -8, -9, -17, -6, 2, -3, -9,…
## $ max_low_temperature_forecast  <dbl> -1, -6, -3, -8, -9, -9, -4, 2, -3, -8, …
## $ solar_radiation               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ max_cloud_cover_4             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ avg_hourly_cloud_cover_4      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ avg_cloud_cover_4             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ min_cloud_cover_4             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ max_cloud_cover_8             <dbl> 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, …
## $ avg_hourly_cloud_cover_8      <dbl> 6.0, 5.8, 6.8, 7.2, 5.4, 3.7, 4.3, 4.3,…
## $ avg_cloud_cover_8             <dbl> 5.5, 5.0, 5.5, 6.5, 4.0, 4.0, 4.0, 4.5,…
## $ min_cloud_cover_8             <dbl> 3, 2, 3, 5, 0, 0, 0, 2, 2, 2, 5, 8, 6, …
## $ max_cloud_cover_10            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ avg_hourly_cloud_cover_10     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ avg_cloud_cover_10            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ min_cloud_cover_10            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
The visualization provided by plot_missing() helps identify columns that may need attention.
The data columns are divided into three sections:
plot_missing(processed_data[,1:30],
             title   = "% of Missing Data (filtered to cols w/missing data)",
             ggtheme = theme_tq())
plot_missing(processed_data[,31:60],
             title   = "% of Missing Data (filtered to cols w/missing data)",
             ggtheme = theme_tq())
plot_missing(processed_data[,61:85],
             title   = "% of Missing Data (filtered to cols w/missing data)",
             ggtheme = theme_tq())
skim() will do the job:
skim(processed_data)
| Name | processed_data | 
| Number of rows | 5066 | 
| Number of columns | 85 | 
| _______________________ | |
| Column type frequency: | |
| character | 4 | 
| Date | 1 | 
| numeric | 80 | 
| ________________________ | |
| Group variables | None | 
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace | 
|---|---|---|---|---|---|---|---|
| CD | 0 | 1 | 4 | 30 | 0 | 34 | 0 | 
| pc3 | 0 | 1 | 2 | 3 | 0 | 35 | 0 | 
| sunrise | 0 | 1 | 7 | 7 | 0 | 182 | 0 | 
| sunset | 0 | 1 | 8 | 8 | 0 | 239 | 0 | 
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique | 
|---|---|---|---|---|---|---|
| Episode | 0 | 1 | 2020-03-01 | 2020-07-27 | 2020-05-14 | 149 | 
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist | 
|---|---|---|---|---|---|---|---|---|---|---|
| all_day_bing_tiles_visited_relt | 0 | 1.00 | -0.13 | 0.19 | -0.66 | -0.27 | -0.11 | 0.01 | 0.46 | ▁▆▇▅▁ | 
| all_day_ratio_single_tile_users | 0 | 1.00 | 0.29 | 0.08 | 0.14 | 0.24 | 0.28 | 0.35 | 0.55 | ▂▇▅▂▁ | 
| cases | 2133 | 0.58 | 13.21 | 27.45 | 1.00 | 1.00 | 4.00 | 11.00 | 286.00 | ▇▁▁▁▁ | 
| maleperc | 2133 | 0.58 | 0.46 | 0.32 | 0.00 | 0.25 | 0.47 | 0.67 | 1.00 | ▆▅▇▃▅ | 
| delay_mean | 2133 | 0.58 | 6.62 | 6.09 | 0.00 | 3.00 | 5.00 | 8.04 | 87.00 | ▇▁▁▁▁ | 
| CC | 2133 | 0.58 | 0.30 | 0.32 | 0.00 | 0.00 | 0.24 | 0.50 | 1.00 | ▇▃▃▁▂ | 
| NoEpi | 2133 | 0.58 | 0.26 | 0.31 | 0.00 | 0.00 | 0.17 | 0.39 | 1.00 | ▇▃▂▁▂ | 
| NoInfo | 2133 | 0.58 | 0.03 | 0.13 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ | 
| Unknown | 2133 | 0.58 | 0.00 | 0.04 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ | 
| OB | 2133 | 0.58 | 0.30 | 0.34 | 0.00 | 0.00 | 0.17 | 0.50 | 1.00 | ▇▂▂▂▂ | 
| Travel | 2133 | 0.58 | 0.10 | 0.25 | 0.00 | 0.00 | 0.00 | 0.02 | 1.00 | ▇▁▁▁▁ | 
| age | 2133 | 0.58 | 44.67 | 14.29 | 20.00 | 34.42 | 44.35 | 53.33 | 90.00 | ▆▇▇▂▁ | 
| city | 2133 | 0.58 | 17.99 | 10.33 | 1.00 | 9.00 | 17.00 | 28.00 | 34.00 | ▆▆▅▅▇ | 
| max_temperature | 3 | 1.00 | 16.87 | 10.11 | -7.60 | 8.10 | 15.90 | 26.50 | 36.90 | ▂▇▇▇▆ | 
| avg_hourly_temperature | 218 | 0.96 | 11.87 | 9.30 | -14.70 | 3.85 | 11.47 | 20.85 | 28.70 | ▁▅▇▆▇ | 
| avg_temperature | 3 | 1.00 | 11.20 | 9.46 | -15.85 | 3.35 | 10.50 | 20.50 | 28.90 | ▁▅▇▆▇ | 
| min_temperature | 3 | 1.00 | 5.53 | 9.24 | -29.70 | -1.80 | 4.80 | 14.10 | 24.50 | ▁▁▇▆▆ | 
| max_humidex | 3253 | 0.36 | 33.11 | 4.28 | 25.00 | 30.00 | 33.00 | 36.00 | 44.00 | ▃▇▇▅▁ | 
| min_windchill | 3517 | 0.31 | -8.30 | 5.01 | -35.00 | -10.00 | -7.00 | -5.00 | 0.00 | ▁▁▂▇▇ | 
| max_relative_humidity | 32 | 0.99 | 90.02 | 9.90 | 30.00 | 85.00 | 93.00 | 98.00 | 100.00 | ▁▁▁▂▇ | 
| avg_hourly_relative_humidity | 32 | 0.99 | 69.16 | 13.27 | 22.70 | 60.40 | 69.40 | 79.00 | 100.00 | ▁▂▇▇▃ | 
| avg_relative_humidity | 32 | 0.99 | 68.80 | 11.13 | 21.50 | 62.00 | 69.00 | 76.50 | 100.00 | ▁▁▇▇▂ | 
| min_relative_humidity | 32 | 0.99 | 47.59 | 15.60 | 11.00 | 36.00 | 46.00 | 57.00 | 100.00 | ▂▇▇▃▁ | 
| max_dew_point | 32 | 0.99 | 8.63 | 9.24 | -21.70 | 0.72 | 8.10 | 17.60 | 25.30 | ▁▂▇▆▇ | 
| avg_hourly_dew_point | 32 | 0.99 | 5.38 | 9.69 | -23.40 | -2.60 | 4.40 | 14.97 | 23.00 | ▁▃▇▅▇ | 
| avg_dew_point | 32 | 0.99 | 5.26 | 9.64 | -23.60 | -2.80 | 4.20 | 14.80 | 23.00 | ▁▃▇▅▇ | 
| min_dew_point | 32 | 0.99 | 1.88 | 10.30 | -31.20 | -6.50 | 1.10 | 11.90 | 21.40 | ▁▂▇▆▆ | 
| max_wind_speed | 30 | 0.99 | 24.22 | 8.82 | 4.00 | 18.00 | 23.00 | 30.00 | 94.00 | ▇▇▁▁▁ | 
| avg_hourly_wind_speed | 30 | 0.99 | 13.42 | 5.82 | 2.12 | 9.17 | 12.46 | 16.58 | 43.17 | ▆▇▂▁▁ | 
| avg_wind_speed | 30 | 0.99 | 14.20 | 5.79 | 2.00 | 10.00 | 13.50 | 17.50 | 50.00 | ▆▇▂▁▁ | 
| min_wind_speed | 30 | 0.99 | 4.19 | 4.03 | 0.00 | 1.00 | 4.00 | 6.00 | 35.00 | ▇▂▁▁▁ | 
| max_wind_gust | 1224 | 0.76 | 42.60 | 11.26 | 27.00 | 34.00 | 40.00 | 48.00 | 160.00 | ▇▂▁▁▁ | 
| wind_gust_dir_10s | 1225 | 0.76 | 21.79 | 9.65 | 0.00 | 16.00 | 24.00 | 30.00 | 36.00 | ▃▃▅▇▇ | 
| max_pressure_sea | 151 | 0.97 | 101.86 | 0.71 | 99.62 | 101.42 | 101.79 | 102.21 | 104.59 | ▁▅▇▂▁ | 
| avg_hourly_pressure_sea | 151 | 0.97 | 101.53 | 0.74 | 99.07 | 101.10 | 101.54 | 101.92 | 104.30 | ▁▃▇▂▁ | 
| avg_pressure_sea | 151 | 0.97 | 101.53 | 0.72 | 99.29 | 101.09 | 101.53 | 101.89 | 104.24 | ▁▅▇▂▁ | 
| min_pressure_sea | 151 | 0.97 | 101.20 | 0.79 | 98.43 | 100.72 | 101.26 | 101.64 | 103.99 | ▁▃▇▂▁ | 
| max_pressure_station | 151 | 0.97 | 99.32 | 1.30 | 95.42 | 98.44 | 99.36 | 100.26 | 103.62 | ▁▅▇▃▁ | 
| avg_hourly_pressure_station | 151 | 0.97 | 99.01 | 1.31 | 94.84 | 98.15 | 99.05 | 99.95 | 103.35 | ▁▅▇▃▁ | 
| avg_pressure_station | 151 | 0.97 | 99.00 | 1.30 | 94.84 | 98.14 | 99.05 | 99.94 | 103.31 | ▁▃▇▃▁ | 
| min_pressure_station | 151 | 0.97 | 98.68 | 1.34 | 94.10 | 97.80 | 98.73 | 99.64 | 103.00 | ▁▃▇▅▁ | 
| max_visibility | 1194 | 0.76 | 22733.55 | 7484.95 | 12900.00 | 16100.00 | 24100.00 | 24100.00 | 80500.00 | ▇▁▁▁▁ | 
| avg_hourly_visibility | 1194 | 0.76 | 20331.38 | 5536.76 | 3825.00 | 16100.00 | 21230.40 | 24100.00 | 40200.00 | ▁▇▇▂▁ | 
| avg_visibility | 1194 | 0.76 | 18588.61 | 6447.86 | 6550.00 | 14450.00 | 16100.00 | 24100.00 | 52320.00 | ▅▇▁▁▁ | 
| min_visibility | 1194 | 0.76 | 14443.67 | 8811.92 | 0.00 | 4800.00 | 16100.00 | 24100.00 | 40200.00 | ▅▂▇▁▁ | 
| max_health_index | 1349 | 0.73 | 2.86 | 0.70 | 1.00 | 2.40 | 2.80 | 3.20 | 7.60 | ▂▇▂▁▁ | 
| avg_hourly_health_index | 1349 | 0.73 | 2.28 | 0.50 | 1.00 | 2.00 | 2.30 | 2.60 | 4.10 | ▂▇▇▂▁ | 
| avg_health_index | 1349 | 0.73 | 2.31 | 0.51 | 1.00 | 2.00 | 2.30 | 2.60 | 4.70 | ▂▇▆▁▁ | 
| min_health_index | 1349 | 0.73 | 1.76 | 0.47 | 1.00 | 1.40 | 1.80 | 2.10 | 3.40 | ▆▇▇▁▁ | 
| heatdegdays | 3 | 1.00 | 8.22 | 7.76 | 0.00 | 0.00 | 7.50 | 14.70 | 33.90 | ▇▃▅▁▁ | 
| cooldegdays | 3 | 1.00 | 1.42 | 2.43 | 0.00 | 0.00 | 0.00 | 2.50 | 10.90 | ▇▁▁▁▁ | 
| growdegdays_5 | 3 | 1.00 | 7.66 | 7.63 | 0.00 | 0.00 | 5.50 | 15.50 | 23.90 | ▇▂▂▃▁ | 
| growdegdays_7 | 3 | 1.00 | 6.40 | 6.92 | 0.00 | 0.00 | 3.50 | 13.50 | 21.90 | ▇▂▂▃▁ | 
| growdegdays_10 | 3 | 1.00 | 4.76 | 5.73 | 0.00 | 0.00 | 0.50 | 10.50 | 18.90 | ▇▁▂▂▁ | 
| precipitation | 819 | 0.84 | 2.15 | 5.66 | 0.00 | 0.00 | 0.00 | 1.30 | 86.60 | ▇▁▁▁▁ | 
| rain | 2930 | 0.42 | 1.99 | 5.64 | 0.00 | 0.00 | 0.00 | 1.00 | 86.60 | ▇▁▁▁▁ | 
| snow | 2927 | 0.42 | 0.15 | 0.83 | 0.00 | 0.00 | 0.00 | 0.00 | 12.20 | ▇▁▁▁▁ | 
| snow_on_ground | 3766 | 0.26 | 9.46 | 19.22 | 0.00 | 0.00 | 0.00 | 8.00 | 96.00 | ▇▁▁▁▁ | 
| daylight | 0 | 1.00 | 14.17 | 1.38 | 11.00 | 13.07 | 14.73 | 15.30 | 16.35 | ▃▃▃▇▆ | 
| sunrise_f | 0 | 1.00 | 6.23 | 0.68 | 5.15 | 5.70 | 6.00 | 6.77 | 8.40 | ▇▇▅▃▁ | 
| sunset_f | 0 | 1.00 | 20.40 | 0.79 | 17.77 | 19.90 | 20.63 | 21.00 | 22.05 | ▁▁▅▇▁ | 
| min_uv_forecast | 2 | 1.00 | 5.96 | 2.63 | 1.00 | 4.00 | 6.00 | 8.00 | 12.00 | ▆▇▅▇▂ | 
| max_uv_forecast | 2 | 1.00 | 6.44 | 2.57 | 1.00 | 4.00 | 6.00 | 9.00 | 12.00 | ▃▇▅▇▃ | 
| min_high_temperature_forecast | 1 | 1.00 | 16.68 | 10.40 | -12.00 | 8.00 | 16.00 | 27.00 | 36.00 | ▁▇▇▆▇ | 
| max_high_temperature_forecast | 1 | 1.00 | 17.02 | 10.34 | -8.00 | 8.00 | 16.00 | 27.00 | 36.00 | ▁▇▇▇▇ | 
| min_low_temperature_forecast | 0 | 1.00 | 6.32 | 9.42 | -24.00 | -1.00 | 6.00 | 15.00 | 25.00 | ▁▃▇▆▅ | 
| max_low_temperature_forecast | 0 | 1.00 | 7.10 | 9.27 | -21.00 | -1.00 | 7.00 | 16.00 | 25.00 | ▁▃▇▆▆ | 
| solar_radiation | 4416 | 0.13 | 18914.72 | 7667.59 | 1331.00 | 13180.25 | 19808.00 | 25243.25 | 32399.00 | ▂▅▆▇▆ | 
| max_cloud_cover_4 | 3506 | 0.31 | 3.45 | 1.15 | 0.00 | 4.00 | 4.00 | 4.00 | 4.00 | ▁▁▁▁▇ | 
| avg_hourly_cloud_cover_4 | 3506 | 0.31 | 1.84 | 1.26 | 0.00 | 0.70 | 1.80 | 2.90 | 4.00 | ▇▆▆▆▆ | 
| avg_cloud_cover_4 | 3506 | 0.31 | 1.87 | 0.78 | 0.00 | 2.00 | 2.00 | 2.00 | 4.00 | ▁▂▇▁▁ | 
| min_cloud_cover_4 | 3506 | 0.31 | 0.30 | 0.89 | 0.00 | 0.00 | 0.00 | 0.00 | 4.00 | ▇▁▁▁▁ | 
| max_cloud_cover_8 | 2533 | 0.50 | 7.32 | 1.44 | 1.00 | 7.00 | 8.00 | 8.00 | 8.00 | ▁▁▁▁▇ | 
| avg_hourly_cloud_cover_8 | 2533 | 0.50 | 4.95 | 2.00 | 0.00 | 3.60 | 5.10 | 6.60 | 8.00 | ▂▃▆▇▇ | 
| avg_cloud_cover_8 | 2533 | 0.50 | 4.48 | 1.47 | 0.50 | 4.00 | 4.00 | 5.00 | 8.00 | ▁▂▇▂▂ | 
| min_cloud_cover_8 | 2533 | 0.50 | 1.64 | 2.17 | 0.00 | 0.00 | 1.00 | 2.00 | 8.00 | ▇▂▁▁▁ | 
| max_cloud_cover_10 | 4768 | 0.06 | 8.10 | 3.27 | 0.00 | 7.25 | 10.00 | 10.00 | 10.00 | ▂▁▁▁▇ | 
| avg_hourly_cloud_cover_10 | 4768 | 0.06 | 4.37 | 3.07 | 0.00 | 1.40 | 4.30 | 7.05 | 10.00 | ▇▅▅▅▃ | 
| avg_cloud_cover_10 | 4768 | 0.06 | 4.38 | 2.05 | 0.00 | 3.62 | 5.00 | 5.00 | 10.00 | ▂▁▇▁▁ | 
| min_cloud_cover_10 | 4768 | 0.06 | 0.65 | 1.92 | 0.00 | 0.00 | 0.00 | 0.00 | 10.00 | ▇▁▁▁▁ | 
skim(processed_data$all_day_bing_tiles_visited_relt)
| Name | processed_data$all_day_bi… | 
| Number of rows | 5066 | 
| Number of columns | 1 | 
| _______________________ | |
| Column type frequency: | |
| numeric | 1 | 
| ________________________ | |
| Group variables | None | 
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist | 
|---|---|---|---|---|---|---|---|---|---|---|
| data | 0 | 1 | -0.13 | 0.19 | -0.66 | -0.27 | -0.11 | 0.01 | 0.46 | ▁▆▇▅▁ | 
Let’s visualize the data in three sections.
datatable(processed_data[, 1:30], rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )
datatable(processed_data[, 31:60], rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )
datatable(processed_data[, 61:85], rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )
geo <- processed_data[,c(1,3,6:8,15,17,21)] %>% 
  dplyr::group_by(CD) %>%
  dplyr::summarise_all(mean, na.rm = TRUE)
datatable(geo, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )
Case frequencies along with the percent of total.
processed_data %>% 
  janitor::tabyl(cases) %>% 
  arrange(desc(percent)) %>% 
  
  # formatting
  janitor::adorn_totals() %>% 
  janitor::adorn_pct_formatting() %>% 
  rmarkdown::paged_table()
This section uses only Toronto but can be expanded to other regions. The case numbers are also lagged for 10 days.
A <- processed_data[,c(1,3,6:8,15,17,21)] %>% 
  dplyr::filter(CD == "Toronto", ) %>%
  dplyr::mutate(lagged_cases = lag(cases, n = 10L))
B <- cor(A[,-1], use = "complete.obs")
B
##                                 all_day_bing_tiles_visited_relt       cases
## all_day_bing_tiles_visited_relt                       1.0000000 -0.79205061
## cases                                                -0.7920506  1.00000000
## maleperc                                              0.2399123 -0.07432073
## delay_mean                                           -0.4381753  0.60027151
## age                                                  -0.4012677  0.44226471
## max_temperature                                       0.4174737 -0.31406409
## max_humidex                                           0.3086334 -0.28368514
## lagged_cases                                         -0.7113276  0.87931153
##                                     maleperc delay_mean         age
## all_day_bing_tiles_visited_relt  0.239912329 -0.4381753 -0.40126765
## cases                           -0.074320734  0.6002715  0.44226471
## maleperc                         1.000000000 -0.1038246 -0.26590062
## delay_mean                      -0.103824616  1.0000000  0.39967474
## age                             -0.265900621  0.3996747  1.00000000
## max_temperature                 -0.022524454 -0.0961471  0.06058689
## max_humidex                     -0.013659505 -0.1148618 -0.03511465
## lagged_cases                    -0.007234725  0.5566002  0.35654930
##                                 max_temperature max_humidex lagged_cases
## all_day_bing_tiles_visited_relt      0.41747367  0.30863342 -0.711327634
## cases                               -0.31406409 -0.28368514  0.879311531
## maleperc                            -0.02252445 -0.01365951 -0.007234725
## delay_mean                          -0.09614710 -0.11486176  0.556600221
## age                                  0.06058689 -0.03511465  0.356549300
## max_temperature                      1.00000000  0.85082857 -0.342176427
## max_humidex                          0.85082857  1.00000000 -0.346667692
## lagged_cases                        -0.34217643 -0.34666769  1.000000000
library(GGally)
ggcorr(B)
An excellent source for learning making maps in R is provided by Nick Eubank and Claudia Engel in this link
Get the county level spatial data from ucdavis.edu:
can2<-getData('GADM', country="CAN", level=2) # counties
head(can2@data, 10)
##    GID_0 NAME_0   GID_1  NAME_1 NL_NAME_1      GID_2          NAME_2 VARNAME_2
## 3    CAN Canada CAN.1_1 Alberta      <NA> CAN.1.11_1  Division No. 1      <NA>
## 1    CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.1_1 Division No. 10      <NA>
## 12   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.2_1 Division No. 11      <NA>
## 13   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.3_1 Division No. 12      <NA>
## 14   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.4_1 Division No. 13      <NA>
## 15   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.5_1 Division No. 14      <NA>
## 16   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.6_1 Division No. 15      <NA>
## 17   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.7_1 Division No. 16      <NA>
## 18   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.8_1 Division No. 17      <NA>
## 19   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.9_1 Division No. 18      <NA>
##    NL_NAME_2          TYPE_2       ENGTYPE_2 CC_2   HASC_2
## 3       <NA> Census Division Census Division   01 CA.AB.ON
## 1       <NA> Census Division Census Division   10 CA.AB.TE
## 12      <NA> Census Division Census Division   11 CA.AB.EL
## 13      <NA> Census Division Census Division   12 CA.AB.TV
## 14      <NA> Census Division Census Division   13 CA.AB.TN
## 15      <NA> Census Division Census Division   14 CA.AB.FT
## 16      <NA> Census Division Census Division   15 CA.AB.FN
## 17      <NA> Census Division Census Division   16 CA.AB.ST
## 18      <NA> Census Division Census Division   17 CA.AB.SN
## 19      <NA> Census Division Census Division   18 CA.AB.ET
class(can2)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
county_on <- can2 %>% 
  filter(NAME_1 == "Ontario")
geo2 <- processed_data %>% 
  group_by(CD) %>%
  summarize(maxcases = max(cases, na.rm = TRUE),
            meanmob = mean(all_day_bing_tiles_visited_relt))
  
a <- unique(county_on$NAME_2)
b <- unique(geo2$CD)
#is.element(b, a)
w = dplyr::left_join(county_on, geo2, by = c("NAME_2" = "CD"))
#class(w)
#nrow(w)
my.palette <- brewer.pal(n = 12, name = "Accent")
spplot(w["maxcases"], main = "Ontario COVID19", 
       sub = "Maximum cases observed any day", 
       col = "transparent",
       col.regions = my.palette,
       cuts = 12)
my.palette <- brewer.pal(n = 12, name = "Accent")
spplot(w["meanmob"], main = "Ontario Mobility", 
       sub = "Average daily mobility", 
       col = "transparent",
       col.regions = my.palette,
       cuts = 12)
# Change these parameters
scale.parameter = 0.45  # scaling parameter. less than 1 is zooming in, more than 1 zooming out. 
xshift = 6  # Shift to right in map units. 
yshift = -6  # Shift to left in map units. 
original.bbox = w@bbox  # Pass bbox of your Spatial* Object. 
# Just copy-paste the following
edges = original.bbox
edges[1, ] <- (edges[1, ] - mean(edges[1, ])) * scale.parameter + mean(edges[1, 
]) + xshift
edges[2, ] <- (edges[2, ] - mean(edges[2, ])) * scale.parameter + mean(edges[2, 
]) + yshift
# In `spplot`, set xlim to edges[1,] and ylim to edges[2,]
spplot(w, "maxcases", main = "Ontario COVID19",
       sub = "Maximum cases observed any day", 
       col = "transparent",
       xlim = edges[1, ], ylim = edges[2, ])