This is just an initial exploratory step to see our raw data.
NOTE: Please click on the Code button to see the codes.

Load the Data and Packages

Let’s download the data and load the packages.

setwd("~/Dropbox/RNS")
load("~/Dropbox/RNS2/DataON/finaldata.RData")
processed_data <- finaldata

This rmd file uses several packages. If you want to replicate the same results, please install them first.

library(tidyverse)
library(tidyquant)
library(recipes)
library(rsample)
library(knitr)
library(maps)

# Data Cleaning
library(janitor)

# EDA
library(skimr)
library(DataExplorer)
library(DT)
library(corrplot)

# ggplot2 Helpers
library(scales)
theme_set(theme_tq())
library(raster)
library(spdplyr)
library(dplyr)
library(RColorBrewer)

Take a glimpse of the data

As usual, let’s take a glimpse() of our data to see how we should proceed. Since the number of columns is 85, we divided it to 3 sections


First (1:30)

glimpse(processed_data[,1:30])
## Rows: 5,066
## Columns: 30
## $ CD                              <chr> "Algoma", "Algoma", "Algoma", "Algoma…
## $ Episode                         <date> 2020-03-01, 2020-03-02, 2020-03-03, …
## $ all_day_bing_tiles_visited_relt <dbl> 0.04122, 0.06040, 0.06287, 0.03830, 0…
## $ all_day_ratio_single_tile_users <dbl> 0.27549, 0.20603, 0.19751, 0.19553, 0…
## $ pc3                             <chr> "NA", "NA", "NA", "NA", "NA", "NA", "…
## $ cases                           <dbl> NA, NA, NA, NA, NA, NA, NA, 1, NA, NA…
## $ maleperc                        <dbl> NA, NA, NA, NA, NA, NA, NA, 0.0, NA, …
## $ delay_mean                      <dbl> NA, NA, NA, NA, NA, NA, NA, 39.0, NA,…
## $ CC                              <dbl> NA, NA, NA, NA, NA, NA, NA, 0.0, NA, …
## $ NoEpi                           <dbl> NA, NA, NA, NA, NA, NA, NA, 1.0, NA, …
## $ NoInfo                          <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, NA…
## $ Unknown                         <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, NA…
## $ OB                              <dbl> NA, NA, NA, NA, NA, NA, NA, 0, NA, NA…
## $ Travel                          <dbl> NA, NA, NA, NA, NA, NA, NA, 0.0, NA, …
## $ age                             <dbl> NA, NA, NA, NA, NA, NA, NA, 60, NA, N…
## $ city                            <dbl> NA, NA, NA, NA, NA, NA, NA, 23, NA, N…
## $ max_temperature                 <dbl> 3.2, 0.8, 3.6, -0.5, -0.9, -2.2, 3.9,…
## $ avg_hourly_temperature          <dbl> -3.26, -2.56, -1.59, -2.35, -4.97, -6…
## $ avg_temperature                 <dbl> -3.90, -6.05, -3.55, -4.30, -7.90, -1…
## $ min_temperature                 <dbl> -11.0, -12.9, -10.7, -8.1, -14.9, -18…
## $ max_humidex                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ min_windchill                   <dbl> -16, -13, -13, -10, -17, -17, -21, -1…
## $ max_relative_humidity           <dbl> 90, 94, 96, 89, 92, 87, 87, 86, 92, 9…
## $ avg_hourly_relative_humidity    <dbl> 74.8, 80.0, 82.2, 80.8, 88.9, 71.0, 6…
## $ avg_relative_humidity           <dbl> 72.5, 82.5, 76.0, 79.5, 87.0, 68.0, 6…
## $ min_relative_humidity           <dbl> 55, 71, 56, 70, 82, 49, 34, 41, 55, 7…
## $ max_dew_point                   <dbl> -4.4, -3.1, -0.5, -3.0, -2.5, -4.4, -…
## $ avg_hourly_dew_point            <dbl> -7.3, -5.6, -4.5, -5.2, -6.5, -10.9, …
## $ avg_dew_point                   <dbl> -8.6, -7.5, -6.2, -5.2, -9.2, -11.5, …
## $ min_dew_point                   <dbl> -12.8, -11.8, -11.8, -7.5, -16.0, -18…

Second (31:60)

glimpse(processed_data[,31:60])
## Rows: 5,066
## Columns: 30
## $ max_wind_speed              <dbl> 28, 31, 19, 41, 27, 32, 21, 20, 28, 34, 1…
## $ avg_hourly_wind_speed       <dbl> 15.54, 16.83, 10.71, 22.50, 16.83, 17.83,…
## $ avg_wind_speed              <dbl> 18.0, 16.0, 11.0, 22.5, 15.0, 16.5, 12.5,…
## $ min_wind_speed              <dbl> 8, 1, 3, 4, 3, 1, 4, 1, 0, 1, 5, 5, 10, 1…
## $ max_wind_gust               <dbl> 48, 45, NA, 52, 42, 42, 33, 37, 39, 50, N…
## $ wind_gust_dir_10s           <dbl> 12, 30, NA, 31, 8, 32, 19, 18, 31, 31, NA…
## $ max_pressure_sea            <dbl> 101.74, 100.52, 100.45, 101.85, 101.89, 1…
## $ avg_hourly_pressure_sea     <dbl> 100.96, 100.32, 100.01, 100.89, 101.60, 1…
## $ avg_pressure_sea            <dbl> 100.95, 100.30, 100.04, 100.99, 101.57, 1…
## $ min_pressure_sea            <dbl> 100.16, 100.09, 99.63, 100.13, 101.25, 10…
## $ max_pressure_station        <dbl> 99.31, 98.15, 98.07, 99.45, 99.47, 100.89…
## $ avg_hourly_pressure_station <dbl> 98.57, 97.96, 97.65, 98.52, 99.19, 100.32…
## $ avg_pressure_station        <dbl> 98.56, 97.95, 97.68, 98.61, 99.15, 100.07…
## $ min_pressure_station        <dbl> 97.80, 97.75, 97.29, 97.78, 98.84, 99.25,…
## $ max_visibility              <dbl> 32200, 40200, 40200, 32200, 32200, 40200,…
## $ avg_hourly_visibility       <dbl> 32200.0, 31912.5, 24337.5, 25291.7, 13225…
## $ avg_visibility              <dbl> 32200, 24950, 20500, 20150, 16700, 21900,…
## $ min_visibility              <dbl> 32200, 9700, 800, 8100, 1200, 3600, 32200…
## $ max_health_index            <dbl> 2.9, 3.0, 2.9, 2.8, 2.7, 2.7, 2.8, 4.0, 3…
## $ avg_hourly_health_index     <dbl> 2.4, 2.5, 2.4, 2.3, 2.3, 2.3, 2.4, 2.8, 2…
## $ avg_health_index            <dbl> 2.5, 2.6, 2.5, 2.5, 2.4, 2.3, 2.4, 3.0, 2…
## $ min_health_index            <dbl> 2.0, 2.3, 2.0, 2.1, 2.1, 1.9, 2.0, 2.1, 2…
## $ heatdegdays                 <dbl> 21.9, 24.1, 21.6, 22.3, 25.9, 28.6, 25.5,…
## $ cooldegdays                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ growdegdays_5               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ growdegdays_7               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ growdegdays_10              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ precipitation               <dbl> 0.0, 0.0, 1.4, 0.8, 7.8, 0.5, 0.0, 0.0, 1…
## $ rain                        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1…
## $ snow                        <dbl> 0.0, 0.0, 2.7, 0.8, 10.6, 1.2, 0.0, 0.0, …

Third (61:85)

glimpse(processed_data[,61:85])
## Rows: 5,066
## Columns: 25
## $ snow_on_ground                <dbl> 72, 72, 72, 74, 73, 82, 82, 82, 74, 73,…
## $ sunrise                       <chr> "7:16:00", "7:14:00", "7:13:00", "7:11:…
## $ sunset                        <chr> "18:23:00", "18:24:00", "18:26:00", "18…
## $ daylight                      <dbl> 11.12, 11.17, 11.22, 11.27, 11.33, 11.3…
## $ sunrise_f                     <dbl> 7.27, 7.23, 7.22, 7.18, 7.15, 7.12, 7.0…
## $ sunset_f                      <dbl> 18.38, 18.40, 18.43, 18.45, 18.48, 18.5…
## $ min_uv_forecast               <dbl> 2, 2, 2, 2, 1, 3, 4, 4, 3, 4, 1, 2, 1, …
## $ max_uv_forecast               <dbl> 2, 3, 2, 2, 1, 3, 4, 4, 3, 4, 1, 2, 1, …
## $ min_high_temperature_forecast <dbl> 2, 1, 2, -1, -1, -1, 3, 8, 6, -1, 2, 3,…
## $ max_high_temperature_forecast <dbl> 2, 1, 2, 0, -1, -1, 3, 8, 8, -1, 2, 3, …
## $ min_low_temperature_forecast  <dbl> -1, -8, -3, -8, -9, -17, -6, 2, -3, -9,…
## $ max_low_temperature_forecast  <dbl> -1, -6, -3, -8, -9, -9, -4, 2, -3, -8, …
## $ solar_radiation               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ max_cloud_cover_4             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ avg_hourly_cloud_cover_4      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ avg_cloud_cover_4             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ min_cloud_cover_4             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ max_cloud_cover_8             <dbl> 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, …
## $ avg_hourly_cloud_cover_8      <dbl> 6.0, 5.8, 6.8, 7.2, 5.4, 3.7, 4.3, 4.3,…
## $ avg_cloud_cover_8             <dbl> 5.5, 5.0, 5.5, 6.5, 4.0, 4.0, 4.0, 4.5,…
## $ min_cloud_cover_8             <dbl> 3, 2, 3, 5, 0, 0, 0, 2, 2, 2, 5, 8, 6, …
## $ max_cloud_cover_10            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ avg_hourly_cloud_cover_10     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ avg_cloud_cover_10            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ min_cloud_cover_10            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…

Missing Data

The visualization provided by plot_missing() helps identify columns that may need attention.


The data columns are divided into three sections:

First

plot_missing(processed_data[,1:30],
             title   = "% of Missing Data (filtered to cols w/missing data)",
             ggtheme = theme_tq())

Second

plot_missing(processed_data[,31:60],
             title   = "% of Missing Data (filtered to cols w/missing data)",
             ggtheme = theme_tq())

Third

plot_missing(processed_data[,61:85],
             title   = "% of Missing Data (filtered to cols w/missing data)",
             ggtheme = theme_tq())

Summary

skim() will do the job:


Whole data

skim(processed_data)
Data summary
Name processed_data
Number of rows 5066
Number of columns 85
_______________________
Column type frequency:
character 4
Date 1
numeric 80
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
CD 0 1 4 30 0 34 0
pc3 0 1 2 3 0 35 0
sunrise 0 1 7 7 0 182 0
sunset 0 1 8 8 0 239 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
Episode 0 1 2020-03-01 2020-07-27 2020-05-14 149

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
all_day_bing_tiles_visited_relt 0 1.00 -0.13 0.19 -0.66 -0.27 -0.11 0.01 0.46 ▁▆▇▅▁
all_day_ratio_single_tile_users 0 1.00 0.29 0.08 0.14 0.24 0.28 0.35 0.55 ▂▇▅▂▁
cases 2133 0.58 13.21 27.45 1.00 1.00 4.00 11.00 286.00 ▇▁▁▁▁
maleperc 2133 0.58 0.46 0.32 0.00 0.25 0.47 0.67 1.00 ▆▅▇▃▅
delay_mean 2133 0.58 6.62 6.09 0.00 3.00 5.00 8.04 87.00 ▇▁▁▁▁
CC 2133 0.58 0.30 0.32 0.00 0.00 0.24 0.50 1.00 ▇▃▃▁▂
NoEpi 2133 0.58 0.26 0.31 0.00 0.00 0.17 0.39 1.00 ▇▃▂▁▂
NoInfo 2133 0.58 0.03 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
Unknown 2133 0.58 0.00 0.04 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
OB 2133 0.58 0.30 0.34 0.00 0.00 0.17 0.50 1.00 ▇▂▂▂▂
Travel 2133 0.58 0.10 0.25 0.00 0.00 0.00 0.02 1.00 ▇▁▁▁▁
age 2133 0.58 44.67 14.29 20.00 34.42 44.35 53.33 90.00 ▆▇▇▂▁
city 2133 0.58 17.99 10.33 1.00 9.00 17.00 28.00 34.00 ▆▆▅▅▇
max_temperature 3 1.00 16.87 10.11 -7.60 8.10 15.90 26.50 36.90 ▂▇▇▇▆
avg_hourly_temperature 218 0.96 11.87 9.30 -14.70 3.85 11.47 20.85 28.70 ▁▅▇▆▇
avg_temperature 3 1.00 11.20 9.46 -15.85 3.35 10.50 20.50 28.90 ▁▅▇▆▇
min_temperature 3 1.00 5.53 9.24 -29.70 -1.80 4.80 14.10 24.50 ▁▁▇▆▆
max_humidex 3253 0.36 33.11 4.28 25.00 30.00 33.00 36.00 44.00 ▃▇▇▅▁
min_windchill 3517 0.31 -8.30 5.01 -35.00 -10.00 -7.00 -5.00 0.00 ▁▁▂▇▇
max_relative_humidity 32 0.99 90.02 9.90 30.00 85.00 93.00 98.00 100.00 ▁▁▁▂▇
avg_hourly_relative_humidity 32 0.99 69.16 13.27 22.70 60.40 69.40 79.00 100.00 ▁▂▇▇▃
avg_relative_humidity 32 0.99 68.80 11.13 21.50 62.00 69.00 76.50 100.00 ▁▁▇▇▂
min_relative_humidity 32 0.99 47.59 15.60 11.00 36.00 46.00 57.00 100.00 ▂▇▇▃▁
max_dew_point 32 0.99 8.63 9.24 -21.70 0.72 8.10 17.60 25.30 ▁▂▇▆▇
avg_hourly_dew_point 32 0.99 5.38 9.69 -23.40 -2.60 4.40 14.97 23.00 ▁▃▇▅▇
avg_dew_point 32 0.99 5.26 9.64 -23.60 -2.80 4.20 14.80 23.00 ▁▃▇▅▇
min_dew_point 32 0.99 1.88 10.30 -31.20 -6.50 1.10 11.90 21.40 ▁▂▇▆▆
max_wind_speed 30 0.99 24.22 8.82 4.00 18.00 23.00 30.00 94.00 ▇▇▁▁▁
avg_hourly_wind_speed 30 0.99 13.42 5.82 2.12 9.17 12.46 16.58 43.17 ▆▇▂▁▁
avg_wind_speed 30 0.99 14.20 5.79 2.00 10.00 13.50 17.50 50.00 ▆▇▂▁▁
min_wind_speed 30 0.99 4.19 4.03 0.00 1.00 4.00 6.00 35.00 ▇▂▁▁▁
max_wind_gust 1224 0.76 42.60 11.26 27.00 34.00 40.00 48.00 160.00 ▇▂▁▁▁
wind_gust_dir_10s 1225 0.76 21.79 9.65 0.00 16.00 24.00 30.00 36.00 ▃▃▅▇▇
max_pressure_sea 151 0.97 101.86 0.71 99.62 101.42 101.79 102.21 104.59 ▁▅▇▂▁
avg_hourly_pressure_sea 151 0.97 101.53 0.74 99.07 101.10 101.54 101.92 104.30 ▁▃▇▂▁
avg_pressure_sea 151 0.97 101.53 0.72 99.29 101.09 101.53 101.89 104.24 ▁▅▇▂▁
min_pressure_sea 151 0.97 101.20 0.79 98.43 100.72 101.26 101.64 103.99 ▁▃▇▂▁
max_pressure_station 151 0.97 99.32 1.30 95.42 98.44 99.36 100.26 103.62 ▁▅▇▃▁
avg_hourly_pressure_station 151 0.97 99.01 1.31 94.84 98.15 99.05 99.95 103.35 ▁▅▇▃▁
avg_pressure_station 151 0.97 99.00 1.30 94.84 98.14 99.05 99.94 103.31 ▁▃▇▃▁
min_pressure_station 151 0.97 98.68 1.34 94.10 97.80 98.73 99.64 103.00 ▁▃▇▅▁
max_visibility 1194 0.76 22733.55 7484.95 12900.00 16100.00 24100.00 24100.00 80500.00 ▇▁▁▁▁
avg_hourly_visibility 1194 0.76 20331.38 5536.76 3825.00 16100.00 21230.40 24100.00 40200.00 ▁▇▇▂▁
avg_visibility 1194 0.76 18588.61 6447.86 6550.00 14450.00 16100.00 24100.00 52320.00 ▅▇▁▁▁
min_visibility 1194 0.76 14443.67 8811.92 0.00 4800.00 16100.00 24100.00 40200.00 ▅▂▇▁▁
max_health_index 1349 0.73 2.86 0.70 1.00 2.40 2.80 3.20 7.60 ▂▇▂▁▁
avg_hourly_health_index 1349 0.73 2.28 0.50 1.00 2.00 2.30 2.60 4.10 ▂▇▇▂▁
avg_health_index 1349 0.73 2.31 0.51 1.00 2.00 2.30 2.60 4.70 ▂▇▆▁▁
min_health_index 1349 0.73 1.76 0.47 1.00 1.40 1.80 2.10 3.40 ▆▇▇▁▁
heatdegdays 3 1.00 8.22 7.76 0.00 0.00 7.50 14.70 33.90 ▇▃▅▁▁
cooldegdays 3 1.00 1.42 2.43 0.00 0.00 0.00 2.50 10.90 ▇▁▁▁▁
growdegdays_5 3 1.00 7.66 7.63 0.00 0.00 5.50 15.50 23.90 ▇▂▂▃▁
growdegdays_7 3 1.00 6.40 6.92 0.00 0.00 3.50 13.50 21.90 ▇▂▂▃▁
growdegdays_10 3 1.00 4.76 5.73 0.00 0.00 0.50 10.50 18.90 ▇▁▂▂▁
precipitation 819 0.84 2.15 5.66 0.00 0.00 0.00 1.30 86.60 ▇▁▁▁▁
rain 2930 0.42 1.99 5.64 0.00 0.00 0.00 1.00 86.60 ▇▁▁▁▁
snow 2927 0.42 0.15 0.83 0.00 0.00 0.00 0.00 12.20 ▇▁▁▁▁
snow_on_ground 3766 0.26 9.46 19.22 0.00 0.00 0.00 8.00 96.00 ▇▁▁▁▁
daylight 0 1.00 14.17 1.38 11.00 13.07 14.73 15.30 16.35 ▃▃▃▇▆
sunrise_f 0 1.00 6.23 0.68 5.15 5.70 6.00 6.77 8.40 ▇▇▅▃▁
sunset_f 0 1.00 20.40 0.79 17.77 19.90 20.63 21.00 22.05 ▁▁▅▇▁
min_uv_forecast 2 1.00 5.96 2.63 1.00 4.00 6.00 8.00 12.00 ▆▇▅▇▂
max_uv_forecast 2 1.00 6.44 2.57 1.00 4.00 6.00 9.00 12.00 ▃▇▅▇▃
min_high_temperature_forecast 1 1.00 16.68 10.40 -12.00 8.00 16.00 27.00 36.00 ▁▇▇▆▇
max_high_temperature_forecast 1 1.00 17.02 10.34 -8.00 8.00 16.00 27.00 36.00 ▁▇▇▇▇
min_low_temperature_forecast 0 1.00 6.32 9.42 -24.00 -1.00 6.00 15.00 25.00 ▁▃▇▆▅
max_low_temperature_forecast 0 1.00 7.10 9.27 -21.00 -1.00 7.00 16.00 25.00 ▁▃▇▆▆
solar_radiation 4416 0.13 18914.72 7667.59 1331.00 13180.25 19808.00 25243.25 32399.00 ▂▅▆▇▆
max_cloud_cover_4 3506 0.31 3.45 1.15 0.00 4.00 4.00 4.00 4.00 ▁▁▁▁▇
avg_hourly_cloud_cover_4 3506 0.31 1.84 1.26 0.00 0.70 1.80 2.90 4.00 ▇▆▆▆▆
avg_cloud_cover_4 3506 0.31 1.87 0.78 0.00 2.00 2.00 2.00 4.00 ▁▂▇▁▁
min_cloud_cover_4 3506 0.31 0.30 0.89 0.00 0.00 0.00 0.00 4.00 ▇▁▁▁▁
max_cloud_cover_8 2533 0.50 7.32 1.44 1.00 7.00 8.00 8.00 8.00 ▁▁▁▁▇
avg_hourly_cloud_cover_8 2533 0.50 4.95 2.00 0.00 3.60 5.10 6.60 8.00 ▂▃▆▇▇
avg_cloud_cover_8 2533 0.50 4.48 1.47 0.50 4.00 4.00 5.00 8.00 ▁▂▇▂▂
min_cloud_cover_8 2533 0.50 1.64 2.17 0.00 0.00 1.00 2.00 8.00 ▇▂▁▁▁
max_cloud_cover_10 4768 0.06 8.10 3.27 0.00 7.25 10.00 10.00 10.00 ▂▁▁▁▇
avg_hourly_cloud_cover_10 4768 0.06 4.37 3.07 0.00 1.40 4.30 7.05 10.00 ▇▅▅▅▃
avg_cloud_cover_10 4768 0.06 4.38 2.05 0.00 3.62 5.00 5.00 10.00 ▂▁▇▁▁
min_cloud_cover_10 4768 0.06 0.65 1.92 0.00 0.00 0.00 0.00 10.00 ▇▁▁▁▁

A single column

skim(processed_data$all_day_bing_tiles_visited_relt)
Data summary
Name processed_data$all_day_bi…
Number of rows 5066
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 -0.13 0.19 -0.66 -0.27 -0.11 0.01 0.46 ▁▆▇▅▁

Tables

Let’s visualize the data in three sections.


First

datatable(processed_data[, 1:30], rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )

Second

datatable(processed_data[, 31:60], rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )

Third

datatable(processed_data[, 61:85], rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )

Temporal and Spatial summaries


Spatial

geo <- processed_data[,c(1,3,6:8,15,17,21)] %>% 
  dplyr::group_by(CD) %>%
  dplyr::summarise_all(mean, na.rm = TRUE)
datatable(geo, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )

Temporal

Case frequencies along with the percent of total.

processed_data %>% 
  janitor::tabyl(cases) %>% 
  arrange(desc(percent)) %>% 
  
  # formatting
  janitor::adorn_totals() %>% 
  janitor::adorn_pct_formatting() %>% 
  rmarkdown::paged_table()

Correlation

This section uses only Toronto but can be expanded to other regions. The case numbers are also lagged for 10 days.

A <- processed_data[,c(1,3,6:8,15,17,21)] %>% 
  dplyr::filter(CD == "Toronto", ) %>%
  dplyr::mutate(lagged_cases = lag(cases, n = 10L))

Table

B <- cor(A[,-1], use = "complete.obs")
B
##                                 all_day_bing_tiles_visited_relt       cases
## all_day_bing_tiles_visited_relt                       1.0000000 -0.79205061
## cases                                                -0.7920506  1.00000000
## maleperc                                              0.2399123 -0.07432073
## delay_mean                                           -0.4381753  0.60027151
## age                                                  -0.4012677  0.44226471
## max_temperature                                       0.4174737 -0.31406409
## max_humidex                                           0.3086334 -0.28368514
## lagged_cases                                         -0.7113276  0.87931153
##                                     maleperc delay_mean         age
## all_day_bing_tiles_visited_relt  0.239912329 -0.4381753 -0.40126765
## cases                           -0.074320734  0.6002715  0.44226471
## maleperc                         1.000000000 -0.1038246 -0.26590062
## delay_mean                      -0.103824616  1.0000000  0.39967474
## age                             -0.265900621  0.3996747  1.00000000
## max_temperature                 -0.022524454 -0.0961471  0.06058689
## max_humidex                     -0.013659505 -0.1148618 -0.03511465
## lagged_cases                    -0.007234725  0.5566002  0.35654930
##                                 max_temperature max_humidex lagged_cases
## all_day_bing_tiles_visited_relt      0.41747367  0.30863342 -0.711327634
## cases                               -0.31406409 -0.28368514  0.879311531
## maleperc                            -0.02252445 -0.01365951 -0.007234725
## delay_mean                          -0.09614710 -0.11486176  0.556600221
## age                                  0.06058689 -0.03511465  0.356549300
## max_temperature                      1.00000000  0.85082857 -0.342176427
## max_humidex                          0.85082857  1.00000000 -0.346667692
## lagged_cases                        -0.34217643 -0.34666769  1.000000000

Plot

library(GGally)
ggcorr(B)

Maps

An excellent source for learning making maps in R is provided by Nick Eubank and Claudia Engel in this link

Get the county level spatial data from ucdavis.edu:

can2<-getData('GADM', country="CAN", level=2) # counties
head(can2@data, 10)
##    GID_0 NAME_0   GID_1  NAME_1 NL_NAME_1      GID_2          NAME_2 VARNAME_2
## 3    CAN Canada CAN.1_1 Alberta      <NA> CAN.1.11_1  Division No. 1      <NA>
## 1    CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.1_1 Division No. 10      <NA>
## 12   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.2_1 Division No. 11      <NA>
## 13   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.3_1 Division No. 12      <NA>
## 14   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.4_1 Division No. 13      <NA>
## 15   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.5_1 Division No. 14      <NA>
## 16   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.6_1 Division No. 15      <NA>
## 17   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.7_1 Division No. 16      <NA>
## 18   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.8_1 Division No. 17      <NA>
## 19   CAN Canada CAN.1_1 Alberta      <NA>  CAN.1.9_1 Division No. 18      <NA>
##    NL_NAME_2          TYPE_2       ENGTYPE_2 CC_2   HASC_2
## 3       <NA> Census Division Census Division   01 CA.AB.ON
## 1       <NA> Census Division Census Division   10 CA.AB.TE
## 12      <NA> Census Division Census Division   11 CA.AB.EL
## 13      <NA> Census Division Census Division   12 CA.AB.TV
## 14      <NA> Census Division Census Division   13 CA.AB.TN
## 15      <NA> Census Division Census Division   14 CA.AB.FT
## 16      <NA> Census Division Census Division   15 CA.AB.FN
## 17      <NA> Census Division Census Division   16 CA.AB.ST
## 18      <NA> Census Division Census Division   17 CA.AB.SN
## 19      <NA> Census Division Census Division   18 CA.AB.ET
class(can2)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
county_on <- can2 %>% 
  filter(NAME_1 == "Ontario")

geo2 <- processed_data %>% 
  group_by(CD) %>%
  summarize(maxcases = max(cases, na.rm = TRUE),
            meanmob = mean(all_day_bing_tiles_visited_relt))
  
a <- unique(county_on$NAME_2)
b <- unique(geo2$CD)

#is.element(b, a)

w = dplyr::left_join(county_on, geo2, by = c("NAME_2" = "CD"))
#class(w)
#nrow(w)

Ontario Maximum Covid-19 Cases

my.palette <- brewer.pal(n = 12, name = "Accent")
spplot(w["maxcases"], main = "Ontario COVID19", 
       sub = "Maximum cases observed any day", 
       col = "transparent",
       col.regions = my.palette,
       cuts = 12)

Ontario Average Mobility

my.palette <- brewer.pal(n = 12, name = "Accent")
spplot(w["meanmob"], main = "Ontario Mobility", 
       sub = "Average daily mobility", 
       col = "transparent",
       col.regions = my.palette,
       cuts = 12)

Ontario Maximum Covid-19 Cases - Zoomed

# Change these parameters
scale.parameter = 0.45  # scaling parameter. less than 1 is zooming in, more than 1 zooming out. 
xshift = 6  # Shift to right in map units. 
yshift = -6  # Shift to left in map units. 
original.bbox = w@bbox  # Pass bbox of your Spatial* Object. 

# Just copy-paste the following
edges = original.bbox

edges[1, ] <- (edges[1, ] - mean(edges[1, ])) * scale.parameter + mean(edges[1, 
]) + xshift
edges[2, ] <- (edges[2, ] - mean(edges[2, ])) * scale.parameter + mean(edges[2, 
]) + yshift

# In `spplot`, set xlim to edges[1,] and ylim to edges[2,]
spplot(w, "maxcases", main = "Ontario COVID19",
       sub = "Maximum cases observed any day", 
       col = "transparent",
       xlim = edges[1, ], ylim = edges[2, ])