This report provides annotation layer to the computational script ./reports/technique-demonstration/technique-demonstration.R
For the full technical report of this script, see its stitched output ./reports/technique-demonstration/stitched_output/technique-demonstration.html
# Attach these packages so their functions don't need to be qualified: http://r-pkgs.had.co.nz/namespace.html#search-path
library(ggplot2) #For graphing
library(magrittr) # Pipes
library(dplyr)
requireNamespace("dplyr", quietly=TRUE)
requireNamespace("TabularManifest") # devtools::install_github("Melinae/TabularManifest")
requireNamespace("knitr")
requireNamespace("scales") #For formating values in graphs
requireNamespace("RColorBrewer")
# link to the source of the location mapping
path_input_meta <- "./data-unshared/derived/0-ls_guide.rds"
path_input_micro <- "./data-unshared/derived/1-greeted.rds"
# path_input_micro <- "./data-unshared/derived/simulated_subsample.rds"
# test whether the file exists / the link is good
testit::assert("File does not exist", base::file.exists(path_input_meta))
testit::assert("File does not exist", base::file.exists(path_input_micro))
# declare where you will store the product of this script
# path_save <- "./data-unshared/derived/object.rds"
# See definitions of commonly used objects in:
source("./manipulation/object-glossary.R") # object definitions
Observations: 4,346,649
Variables: 35
$ person_id [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, ...
$ ABDERR [3m[38;5;246m<fct>[39m[23m Non-Aboriginal Ide...
$ ABIDENT [3m[38;5;246m<fct>[39m[23m Non-Aboriginal ide...
$ ADIFCLTY [3m[38;5;246m<fct>[39m[23m No, No, No, No, No...
$ CITSM [3m[38;5;246m<fct>[39m[23m Not a Canadian cit...
$ COWD [3m[38;5;246m<fct>[39m[23m "Paid Worker - Wor...
$ DISABFL [3m[38;5;246m<fct>[39m[23m "No", "No", "Yes, ...
$ DISABIL [3m[38;5;246m<fct>[39m[23m No difficulty with...
$ DVISMIN [3m[38;5;246m<fct>[39m[23m Not a visible mino...
$ FOL [3m[38;5;246m<fct>[39m[23m English only, Engl...
$ FPTIM [3m[38;5;246m<fct>[39m[23m NA, NA, NA, NA, NA...
$ GENSTPOB [3m[38;5;246m<fct>[39m[23m 1st generation - R...
$ HCDD [3m[38;5;246m<fct>[39m[23m "Bachelors degree"...
$ IMMDER [3m[38;5;246m<fct>[39m[23m Immigrants, Immigr...
$ LOINCA [3m[38;5;246m<fct>[39m[23m non-low income, no...
$ LOINCB [3m[38;5;246m<fct>[39m[23m non-low income, no...
$ MARST [3m[38;5;246m<fct>[39m[23m Legally married (a...
$ NOCSBRD [3m[38;5;246m<fct>[39m[23m "D Health occupati...
$ OLN [3m[38;5;246m<fct>[39m[23m Both English and F...
$ POBDER [3m[38;5;246m<fct>[39m[23m Born outside Cana...
$ SEX [3m[38;5;246m<fct>[39m[23m Female, Female, Fe...
$ TRMODE [3m[38;5;246m<fct>[39m[23m "Car, truck, van a...
$ RPAIR [3m[38;5;246m<fct>[39m[23m "Yes, major repair...
$ PR [3m[38;5;246m<fct>[39m[23m Ontario, Manitoba,...
$ RUINDFG [3m[38;5;246m<fct>[39m[23m Rural, Rural, Urba...
$ d_licoratio_da_bef [3m[38;5;246m<fct>[39m[23m 5th decile, 3rd ...
$ S_DEAD [3m[38;5;246m<fct>[39m[23m Not dead, Not dead...
$ EFCNT_PP_R [3m[38;5;246m<fct>[39m[23m 4 family members, ...
$ AGE_IMM_R_group [3m[38;5;246m<fct>[39m[23m 35 to < 40, 25 to ...
$ COD1 [3m[38;5;246m<fct>[39m[23m Did not die, Did n...
$ COD2 [3m[38;5;246m<fct>[39m[23m Did not die, Did n...
$ DPOB11N [3m[38;5;246m<fct>[39m[23m NA, NA, NA, NA, NA...
$ KID_group [3m[38;5;246m<fct>[39m[23m one or two childre...
$ YRIM_group [3m[38;5;246m<fct>[39m[23m 2002 or later, 200...
$ age_group [3m[38;5;246m<fct>[39m[23m 40 to 44, 30 to 34...
# create a function to subset a dataset in this context
# because data is heavy and makes development cumbersome
get_a_subsample <- function(d, sample_size, seed = 42){
# sample_size <- 20000
v_sample_ids <- sample(unique(d$person_id), sample_size)
d1 <- d %>%
dplyr::filter(person_id %in% v_sample_ids)
return(d1)
}
# how to use
# ds1 <- ds0 %>% get_a_subsample(10000)
where_to_store_graphs <- "./reports/technique-demonstration/prints/1/" # female marital educ3 poor_health
# where_to_store_graphs = "./reports/technique-demonstration/prints/2/" # marital educ3 poor_health first
# where_to_store_graphs = "./reports/technique-demonstration/prints/3/", # other collection of predictors
# define a function to print a graph onto disk as an image
# because some aspects of appearances are easier to control during printing, not graphing
quick_save <- function(g,name){
ggplot2::ggsave(
filename = paste0(name,".png"),
plot = g,
device = png,
path = where_to_store_graphs, # female marital educ poor_healt
width = 1600,
height = 1200,
# units = "cm",
dpi = 200,
limitsize = FALSE
)
}
# declare the dependent variable and define descriptive labels
dv_name <- "S_DEAD"
dv_label_prob <- "Alive in X years"
dv_label_odds <- "Odds(Dead)"
# select the predictors to evaluate graphically
# becasue we typically will have more predictors then we want to display
# these will define rows in the printed matrix of graphs
covar_order_values <- c("female","marital","educ3","poor_health") #for /prints/1/
# covar_order_values <- c("marital", "educ3","poor_health", "FOL") # for /prints/2/
library(dplyr)
# because we need to examine observed values of each predictor
ds0 %>% group_by(PR) %>% summarize(n = n())
# A tibble: 13 x 2
PR n
<fct> <int>
1 Newfoundland and Labrador 74281
2 Prince Edward Island 18372
3 Nova Scotia 128755
4 New Brunswick 105097
5 Quebec 1047824
6 Ontario 1604077
7 Manitoba 176994
8 Saskatchewan 150432
9 Alberta 444249
10 British Columbia 557466
11 Yukon 9239
12 Northwest Territories 15454
13 Nunavut 14409
# A tibble: 2 x 2
SEX n
<fct> <int>
1 Female 2242681
2 Male 2103968
# A tibble: 5 x 2
MARST n
<fct> <int>
1 Divorced 376637
2 Legally married (and not separated) 2304941
3 Separated, but still legally married 140009
4 Never legally married (single) 1264326
5 Widowed 260736
# A tibble: 13 x 2
HCDD n
<fct> <int>
1 None 9.02e5
2 High school graduation certificate or equivalency certificate 1.07e6
3 Other trades certificate or diploma 3.33e5
4 Registered apprenticeship certificate 1.84e5
5 College, CEGEP or other non-university certificate or diploma from a program of 3 months ~ 1.10e5
6 College, CEGEP or other non-university certificate or diploma from a program of 1 year to~ 3.90e5
7 College, CEGEP or other non-university certificate or diploma from a program of more than~ 3.16e5
8 University certificate or diploma below bachelor level 2.02e5
9 Bachelors degree 5.37e5
10 University certificate or diploma above bachelor level 8.94e4
11 Degree in medicine, dentistry, veterinary medicine or optometry 2.39e4
12 Masters degree 1.57e5
13 Earned doctorate degree 3.15e4
# A tibble: 4 x 2
ADIFCLTY n
<fct> <int>
1 No 3610703
2 Not stated 54059
3 Yes, often 282002
4 Yes, sometimes 399885
# A tibble: 4 x 2
DISABFL n
<fct> <int>
1 No 3359763
2 Not stated 45969
3 Yes, often 423068
4 Yes, sometimes 517849
# A tibble: 15 x 2
age_group n
<fct> <int>
1 19 to 24 370193
2 25 to 29 357121
3 30 to 34 370583
4 35 to 39 406491
5 40 to 44 479630
6 45 to 49 485782
7 50 to 54 437395
8 55 to 59 388211
9 60 to 64 295752
10 65 to 69 227771
11 70 to 74 192595
12 75 to 79 156532
13 80 to 84 106865
14 85 to 89 51146
15 90 and older 20582
# A tibble: 52 x 3
# Groups: FOL [4]
FOL PR n
<fct> <fct> <int>
1 English only Newfoundland and Labrador 73794
2 English only Prince Edward Island 17539
3 English only Nova Scotia 123550
4 English only New Brunswick 69961
5 English only Quebec 124818
6 English only Ontario 1486333
7 English only Manitoba 168102
8 English only Saskatchewan 147337
9 English only Alberta 429295
10 English only British Columbia 531644
# ... with 42 more rows
# as evident from above
# some variables are too granular for general analysis
# they need to be re-scaled in order to be more useful
ds1 <- ds0 %>%
dplyr::mutate(
# because `female` is less ambiguous then `sex`
female = car::recode(
SEX, "
'Female' = 'TRUE'
;'Male' = 'FALSE'
")
,female = factor(female, levels = c("FALSE","TRUE"))
# because `still legaly married` is more legal than human
,marital = car::recode(
MARST, "
'Divorced' = 'sep_divorced'
;'Legally married (and not separated)' = 'mar_cohab'
;'Separated, but still legally married' = 'sep_divorced'
;'Never legally married (single)' = 'single'
;'Widowed' = 'widowed'
")
,marital = factor(marital, levels = c(
"sep_divorced","widowed","single","mar_cohab"))
# because more than 5 categories is too fragmented
,educ5 = car::recode(
HCDD, "
'None' = 'less then high school'
;'High school graduation certificate or equivalency certificate' = 'high school'
;'Other trades certificate or diploma' = 'high school'
;'Registered apprenticeship certificate' = 'high school'
;'College, CEGEP or other non-university certificate or diploma from a program of 3 months to less than 1 year' = 'college'
;'College, CEGEP or other non-university certificate or diploma from a program of 1 year to 2 years' = 'college'
;'College, CEGEP or other non-university certificate or diploma from a program of more than 2 years' = 'college'
;'University certificate or diploma below bachelor level' = 'college'
;'Bachelors degree' = 'college'
;'University certificate or diploma above bachelor level' = 'graduate'
;'Degree in medicine, dentistry, veterinary medicine or optometry' = 'graduate'
;'Masters degree' = 'graduate'
;'Earned doctorate degree' = 'Dr.'
")
,educ5 = factor(educ5, levels = c(
"less then high school"
,"high school"
,"college"
,"graduate"
,"Dr."
)
)
# because even only 5 may be too granular for our purposes
,educ3 = car::recode(
HCDD, "
'None' = 'less than high school'
;'High school graduation certificate or equivalency certificate' = 'high school'
;'Other trades certificate or diploma' = 'high school'
;'Registered apprenticeship certificate' = 'more than high school'
;'College, CEGEP or other non-university certificate or diploma from a program of 3 months to less than 1 year' = 'more than high school'
;'College, CEGEP or other non-university certificate or diploma from a program of 1 year to 2 years' = 'more than high school'
;'College, CEGEP or other non-university certificate or diploma from a program of more than 2 years' = 'more than high school'
;'University certificate or diploma below bachelor level' = 'more than high school'
;'Bachelors degree' = 'more than high school'
;'University certificate or diploma above bachelor level' = 'more than high school'
;'Degree in medicine, dentistry, veterinary medicine or optometry' = 'more than high school'
;'Masters degree' = 'more than high school'
;'Earned doctorate degree' = 'more than high school'
")
,educ3 = factor(educ3, levels = c(
"less than high school"
, "high school"
, "more than high school"
)
)
# ADIFCLTY "Problems with ADL" (physical & cognitive)
# DISABFL "Problems with ADL" (physical & social)
# because this is what counts practically
,poor_health = ifelse(ADIFCLTY %in% c("Yes, often","Yes, sometimes")
&
DISABFL %in% c("Yes, often","Yes, sometimes"),
TRUE, FALSE
)
,poor_health = factor(poor_health, levels = c("TRUE","FALSE"))
# because interval floor is easer to display on the graph then `19 to 24`
,age_group_low = car::recode(
age_group,
"
'19 to 24' = '19'
;'25 to 29' = '25'
;'30 to 34' = '30'
;'35 to 39' = '35'
;'40 to 44' = '40'
;'45 to 49' = '45'
;'50 to 54' = '50'
;'55 to 59' = '55'
;'60 to 64' = '60'
;'65 to 69' = '65'
;'70 to 74' = '70'
;'75 to 79' = '75'
;'80 to 84' = '80'
;'85 to 89' = '85'
;'90 and older' = '90'
"
)
) %>%
# because easier to reference, expressed as interval's floor
dplyr::mutate(
age_group = age_group_low
) %>%
# because it needs to be sorted from lowest to highest ability
dplyr::mutate(
FOL = factor(FOL,levels = c(
"Neither English nor French"
,"French only"
,"English only"
,"Both English and French"
)
)
,OLN = factor(FOL,levels = c(
"Neither English nor French"
,"French only"
,"English only"
,"Both English and French"
)
)
)
ds1 %>% glimpse(50)
Observations: 4,346,649
Variables: 41
$ person_id <int> 1, 2, 3, 4, 5, 6, ...
$ ABDERR <fct> Non-Aboriginal Ide...
$ ABIDENT <fct> Non-Aboriginal ide...
$ ADIFCLTY <fct> No, No, No, No, No...
$ CITSM <fct> Not a Canadian cit...
$ COWD <fct> "Paid Worker - Wor...
$ DISABFL <fct> "No", "No", "Yes, ...
$ DISABIL <fct> No difficulty with...
$ DVISMIN <fct> Not a visible mino...
$ FOL <fct> English only, Engl...
$ FPTIM <fct> NA, NA, NA, NA, NA...
$ GENSTPOB <fct> 1st generation - R...
$ HCDD <fct> "Bachelors degree"...
$ IMMDER <fct> Immigrants, Immigr...
$ LOINCA <fct> non-low income, no...
$ LOINCB <fct> non-low income, no...
$ MARST <fct> Legally married (a...
$ NOCSBRD <fct> "D Health occupati...
$ OLN <fct> English only, Engl...
$ POBDER <fct> Born outside Cana...
$ SEX <fct> Female, Female, Fe...
$ TRMODE <fct> "Car, truck, van a...
$ RPAIR <fct> "Yes, major repair...
$ PR <fct> Ontario, Manitoba,...
$ RUINDFG <fct> Rural, Rural, Urba...
$ d_licoratio_da_bef <fct> 5th decile, 3rd ...
$ S_DEAD <fct> Not dead, Not dead...
$ EFCNT_PP_R <fct> 4 family members, ...
$ AGE_IMM_R_group <fct> 35 to < 40, 25 to ...
$ COD1 <fct> Did not die, Did n...
$ COD2 <fct> Did not die, Did n...
$ DPOB11N <fct> NA, NA, NA, NA, NA...
$ KID_group <fct> one or two childre...
$ YRIM_group <fct> 2002 or later, 200...
$ age_group <fct> 40, 30, 65, 19, 55...
$ female <fct> TRUE, TRUE, TRUE, ...
$ marital <fct> mar_cohab, mar_coh...
$ educ5 <fct> college, college, ...
$ educ3 <fct> more than high sch...
$ poor_health <fct> FALSE, FALSE, FALS...
$ age_group_low <fct> 40, 30, 65, 19, 55...
# # because we want/need to inspect newly created variables
ds1 %>% group_by(educ3) %>% summarize(n = n())
# A tibble: 3 x 2
educ3 n
<fct> <int>
1 less than high school 902326
2 high school 1403807
3 more than high school 2040516
# A tibble: 5 x 2
educ5 n
<fct> <int>
1 less then high school 902326
2 high school 1587347
3 college 1555485
4 graduate 269945
5 Dr. 31546
# A tibble: 4 x 2
FOL n
<fct> <int>
1 Neither English nor French 64633
2 French only 1032652
3 English only 3209323
4 Both English and French 40041
$levels
1 2 3
"Immigrants" "Non-permanent residents" "Non-immigrants"
$label
[1] "Immigration status"
$description
[1] "Immigration status: Indicates whether the respondent is a non-immigrant, an immigrant or a non-permanent resident."
$levels
1
"1st generation - Respondent born outside Canada"
2
"2nd generation - Respondent born in Canada of at least one foreign-born parent"
3
"3rd generation - Respondent born in Canada and both parents born in Canada"
$label
[1] "Generation in Canada"
$description
[1] "Generation status: Refers to the generational status of the respondent, that is, 1st generation, 2nd generation or 3rd generation or more.Generation status is derived from place of birth of respondent, place of birth of father and place of birth of mother."
# define the scope of the exploration
selected_provinces <- c("Alberta","British Columbia", "Ontario", "Quebec")
sample_size = 10000
# because we want to focus on a meaningful sample: first-generation immigrants
ds2 <- ds1 %>%
dplyr::filter(PR %in% selected_provinces) %>%
dplyr::filter(IMMDER == "Immigrants") %>%
dplyr::filter(GENSTPOB == "1st generation - Respondent born outside Canada") #%>%
# get_a_subsample(sample_size) # use when need get representative sample across provinces
#create samples of the same size from each province
dmls <- list() # dummy list (dmls) to populate during the loop
for(province_i in selected_provinces){
# province_i = "British Columbia" # for example
dmls[[province_i]] <- ds2 %>%
dplyr::filter(PR == province_i) %>%
get_a_subsample(sample_size) # see `define-utility-functions` chunk
}
lapply(dmls, names) # view the contents of the list object
$Alberta
[1] "person_id" "ABDERR" "ABIDENT" "ADIFCLTY"
[5] "CITSM" "COWD" "DISABFL" "DISABIL"
[9] "DVISMIN" "FOL" "FPTIM" "GENSTPOB"
[13] "HCDD" "IMMDER" "LOINCA" "LOINCB"
[17] "MARST" "NOCSBRD" "OLN" "POBDER"
[21] "SEX" "TRMODE" "RPAIR" "PR"
[25] "RUINDFG" "d_licoratio_da_bef" "S_DEAD" "EFCNT_PP_R"
[29] "AGE_IMM_R_group" "COD1" "COD2" "DPOB11N"
[33] "KID_group" "YRIM_group" "age_group" "female"
[37] "marital" "educ5" "educ3" "poor_health"
[41] "age_group_low"
$`British Columbia`
[1] "person_id" "ABDERR" "ABIDENT" "ADIFCLTY"
[5] "CITSM" "COWD" "DISABFL" "DISABIL"
[9] "DVISMIN" "FOL" "FPTIM" "GENSTPOB"
[13] "HCDD" "IMMDER" "LOINCA" "LOINCB"
[17] "MARST" "NOCSBRD" "OLN" "POBDER"
[21] "SEX" "TRMODE" "RPAIR" "PR"
[25] "RUINDFG" "d_licoratio_da_bef" "S_DEAD" "EFCNT_PP_R"
[29] "AGE_IMM_R_group" "COD1" "COD2" "DPOB11N"
[33] "KID_group" "YRIM_group" "age_group" "female"
[37] "marital" "educ5" "educ3" "poor_health"
[41] "age_group_low"
$Ontario
[1] "person_id" "ABDERR" "ABIDENT" "ADIFCLTY"
[5] "CITSM" "COWD" "DISABFL" "DISABIL"
[9] "DVISMIN" "FOL" "FPTIM" "GENSTPOB"
[13] "HCDD" "IMMDER" "LOINCA" "LOINCB"
[17] "MARST" "NOCSBRD" "OLN" "POBDER"
[21] "SEX" "TRMODE" "RPAIR" "PR"
[25] "RUINDFG" "d_licoratio_da_bef" "S_DEAD" "EFCNT_PP_R"
[29] "AGE_IMM_R_group" "COD1" "COD2" "DPOB11N"
[33] "KID_group" "YRIM_group" "age_group" "female"
[37] "marital" "educ5" "educ3" "poor_health"
[41] "age_group_low"
$Quebec
[1] "person_id" "ABDERR" "ABIDENT" "ADIFCLTY"
[5] "CITSM" "COWD" "DISABFL" "DISABIL"
[9] "DVISMIN" "FOL" "FPTIM" "GENSTPOB"
[13] "HCDD" "IMMDER" "LOINCA" "LOINCB"
[17] "MARST" "NOCSBRD" "OLN" "POBDER"
[21] "SEX" "TRMODE" "RPAIR" "PR"
[25] "RUINDFG" "d_licoratio_da_bef" "S_DEAD" "EFCNT_PP_R"
[29] "AGE_IMM_R_group" "COD1" "COD2" "DPOB11N"
[33] "KID_group" "YRIM_group" "age_group" "female"
[37] "marital" "educ5" "educ3" "poor_health"
[41] "age_group_low"
# OVERWRITE, making it a stratified sample across selected provinces (same size in each)
ds2 <- plyr::ldply(dmls,data.frame,.id = "PR")
# inspect the created data set
ds2 %>% dplyr::glimpse(50)
Observations: 40,000
Variables: 41
$ person_id <int> 1924, 1958, 2224, ...
$ ABDERR <fct> Non-Aboriginal Ide...
$ ABIDENT <fct> Non-Aboriginal ide...
$ ADIFCLTY <fct> "No", "Not stated"...
$ CITSM <fct> Canadian citizen b...
$ COWD <fct> "Paid Worker - Wor...
$ DISABFL <fct> "No", "Not stated"...
$ DISABIL <fct> No difficulty with...
$ DVISMIN <fct> Not a visible mino...
$ FOL <fct> English only, Engl...
$ FPTIM <fct> NA, NA, NA, NA, NA...
$ GENSTPOB <fct> 1st generation - R...
$ HCDD <fct> "University certif...
$ IMMDER <fct> Immigrants, Immigr...
$ LOINCA <fct> non-low income, no...
$ LOINCB <fct> non-low income, no...
$ MARST <fct> Never legally marr...
$ NOCSBRD <fct> "D Health occupati...
$ OLN <fct> English only, Engl...
$ POBDER <fct> Born outside Cana...
$ SEX <fct> Male, Female, Fema...
$ TRMODE <fct> "Car, truck, van a...
$ RPAIR <fct> "No, only regular ...
$ PR <fct> Alberta, Alberta, ...
$ RUINDFG <fct> Urban, Rural, Rura...
$ d_licoratio_da_bef <fct> 9th decile, 3rd ...
$ S_DEAD <fct> Not dead, Not dead...
$ EFCNT_PP_R <fct> 1 person, 2 family...
$ AGE_IMM_R_group <fct> 25 to < 30, 20 to ...
$ COD1 <fct> Did not die, Did n...
$ COD2 <fct> Did not die, Did n...
$ DPOB11N <fct> NA, NA, NA, NA, NA...
$ KID_group <fct> no children, no ch...
$ YRIM_group <fct> between 1996 and 1...
$ age_group <fct> 35, 50, 40, 45, 65...
$ female <fct> FALSE, TRUE, TRUE,...
$ marital <fct> single, mar_cohab,...
$ educ5 <fct> college, college, ...
$ educ3 <fct> more than high sch...
$ poor_health <fct> FALSE, FALSE, FALS...
$ age_group_low <fct> 35, 50, 40, 45, 65...
# A tibble: 4 x 2
PR n_people
<fct> <int>
1 Quebec 10000
2 Ontario 10000
3 Alberta 10000
4 British Columbia 10000
# for the sake of modularization, create a data set that will passed to modeling
ds_for_modeling <- ds2 %>%
# reorganize the order of variables to match your modeling preference
dplyr::select_(
"person_id", "PR", "S_DEAD"
,"age_group"
,"female", "marital", "educ3","poor_health", "FOL","OLN"
) %>%
# add transformation as needed
# na.omit() %>%
dplyr::rename_(
"dv" = dv_name # to ease serialization and string handling
)
# define the model equation
equation_string <- paste0(
"dv ~ -1 + PR + age_group + female + marital + educ3 + poor_health + FOL"
)
equation_formula <- as.formula(equation_string)
print(equation_formula, showEnv = FALSE)
dv ~ -1 + PR + age_group + female + marital + educ3 + poor_health +
FOL
# pass model specification to the estimation routine
model_solution <- glm(
equation_formula,
data = ds_for_modeling,
family = binomial(link="logit")
)
# inspect model results
equation_formula # because we must see equation!
dv ~ -1 + PR + age_group + female + marital + educ3 + poor_health +
FOL
Call:
glm(formula = equation_formula, family = binomial(link = "logit"),
data = ds_for_modeling)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.5971 0.0877 0.1606 0.3497 1.8711
Coefficients:
Estimate Std. Error z value Pr(>|z|)
PRQuebec 4.23372 0.43179 9.805 < 2e-16 ***
PROntario 4.45294 0.42980 10.361 < 2e-16 ***
PRAlberta 4.44406 0.43062 10.320 < 2e-16 ***
PRBritish Columbia 4.51151 0.43040 10.482 < 2e-16 ***
age_group25 -0.28567 0.54128 -0.528 0.597656
age_group30 -0.60315 0.50177 -1.202 0.229341
age_group35 -0.68898 0.47764 -1.442 0.149167
age_group40 -1.42822 0.43896 -3.254 0.001139 **
age_group45 -1.95440 0.42863 -4.560 5.12e-06 ***
age_group50 -2.36172 0.42378 -5.573 2.50e-08 ***
age_group55 -3.06274 0.41877 -7.314 2.60e-13 ***
age_group60 -3.65621 0.41735 -8.761 < 2e-16 ***
age_group65 -3.79381 0.41807 -9.075 < 2e-16 ***
age_group70 -3.98076 0.41803 -9.523 < 2e-16 ***
age_group75 -4.24202 0.41853 -10.136 < 2e-16 ***
age_group80 -4.50362 0.41964 -10.732 < 2e-16 ***
age_group85 -4.83316 0.42402 -11.398 < 2e-16 ***
age_group90 -5.09983 0.43801 -11.643 < 2e-16 ***
femaleTRUE 0.72169 0.04725 15.274 < 2e-16 ***
maritalwidowed -0.67246 0.08400 -8.006 1.19e-15 ***
maritalsingle -0.03629 0.11287 -0.321 0.747851
maritalmar_cohab 0.17580 0.07195 2.443 0.014551 *
educ3high school 0.19417 0.05682 3.417 0.000633 ***
educ3more than high school 0.51406 0.05438 9.453 < 2e-16 ***
poor_healthFALSE 1.12071 0.04533 24.722 < 2e-16 ***
FOLFrench only 0.13392 0.11287 1.186 0.235431
FOLEnglish only -0.23152 0.08086 -2.863 0.004192 **
FOLBoth English and French 0.12947 0.15588 0.831 0.406193
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 55452 on 40000 degrees of freedom
Residual deviance: 14903 on 39972 degrees of freedom
AIC: 14959
Number of Fisher Scoring iterations: 8
PRQuebec PROntario PRAlberta
4.23 4.45 4.44
PRBritish Columbia age_group25 age_group30
4.51 -0.29 -0.60
age_group35 age_group40 age_group45
-0.69 -1.43 -1.95
age_group50 age_group55 age_group60
-2.36 -3.06 -3.66
age_group65 age_group70 age_group75
-3.79 -3.98 -4.24
age_group80 age_group85 age_group90
-4.50 -4.83 -5.10
femaleTRUE maritalwidowed maritalsingle
0.72 -0.67 -0.04
maritalmar_cohab educ3high school educ3more than high school
0.18 0.19 0.51
poor_healthFALSE FOLFrench only FOLEnglish only
1.12 0.13 -0.23
FOLBoth English and French
0.13
# distill all possible combinations of predictors
# because we will create predictions for them
# using the coefficients from the model solution
ds_predicted <- ds_for_modeling %>%
dplyr::select_(
"PR"
,"age_group"
,"female"
,"educ3"
# ,"educ5"
,"marital"
,"poor_health"
,"FOL"
# ,"ONL"
) %>%
dplyr::distinct()
# compute predicted values of the criterion
# by applying model solution to all possible levels of predictors
#logged-odds of probability (ie, linear)
ds_predicted$dv_hat <- as.numeric(predict(model_solution, newdata=ds_predicted))
#probability (ie, s-curve), because we want to visualize probability
ds_predicted$dv_hat_p <- plogis(ds_predicted$dv_hat)
# save a modeling object to plat later
ls_model <- list(
"call" = equation_string
,"summary" = model_solution %>% summary()
,"coefficients" = model_solution %>% stats::coefficients()
,"predicted_values" = ds_predicted
)
# saveRDS(ls_model, "./data-public/derived/technique-demonstration/ls_model.rds")
# the script can be continutued in
# `./reports/technique-demonstrations/graphing-phase-demo.R`
# without relying on the raw data
# create a function that would assign color
# to the values of predictors based on informed expectation
assign_color <- function(color_group){
if( color_group == "female") {
palette_color <- c(
"TRUE" = reference_color
,"FALSE" = increased_risk_1
)
} else if( color_group %in% c("educ5") ) {
palette_color <- c(
"less than high school" = increased_risk_2
,"high school" = increased_risk_1
, "college" = reference_color
, "graduate" = descreased_risk_1
, "Dr." = descreased_risk_2
)
} else if( color_group %in% c("educ3") ) {
palette_color <- c(
"less than high school" = increased_risk_1
,"high school" = reference_color
,"more than high school" = descreased_risk_1
)
} else if( color_group %in% c("marital") ) {
palette_color <- c(
"mar_cohab" = descreased_risk_1
,"sep_divorced" = increased_risk_2
,"single" = reference_color
,"widowed" = increased_risk_1
)
} else if( color_group %in% c("poor_health") ) {
palette_color <- c(
"FALSE" = reference_color
,"TRUE" = increased_risk_2
)
} else if( color_group %in% c("FOL") ) {
palette_color <- c(
"Both English and French" = descreased_risk_1
,"English only" = reference_color
,"French only" = increased_risk_1
,"Neither English nor French" = increased_risk_2
)
} else if( color_group %in% c("OLN") ) {
palette_color <- c(
"Both English and French" = descreased_risk_2
,"English only" = reference_color
,"French only" = increased_risk_1
,"Neither English nor French" = increased_risk_2
)
} else {
stop("The palette for this variable is not defined.")
}
}
# shared graphing setting
common_alpha <- .7 # shared transparency
common_natural <- "grey90" # the "no-color" color
y_low = .2 # to remove white space
y_high = 1 # to remove white space
# load the custom graphing function, isolated in this script
base::source("./scripts/graphing/graph-logistic.R")
# color definitions are picked from
# http://colorbrewer2.org/#type=qualitative&scheme=Set1&n=7
list.files(where_to_store_graphs, full.names = TRUE)
[1] "./reports/technique-demonstration/prints/1/g0.png"
[2] "./reports/technique-demonstration/prints/1/g1.png"
[3] "./reports/technique-demonstration/prints/1/g2.png"
[4] "./reports/technique-demonstration/prints/1/g3.png"
[5] "./reports/technique-demonstration/prints/1/g4.png"
[6] "./reports/technique-demonstration/prints/1/g5.png"
[7] "./reports/technique-demonstration/prints/1/g6.png"
# 0 step : All colors are in
increased_risk_2 <- "#e41a1c" # red - further increased risk factor
increased_risk_1 <- "#ff7f00" # organge - increased risk factor
reference_color <- "#4daf4a" # green - REFERENCE category
descreased_risk_1 <-"#377eb8" # blue - descreased risk factor
descreased_risk_2 <- "#984ea3" # purple - further descrease in risk factor
g0 <- ds_predicted %>%
graph_logistic_point_complex_4(
x_name = "age_group"
,y_name = "dv_hat_p"
,covar_order = covar_order_values
,alpha_level = common_alpha
,y_title = dv_label_prob
,y_range = c(y_low, y_high)
)
g0 %>% print() # inspect
[1] "./reports/technique-demonstration/prints/1/g0.png"
[2] "./reports/technique-demonstration/prints/1/g1.png"
[3] "./reports/technique-demonstration/prints/1/g2.png"
[4] "./reports/technique-demonstration/prints/1/g3.png"
[5] "./reports/technique-demonstration/prints/1/g4.png"
[6] "./reports/technique-demonstration/prints/1/g5.png"
[7] "./reports/technique-demonstration/prints/1/g6.png"
# 1 step of color logic: no color is added
increased_risk_2 <- common_natural # red - further increased risk factor
increased_risk_1 <- common_natural # organge - increased risk factor
reference_color <- common_natural # green - REFERENCE category
descreased_risk_1 <-common_natural # blue - descreased risk factor
descreased_risk_2 <- common_natural # purple - further descrease in risk factor
# increased_risk_2 <- "#e41a1c" # red - further increased risk factor
# increased_risk_1 <- "#ff7f00" # organge - increased risk factor
# reference_color <- "#4daf4a" # green - REFERENCE category
# descreased_risk_1 <-"#377eb8" # blue - descreased risk factor
# descreased_risk_2 <- "#984ea3" # purple - further descrease in risk factor
g1 <- ds_predicted %>%
graph_logistic_point_complex_4(
x_name = "age_group"
,y_name = "dv_hat_p"
,covar_order = covar_order_values
,alpha_level = common_alpha
,y_title = dv_label_prob
,y_range = c(y_low, y_high)
)
g1 %>% print() # inspect
[1] "./reports/technique-demonstration/prints/1/g0.png"
[2] "./reports/technique-demonstration/prints/1/g1.png"
[3] "./reports/technique-demonstration/prints/1/g2.png"
[4] "./reports/technique-demonstration/prints/1/g3.png"
[5] "./reports/technique-demonstration/prints/1/g4.png"
[6] "./reports/technique-demonstration/prints/1/g5.png"
[7] "./reports/technique-demonstration/prints/1/g6.png"
# 2 step of color logic: add only the reference group
increased_risk_2 <- common_natural # red - further increased risk factor
increased_risk_1 <- common_natural # organge - increased risk factor
reference_color <- common_natural # green - REFERENCE category
descreased_risk_1 <-common_natural # blue - descreased risk factor
descreased_risk_2 <- common_natural # purple - further descrease in risk factor
# increased_risk_2 <- "#e41a1c" # red - further increased risk factor
# increased_risk_1 <- "#ff7f00" # organge - increased risk factor
reference_color <- "#4daf4a" # green - REFERENCE category
# descreased_risk_1 <-"#377eb8" # blue - descreased risk factor
# descreased_risk_2 <- "#984ea3" # purple - further descrease in risk factor
g2 <- ds_predicted %>%
graph_logistic_point_complex_4(
x_name = "age_group"
,y_name = "dv_hat_p"
,covar_order = covar_order_values
,alpha_level = common_alpha
,y_title = dv_label_prob
,y_range = c(y_low, y_high)
)
g2 %>% print() # inspect
[1] "./reports/technique-demonstration/prints/1/g0.png"
[2] "./reports/technique-demonstration/prints/1/g1.png"
[3] "./reports/technique-demonstration/prints/1/g2.png"
[4] "./reports/technique-demonstration/prints/1/g3.png"
[5] "./reports/technique-demonstration/prints/1/g4.png"
[6] "./reports/technique-demonstration/prints/1/g5.png"
[7] "./reports/technique-demonstration/prints/1/g6.png"
# 3 step of color logic: add moderately increased risk
increased_risk_2 <- common_natural # red - further increased risk factor
increased_risk_1 <- common_natural # organge - increased risk factor
reference_color <- common_natural # green - REFERENCE category
descreased_risk_1 <-common_natural # blue - descreased risk factor
descreased_risk_2 <- common_natural # purple - further descrease in risk factor
# increased_risk_2 <- "#e41a1c" # red - further increased risk factor
increased_risk_1 <- "#ff7f00" # organge - increased risk factor
# reference_color <- "#4daf4a" # green - REFERENCE category
# descreased_risk_1 <-"#377eb8" # blue - descreased risk factor
# descreased_risk_2 <- "#984ea3" # purple - further descrease in risk factor
g3 <- ds_predicted %>%
graph_logistic_point_complex_4(
x_name = "age_group"
,y_name = "dv_hat_p"
,covar_order = covar_order_values
,alpha_level = common_alpha
,y_title = dv_label_prob
,y_range = c(y_low, y_high)
)
g3 %>% print() # inspect
[1] "./reports/technique-demonstration/prints/1/g0.png"
[2] "./reports/technique-demonstration/prints/1/g1.png"
[3] "./reports/technique-demonstration/prints/1/g2.png"
[4] "./reports/technique-demonstration/prints/1/g3.png"
[5] "./reports/technique-demonstration/prints/1/g4.png"
[6] "./reports/technique-demonstration/prints/1/g5.png"
[7] "./reports/technique-demonstration/prints/1/g6.png"
# 4 step of color logic: add moderately decreased risk
increased_risk_2 <- common_natural # red - further increased risk factor
increased_risk_1 <- common_natural # organge - increased risk factor
reference_color <- common_natural # green - REFERENCE category
descreased_risk_1 <-common_natural # blue - descreased risk factor
descreased_risk_2 <- common_natural # purple - further descrease in risk factor
# increased_risk_2 <- "#e41a1c" # red - further increased risk factor
# increased_risk_1 <- "#ff7f00" # organge - increased risk factor
# reference_color <- "#4daf4a" # green - REFERENCE category
descreased_risk_1 <-"#377eb8" # blue - descreased risk factor
# descreased_risk_2 <- "#984ea3" # purple - further descrease in risk factor
g4 <- ds_predicted %>%
graph_logistic_point_complex_4(
x_name = "age_group"
,y_name = "dv_hat_p"
,covar_order = covar_order_values
,alpha_level = common_alpha
,y_title = dv_label_prob
,y_range = c(y_low, y_high)
)
g4 %>% print() # inspect
[1] "./reports/technique-demonstration/prints/1/g0.png"
[2] "./reports/technique-demonstration/prints/1/g1.png"
[3] "./reports/technique-demonstration/prints/1/g2.png"
[4] "./reports/technique-demonstration/prints/1/g3.png"
[5] "./reports/technique-demonstration/prints/1/g4.png"
[6] "./reports/technique-demonstration/prints/1/g5.png"
[7] "./reports/technique-demonstration/prints/1/g6.png"
# 5 step of color logic: add substantially increased risk
increased_risk_2 <- common_natural # red - further increased risk factor
increased_risk_1 <- common_natural # organge - increased risk factor
reference_color <- common_natural # green - REFERENCE category
descreased_risk_1 <-common_natural # blue - descreased risk factor
descreased_risk_2 <- common_natural # purple - further descrease in risk factor
increased_risk_2 <- "#e41a1c" # red - further increased risk factor
# increased_risk_1 <- "#ff7f00" # organge - increased risk factor
# reference_color <- "#4daf4a" # green - REFERENCE category
# descreased_risk_1 <-"#377eb8" # blue - descreased risk factor
# descreased_risk_2 <- "#984ea3" # purple - further descrease in risk factor
g5 <- ds_predicted %>%
graph_logistic_point_complex_4(
x_name = "age_group"
,y_name = "dv_hat_p"
,covar_order = covar_order_values
,alpha_level = common_alpha
,y_title = dv_label_prob
,y_range = c(y_low, y_high)
)
g5 %>% print() # inspect
[1] "./reports/technique-demonstration/prints/1/g0.png"
[2] "./reports/technique-demonstration/prints/1/g1.png"
[3] "./reports/technique-demonstration/prints/1/g2.png"
[4] "./reports/technique-demonstration/prints/1/g3.png"
[5] "./reports/technique-demonstration/prints/1/g4.png"
[6] "./reports/technique-demonstration/prints/1/g5.png"
[7] "./reports/technique-demonstration/prints/1/g6.png"
# 6 step of color logic: add substantially decreased risk
increased_risk_2 <- common_natural # red - further increased risk factor
increased_risk_1 <- common_natural # organge - increased risk factor
reference_color <- common_natural # green - REFERENCE category
descreased_risk_1 <-common_natural # blue - descreased risk factor
descreased_risk_2 <- common_natural # purple - further descrease in risk factor
# increased_risk_2 <- "#e41a1c" # red - further increased risk factor
# increased_risk_1 <- "#ff7f00" # organge - increased risk factor
# reference_color <- "#4daf4a" # green - REFERENCE category
# descreased_risk_1 <-"#377eb8" # blue - descreased risk factor
descreased_risk_2 <- "#984ea3" # purple - further descrease in risk factor
g6 <- ds_predicted %>%
graph_logistic_point_complex_4(
x_name = "age_group"
,y_name = "dv_hat_p"
,covar_order = covar_order_values
,alpha_level = common_alpha
,y_title = dv_label_prob
,y_range = c(y_low, y_high)
)
g6 %>% print() # inspect
[1] "./reports/technique-demonstration/prints/1/g0.png"
[2] "./reports/technique-demonstration/prints/1/g1.png"
[3] "./reports/technique-demonstration/prints/1/g2.png"
[4] "./reports/technique-demonstration/prints/1/g3.png"
[5] "./reports/technique-demonstration/prints/1/g4.png"
[6] "./reports/technique-demonstration/prints/1/g5.png"
[7] "./reports/technique-demonstration/prints/1/g6.png"
For the sake of documentation and reproducibility, the current report was rendered on a system using the following software.
Report rendered by an499583 at 2019-11-05, 11:39 -0500 in 78 seconds.
R version 3.5.2 (2018-12-20)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252 LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] grid stats graphics grDevices utils datasets methods base
other attached packages:
[1] knitr_1.23 dplyr_0.8.3 magrittr_1.5 RColorBrewer_1.1-2 dichromat_2.0-0
[6] ggplot2_3.1.1 extrafont_0.17
loaded via a namespace (and not attached):
[1] tidyselect_0.2.5 xfun_0.7 reshape2_1.4.3
[4] purrr_0.3.2 haven_2.1.0 carData_3.0-2
[7] colorspace_1.4-1 vctrs_0.1.0 htmltools_0.3.6
[10] yaml_2.2.0 utf8_1.1.4 rlang_0.4.0
[13] pillar_1.4.1 foreign_0.8-71 glue_1.3.1
[16] withr_2.1.2 readxl_1.3.1 jpeg_0.1-8
[19] plyr_1.8.4 stringr_1.4.0 munsell_0.5.0
[22] gtable_0.3.0 cellranger_1.1.0 zip_2.0.2
[25] evaluate_0.14 labeling_0.3 rio_0.5.16
[28] forcats_0.4.0 curl_3.3 fansi_0.4.0
[31] Rttf2pt1_1.3.7 Rcpp_1.0.1 backports_1.1.4
[34] scales_1.0.0 TabularManifest_0.1-16.9003 abind_1.4-5
[37] testit_0.9 digest_0.6.19 hms_0.4.2
[40] stringi_1.4.3 openxlsx_4.1.0.1 cowplot_0.9.4
[43] cli_1.1.0 tools_3.5.2 lazyeval_0.2.2
[46] tibble_2.1.3 crayon_1.3.4 extrafontdb_1.0
[49] car_3.0-3 pkgconfig_2.0.2 zeallot_0.1.0
[52] data.table_1.12.2 assertthat_0.2.1 rmarkdown_1.13
[55] rstudioapi_0.10 R6_2.4.0 compiler_3.5.2