This report was automatically generated with the R package knitr (version 1.20).
# this script imports the raw data described in this shared document
# https://drive.google.com/file/d/10idMxy8eX8nTHr6wr2Q40x4XOP3Y5ck7/view
# and prepares a state of data used as a standard point of departure for any subsequent reproducible analytics
# Lines before the first chunk are invisible to Rmd/Rnw callers
# Run to stitch a tech report of this script (used only in RStudio)
# knitr::stitch_rmd(
# script = "./manipulation/0-greeter.R",
# output = "./manipulation/stitched-output/0-greeter.md"
# )
# this command is typically executed by the ./manipulation/governor.R
rm(list=ls(all=TRUE)) #Clear the memory of variables from previous run.
# This is not called by knitr, because it's above the first chunk.
cat("\f") # clear console when working in RStudio
library(magrittr) #Pipes
requireNamespace("dplyr", quietly=TRUE)
# link to the source of the location mapping
path_input_micro <- "./data-unshared/raw/ipdln_synth_final.csv"
path_input_meta <- "./data-unshared/derived/ls_guide.rds"
# test whether the file exists / the link is good
testit::assert("File does not exist", base::file.exists(path_input_micro))
testit::assert("File does not exist", base::file.exists(path_input_meta))
# declare where you will store the product of this script
path_save <- "./data-unshared/derived/0-greeted.rds"
# See definitions of commonly used objects in:
source("./manipulation/object-glossary.R") # object definitions
# functions, the use of which is localized to this script
# for commonly used function see ./manipulation/function-support.R
ds0 <- readr::read_csv(path_input_micro) %>% as.data.frame()
## Parsed with column specification:
## cols(
## .default = col_integer()
## )
## See spec(...) for full column specifications.
ls_guide <- readRDS(path_input_meta)
# basic inspection
ds0 %>% dplyr::glimpse(50)
## Observations: 4,346,649
## Variables: 34
## $ ABDERR_synth <int> 2, 2, 2, 2, ...
## $ ABIDENT_synth <int> 6, 6, 6, 6, ...
## $ ADIFCLTY_synth <int> 1, 1, 1, 1, ...
## $ CITSM_synth <int> 2, 2, 1, 1, ...
## $ COWD_synth <int> 4, 4, 7, 4, ...
## $ DISABFL_synth <int> 1, 1, 4, 1, ...
## $ DISABIL_synth <int> 9, 9, 14, 9,...
## $ DVISMIN_synth <int> 14, 14, 14, ...
## $ FOL_synth <int> 1, 1, 2, 1, ...
## $ FPTIM_synth <int> 1, 1, 3, 2, ...
## $ GENSTPOB_synth <int> 1, 1, 3, 3, ...
## $ HCDD_synth <int> 9, 8, 1, 2, ...
## $ IMMDER_synth <int> 1, 1, 3, 3, ...
## $ LOINCA_synth <int> 1, 1, 1, 1, ...
## $ LOINCB_synth <int> 1, 1, 1, 2, ...
## $ MARST_synth <int> 2, 2, 2, 4, ...
## $ NOCSBRD_synth <int> 4, 4, 11, 6,...
## $ OLN_synth <int> 3, 1, 2, 3, ...
## $ POBDER_synth <int> 3, 3, 1, 1, ...
## $ SEX_synth <int> 1, 1, 1, 1, ...
## $ TRMODE_synth <int> 2, 2, 9, 5, ...
## $ RPAIR_synth <int> 3, 1, 1, 2, ...
## $ PR_synth <int> 35, 46, 24, ...
## $ RUINDFG_synth <int> 1, 1, 2, 2, ...
## $ d_licoratio_da_bef_synth <int> 5, 3, 3, 2, ...
## $ S_DEAD_synth <int> 2, 2, 1, 2, ...
## $ EFCNT_PP_R_synth <int> 4, 5, 2, 4, ...
## $ AGE_IMM_R_group_synth <int> 8, 6, 15, 15...
## $ COD1_synth <int> 5, 5, 2, 5, ...
## $ COD2_synth <int> 14, 14, 13, ...
## $ DPOB11N_synth <int> 4, 2, 1, 1, ...
## $ KID_group_synth <int> 2, 3, 1, 2, ...
## $ YRIM_group_synth <int> 1, 1, 6, 6, ...
## $ age_group_synth <int> 5, 3, 10, 1,...
# remove the unnecessary suffix in the name of variables
names(ds0) <- gsub("_synth$", "", names(ds0 ))
names(ds0)
## [1] "ABDERR" "ABIDENT" "ADIFCLTY"
## [4] "CITSM" "COWD" "DISABFL"
## [7] "DISABIL" "DVISMIN" "FOL"
## [10] "FPTIM" "GENSTPOB" "HCDD"
## [13] "IMMDER" "LOINCA" "LOINCB"
## [16] "MARST" "NOCSBRD" "OLN"
## [19] "POBDER" "SEX" "TRMODE"
## [22] "RPAIR" "PR" "RUINDFG"
## [25] "d_licoratio_da_bef" "S_DEAD" "EFCNT_PP_R"
## [28] "AGE_IMM_R_group" "COD1" "COD2"
## [31] "DPOB11N" "KID_group" "YRIM_group"
## [34] "age_group"
ds0 %>% dplyr::glimpse(50)
## Observations: 4,346,649
## Variables: 34
## $ ABDERR <int> 2, 2, 2, 2, 2, 2, ...
## $ ABIDENT <int> 6, 6, 6, 6, 6, 6, ...
## $ ADIFCLTY <int> 1, 1, 1, 1, 1, 1, ...
## $ CITSM <int> 2, 2, 1, 1, 1, 1, ...
## $ COWD <int> 4, 4, 7, 4, 4, 7, ...
## $ DISABFL <int> 1, 1, 4, 1, 1, 1, ...
## $ DISABIL <int> 9, 9, 14, 9, 9, 9,...
## $ DVISMIN <int> 14, 14, 14, 14, 14...
## $ FOL <int> 1, 1, 2, 1, 1, 2, ...
## $ FPTIM <int> 1, 1, 3, 2, 1, 3, ...
## $ GENSTPOB <int> 1, 1, 3, 3, 3, 3, ...
## $ HCDD <int> 9, 8, 1, 2, 5, 1, ...
## $ IMMDER <int> 1, 1, 3, 3, 3, 3, ...
## $ LOINCA <int> 1, 1, 1, 1, 1, 1, ...
## $ LOINCB <int> 1, 1, 1, 2, 1, 1, ...
## $ MARST <int> 2, 2, 2, 4, 2, 2, ...
## $ NOCSBRD <int> 4, 4, 11, 6, 4, 11...
## $ OLN <int> 3, 1, 2, 3, 1, 2, ...
## $ POBDER <int> 3, 3, 1, 1, 2, 1, ...
## $ SEX <int> 1, 1, 1, 1, 2, 2, ...
## $ TRMODE <int> 2, 2, 9, 5, 2, 9, ...
## $ RPAIR <int> 3, 1, 1, 2, 1, 1, ...
## $ PR <int> 35, 46, 24, 59, 48...
## $ RUINDFG <int> 1, 1, 2, 2, 2, 1, ...
## $ d_licoratio_da_bef <int> 5, 3, 3, 2, 9, 7, ...
## $ S_DEAD <int> 2, 2, 1, 2, 2, 2, ...
## $ EFCNT_PP_R <int> 4, 5, 2, 4, 4, 3, ...
## $ AGE_IMM_R_group <int> 8, 6, 15, 15, 15, ...
## $ COD1 <int> 5, 5, 2, 5, 5, 5, ...
## $ COD2 <int> 14, 14, 13, 14, 14...
## $ DPOB11N <int> 4, 2, 1, 1, 1, 1, ...
## $ KID_group <int> 2, 3, 1, 2, 2, 2, ...
## $ YRIM_group <int> 1, 1, 6, 6, 6, 6, ...
## $ age_group <int> 5, 3, 10, 1, 8, 11...
# augment the micro data with meta data
# function to augment micro data with meta data
augment_with_meta <- function(
d, # a dataframe with the original raw data, prepared by the ./manipulation/0-greeter.R
l # a list object with organized meta data, prepared by the ./manipulation/0-metador.R
){
for(name_i in names(d)){
# name_i <- "SEX"
# d_ <- ds0[1:1000,c("SEX","S_DEAD")]
# l_ <- ls_guide
d <- d %>%
dplyr::rename_("target_variable" = name_i) %>%
dplyr::mutate(
target_variable = factor(
target_variable,
levels = l$item[[name_i]]$levels %>% names(),
labels = l$item[[name_i]]$levels
)
)
names(d) <- gsub("^target_variable$",name_i, names(d))
# d1 %>% dplyr::glimpse()
}
return(d)
}
# how to use
ds1 <- ds0 %>% augment_with_meta(ls_guide)
ds1 %>% dplyr::glimpse(50)
## Observations: 4,346,649
## Variables: 34
## $ ABDERR <fct> Non-Aboriginal Ide...
## $ ABIDENT <fct> Non-Aboriginal ide...
## $ ADIFCLTY <fct> No, No, No, No, No...
## $ CITSM <fct> Not a Canadian cit...
## $ COWD <fct> Paid Worker - Work...
## $ DISABFL <fct> No, No, Yes, somet...
## $ DISABIL <fct> No difficulty with...
## $ DVISMIN <fct> Not a visible mino...
## $ FOL <fct> English , English ...
## $ FPTIM <fct> NA, NA, NA, NA, NA...
## $ GENSTPOB <fct> 1st generation - R...
## $ HCDD <fct> Bachelors degree, ...
## $ IMMDER <fct> Immigrants, Immigr...
## $ LOINCA <fct> non-low income, no...
## $ LOINCB <fct> non-low income, no...
## $ MARST <fct> Legally married (a...
## $ NOCSBRD <fct> D Health occupatio...
## $ OLN <fct> Both English and F...
## $ POBDER <fct> Born outside Cana...
## $ SEX <fct> Female, Female, Fe...
## $ TRMODE <fct> Car, truck, van as...
## $ RPAIR <fct> Yes, major repairs...
## $ PR <fct> Ontario, Manitoba,...
## $ RUINDFG <fct> Rural, Rural, Urba...
## $ d_licoratio_da_bef <fct> 5th decile, 3rd ...
## $ S_DEAD <fct> Not dead, Not dead...
## $ EFCNT_PP_R <fct> 4 family members, ...
## $ AGE_IMM_R_group <fct> NA, NA, NA, NA, NA...
## $ COD1 <fct> Did not die, Did n...
## $ COD2 <fct> Did not die, Did n...
## $ DPOB11N <fct> NA, NA, NA, NA, NA...
## $ KID_group <fct> one or two childre...
## $ YRIM_group <fct> 2002 or later, 200...
## $ age_group <fct> 40 to 44, 30 to 34...
cat("Save results to ",path_save)
## Save results to ./data-unshared/derived/0-greeted.rds
saveRDS(ds1, path_save)
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows >= 8 x64 (build 9200)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 magrittr_1.5
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.18 dplyr_0.7.6 assertthat_0.2.0 R6_2.2.2
## [5] evaluate_0.10.1 pillar_1.2.1 stringi_1.1.7 rlang_0.2.0
## [9] testit_0.7 tools_3.4.4 stringr_1.3.1 readr_1.1.1
## [13] glue_1.2.0 purrr_0.2.4 hms_0.4.1 yaml_2.1.19
## [17] compiler_3.4.4 pkgconfig_2.0.1 knitr_1.20 bindr_0.1.1
## [21] tidyselect_0.2.3 tibble_1.4.2
Sys.time()
## [1] "2018-09-05 14:47:01 PDT"