SatRday Chicago, April 27, 2019

Parfait Gasana

Data Analyst, Winston & Strawn
  @ParfaitG (GitHub)




R as an Extendable Environment



Relational Databases



Advantages of Relational Databases in Data Science



Programming Interfaces



R DBI Standard



Use Cases



CTA Bus/Rail Aggregation

library(DBI)
library(RPostgreSQL)

# CONNECT
conn <- dbConnect(RPostgreSQL::PostgreSQL(), host=config$pg_conn$host, dbname=config$pg_conn$name,
                  user=config$pg_conn$user, password=config$pg_conn$pwd, port=config$pg_conn$port)
# R AGGREGATION (MULTIPLE FUNCS)
# do.call(data.frame, 
#        aggregate(rides ~ route + routename, agg_csv,
#                  function(x) c(count=length(x), sum=sum(x), mean=mean(x),
#                                median=median(x), min=min(x), max=max(x))))

# POSTGRES AGGREGATION
sql <- 'SELECT rt.route_name, 
               COUNT(rd.rides) AS "count", 
               SUM(rd.rides) AS "sum", 
               AVG(rd.rides) AS "mean", 
               MEDIAN(rd.rides) AS "median",
               R_MEDIAN(rd.rides) AS "r_median",
               MIN(rd.rides) AS "min", 
               MAX(rd.rides) AS "max"
        FROM bus_routes rt
        INNER JOIN bus_rides rd ON rt.route_id = rd.route_id
        GROUP BY rt.route_name
        ORDER BY SUM(rd.rides) DESC
        LIMIT 10'

agg_sql <- dbGetQuery(conn, sql)

kable_styling(kable(agg_sql),
              bootstrap_options = c("striped", "hover", "condensed"))
route_name count sum mean median r_median min max
79th 6513 176238493 27059.50 27672 27672 7896 43081
Ashland 6513 156178845 23979.56 23676 23676 5570 45177
Chicago 6513 130620291 20055.32 21767 21767 4448 35893
Western 6513 129322319 19856.03 19553 19553 4985 37202
Cottage Grove 6513 128999766 19806.50 21181 21181 5489 31187
Belmont 6513 123065617 18895.38 20732 20732 3451 30025
Clark 6513 119459342 18341.68 19171 19171 3310 26896
King Drive 6513 119070638 18282.00 19402 19402 3993 29896
Halsted 6513 118284951 18161.36 19305 19305 2915 30605
Sheridan 6513 118178375 18145.00 18812 18812 2666 33236

Graphing

ggplot(agg_sql, aes(route_name, sum, fill=route_name)) + geom_col(position = "dodge") +
  labs(title="CTA Top 10 Bus Routes by Ridership", x="Year", y="Rides") +
  scale_y_continuous(expand = c(0, 0), label=comma, limit=c(0,2E8)) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))


Calculation + Aggregation

sql <- 'WITH station_agg AS
         (SELECT DATE_PART(\'year\', r.ride_date)::integer AS "year",
                 r.station_id,
                 r.station_name,
                 COUNT(r.rides)::numeric(20,5) AS "count", 
                 SUM(r.rides)::numeric(20,5) AS "sum", 
                 AVG(r.rides)::numeric(20,5) AS "mean", 
                 MEDIAN(r.rides)::numeric(20,5) AS "median",
                 MIN(r.rides)::numeric(20,5) AS "min", 
                 MAX(r.rides)::numeric(20,5) AS "max"
          FROM rail_rides r
          GROUP BY DATE_PART(\'year\', r.ride_date),
                   r.station_id,
                   r.station_name
          ),
                   
      merge_rail AS
         (SELECT s.*, 
                 r.rail_line,
                 (s."sum" / COUNT(*) OVER(PARTITION BY s.station_id, "year")) AS rail_total
          FROM station_agg s
          INNER JOIN rail_stations r ON s.station_id = r.station_id
         )
         
      SELECT m."year", m.rail_line, SUM(m.rail_total)  AS rail_total
      FROM merge_rail m
      GROUP BY m."year", m.rail_line
      ORDER BY m.rail_line, m."year"'
  
agg_sql <- dbGetQuery(conn, sql)

kable_styling(kable(agg_sql[sample(1:nrow(agg_sql), 20),]),
              bootstrap_options = c("striped", "hover", "condensed"))
year rail_line rail_total
115 2007 purple_exp 11685317
132 2006 red 56267405
25 2007 brown 15158712
24 2006 brown 15900764
85 2013 pink 10076391
91 2001 purple 2025669
74 2002 pink 5668260
112 2004 purple_exp 10812035
161 2017 yellow 1373701
107 2017 purple 2054181
38 2002 green 11275092
13 2013 blue 45304180
145 2001 yellow 1175412
51 2015 green 15502181
62 2008 orange 12915598
93 2003 purple 2016380
117 2009 purple_exp 12493455
128 2002 red 55489388
8 2008 blue 37294109
109 2001 purple_exp 11168731

Graphing

cta_palette <- c(blue="#00A1DE", brown="#62361B", green="#009B3A", orange="#F9461C", pink="#E27EA6",
                 purple="#522398", purple_exp="#8059BA", red="#C60C30", yellow="#F9E300")

ggplot(agg_sql, aes(year, rail_total, color=rail_line)) + 
  geom_line(stat="identity") + geom_point(stat="identity") +
  labs(title="CTA Rail Ridership By Year", x="Year", y="Rides") +
  scale_x_continuous("year", breaks=unique(agg_sql$year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) +
  scale_color_manual(values = cta_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))


Modeling

CREATE MATERIALIZED VIEW Rail_Model_Data AS
    SELECT r.id, r.station_id, r.station_name, r.ride_date, r.day_type, r.rides AS raw, 
          (r.rides / COUNT(*) OVER(PARTITION BY r.station_id, r.ride_date)) AS rides,
          CASE 
               WHEN r.normalized_date BETWEEN '2099-01-01' AND '2099-03-19' THEN 'winter'
               WHEN r.normalized_date BETWEEN '2099-03-20' AND '2099-06-19' THEN 'spring'
               WHEN r.normalized_date BETWEEN '2099-06-20' AND '2099-09-19' THEN 'summer'
               WHEN r.normalized_date BETWEEN '2099-09-20' AND '2099-12-19' THEN 'fall'
               WHEN r.normalized_date BETWEEN '2099-12-20' AND '2099-12-31' THEN 'winter'
               ELSE NULL
           END As season,        
           REPLACE(REPLACE((regexp_split_to_array(s.location, '\s+'))[1], ',', ''), '(', '')::numeric AS latitude,
           REPLACE((regexp_split_to_array(s.location, '\s+'))[2], ')', '')::numeric AS longitude,
           s.rail_line, s.ada, s.direction,
           ue.ue_rate, g.gas_price, w.avg_temp, w.precipitation, w.snow_depth
    FROM 
       (
        SELECT id, station_id, station_name, day_type, rides, ride_date, 
               ride_date + (2099 - date_part('year', ride_date)  ||' year')::interval as normalized_date
        FROM rail_rides
       )r
    INNER JOIN rail_stations s ON s.station_id = r.station_id
    INNER JOIN unemployment_rates ue ON ue.ue_date = r.ride_date
    INNER JOIN gas_prices g ON g.gas_date = r.ride_date
    INNER JOIN weather_data w ON w.weather_date = r.ride_date
    ORDER BY r.ride_date, r.station_id;
    
REFRESH MATERIALIZED VIEW Rail_Model_Data;
rail_model_data <- dbGetQuery(conn, "SELECT * FROM rail_model_data")

scroll_box(kable_styling(kable(head(rail_model_data)),
                        bootstrap_options = c("striped", "hover", "condensed")), width = "100%")
id station_id station_name ride_date day_type raw rides season latitude longitude rail_line ada direction ue_rate gas_price avg_temp precipitation snow_depth
62 40010 Austin-Forest Park 2001-01-01 U 290 290 winter 41.87085 -87.77681 blue 0 W | E 5.3 1.487 14.5 0 17
61 40020 Harlem-Lake 2001-01-01 U 633 633 winter 41.88685 -87.80318 green 1 E | W 5.3 1.487 14.5 0 17
50 40030 Pulaski-Lake 2001-01-01 U 483 483 winter 41.88541 -87.72540 green 1 W | E 5.3 1.487 14.5 0 17
109 40040 Quincy/Wells 2001-01-01 U 374 93 winter 41.87872 -87.63374 pink 0 N 5.3 1.487 14.5 0 17
109 40040 Quincy/Wells 2001-01-01 U 374 93 winter 41.87872 -87.63374 brown 0 S 5.3 1.487 14.5 0 17
109 40040 Quincy/Wells 2001-01-01 U 374 93 winter 41.87872 -87.63374 orange 0 N 5.3 1.487 14.5 0 17
model <- lm(rides ~ day_type + season + latitude + longitude + rail_line + 
                    ue_rate + gas_price + avg_temp + precipitation + snow_depth, 
            data = rail_model_data)

# ANOVA
kable_styling(kable(xtable(anova(model))),
              bootstrap_options = c("striped", "hover", "condensed"))
Df Sum Sq Mean Sq F value Pr(>F)
day_type 2 445663599687 222831799844 64679.229601 0.0000000
season 3 12065643417 4021881139 1167.392508 0.0000000
latitude 1 86867071786 86867071786 25214.064082 0.0000000
longitude 1 319342686 319342686 92.692510 0.0000000
rail_line 8 1832581552014 229072694002 66490.713539 0.0000000
ue_rate 1 1655357962 1655357962 480.484732 0.0000000
gas_price 1 22101747909 22101747909 6415.260428 0.0000000
avg_temp 1 3204630785 3204630785 930.177158 0.0000000
precipitation 1 878964701 878964701 255.128576 0.0000000
snow_depth 1 8106972 8106972 2.353132 0.1250319
Residuals 1038214 3576837505172 3445183 NA NA
# POINT ESTIMATES
kable_styling(kable(xtable(summary(model))),
              bootstrap_options = c("striped", "hover", "condensed"))
Estimate Std. Error t value Pr(>|t|)
(Intercept) -67333.240996 3539.2418169 -19.024764 0.0000000
day_typeU -454.935197 6.6213391 -68.707430 0.0000000
day_typeW 1159.154756 5.2660141 220.119950 0.0000000
seasonspring -180.266785 5.3603614 -33.629595 0.0000000
seasonsummer -179.982024 6.5944919 -27.292781 0.0000000
seasonwinter -160.120620 6.1885199 -25.873815 0.0000000
latitude -3954.668735 36.5219229 -108.282051 0.0000000
longitude -2679.792012 43.6084183 -61.451254 0.0000000
rail_linebrown -1187.766815 7.2025790 -164.908543 0.0000000
rail_linegreen -2076.775862 6.8343576 -303.872871 0.0000000
rail_lineorange -1082.768528 8.2804788 -130.761584 0.0000000
rail_linepink -2215.792821 7.3581389 -301.134954 0.0000000
rail_linepurple -2043.636304 11.1396088 -183.456738 0.0000000
rail_linepurple_exp -1558.043094 7.6564149 -203.495122 0.0000000
rail_linered 1636.661666 6.9037116 237.069820 0.0000000
rail_lineyellow -1418.136925 17.5558352 -80.778665 0.0000000
ue_rate -7.639672 0.9665412 -7.904135 0.0000000
gas_price 188.159773 2.4375323 77.192729 0.0000000
avg_temp 5.141775 0.1658352 31.005329 0.0000000
precipitation -84.092077 5.2552649 -16.001492 0.0000000
snow_depth 1.848654 1.2051261 1.533992 0.1250319
graph_data <- data.frame(param = names(model$coefficients[-1]),
                         value = model$coefficients[-1],
                         row.names = NULL)

ggplot(graph_data) + geom_col(aes(x=param, y=value, fill=param), position = "dodge") +
  labs(title="CTA System Rail Regression Point Estimates", x="Parameters", y="Value") +
  guides(fill=FALSE) + ylim(-4000, 2000) + 
  scale_fill_manual(values = seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=45, vjust=0.95, hjust=0.95))

# DISCONNECT FROM DATABASE
dbDisconnect(conn)
## [1] TRUE


Divvy Time and Distance Calculation

options(connectionObserver = NULL)

library(odbc)

conn <- dbConnect(odbc::odbc(),
                  instance = config$db2_conn$instance,
                  driver = config$db2_conn$driver,
                  database = config$db2_conn$database,
                  server = config$db2_conn$server,
                  hostname = config$db2_conn$hostname,
                  port = config$db2_conn$port,
                  uid = config$db2_conn$uid,
                  pwd = config$db2_conn$pwd,
                  LongDataCompat = 1)
day_df <- dbGetQuery(conn, "SELECT t.START_TIME, t.STOP_TIME,
                                   t.TRIP_DURATION
                            FROM TRIPS t
                            WHERE DATE(t.START_TIME) = TO_DATE('2015-09-23', 'YYYY-MM-DD')
                            ORDER BY t.START_TIME")

kable_styling(kable(head(day_df, 20)),
              bootstrap_options = c("striped", "hover", "condensed"))
START_TIME STOP_TIME TRIP_DURATION
2015-09-23 00:00:00 2015-09-23 00:07:00 414
2015-09-23 00:00:00 2015-09-23 00:03:00 183
2015-09-23 00:01:00 2015-09-23 00:09:00 450
2015-09-23 00:01:00 2015-09-23 00:24:00 1350
2015-09-23 00:01:00 2015-09-23 00:07:00 402
2015-09-23 00:02:00 2015-09-23 00:15:00 793
2015-09-23 00:03:00 2015-09-23 00:27:00 1435
2015-09-23 00:04:00 2015-09-23 00:06:00 134
2015-09-23 00:05:00 2015-09-23 00:19:00 820
2015-09-23 00:06:00 2015-09-23 00:15:00 526
2015-09-23 00:07:00 2015-09-23 00:15:00 506
2015-09-23 00:08:00 2015-09-23 00:15:00 425
2015-09-23 00:09:00 2015-09-23 00:32:00 1372
2015-09-23 00:10:00 2015-09-23 00:24:00 881
2015-09-23 00:10:00 2015-09-23 00:35:00 1445
2015-09-23 00:10:00 2015-09-23 00:26:00 935
2015-09-23 00:11:00 2015-09-23 00:23:00 717
2015-09-23 00:13:00 2015-09-23 00:19:00 382
2015-09-23 00:14:00 2015-09-23 00:15:00 66
2015-09-23 00:16:00 2015-09-23 00:38:00 1316
ggplot(day_df, aes(START_TIME, TRIP_DURATION)) + geom_line(color=seaborn_palette[1]) +
  xlab("Start Time") + ylab("Trip Duration")

agg_sql <- dbGetQuery(conn, "WITH sub AS 
                                (SELECT t.FROM_STATION_ID, t.FROM_STATION_NAME,
                                        t.FROM_LATITUDE, t.FROM_LONGITUDE,
                                        ROUND(CASE 
                                                   WHEN t.TO_LATITUDE IS NOT NULL AND t.FROM_LATITUDE IS NOT NULL
                                                   THEN
                                                        3963.1 * (2 * ATAN(1) - 
                                                                  ASIN(SIN(RADIANS(t.FROM_LATITUDE)) * SIN(RADIANS(t.TO_LONGITUDE)) + 
                                                                       COS(RADIANS(t.FROM_LATITUDE)) * COS(RADIANS(t.TO_LONGITUDE)) * 
                                                                       COS(RADIANS(t.TO_LONGITUDE - t.FROM_LONGITUDE))
                                                                      ) 
                                                                 )
                                                   ELSE NULL
                                              END, 4) AS TRIP_DISTANCE
                                 FROM TRIPS t
                                 WHERE DATE(t.START_TIME) = TO_DATE('2015-09-23', 'YYYY-MM-DD'))
                            
                            SELECT FROM_STATION_ID, FROM_STATION_NAME, 
                                   FROM_LATITUDE, FROM_LONGITUDE,
                                   MIN(TRIP_DISTANCE) AS MIN_DISTANCE,
                                   MAX(TRIP_DISTANCE) AS MAX_DISTANCE
                            
                            FROM sub 
                            GROUP BY FROM_STATION_ID, FROM_STATION_NAME, 
                                     FROM_LATITUDE, FROM_LONGITUDE
                            ORDER BY MAX(TRIP_DISTANCE) DESC
                            FETCH FIRST 10 ROWS ONLY;")

kable_styling(kable(agg_sql), bootstrap_options = c("striped", "hover", "condensed"))
FROM_STATION_ID FROM_STATION_NAME FROM_LATITUDE FROM_LONGITUDE MIN_DISTANCE MAX_DISTANCE
467 Western Ave & Lunt Ave 42.00859 -87.69049 8969.148 8971.314
494 Kedzie Ave & Bryn Mawr Ave 41.98240 -87.70892 8969.882 8971.135
447 Glenwood Ave & Morse Ave 42.00797 -87.66550 8967.513 8971.132
466 Ridge Blvd & Touhy Ave 42.01213 -87.68291 8969.393 8970.895
294 Broadway & Berwyn Ave 41.97835 -87.65975 8963.826 8970.855
479 Drake Ave & Montrose Ave 41.96115 -87.71657 8969.369 8970.804
469 St. Louis Ave & Balmoral Ave 41.98039 -87.71611 8967.694 8970.700
450 Warren Park West 42.00201 -87.68973 8970.580 8970.580
451 Sheridan Rd & Loyola Ave 42.00104 -87.66120 8966.526 8970.579
325 Clark St & Winnemac Ave 41.97339 -87.66836 8964.613 8970.512

Mapping

divvy_from <- dbGetQuery(conn, "SELECT t.FROM_STATION_NAME, 
                                       t.FROM_LATITUDE, 
                                       t.FROM_LONGITUDE,
                                       SUM(t.TRIP_DURATION) AS Total_Duration,
                                       MIN(t.TRIP_DURATION) AS Min_Duration,
                                       AVG(t.TRIP_DURATION) AS Avg_Duration,
                                       MEDIAN(t.TRIP_DURATION) AS Median_Duration,
                                       MAX(t.TRIP_DURATION) AS Max_Duration
                                       
                                FROM TRIPS t
                                GROUP BY t.FROM_STATION_NAME,
                                         t.FROM_LATITUDE, 
                                         t.FROM_LONGITUDE
                                ORDER BY SUM(t.TRIP_DURATION) DESC
                                FETCH FIRST 10 ROWS ONLY;")

kable_styling(kable(divvy_from), bootstrap_options = c("striped", "hover", "condensed"))
FROM_STATION_NAME FROM_LATITUDE FROM_LONGITUDE TOTAL_DURATION MIN_DURATION AVG_DURATION MEDIAN_DURATION MAX_DURATION
Lake Shore Dr & Monroe St 41.88096 -87.61674 444455108 60 1803 1347 6403880
Streeter Dr & Grand Ave 41.89228 -87.61204 431638473 60 1956 1417 5444590
Michigan Ave & Oak St 41.90096 -87.62378 351053592 60 1815 1300 2549130
Millennium Park 41.88103 -87.62408 347331136 60 1820 1241 6785180
Theater on the Lake 41.92628 -87.63083 330660276 60 1524 1285 740561
Lake Shore Dr & North Blvd 41.91172 -87.62680 286215561 60 1452 1173 336578
Streeter Dr & Illinois St 41.89107 -87.61220 222454638 60 1615 1320 84073
Canal St & Adams St 41.87926 -87.63990 184661822 60 837 614 9961130
Michigan Ave & Washington St 41.88389 -87.62465 179427697 60 1160 649 1876990
Shedd Aquarium 41.86722 -87.61535 178996820 60 1675 1307 1389890
ggplot(transform(divvy_from, FROM_STATION_NAME=gsub("&", "\n&", FROM_STATION_NAME)),
                 aes(FROM_STATION_NAME, TOTAL_DURATION, fill=FROM_STATION_NAME)) +
  geom_col(position = "dodge") +
  labs(title="Divvy Top 10 Origination Stations", x="Station", y="Trip Duration (seconds)") +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

divvy_to <- dbGetQuery(conn, "SELECT t.TO_STATION_NAME, 
                                     t.TO_LATITUDE, 
                                     t.TO_LONGITUDE,
                                     SUM(t.TRIP_DURATION) AS Total_Duration,
                                     MIN(t.TRIP_DURATION) AS Min_Duration,
                                     AVG(t.TRIP_DURATION) AS Avg_Duration,
                                     MEDIAN(t.TRIP_DURATION) AS Median_Duration,
                                     MAX(t.TRIP_DURATION) AS Max_Duration
                                   
                            FROM TRIPS t
                            WHERE TO_STATION_NAME <> 'DIVVY Map Frame B/C Station'
                            GROUP BY t.TO_STATION_NAME,
                                     t.TO_LATITUDE, 
                                     t.TO_LONGITUDE
                            ORDER BY SUM(t.TRIP_DURATION) DESC
                            FETCH FIRST 10 ROWS ONLY;")

kable_styling(kable(divvy_to), bootstrap_options = c("striped", "hover", "condensed"))
TO_STATION_NAME TO_LATITUDE TO_LONGITUDE TOTAL_DURATION MIN_DURATION AVG_DURATION MEDIAN_DURATION MAX_DURATION
Streeter Dr & Grand Ave 41.89228 -87.61204 469992965 60 1914 1432 297905
Lake Shore Dr & Monroe St 41.88096 -87.61674 396754905 60 1702 1318 731920
Theater on the Lake 41.92628 -87.63083 370722559 60 1598 1374 383963
Michigan Ave & Oak St 41.90096 -87.62378 369016876 60 1782 1306 2549130
Millennium Park 41.88103 -87.62408 344723608 60 1637 1157 566401
Lake Shore Dr & North Blvd 41.91172 -87.62680 337381528 60 1524 1295 269506
Streeter Dr & Illinois St 41.89107 -87.61220 264397622 60 1611 1325 85368
Michigan Ave & Washington St 41.88389 -87.62465 158901379 60 1003 572 3054250
Shedd Aquarium 41.86722 -87.61535 158356779 60 1563 1307 184015
Adler Planetarium 41.86610 -87.60727 153361126 60 1635 1357 524315
ggplot(transform(divvy_to, TO_STATION_NAME=gsub("&", "\n&", TO_STATION_NAME)), 
       aes(TO_STATION_NAME, TOTAL_DURATION, fill=TO_STATION_NAME)) +
  geom_col(position = "dodge") +
  labs(title="Divvy Top 10 Destination Stations", x="Station", y="Trip Duration (seconds)") +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

library(jsonlite)

x <- toJSON(divvy_from[c("FROM_STATION_NAME", "FROM_LATITUDE", "FROM_LONGITUDE")], pretty=TRUE)

# EXPORT TO FILE
fileConn <- file("Divvy_From_Coords.json")
writeLines(x, fileConn)
close(fileConn)

x
## [
##   {
##     "FROM_STATION_NAME": "Lake Shore Dr & Monroe St",
##     "FROM_LATITUDE": 41.881,
##     "FROM_LONGITUDE": -87.6167
##   },
##   {
##     "FROM_STATION_NAME": "Streeter Dr & Grand Ave",
##     "FROM_LATITUDE": 41.8923,
##     "FROM_LONGITUDE": -87.612
##   },
##   {
##     "FROM_STATION_NAME": "Michigan Ave & Oak St",
##     "FROM_LATITUDE": 41.901,
##     "FROM_LONGITUDE": -87.6238
##   },
##   {
##     "FROM_STATION_NAME": "Millennium Park",
##     "FROM_LATITUDE": 41.881,
##     "FROM_LONGITUDE": -87.6241
##   },
##   {
##     "FROM_STATION_NAME": "Theater on the Lake",
##     "FROM_LATITUDE": 41.9263,
##     "FROM_LONGITUDE": -87.6308
##   },
##   {
##     "FROM_STATION_NAME": "Lake Shore Dr & North Blvd",
##     "FROM_LATITUDE": 41.9117,
##     "FROM_LONGITUDE": -87.6268
##   },
##   {
##     "FROM_STATION_NAME": "Streeter Dr & Illinois St",
##     "FROM_LATITUDE": 41.8911,
##     "FROM_LONGITUDE": -87.6122
##   },
##   {
##     "FROM_STATION_NAME": "Canal St & Adams St",
##     "FROM_LATITUDE": 41.8793,
##     "FROM_LONGITUDE": -87.6399
##   },
##   {
##     "FROM_STATION_NAME": "Michigan Ave & Washington St",
##     "FROM_LATITUDE": 41.8839,
##     "FROM_LONGITUDE": -87.6246
##   },
##   {
##     "FROM_STATION_NAME": "Shedd Aquarium",
##     "FROM_LATITUDE": 41.8672,
##     "FROM_LONGITUDE": -87.6154
##   }
## ]
x <- toJSON(divvy_to[c("TO_STATION_NAME", "TO_LATITUDE", "TO_LONGITUDE")], pretty=TRUE)

# EXPORT TO FILE
fileConn <- file("Divvy_To_Coords.json")
writeLines(x, fileConn)
close(fileConn)

x
## [
##   {
##     "TO_STATION_NAME": "Streeter Dr & Grand Ave",
##     "TO_LATITUDE": 41.8923,
##     "TO_LONGITUDE": -87.612
##   },
##   {
##     "TO_STATION_NAME": "Lake Shore Dr & Monroe St",
##     "TO_LATITUDE": 41.881,
##     "TO_LONGITUDE": -87.6167
##   },
##   {
##     "TO_STATION_NAME": "Theater on the Lake",
##     "TO_LATITUDE": 41.9263,
##     "TO_LONGITUDE": -87.6308
##   },
##   {
##     "TO_STATION_NAME": "Michigan Ave & Oak St",
##     "TO_LATITUDE": 41.901,
##     "TO_LONGITUDE": -87.6238
##   },
##   {
##     "TO_STATION_NAME": "Millennium Park",
##     "TO_LATITUDE": 41.881,
##     "TO_LONGITUDE": -87.6241
##   },
##   {
##     "TO_STATION_NAME": "Lake Shore Dr & North Blvd",
##     "TO_LATITUDE": 41.9117,
##     "TO_LONGITUDE": -87.6268
##   },
##   {
##     "TO_STATION_NAME": "Streeter Dr & Illinois St",
##     "TO_LATITUDE": 41.8911,
##     "TO_LONGITUDE": -87.6122
##   },
##   {
##     "TO_STATION_NAME": "Michigan Ave & Washington St",
##     "TO_LATITUDE": 41.8839,
##     "TO_LONGITUDE": -87.6246
##   },
##   {
##     "TO_STATION_NAME": "Shedd Aquarium",
##     "TO_LATITUDE": 41.8672,
##     "TO_LONGITUDE": -87.6154
##   },
##   {
##     "TO_STATION_NAME": "Adler Planetarium",
##     "TO_LATITUDE": 41.8661,
##     "TO_LONGITUDE": -87.6073
##   }
## ]

# DISCONNECT FROM DATABASE
dbDisconnect(conn)


Metra Data Normalization and Testing

library(RSQLite)

conn <- dbConnect(RSQLite::SQLite(), dbname=config$sqlite_conn$database)

By Year

sql <- "SELECT l.Line, 
               t.Line AS Short,
               CAST(strftime('%Y', Report_Month) AS INT) as Year,
               SUM(t.Late_Trains) AS Total_Late_Trains,
               AVG(t.Late_Trains) AS Avg_Late_Trains
        FROM Trains t
        INNER JOIN Lines l ON t.LineID = l.ID
        WHERE l.Line <> 'SYSTEM'
        GROUP BY l.Line,
                 t.Line,
                 strftime('%Y', t.Report_Month)"

agg_sql <- dbGetQuery(conn, sql)

kable_styling(kable(agg_sql[sample(1:nrow(agg_sql), 20),]),
              bootstrap_options = c("striped", "hover", "condensed"))
Line Short Year Total_Late_Trains Avg_Late_Trains
111 Union Pacific / Northwest UP-NW 2009 2053 28.513889
99 SouthWest Service SWS 2017 1205 20.083333
21 Metra Electric Blue Island Elec-SC 2009 705 9.791667
29 Metra Electric Blue Island Elec-SC 2017 729 10.125000
84 Rock Island District RI 2012 2594 36.027778
36 Metra Electric Main Line Elec-ML 2014 2032 28.222222
6 BNSF Railway BNSF 2014 7955 110.486111
14 Heritage Corridor Heritage 2012 204 5.368421
71 North Central Service NCS 2009 807 16.812500
108 Union Pacific / North UP-N 2016 1252 17.388889
113 Union Pacific / Northwest UP-NW 2011 2641 36.680556
23 Metra Electric Blue Island Elec-SC 2011 1245 17.291667
30 Metra Electric Blue Island Elec-SC 2018 565 7.847222
92 SouthWest Service SWS 2010 1385 23.083333
126 Union Pacific / West UP-W 2014 2773 38.513889
89 Rock Island District RI 2017 2736 38.000000
54 Milwaukee District / North Milw-N 2012 3064 42.555556
31 Metra Electric Main Line Elec-ML 2009 2027 28.152778
124 Union Pacific / West UP-W 2012 2265 31.458333
101 Union Pacific / North UP-N 2009 2868 39.833333
ggplot(agg_sql, aes(Year, Total_Late_Trains, color=Line)) + 
  geom_line(stat="identity") +
  labs(title="Metra Late Trains by Line", x="Year", y="Late Trains") +
  scale_x_continuous("year", breaks=unique(agg_sql$Year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_color_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))


Statistical Testing

T-Tests

lines_list <- split(agg_sql, agg_sql$Short)

nms <- lapply(combn(unique(agg_sql$Short), 2, simplify=FALSE), function(i) paste(i, collapse="_"))

res <- t(sapply(combn(unique(agg_sql$Short), 2, simplify=FALSE), function(i) {
  t <- t.test(lines_list[[i[1]]]$Total_Late_Trains, lines_list[[i[2]]]$Total_Late_Trains)
  c(statistic = t$statistic, p_value = t$p.value)
}))

row.names(res) <- nms

scroll_box(kable_styling(kable(res), bootstrap_options = c("striped", "hover", "condensed")),
           height="800px")
statistic.t p_value
BNSF_Heritage 9.8494756 0.0000034
BNSF_Elec-SC 8.7158417 0.0000076
BNSF_Elec-ML 6.1374028 0.0000957
BNSF_Elec-BI 9.3920454 0.0000054
BNSF_Milw-N 2.8910097 0.0111757
BNSF_Milw-W 4.9595192 0.0002727
BNSF_NCS 7.9789334 0.0000143
BNSF_RI 4.5415218 0.0006351
BNSF_SWS 7.5383993 0.0000207
BNSF_UP-N 4.9702928 0.0001637
BNSF_UP-NW 5.4107001 0.0002478
BNSF_UP-W 3.7401929 0.0021030
Heritage_Elec-SC -5.2621722 0.0000918
Heritage_Elec-ML -12.0629835 0.0000001
Heritage_Elec-BI -3.4987332 0.0027396
Heritage_Milw-N -10.3277589 0.0000017
Heritage_Milw-W -9.1513465 0.0000037
Heritage_NCS -7.7996898 0.0000020
Heritage_RI -11.0240760 0.0000006
Heritage_SWS -8.6861313 0.0000010
Heritage_UP-N -6.3532760 0.0001028
Heritage_UP-NW -13.6677426 0.0000000
Heritage_UP-W -9.5307864 0.0000033
Elec-SC_Elec-ML -7.6930971 0.0000013
Elec-SC_Elec-BI 3.0553272 0.0090725
Elec-SC_Milw-N -8.4524134 0.0000058
Elec-SC_Milw-W -6.7021299 0.0000291
Elec-SC_NCS -2.6856024 0.0153183
Elec-SC_RI -8.3005633 0.0000030
Elec-SC_SWS -3.9255453 0.0011167
Elec-SC_UP-N -4.5812945 0.0009268
Elec-SC_UP-NW -9.4434609 0.0000001
Elec-SC_UP-W -7.5259529 0.0000146
Elec-ML_Elec-BI 10.7051161 0.0000005
Elec-ML_Milw-N -4.3660738 0.0008571
Elec-ML_Milw-W -1.6151844 0.1279065
Elec-ML_NCS 5.2419772 0.0000733
Elec-ML_RI -2.7359672 0.0152246
Elec-ML_SWS 3.8518590 0.0012327
Elec-ML_UP-N -0.7261232 0.4813509
Elec-ML_UP-NW -1.8073859 0.0875269
Elec-ML_UP-W -3.2060881 0.0069770
Elec-BI_Milw-N -9.6182342 0.0000038
Elec-BI_Milw-W -8.2352811 0.0000119
Elec-BI_NCS -5.9428602 0.0000643
Elec-BI_RI -10.0649428 0.0000019
Elec-BI_SWS -7.0205384 0.0000176
Elec-BI_UP-N -5.6282342 0.0002825
Elec-BI_UP-NW -12.4136552 0.0000001
Elec-BI_UP-W -8.7676538 0.0000080
Milw-N_Milw-W 2.6700024 0.0162767
Milw-N_NCS 7.2630725 0.0000175
Milw-N_RI 2.0467572 0.0573187
Milw-N_SWS 6.5511394 0.0000355
Milw-N_UP-N 2.7933483 0.0120074
Milw-N_UP-NW 3.2584554 0.0063262
Milw-N_UP-W 1.0801680 0.2944043
Milw-W_NCS 5.1854922 0.0002195
Milw-W_RI -0.8225248 0.4216052
Milw-W_SWS 4.2973444 0.0008763
Milw-W_UP-N 0.4888439 0.6313060
Milw-W_UP-NW 0.2804970 0.7829139
Milw-W_UP-W -1.5653195 0.1356000
NCS_RI -6.6347574 0.0000189
NCS_SWS -1.3501689 0.1939515
NCS_UP-N -3.4515401 0.0055512
NCS_UP-NW -7.0452631 0.0000029
NCS_UP-W -6.2606259 0.0000586
RI_SWS 5.6476491 0.0000687
RI_UP-N 1.1936645 0.2499458
RI_UP-NW 1.3014342 0.2118233
RI_UP-W -0.8818209 0.3903462
SWS_UP-N -2.7957833 0.0169999
SWS_UP-NW -5.6470712 0.0000295
SWS_UP-W -5.5094534 0.0001466
UP-N_UP-NW -0.3290665 0.7474460
UP-N_UP-W -1.8187654 0.0857421
UP-NW_UP-W -2.0429687 0.0613004

By Month

sql <- "SELECT l.ID,
               l.Line, 
               t.Line AS Short,
               CAST(strftime('%m', Report_Month) AS INT) as Month_Num,
               SUM(t.Late_Trains) AS Total_Late_Trains,
               AVG(t.Late_Trains) AS Avg_Late_Trains
        FROM Trains t
        INNER JOIN Lines l ON t.LineID = l.ID
        WHERE l.Line <> 'SYSTEM'
        GROUP BY l.ID,
                 l.Line,
                 t.Line,
                 CAST(strftime('%m', Report_Month) AS INT)
        ORDER BY l.ID,
                 CAST(strftime('%m', Report_Month) AS INT)"

agg_sql <- transform(dbGetQuery(conn, sql), Month = factor(month.abb[Month_Num], levels=month.abb))

kable_styling(kable(agg_sql[sample(1:nrow(agg_sql), 20),]), 
              bootstrap_options = c("striped", "hover", "condensed"))
ID Line Short Month_Num Total_Late_Trains Avg_Late_Trains Month
36 3 Metra Electric Main Line Elec-ML 12 1599 29.611111 Dec
13 2 Heritage Corridor Heritage 1 336 10.500000 Jan
86 8 North Central Service NCS 2 1368 34.200000 Feb
106 9 Rock Island District RI 10 1605 26.750000 Oct
27 3 Metra Electric Main Line Elec-ML 3 1202 20.033333 Mar
16 2 Heritage Corridor Heritage 4 189 5.727273 Apr
100 9 Rock Island District RI 4 1661 27.683333 Apr
76 7 Milwaukee District / West Milw-W 4 1208 20.133333 Apr
152 13 Union Pacific / West UP-W 8 2792 46.533333 Aug
17 2 Heritage Corridor Heritage 5 354 10.727273 May
37 4 Metra Electric South Chicago Elec-BI 1 780 15.600000 Jan
153 13 Union Pacific / West UP-W 9 1857 30.950000 Sep
64 6 Milwaukee District / North Milw-N 4 1932 32.200000 Apr
79 7 Milwaukee District / West Milw-W 7 2300 38.333333 Jul
62 6 Milwaukee District / North Milw-N 2 3327 55.450000 Feb
115 10 SouthWest Service SWS 7 1186 23.254902 Jul
19 2 Heritage Corridor Heritage 7 378 11.454546 Jul
45 4 Metra Electric South Chicago Elec-BI 9 470 9.215686 Sep
114 10 SouthWest Service SWS 6 1608 32.160000 Jun
125 11 Union Pacific / North UP-N 5 1623 27.050000 May
ggplot(agg_sql, aes(Month, Total_Late_Trains, fill=Line)) + 
  geom_col(position = "dodge") +
  labs(title="Metra Late Trains by Month", x="Year", y="Late Trains") +
  scale_x_discrete("Month", breaks=unique(agg_sql$Month)) +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  facet_wrap(~Line, ncol=3, scales="free_y") +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

Correlations

sql <- "SELECT CAST(strftime('%Y', c.Report_Month) AS INT) AS Year,
               SUM(CASE WHEN c.Late_Cause = 'Accident' THEN c.Late_Trains ELSE NULL END) AS [Accident],
               SUM(CASE WHEN c.Late_Cause = 'Catenary Failure' THEN c.Late_Trains ELSE NULL END) AS [Catenary],               
               SUM(CASE WHEN c.Late_Cause = 'Freight Interference - Peak' THEN c.Late_Trains ELSE NULL END) AS [Freight Interference Peak],
               SUM(CASE WHEN c.Late_Cause = 'Freight Interference - Off-Peak' THEN c.Late_Trains ELSE NULL END) AS [Freight Interference Off Peak],  
               SUM(CASE WHEN c.Late_Cause = 'Human Error' THEN c.Late_Trains ELSE NULL END) AS [Human Error],                     
               SUM(CASE WHEN c.Late_Cause = 'Lift Deployment' THEN c.Late_Trains ELSE NULL END) AS [Lift Deployment],                
               SUM(CASE WHEN c.Late_Cause = 'Locomotive Failure' THEN c.Late_Trains ELSE NULL END) AS [Locomotive Failure],              
               SUM(CASE WHEN c.Late_Cause = 'Non-Locomotive Equipment Failure' THEN c.Late_Trains ELSE NULL END) AS [Non-Locomotive Equipment Failure], 
               SUM(CASE WHEN c.Late_Cause = 'Obstruction/Debris' THEN c.Late_Trains ELSE NULL END) AS [Obstruction Debris],              
               SUM(CASE WHEN c.Late_Cause = 'Other' THEN c.Late_Trains ELSE NULL END) AS [Other],                            
               SUM(CASE WHEN c.Late_Cause = 'Passenger Loading' THEN c.Late_Trains ELSE NULL END) AS [Passenger Loading],
               SUM(CASE WHEN c.Late_Cause = 'Passenger Train Interference' THEN c.Late_Trains ELSE NULL END) AS [Passenger Train Interference],     
               SUM(CASE WHEN c.Late_Cause = 'Sick, Injured, Unruly Passenger' THEN c.Late_Trains ELSE NULL END) AS [Sick Injured Unruly Passenger],
               SUM(CASE WHEN c.Late_Cause = 'Signal/Switch Failure' THEN c.Late_Trains ELSE NULL END) AS [Signal Switch Failure],           
               SUM(CASE WHEN c.Late_Cause = 'Track Work' THEN c.Late_Trains ELSE NULL END) AS [Track Work],                      
               SUM(CASE WHEN c.Late_Cause = 'Weather' THEN c.Late_Trains ELSE NULL END) AS [Weather]            FROM Causes c 
      GROUP BY CAST(strftime('%Y', c.Report_Month) AS INT);"

wide_sql <- dbGetQuery(conn, sql)

scroll_box(kable_styling(knitr::kable(wide_sql),
                         bootstrap_options = c("striped", "hover", "condensed")), width = "100%")
Year Accident Catenary Freight Interference Peak Freight Interference Off Peak Human Error Lift Deployment Locomotive Failure Non-Locomotive Equipment Failure Obstruction Debris Other Passenger Loading Passenger Train Interference Sick Injured Unruly Passenger Signal Switch Failure Track Work Weather
2009 522 116 688 1040 1058 510 1202 402 798 538 2736 608 788 2798 1616 2150
2010 1600 196 1754 3054 2212 1082 2476 1144 1468 1158 4130 1484 1670 5290 2746 2946
2011 1338 80 990 2272 1740 902 1320 486 802 792 4290 988 1000 3296 2758 3094
2012 932 162 496 1474 1294 500 1086 326 848 638 2364 440 874 2506 1806 1262
2013 1150 338 642 1706 1508 410 1202 468 782 526 2276 404 768 3274 1328 2194
2014 1370 144 1336 2334 1470 428 1658 828 1100 568 1509 490 732 2536 1964 3322
2015 898 302 726 1360 1318 330 1130 390 870 488 1084 260 574 1770 1188 1950
2016 1088 200 618 1066 1270 290 1704 382 892 484 1082 308 814 2782 2000 1108
2017 1268 66 718 1294 1796 496 1596 398 1090 640 1162 266 712 2446 1896 1194
2018 814 154 1112 1916 1990 640 1732 552 1230 562 1324 444 776 3478 1570 2108
scroll_box(kable_styling(kable(cor(wide_sql[-1], use = "complete.obs", method="pearson")),
                         bootstrap_options = c("striped", "hover", "condensed")), width = "100%")
Accident Catenary Freight Interference Peak Freight Interference Off Peak Human Error Lift Deployment Locomotive Failure Non-Locomotive Equipment Failure Obstruction Debris Other Passenger Loading Passenger Train Interference Sick Injured Unruly Passenger Signal Switch Failure Track Work Weather
Accident 1.0000000 -0.0571909 0.6196663 0.7373892 0.6300398 0.4951796 0.6159002 0.6666842 0.4974521 0.6643722 0.3652908 0.5278519 0.5798711 0.4952077 0.6901787 0.4391694
Catenary -0.0571909 1.0000000 -0.1357559 -0.0438564 -0.1705789 -0.3446550 -0.1504044 0.0015760 -0.1653271 -0.2195482 -0.2041612 -0.1828021 -0.0959763 -0.0188764 -0.5307573 -0.0582482
Freight Interference Peak 0.6196663 -0.1357559 1.0000000 0.9055098 0.7302826 0.7151507 0.8259202 0.9622278 0.8339600 0.7363075 0.4311431 0.7540010 0.7063508 0.7282518 0.5910244 0.7508904
Freight Interference Off Peak 0.7373892 -0.0438564 0.9055098 1.0000000 0.7529271 0.7904312 0.6569837 0.8892313 0.6659303 0.7972720 0.6107565 0.8029986 0.7435435 0.7480295 0.6527215 0.7935347
Human Error 0.6300398 -0.1705789 0.7302826 0.7529271 1.0000000 0.7518244 0.7422892 0.6521274 0.8059425 0.7214470 0.3432964 0.5811082 0.6256107 0.7313770 0.5249417 0.3568060
Lift Deployment 0.4951796 -0.3446550 0.7151507 0.7904312 0.7518244 1.0000000 0.5793972 0.6491509 0.5546727 0.9197200 0.8436129 0.9302116 0.8559500 0.8286376 0.7889907 0.5610627
Locomotive Failure 0.6159002 -0.1504044 0.8259202 0.6569837 0.7422892 0.5793972 1.0000000 0.8334991 0.9082875 0.6965826 0.2240091 0.6151823 0.7431315 0.7710464 0.5922982 0.3001438
Non-Locomotive Equipment Failure 0.6666842 0.0015760 0.9622278 0.8892313 0.6521274 0.6491509 0.8334991 1.0000000 0.8091589 0.7532443 0.4373013 0.7563912 0.7624001 0.7610562 0.5534812 0.6992606
Obstruction Debris 0.4974521 -0.1653271 0.8339600 0.6659303 0.8059425 0.5546727 0.9082875 0.8091589 1.0000000 0.6446795 0.0968578 0.5028209 0.6174117 0.6557449 0.4004296 0.2865157
Other 0.6643722 -0.2195482 0.7363075 0.7972720 0.7214470 0.9197200 0.6965826 0.7532443 0.6446795 1.0000000 0.7718366 0.9286207 0.9502399 0.8323155 0.8042321 0.4541852
Passenger Loading 0.3652908 -0.2041612 0.4311431 0.6107565 0.3432964 0.8436129 0.2240091 0.4373013 0.0968578 0.7718366 1.0000000 0.8907353 0.7599024 0.6769158 0.7213697 0.5713612
Passenger Train Interference 0.5278519 -0.1828021 0.7540010 0.8029986 0.5811082 0.9302116 0.6151823 0.7563912 0.5028209 0.9286207 0.8907353 1.0000000 0.9342033 0.8602439 0.7997392 0.6390272
Sick Injured Unruly Passenger 0.5798711 -0.0959763 0.7063508 0.7435435 0.6256107 0.8559500 0.7431315 0.7624001 0.6174117 0.9502399 0.7599024 0.9342033 1.0000000 0.9149180 0.7682354 0.3985369
Signal Switch Failure 0.4952077 -0.0188764 0.7282518 0.7480295 0.7313770 0.8286376 0.7710464 0.7610562 0.6557449 0.8323155 0.6769158 0.8602439 0.9149180 1.0000000 0.6286025 0.4424591
Track Work 0.6901787 -0.5307573 0.5910244 0.6527215 0.5249417 0.7889907 0.5922982 0.5534812 0.4004296 0.8042321 0.7213697 0.7997392 0.7682354 0.6286025 1.0000000 0.4417094
Weather 0.4391694 -0.0582482 0.7508904 0.7935347 0.3568060 0.5610627 0.3001438 0.6992606 0.2865157 0.4541852 0.5713612 0.6390272 0.3985369 0.4424591 0.4417094 1.0000000
sql <- "SELECT CAST(strftime('%Y', c.Report_Month) AS INT) AS Year,
               c.Late_Cause,
               SUM(c.Late_Trains) AS Total_Late_Trains
        FROM Causes c
        INNER JOIN Lines l ON c.LineID = l.ID
        WHERE c.Late_Cause IN 
              (SELECT DISTINCT sub_c.Late_Cause 
               FROM Causes AS sub_c 
               WHERE strftime('%Y', sub_c.Report_Month) = '2012')
          AND c.Late_Cause <> 'TOTAL TRAINS DELAYED'
        GROUP BY strftime('%Y', c.Report_Month),
                 c.Late_Cause"

agg_sql <- dbGetQuery(conn, sql)

kable_styling(knitr::kable(agg_sql[sample(1:nrow(agg_sql), 20),]),
              bootstrap_options = c("striped", "hover", "condensed"))
Year Late_Cause Total_Late_Trains
120 2016 Accident 1088
108 2015 Human Error 1318
7 2009 Lift Deployment 510
12 2009 Passenger Loading 2736
156 2018 Freight Interference - Off-Peak 1916
85 2013 Weather 2194
75 2013 Lift Deployment 410
125 2016 Human Error 1270
27 2010 Obstruction/Debris 1468
89 2014 Freight Interference - Peak 1336
139 2017 Freight Interference - Off-Peak 1294
57 2012 Human Error 1294
87 2014 Catenary Failure 144
33 2010 Track Work 2746
47 2011 Passenger Train Interference 988
65 2012 Sick, Injured, Unruly Passenger 874
82 2013 Sick, Injured, Unruly Passenger 768
1 2009 Accident 522
106 2015 Freight Interference - Peak 726
105 2015 Freight Interference - Off-Peak 1360
ggplot(subset(agg_sql, Late_Cause != 'Freight Interference - Total'),
              aes(Year, Total_Late_Trains, fill=Late_Cause)) + 
  geom_bar(position = "fill", stat = "identity") +
  labs(title="Metra Late Trains by Cause", x="Year", y="Late Trains") +
  guides(fill=guide_legend(title="Cause", ncol=4)) +
  scale_x_continuous("year", breaks=unique(agg_sql$Year)) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

# DISCONNECT FROM DATABASE
dbDisconnect(conn)


Conclusion: Relational Databases in R