Hackathon and Review

This session is designed to help you prepare for the hackathon. We’ll start with Q&A about the tasks, then review the some of the skills you might need.

Part 1: Hackathon Q&A

Part 2: Skills Review

Data wrangling with dplyr

The dplyr verbs you’ll use constantly:

# select: choose columns
penguins |> select(species, bill_length_mm, body_mass_g)

# filter: choose rows based on conditions
penguins |> filter(species == "Adelie", year == 2007)

# mutate: create or modify columns
penguins |> mutate(body_mass_kg = body_mass_g / 1000)

# group_by + summarize: aggregate data
penguins |> 
  group_by(species) |> 
  summarize(
    mean_mass = mean(body_mass_g, na.rm = TRUE),
    n = n()
  )

# arrange: sort rows
penguins |> arrange(desc(body_mass_g))

Other useful functions:

# count: shortcut for group_by + summarize(n = n())
penguins |> count(species, island)

# case_when: vectorized if-else for creating categories
penguins |> 
  mutate(
    size = case_when(
      body_mass_g < 3500 ~ "small",
      body_mass_g < 5000 ~ "medium",
      TRUE ~ "large"
    )
  )

# across: apply a function to multiple columns
penguins |> 
  summarize(across(where(is.numeric), mean, na.rm = TRUE))

Reshaping: pivot_longer – turn many columns into key-value pairs. pivot_wider – spread key-value pairs into multiple columns.

# pivot_longer
penguins_long <- penguins |>
  select(species, starts_with("bill_")) |>
  mutate(id = row_number()) |>
  pivot_longer(
    cols = starts_with("bill_"),
    names_to = "measure",
    values_to = "value"
  )
head(penguins_long)

## # A tibble: 6 × 4
##   species    id measure        value
##   <fct>   <int> <chr>          <dbl>
## 1 Adelie      1 bill_length_mm  39.1
## 2 Adelie      1 bill_depth_mm   18.7
## 3 Adelie      2 bill_length_mm  39.5
## 4 Adelie      2 bill_depth_mm   17.4
## 5 Adelie      3 bill_length_mm  40.3
## 6 Adelie      3 bill_depth_mm   18

# pivot_wider
penguins_long |>
  pivot_wider(
    names_from = measure,
    values_from = value
  ) |> 
  head()

## # A tibble: 6 × 4
##   species    id bill_length_mm bill_depth_mm
##   <fct>   <int>          <dbl>         <dbl>
## 1 Adelie      1           39.1          18.7
## 2 Adelie      2           39.5          17.4
## 3 Adelie      3           40.3          18  
## 4 Adelie      4           NA            NA  
## 5 Adelie      5           36.7          19.3
## 6 Adelie      6           39.3          20.6

Writing functions

Recall the basic structure of a function:

function_name <- function(arguments) {
  # body: what the function does
  # last expression is returned (or use return())
}

A simple example:

calculate_stats <- function(x) {
  c(
    mean = mean(x, na.rm = TRUE),
    sd = sd(x, na.rm = TRUE),
    n = length(x)
  )
}

calculate_stats(penguins$bill_length_mm)

##       mean         sd          n 
##  43.921930   5.459584 344.000000

Input validation

Good functions check their inputs and fail with helpful messages:

my_function <- function(data, var_name) {
  
  # Check data type
  if (!is.data.frame(data)) {
    stop("'data' must be a data frame")
  }
  
  # Check if column exists
  if (!var_name %in% names(data)) {
    stop(paste("Column", var_name, "not found in data"))
  }
  
  # Check if variable is the right type
  
  # ... rest of function
}

Passing column names to functions

Two main approaches in the tidyverse:

Approach 1: Column name as string + .data[[]]

group_mean <- function(data, group_col, value_col) {
  data |> 
    group_by(.data[[group_col]]) |> 
    summarize(mean_value = mean(.data[[value_col]], na.rm = TRUE))
}

group_mean(penguins, "species", "bill_length_mm")

## # A tibble: 3 × 2
##   species   mean_value
##   <fct>          <dbl>
## 1 Adelie          38.8
## 2 Chinstrap       48.8
## 3 Gentoo          47.5

Approach 2: Bare column name + {{ }} (tidy evaluation)

group_mean2 <- function(data, group_col, value_col) {
  data |> 
    group_by({{ group_col }}) |> 
    summarize(mean_value = mean({{ value_col }}, na.rm = TRUE))
}

group_mean2(penguins, species, bill_length_mm)

## # A tibble: 3 × 2
##   species   mean_value
##   <fct>          <dbl>
## 1 Adelie          38.8
## 2 Chinstrap       48.8
## 3 Gentoo          47.5

Iteration with purrr

The map() family applies a function to each element of a list or vector:

map() returns a list
map_dbl() returns a numeric vector
map_chr() returns a character vector
map_dfr() returns a data frame (row-binding results)

Three ways to pass functions to map():

# 1. Named function
map(penguins |> select(where(is.numeric)), mean, na.rm = TRUE)

# 2. Anonymous function
map(penguins |> select(where(is.numeric)), \(x) mean(x, na.rm = TRUE))

# 3. Formula shorthand (older style)
map(penguins |> select(where(is.numeric)), ~ mean(.x, na.rm = TRUE))

Example: applying a custom function across multiple variables.

# Define the function first
get_stats <- function(var) {
  data.frame(
    variable = var,
    mean = mean(penguins[[var]], na.rm = TRUE),
    sd = sd(penguins[[var]], na.rm = TRUE)
  )
}

# Then map it
numeric_vars <- c("bill_length_mm", "bill_depth_mm", "flipper_length_mm")
map_dfr(numeric_vars, get_stats)

##            variable      mean        sd
## 1    bill_length_mm  43.92193  5.459584
## 2     bill_depth_mm  17.15117  1.974793
## 3 flipper_length_mm 200.91520 14.061714

Data visualization with ggplot2

The basic structure of a ggplot:

ggplot(data, aes(x = ..., y = ..., color = ..., fill = ...)) +
  geom_...() +
  scale_...() +
  labs(title = ..., x = ..., y = ..., caption = ...) +
  theme_...() +
  theme(...)

Common plot types

library(gridExtra)

# histogram
p1 <- ggplot(penguins, aes(x = bill_length_mm)) +
  geom_histogram(binwidth = 2) +
  labs(
  title = "Histogram",
  subtitle = "Use for the distribution of a numeric variable",
  x = "Bill length (mm)",
  y = "Count"
  ) +
  theme_minimal(base_size = 10)


# boxplot
p2 <- ggplot(penguins, aes(x = species, y = body_mass_g, fill = species)) +
  geom_boxplot(alpha = 0.8) +
  labs(
  title = "Boxplot",
  subtitle = "Use to compare distributions across categories",
  x = "Species",
  y = "Body mass (g)"
  ) +
  theme_minimal(base_size = 10) +
  theme(legend.position = "none")

# scatterplot
p3 <- ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(alpha = 0.7) +
  labs(
  title = "Scatterplot",
  subtitle = "Use for relationships between two numeric variables",
  x = "Flipper length (mm)",
  y = "Body mass (g)"
  ) +
  theme_minimal(base_size = 10)

# another scatterplot
p4 <- ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
  geom_point(alpha = 0.7) +
  labs(
  title = "Scatterplot by Group",
  subtitle = "Use when comparing numeric relationships across \ngroups",
  x = "Flipper length (mm)",
  y = "Body mass (g)",
  color = "Species"
  ) +
  theme_minimal(base_size = 10)


grid.arrange(p1, p2, p3, p4, ncol = 2)

Faceting

Another way to show groups is to split your plot into panels. An example:

ggplot(penguins, aes(x = bill_length_mm, fill = species)) +
  geom_histogram(binwidth = 2, color = "white") +
  labs(
    title = "Faceted Histogram",
    subtitle = "Use to compare the distribution of a numeric variable across groups",
    x = "Bill length (mm)",
    y = "Count"
  ) +
  facet_wrap(~ species, ncol = 1) +
  theme_minimal() +
  theme(legend.position = "none")

Making plots standalone

A good plot should be understandable without reading surrounding text. Be mindful of labels, titles, legends, and captions.

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
  geom_point(alpha = 0.7) +
  labs(
    title = "Penguin Body Mass vs. Flipper Length",
    subtitle = "Data from Palmer Station, Antarctica (2007-2009)",
    x = "Flipper length (mm)",
    y = "Body mass (g)",
    color = "Species",
    caption = "Source: palmerpenguins  package"
  ) +
  theme_minimal()

Tables

Sometimes you need a table to show the information clearly.

library(gt)

# summarize data
penguin_summary <- penguins |>
  group_by(species) |>
  summarize(
    mean_mass_g = round(mean(body_mass_g, na.rm = TRUE)),
    mean_flipper_mm = round(mean(flipper_length_mm, na.rm = TRUE)),
    n = n()
  )

# table
penguin_summary |> 
  gt() |> 
  tab_header(
    title = "Summary of Penguins",
  ) |> 
  cols_label(
    species = "Species",
    mean_mass_g = "Mean Mass (g)",
    mean_flipper_mm = "Mean Flipper Length (mm)",
    n = "Count"
  ) |> 
  tab_style(
    style = cell_fill(color = ),
    locations = cells_body(rows = species == "Adelie")
  )

Species	Mean Mass (g)	Mean Flipper Length (mm)	Count
Summary of Penguins
Adelie	3701	190	152
Chinstrap	3733	196	68
Gentoo	5076	217	124

Modeling with broom and modelsummary

Running models

model <- lm(body_mass_g ~ flipper_length_mm + species, data = penguins)
summary(model)

## 
## Call:
## lm(formula = body_mass_g ~ flipper_length_mm + species, data = penguins)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -927.70 -254.82  -23.92  241.16 1191.68 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -4031.477    584.151  -6.901 2.55e-11 ***
## flipper_length_mm    40.705      3.071  13.255  < 2e-16 ***
## speciesChinstrap   -206.510     57.731  -3.577 0.000398 ***
## speciesGentoo       266.810     95.264   2.801 0.005392 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 375.5 on 338 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.7826, Adjusted R-squared:  0.7807 
## F-statistic: 405.7 on 3 and 338 DF,  p-value: < 2.2e-16

Tidying with broom

library(broom)

# coefficients as a tidy data frame
tidy(model)

## # A tibble: 4 × 5
##   term              estimate std.error statistic  p.value
##   <chr>                <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)        -4031.     584.       -6.90 2.55e-11
## 2 flipper_length_mm     40.7      3.07     13.3  1.40e-32
## 3 speciesChinstrap    -207.      57.7      -3.58 3.98e- 4
## 4 speciesGentoo        267.      95.3       2.80 5.39e- 3

# model fit statistics
glance(model)

## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic   p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.783         0.781  376.      406. 1.25e-111     3 -2511. 5032. 5051.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

# add fitted values and residuals to data
augment(model) |> head()

## # A tibble: 6 × 10
##   .rownames body_mass_g flipper_length_mm species .fitted .resid    .hat .sigma
##   <chr>           <int>             <int> <fct>     <dbl>  <dbl>   <dbl>  <dbl>
## 1 1                3750               181 Adelie    3336.  414.  0.0120    375.
## 2 2                3800               186 Adelie    3540.  260.  0.00767   376.
## 3 3                3250               195 Adelie    3906. -656.  0.00833   374.
## 4 5                3450               193 Adelie    3825. -375.  0.00724   376.
## 5 6                3650               190 Adelie    3703.  -52.5 0.00662   376.
## 6 7                3625               181 Adelie    3336.  289.  0.0120    376.
## # ℹ 2 more variables: .cooksd <dbl>, .std.resid <dbl>

Presenting with modelsummary

library(modelsummary)

# fit multiple models
m1 <- lm(body_mass_g ~ flipper_length_mm, data = penguins)
m2 <- lm(body_mass_g ~ flipper_length_mm + species, data = penguins)
m3 <- lm(body_mass_g ~ flipper_length_mm * species, data = penguins)

models <- list(
  "Flipper only" = m1,
  "+ Species" = m2,
  "Interaction" = m3
)

# regression table
modelsummary(
  models,
  gof_omit = "AIC|BIC|Log.Lik|RMSE",
  output = "gt"
) |> 
  tab_options(table.align = "center")

	Flipper only	+ Species	Interaction
(Intercept)	-5780.831	-4031.477	-2535.837
	(305.815)	(584.151)	(879.468)
flipper_length_mm	49.686	40.705	32.832
	(1.518)	(3.071)	(4.627)
speciesChinstrap		-206.510	-501.359
		(57.731)	(1523.459)
speciesGentoo		266.810	-4251.444
		(95.264)	(1427.332)
flipper_length_mm × speciesChinstrap			1.742
			(7.856)
flipper_length_mm × speciesGentoo			21.791
			(6.941)
Num.Obs.	342	342	342
R2	0.759	0.783	0.790
R2 Adj.	0.758	0.781	0.786

# coefficients plot
modelplot(models, coef_omit = "Intercept") +
  theme_minimal() +
  labs(title = "Coefficients")

This script was drafted by Killian Conyngham and Carol Sobral.