#load libraries
library(data.table)
library(scales)
library(ggplot2)
library(stargazer)
library(ggthemes)
library(here)
library(forcats)
# set the working directory to be the location of your data file
setwd("/Users/dproserp/Library/CloudStorage/Dropbox/teaching/mkt566-2025/w3/beer-case")
In this exercise, we will work with a dataset of beer reviews from RateBeer (you can download it here), and perform some exploratory data analysis.
Let’s start by exploring the data and familiarizing with its structure:
beer = fread("w3-ratebeer-sampled.csv.gz")
head(beer)
ncol(beer)
## [1] 12
nrow(beer)
## [1] 292680
Now, let’s focus on some specific variables:
SOLUTION: Yes, there are non-beer drinks in the dataset, such as “Cider” and “Mead”. We will need to filter these out later.
unique(beer$beer_style)
## [1] "Klsch" "Sour Ale/Wild Ale"
## [3] "Traditional Ale" "Belgian Ale"
## [5] "India Pale Ale (IPA)" "Brown Ale"
## [7] "Porter" "Sweet Stout"
## [9] "Golden Ale/Blond Ale" "Stout"
## [11] "Scotch Ale" "Saison"
## [13] "Dunkel" "American Pale Ale"
## [15] "Belgian White (Witbier)" "Belgian Strong Ale"
## [17] "Wheat Ale" "Abt/Quadrupel"
## [19] "Premium Lager" "Imperial/Strong Porter"
## [21] "Pale Lager" "Fruit Beer"
## [23] "Amber Ale" "English Pale Ale"
## [25] "Spice/Herb/Vegetable" "Pilsener"
## [27] "German Hefeweizen" "Premium Bitter/ESB"
## [29] "Barley Wine" "Doppelbock"
## [31] "Abbey Tripel" "Irish Ale"
## [33] "Cider" "Classic German Pilsener"
## [35] "Weizen Bock" "Dunkelweizen"
## [37] "Oktoberfest/Mrzen" "Dortmunder/Helles"
## [39] "Zwickel/Keller/Landbier" "Sak - Junmai"
## [41] "Sak - Nigori" "Sak - Daiginjo"
## [43] "Strong Pale Lager/Imperial Pils" "Heller Bock"
## [45] "Perry" "Cream Ale"
## [47] "Scottish Ale" "Mild Ale"
## [49] "Vienna" "California Common"
## [51] "Dunkler Bock" "Low Alcohol"
## [53] "Bohemian Pilsener" "Ice Cider/Perry"
## [55] "Bitter" "English Strong Ale"
## [57] "Abbey Dubbel" "Imperial/Double IPA"
## [59] "Foreign Stout" "American Strong Ale"
## [61] "Specialty Grain" "Malt Liquor"
## [63] "Black IPA" "Imperial Stout"
## [65] "Smoked" "Baltic Porter"
## [67] "Mead" "Schwarzbier"
## [69] "Altbier" "Old Ale"
## [71] "Dry Stout" "Bire de Garde"
## [73] "Berliner Weisse" "American Dark Lager"
## [75] "German Kristallweizen" "Eisbock"
## [77] "Sak - Ginjo" "Lambic - Fruit"
## [79] "Lambic - Unblended" "Sak - Taru"
## [81] "Lambic - Gueuze" "Sak - Namasak"
## [83] "Sak - Futsu-shu" "Lambic - Faro"
## [85] "Sak - Honjozo" "Sak - Tokubetsu"
## [87] "Sak - Genshu" "Sak - Infused"
## [89] "Sak - Koshu"
SOLUTION: Yes, there are some values that are not numeric, such as “_“. Also, there are beers with very high ABV values.
SOLUTION: The “_” character indicates that the ABV value is missing but it does not seem to be zero (since non-alcholic beer have ABV). We will need to handle these missing values appropriately.
SOLUTON: The rating variables (review_overall, review_taste, etc.) are currently stored as character strings, which prevents us from computing the mean directly. We need to convert them to numeric format first.
SOLUTION: The review_time variable is in Unix
timestamp format, which represents the number of seconds since January
1, 1970. We can convert it to a date format using the
as.POSIXct
function in R.
Now that we are familiar with the data, let’s try to visualize it and answer some questions. Before starting, remove non-beers drinks from the dataset, and convert all variables with prefix “review_” and beer_ABV to numerical variables.
# fix a few things
#conver timestamp to datatime
beer[, datetime:= as.POSIXct(review_time, origin = "1970-01-01", tz = "UTC")]
#remove non-beers drinks
beer = beer[!grepl("Sak|Cider|Mead|Kombucha|Wine|Liquor", beer_style)]
#convert all review_ variables to numeric
for (col in c("review_overall", "review_aroma", "review_appearance", "review_palate", "review_taste")) {
beer[, (col) := as.numeric(sub("/.*", "", get(col)))]
}
# conver abv to numeric
beer[, beer_ABV := as.numeric(beer_ABV)]
beer$beer_style <- fct_infreq(beer$beer_style)
ggplot(data = beer) +
geom_bar(mapping = aes(x = beer_style), fill = "blue", color = "black") +
labs(title = "Distribution of Beer Styles", x = "Beer Style", y = "Reviews") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size= 6),
plot.margin = margin(t = 10, r = 20, b = 40, l = 20)) +
scale_y_continuous(labels = comma_format())
ggplot(data = beer) +
geom_histogram(mapping = aes(x = beer_ABV), binwidth = 0.5, fill = "blue", color = "black") +
labs(title = "Distribution of ABV", x = "ABV", y = "Count") +
scale_x_continuous(breaks = seq(0, max(beer$beer_ABV, na.rm = TRUE), 2)) +
theme_minimal()
# Example: Find outliers in the ABV variable
Q1 <- quantile(beer$beer_ABV, 0.25, na.rm = TRUE)
Q3 <- quantile(beer$beer_ABV, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
# Outliers are values below Q1 - 1.5*IQR or above Q3 + 1.5*IQR
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliers = unique(beer[beer_ABV < lower_bound | beer_ABV > upper_bound, .(beer_name, beer_ABV)])
outliers
# beer with max ABV
max_abv_beer <- beer[which.max(beer$beer_ABV), .(beer_name, beer_style, beer_ABV)]
max_abv_beer
avg.abv = beer[, .(beer_ABV = mean(as.numeric(beer_ABV), na.rm = TRUE)), by = beer_style]
ggplot(data = avg.abv) +
geom_bar(mapping = aes(x = reorder(beer_style, -beer_ABV), y = beer_ABV), stat = "identity") +
labs(title = "Average ABV by Beer Style", x = "Beer Style", y = "Average ABV") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) +
scale_y_continuous(labels = scales::percent_format(scale = 1))
Next, let’s look at ratings:
avg.review = beer[, .(review_overall = mean(review_overall, na.rm = TRUE)), by = beer_style]
ggplot(data = avg.review, aes(x = reorder(beer_style, -review_overall), y = review_overall, group = 1)) +
geom_line() + geom_point() +
labs(title = "Average Review Overall by Beer Style", x = "Beer Style", y = "Average Review Overall") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) +
scale_y_continuous(labels = scales::number_format(scale = 1))
avg.taste = beer[, .(review_taste = mean(review_taste, na.rm = TRUE)), by = beer_style]
ggplot(data = avg.taste, aes(x = reorder(beer_style, -review_taste), y = review_taste, group = 1)) +
geom_line() + geom_point() +
labs(title = "Average Review Taste by Beer Style", x = "Beer Style", y = "Average Review Taste") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) +
scale_y_continuous(labels = scales::number_format(scale = 1))
SOLUTION: There are spikes concentrated in roung numbers (e.g., 2, 3, 4, 4.5, 5), which is due to ratings being round numbers and averages more like to have specific values that are round number of multiples of 0.05 in most cases
avg.brewer = beer[, .(review_overall = mean(review_overall, na.rm = TRUE), num_reviews = .N), by = beer_brewerId]
ggplot(data = avg.brewer, aes(x = review_overall)) +
geom_histogram(binwidth = 0.1, fill = "blue", color = "black") +
labs(title = "Distribution of Average Review Overall by Brewer", x = "Average Review Overall", y = "Count") +
scale_x_continuous() +
theme_minimal()
#Which are the best and worst brewer?
best.brewer = avg.brewer[review_overall == max(review_overall)]
worst.brewer = avg.brewer[review_overall == min(review_overall)]
best.brewer
worst.brewer
# with at least 10 reviews
best.brewer = avg.brewer[num_reviews >= 10][review_overall == max(review_overall)]
worst.brewer = avg.brewer[num_reviews >= 10][review_overall == min(review_overall)]
best.brewer
worst.brewer
Finally, let’s look at the relationship between beer styles and reviews:
avg.brewer.style = beer[, .(num_styles = uniqueN(beer_style), review_overall = mean(review_overall, na.rm = TRUE)), by = beer_brewerId]
ggplot(data = avg.brewer.style, aes(x = num_styles, y = review_overall)) +
geom_point() +
geom_smooth(method = "lm", color = "red") +
labs(title = "Number of Beer Styles vs. Average Review Overall", x = "Number of Beer Styles", y = "Average Review Overall") +
scale_x_continuous(breaks = seq(0, max(avg.brewer.style$num_styles, na.rm = TRUE), 5)) +
theme_minimal()
Alternatively:
# compute
temp = beer[, list(num_beer_style =length(unique(beer_style)),
avg_rating = mean(review_overall, na.rm=T)), by = beer_brewerId]
#compute average review_overall by num_beer_style
avg.brewer.style = temp[, .(avg_rating = mean(avg_rating, na.rm = TRUE)),
by = num_beer_style][order(-num_beer_style)]
ggplot(data = avg.brewer.style, aes(x = num_beer_style, y = avg_rating)) +
geom_line() + geom_point() +
labs(title = "Average Review Overall by Number of Beer Styles", x = "Number of Beer Styles", y = "Average Review Overall") +
scale_x_continuous(breaks = seq(0, max(avg.brewer.style$num_beer_style), 5)) +
scale_y_continuous(labels = scales::number_format(scale = 1)) +
theme_minimal()