#load libraries
library(data.table)
library(scales)
library(ggplot2)
library(stargazer)
library(ggthemes)
library(here)
library(forcats)
# set the working directory to be the location of your data file
setwd("...")
In this exercise, we will work with a dataset of beer reviews from RateBeer (you can download it here), and perform some exploratory data analysis.
Let’s start by exploring the data and familiarizing with its structure:
beer = fread("w3-ratebeer-sampled.csv.gz")
head(beer)
ncol(beer)
nrow(beer)
Now, let’s focus on some specific variables:
SOLUTION: Yes, there are non-beer drinks in the dataset, such as “Cider” and “Mead”. We will need to filter these out later.
unique(beer$beer_style)
SOLUTION: Yes, there are some values that are not numeric, such as “_“. Also, there are beers with very high ABV values.
SOLUTION: The “_” character indicates that the ABV value is missing but it does not seem to be zero (since non-alcholic beer have ABV). We will need to handle these missing values appropriately.
SOLUTON: The rating variables (review_overall, review_taste, etc.) are currently stored as character strings, which prevents us from computing the mean directly. We need to convert them to numeric format first.
SOLUTION: The review_time variable is in Unix
timestamp format, which represents the number of seconds since January
1, 1970. We can convert it to a date format using the
as.POSIXct
function in R.
Now that we are familiar with the data, let’s try to visualize it and answer some questions. Before starting, remove non-beers drinks from the dataset, and convert all variables with prefix “review_” and beer_ABV to numerical variables.
# Clean the data
#convert timestamp to datatime
beer[, datetime:= as.POSIXct(review_time, origin = "1970-01-01", tz = "UTC")]
#remove non-beers drinks
beer = beer[!grepl("Sak|Cider|Mead|Kombucha|Wine|Liquor", beer_style)]
#convert all review_ variables to numeric
for (col in c("review_overall", "review_aroma", "review_appearance", "review_palate", "review_taste")) {
beer[, (col) := as.numeric(sub("/.*", "", get(col)))]
}
# conver abv to numeric
beer[, beer_ABV := as.numeric(beer_ABV)]
beer$beer_style <- fct_infreq(beer$beer_style)
ggplot(data = beer) +
geom_bar(...
ggplot(data = beer) +
geom_histogram(...
# Example: Find outliers in the ABV variable
Q1 <- quantile(beer$beer_ABV, 0.25, na.rm = TRUE)
Q3 <- quantile(beer$beer_ABV, 0.75, na.rm = TRUE)
IQR <- ...
avg.abv = beer[, .(beer_ABV = mean(as.numeric(beer_ABV), na.rm = TRUE)), by = beer_style]
ggplot(data = avg.abv) +
geom_bar(...
Next, let’s look at ratings:
avg.review = beer[, .(review_overall = mean(review_overall, na.rm = TRUE)), by = beer_style]
ggplot(data = avg.review, aes(x = reorder(beer_style, -review_overall), y = review_overall, group = 1)) +
geom...
avg.taste = beer[, .(review_taste = mean(review_taste, na.rm = TRUE)), by = beer_style]
ggplot(data = avg.taste, aes(x = reorder(beer_style, -review_taste), y = review_taste, group = 1)) +
geom...
avg.brewer = beer[, .(review_overall = mean(review_overall, na.rm = TRUE), num_reviews = .N), by = beer_brewerId]
ggplot(data = avg.brewer, aes(x = review_overall)) +
geom_histogram(...
Finally, let’s look at the relationship between beer styles and reviews:
avg.brewer.style = beer[, .(num_styles = uniqueN(beer_style), review_overall = mean(review_overall, na.rm = TRUE)), by = beer_brewerId]
ggplot(data = avg.brewer.style, aes(x = num_styles, y = review_overall)) +
geom_point(...