Tidy Tuesday Horror
Load the Data and Check Duplicates
library(tidyverse)
library(lubridate)
library(kableExtra)
library(ggridges)
# there were complete duplicated rows
dat <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-22/horror_movies.csv") %>%
distinct(.) # removes complete dups
# check duplicates
dup_title <- dat %>%
filter(duplicated(title) | duplicated(title, fromLast = TRUE)) %>%
arrange(title)
# examined they seem different movies even though same title
dup_title %>%
filter(duplicated(plot))
## # A tibble: 0 x 12
## # … with 12 variables: title <chr>, genres <chr>, release_date <chr>,
## # release_country <chr>, movie_rating <chr>, review_rating <dbl>,
## # movie_run_time <chr>, plot <chr>, cast <chr>, language <chr>,
## # filming_locations <chr>, budget <chr>
dup_title %>%
filter(duplicated(release_date)| duplicated(release_date, fromLast = TRUE))
## # A tibble: 2 x 12
## title genres release_date release_country movie_rating review_rating
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 The … Comed… 21-Jul-15 USA <NA> 5.2
## 2 The … Comed… 21-Jul-15 USA NOT RATED 3.6
## # … with 6 more variables: movie_run_time <chr>, plot <chr>, cast <chr>,
## # language <chr>, filming_locations <chr>, budget <chr>
# The Jokesters seems to be a duplicate but with different rating and run time
# Deleting it for now
dat <- dat %>%
filter(title != "The Jokesters (2015)")
Genres
The genre column looked extremely messy so some data munging fun. Each film can be categorized into multiple genres.
dat_long <- dat %>%
separate_rows(genres, sep = "\\|") %>% # long format
mutate(genres = str_trim(genres))
# Just to check - looks okay - just 1 movie with no genre
table(dat_long$genres, useNA = "ifany")
##
## Action Adult Adventure Animation Biography Comedy
## 335 1 115 39 4 511
## Crime Drama Family Fantasy History Horror
## 120 529 11 229 6 3309
## Music Musical Mystery Reality-TV Romance Sci-Fi
## 5 13 453 1 99 308
## Sport Thriller War Western <NA>
## 4 1369 14 15 1
dat_long <- dat_long %>%
mutate(genres = fct_infreq(fct_lump(genres, n = 8))) # Factor keeping 8 most frequent categories and lumping the rest to Other and order the factor by frequency
Table: Number of Films per Genre
genre_count <- dat_long %>%
filter(!is.na(genres)) %>%
group_by(genres) %>%
summarize(n = n()) %>%
ungroup()
kable(genre_count, format = "html", table.attr = "style = \"color: white;\"") %>%
kable_styling(bootstrap_options = "striped", full_width = F)
genres | n |
---|---|
Horror | 3309 |
Thriller | 1369 |
Drama | 529 |
Comedy | 511 |
Mystery | 453 |
Other | 447 |
Action | 335 |
Sci-Fi | 308 |
Fantasy | 229 |
Bar Graph: Distribution of Genres
genre_count %>%
ggplot(aes(x = genres, y = n, fill = genres)) +
geom_bar(stat = "identity") +
scale_y_continuous(labels = scales::comma) + # y axis to have commas
scale_fill_brewer(palette ="BuPu", direction = -1) + # reverse order the palette
theme_light() +
labs(x = "", y = "Number of Films") +
theme(legend.position = "none")
Review Rating by Release Year
Some of the years are dmy
format, some just have the years. I am extracting the year and filling in any that didn’t parse with the year value from the original release_date column. No missing values for year :)
date_dat <- dat %>%
mutate(date = dmy(release_date),
yr = year(date),
yr = ifelse(is.na(yr), release_date, yr))
table(is.na(date_dat$yr))
##
## FALSE
## 3310
table(is.na(date_dat$review_rating))
##
## FALSE TRUE
## 3058 252
date_dat %>%
select(release_date, date, yr) %>%
filter(is.na(date)) %>%
head()
## # A tibble: 6 x 3
## release_date date yr
## <chr> <date> <chr>
## 1 2017 NA 2017
## 2 2013 NA 2013
## 3 2012 NA 2012
## 4 2013 NA 2013
## 5 2017 NA 2017
## 6 2017 NA 2017
date_dat %>%
ggplot(aes(x = yr, y = review_rating, fill = yr)) +
geom_boxplot(alpha = .5) +
labs(x = "Release Year", y = "Review Rating") +
theme_light() +
theme(legend.position = "none")
Looks like there is a slight increase in ratings for newer films.
And here is a ridgeline plot :)
date_dat %>%
ggplot(aes(y = yr, x = review_rating, fill = yr)) +
geom_density_ridges(alpha = .5) +
labs(y = "Release Year", x = "Review Rating") +
theme_light() +
theme(legend.position = "none")