knitr::opts_chunk$set(include = TRUE)
knitr::opts_chunk$set(comment = NA)
library(rvest)
library(httr)
library(magrittr)
library(tidyverse)
library(rio)
library(glue)
For this exercise, you will not be working with a single dataset, but will instead practice new skills using both your NLSY data from last seminar, as well as online data concerning movies.
Revisit your NLSY97 dataset from last week
Recode the schooltype variable into text values, corresponding to:
nlsy97 <-import("nlsy97.rds")
Load the IMDB Top 250 Movies
# IMDB Top 250 Movies
top250_basic <- read_html("https://www.imdb.com/chart/top/") %>%
html_table() %>% as.data.frame()
add_headers("Accept-Language"="en-US, en;q=0.5")
top250_eng.pre <- html_session("https://www.imdb.com/chart/top/",
add_headers("Accept-Language"="en-US, en;q=0.5")) %>%
html_table %>% as.data.frame()
top250_eng <- top250_eng.pre %>% select("Rank...Title", "IMDb.Rating") %>%
rename(Title = Rank...Title,
Rating = IMDb.Rating) # Keep only Title and Year Columns
top250_eng$Ranking <- top250_eng$Title %>% str_extract("[0-9]+(?=(.\n))")
top250_eng$Year <- str_extract(top250_eng$Title, "(?<=[:punct:])[:digit:]+")
top250_eng$Title <- top250_eng$Title %>% str_extract("(?<=(.\n)).+")
top250_eng$Title %<>% str_trim(side = "both")
head(top250_eng)
Get the box office statistics for the top 500 all-time US box office earners
You may need to experiment with one of the pages first to ensure that you get the right dataframe from each iteration of the loop.
domesticgross <- list()
for(i in 1:5){
domesticgross[[i]] <-read_html(
glue("https://www.boxofficemojo.com/alltime/domestic.htm?page={i}")) %>%
html_nodes(xpath = "//table") %>% html_table(fill=TRUE) %>%
extract2(6)
}
topearners <- domesticgross %>% bind_rows()
colnames(topearners) <-topearners[1,]
topearners %<>% filter(Rank !="Rank")
topearners %<>% rename("Title" = "Title(click to view)",
"Gross"="Lifetime Gross") %>%
select(Title,Studio, Gross)
head(topearners)
Create a dataset with both IMDB performance and earnings
expensive_movies <-inner_join(top250_eng, topearners, by="Title")
expensive_movies$Gross %<>% str_replace_all("[$,]+","")
expensive_movies[,c("Ranking","Gross","Year")] %<>% map(as.numeric)
expensive_movies %<>% mutate(logearnings = log(Gross))
myols <- function(depvar,indvars) {
x <- indvars %>% as_tibble()
x %<>% as_tibble() %>% mutate(Constant = 1) %>% as.matrix()
y <- depvar %>% as.matrix()
beta_myols <- t(solve(t(x) %*% x) %*% (t(x) %*% y))
colnames(beta_myols) <-colnames(x)
rownames(beta_myols) <- "Estimate"
beta_myols <- t(beta_myols)
return(beta_myols)
}
myols(expensive_movies$logearnings,expensive_movies[,c("Rating","Year")])
Estimate
Rating 0.197996254
Year 0.006279776
Constant 5.035941990