Problem to be created as R markdown file is attached as an image. The code in R for the given problem(attached as image) is as below: rm(list = ls()) library(xml2) library(dplyr) library(httr) library(stringr) # 1. Use the xml2 package to scrape data on NHL hockey teams user_agent <- "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Mobile Safari/537.36" # Initialize timer Start <- Sys.time() # Initialize variables to store results names <- c() years <- c() wins <- c() losses <- c() # Function to scrape a single page scrape_page <- function(url) { Sys.sleep(5) # Wait for 5 seconds page <- read_html(url, user_agent = user_agent) # Extract data team_names <- xml_text(xml_find_all(page, "//td[@class='name']")) team_years <- xml_text(xml_find_all(page, "//td[@class='year']")) team_wins <- xml_text(xml_find_all(page, "//td[@class='wins']")) team_losses <- xml_text(xml_find_all(page, "//td[@class='losses']")) # Append data to global variables names <<- c(names, team_names) years <<- c(years, team_years) wins <<- c(wins, team_wins) losses <<- c(losses, team_losses) } # Define the base URL for the pages base_url <- "https://www.scrapethissite.com/pages/forms/?page_num=" # Loop through the pages for (i in 1:24) { print(paste("Scraping page", i)) scrape_page(paste0(base_url, i)) Sys.sleep(sample(2:20, 1)) # Wait a random amount of time (2 - 20 seconds) between pages } # Create the data frame nhl <- data.frame( Team = names, Year = as.integer(years), Wins = as.integer(wins), Losses = as.integer(losses), stringsAsFactors = FALSE ) # Print results print(head(nhl)) View(nhl) # Save scraping time End <- Sys.time() Start End print(End - Start) # 2. Clean and transform nhl_data # Clean up the data (remove \n and whitespace) nhl$Team <- str_trim(str_replace_all(nhl$Team, "\n", "")) nhl$Year <- as.integer(str_trim(str_replace_all(nhl$Year, "\n", ""))) nhl$Wins <- as.integer(str_trim(str_replace_all(nhl$Wins, "\n", ""))) nhl$Losses <- as.integer(str_trim(str_replace_all(nhl$Losses, "\n", ""))) # Check for NA values which might have been introduced during conversion sum(is.na(nhl$Year)) # Should be 0 for a successful conversion sum(is.na(nhl$Wins)) # Should be 0 for a successful conversion sum(is.na(nhl$Losses)) # Should be 0 for a successful conversion # If there are any NA values, you might need to inspect the rows and handle them appropriately if (any(is.na(nhl))) { nhl[is.na(nhl)] <- 0 # Example: replacing NA values with 0 } # Print results print(head(nhl)) # 3. Merge vertically #install.packages("readxl") library(readxl) # Read Excel Data nhl_expanded <- read_excel("nhl_2012-2021.xlsx", skip = 1) # Clean Excel Data nhl_expanded$Team <- as.character(nhl_expanded$Team) nhl_expanded$Team <- str_trim(str_replace_all(nhl_expanded$Team, "\\*$", "")) nhl_expanded$Year <- as.integer(nhl_expanded$Season) nhl_expanded$Wins <- as.integer(nhl_expanded$W) nhl_expanded$Losses <- as.integer(nhl_expanded$L) library(stringr) # Remove trailing * from team names in nhl nhl$Team <- str_replace_all(nhl$Team, "\\*$", "") # Remove trailing * from team names in nhl_expanded nhl_expanded$Team <- str_replace_all(nhl_expanded$Team, "\\*$", "") library(dplyr) # Ensure nhl_expanded has the same columns as nhl, in the same order nhl_expanded <- nhl_expanded %>% select(names(nhl)) # Now you can use rbind nhl2 <- rbind(nhl, nhl_expanded) # Create Win % Calculated Column nhl2$Win_Percentage <- nhl2$Wins / (nhl2$Wins + nhl2$Losses) # Print results print(head(nhl2)) View(nhl2) # 4. Merge Horizontally install.packages("tidyverse") library(tidyverse) # Read arena data arena <- read.csv("nhl_hockey_arenas.csv", stringsAsFactors = FALSE) # Print unique team names to find spelling errors print(unique(arena$Team)) print(unique(nhl2$Team)) arena$Team <- gsub("Seattle Kracken", "Seattle Kraken", arena$Team) # Merge horizontally using base R's merge function nhl3 <- merge(x = nhl2, y = arena[, c("Team", "Arena.Name", "Arena.Location", "Seating.Capacity")], by = "Team", all.x = TRUE) # Rename columns names(nhl3)[names(nhl3) == "Arena.Name"] <- "Arena" names(nhl3)[names(nhl3) == "Arena.Location"] <- "Location" names(nhl3)[names(nhl3) == "Seating.Capacity"] <- "Capacity" # Print results print(head(nhl3)) View(nhl3) # 5. Write nhl3 to a new .csv file write.csv(nhl3, "hockey_data.csv",row.names = FALSE)
1. Create the given problem in R Markdown.
2. You will need to add details to complete the documentation for the rest of the
assignment.
a. Import from Excel and vertical merge the data. b. Import from .CSV and horizontally merge. c. Perform any necessary clean up as required in HW4. d. Your code and HTML output should be attractive, organized, and lead the reader through your process and conclusions.
Problem to be created as R markdown file is attached as an image. The code in R for the given problem(attached as image) is as below:
rm(list = ls())
library(xml2)
library(dplyr)
library(httr)
library(stringr)
# 1. Use the xml2 package to scrape data on NHL hockey teams
user_agent <- "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Mobile Safari/537.36"
# Initialize timer
Start <- Sys.time()
# Initialize variables to store results
names <- c()
years <- c()
wins <- c()
losses <- c()
# Function to scrape a single page
scrape_page <- function(url) {
Sys.sleep(5) # Wait for 5 seconds
page <- read_html(url, user_agent = user_agent)
# Extract data
team_names <- xml_text(xml_find_all(page, "//td[@class='name']"))
team_years <- xml_text(xml_find_all(page, "//td[@class='year']"))
team_wins <- xml_text(xml_find_all(page, "//td[@class='wins']"))
team_losses <- xml_text(xml_find_all(page, "//td[@class='losses']"))
# Append data to global variables
names <<- c(names, team_names)
years <<- c(years, team_years)
wins <<- c(wins, team_wins)
losses <<- c(losses, team_losses)
}
# Define the base URL for the pages
base_url <- "https://www.scrapethissite.com/pages/forms/?page_num="
# Loop through the pages
for (i in 1:24) {
print(paste("Scraping page", i))
scrape_page(paste0(base_url, i))
Sys.sleep(sample(2:20, 1)) # Wait a random amount of time (2 - 20 seconds) between pages
}
# Create the data frame
nhl <- data.frame(
Team = names,
Year = as.integer(years),
Wins = as.integer(wins),
Losses = as.integer(losses),
stringsAsFactors = FALSE
)
# Print results
print(head(nhl))
View(nhl)
# Save scraping time
End <- Sys.time()
Start
End
print(End - Start)
# 2. Clean and transform nhl_data
# Clean up the data (remove \n and whitespace)
nhl$Team <- str_trim(str_replace_all(nhl$Team, "\n", ""))
nhl$Year <- as.integer(str_trim(str_replace_all(nhl$Year, "\n", "")))
nhl$Wins <- as.integer(str_trim(str_replace_all(nhl$Wins, "\n", "")))
nhl$Losses <- as.integer(str_trim(str_replace_all(nhl$Losses, "\n", "")))
# Check for NA values which might have been introduced during conversion
sum(is.na(nhl$Year)) # Should be 0 for a successful conversion
sum(is.na(nhl$Wins)) # Should be 0 for a successful conversion
sum(is.na(nhl$Losses)) # Should be 0 for a successful conversion
# If there are any NA values, you might need to inspect the rows and handle them appropriately
if (any(is.na(nhl))) {
nhl[is.na(nhl)] <- 0 # Example: replacing NA values with 0
}
# Print results
print(head(nhl))
# 3. Merge vertically
#install.packages("readxl")
library(readxl)
# Read Excel Data
nhl_expanded <- read_excel("nhl_2012-2021.xlsx", skip = 1)
# Clean Excel Data
nhl_expanded$Team <- as.character(nhl_expanded$Team)
nhl_expanded$Team <- str_trim(str_replace_all(nhl_expanded$Team, "\\*$", ""))
nhl_expanded$Year <- as.integer(nhl_expanded$Season)
nhl_expanded$Wins <- as.integer(nhl_expanded$W)
nhl_expanded$Losses <- as.integer(nhl_expanded$L)
library(stringr)
# Remove trailing * from team names in nhl
nhl$Team <- str_replace_all(nhl$Team, "\\*$", "")
# Remove trailing * from team names in nhl_expanded
nhl_expanded$Team <- str_replace_all(nhl_expanded$Team, "\\*$", "")
library(dplyr)
# Ensure nhl_expanded has the same columns as nhl, in the same order
nhl_expanded <- nhl_expanded %>%
select(names(nhl))
# Now you can use rbind
nhl2 <- rbind(nhl, nhl_expanded)
# Create Win % Calculated Column
nhl2$Win_Percentage <- nhl2$Wins / (nhl2$Wins + nhl2$Losses)
# Print results
print(head(nhl2))
View(nhl2)
# 4. Merge Horizontally
install.packages("tidyverse")
library(tidyverse)
# Read arena data
arena <- read.csv("nhl_hockey_arenas.csv", stringsAsFactors = FALSE)
# Print unique team names to find spelling errors
print(unique(arena$Team))
print(unique(nhl2$Team))
arena$Team <- gsub("Seattle Kracken", "Seattle Kraken", arena$Team)
# Merge horizontally using base R's merge function
nhl3 <- merge(x = nhl2,
y = arena[, c("Team", "Arena.Name", "Arena.Location", "Seating.Capacity")],
by = "Team",
all.x = TRUE)
# Rename columns
names(nhl3)[names(nhl3) == "Arena.Name"] <- "Arena"
names(nhl3)[names(nhl3) == "Arena.Location"] <- "Location"
names(nhl3)[names(nhl3) == "Seating.Capacity"] <- "Capacity"
# Print results
print(head(nhl3))
View(nhl3)
# 5. Write nhl3 to a new .csv file
write.csv(nhl3, "hockey_data.csv",row.names = FALSE)
Step by step
Solved in 3 steps