Problem to be created as R markdown file is attached as an image. The code in R for the given problem(attached as image) is as below: rm(list = ls()) library(xml2) library(dplyr) library(httr) library(stringr) # 1. Use the xml2 package to scrape data on NHL hockey teams user_agent <- "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Mobile Safari/537.36" # Initialize timer Start <- Sys.time() # Initialize variables to store results names <- c() years <- c() wins <- c() losses <- c() # Function to scrape a single page scrape_page <- function(url) { Sys.sleep(5) # Wait for 5 seconds page <- read_html(url, user_agent = user_agent) # Extract data team_names <- xml_text(xml_find_all(page, "//td[@class='name']")) team_years <- xml_text(xml_find_all(page, "//td[@class='year']")) team_wins <- xml_text(xml_find_all(page, "//td[@class='wins']")) team_losses <- xml_text(xml_find_all(page, "//td[@class='losses']")) # Append data to global variables names <<- c(names, team_names) years <<- c(years, team_years) wins <<- c(wins, team_wins) losses <<- c(losses, team_losses) } # Define the base URL for the pages base_url <- "https://www.scrapethissite.com/pages/forms/?page_num=" # Loop through the pages for (i in 1:24) { print(paste("Scraping page", i)) scrape_page(paste0(base_url, i)) Sys.sleep(sample(2:20, 1)) # Wait a random amount of time (2 - 20 seconds) between pages } # Create the data frame nhl <- data.frame( Team = names, Year = as.integer(years), Wins = as.integer(wins), Losses = as.integer(losses), stringsAsFactors = FALSE ) # Print results print(head(nhl)) View(nhl) # Save scraping time End <- Sys.time() Start End print(End - Start) # 2. Clean and transform nhl_data # Clean up the data (remove \n and whitespace) nhl$Team <- str_trim(str_replace_all(nhl$Team, "\n", "")) nhl$Year <- as.integer(str_trim(str_replace_all(nhl$Year, "\n", ""))) nhl$Wins <- as.integer(str_trim(str_replace_all(nhl$Wins, "\n", ""))) nhl$Losses <- as.integer(str_trim(str_replace_all(nhl$Losses, "\n", ""))) # Check for NA values which might have been introduced during conversion sum(is.na(nhl$Year)) # Should be 0 for a successful conversion sum(is.na(nhl$Wins)) # Should be 0 for a successful conversion sum(is.na(nhl$Losses)) # Should be 0 for a successful conversion # If there are any NA values, you might need to inspect the rows and handle them appropriately if (any(is.na(nhl))) { nhl[is.na(nhl)] <- 0 # Example: replacing NA values with 0 } # Print results print(head(nhl)) # 3. Merge vertically #install.packages("readxl") library(readxl) # Read Excel Data nhl_expanded <- read_excel("nhl_2012-2021.xlsx", skip = 1) # Clean Excel Data nhl_expanded$Team <- as.character(nhl_expanded$Team) nhl_expanded$Team <- str_trim(str_replace_all(nhl_expanded$Team, "\\*$", "")) nhl_expanded$Year <- as.integer(nhl_expanded$Season) nhl_expanded$Wins <- as.integer(nhl_expanded$W) nhl_expanded$Losses <- as.integer(nhl_expanded$L) library(stringr) # Remove trailing * from team names in nhl nhl$Team <- str_replace_all(nhl$Team, "\\*$", "") # Remove trailing * from team names in nhl_expanded nhl_expanded$Team <- str_replace_all(nhl_expanded$Team, "\\*$", "") library(dplyr) # Ensure nhl_expanded has the same columns as nhl, in the same order nhl_expanded <- nhl_expanded %>% select(names(nhl)) # Now you can use rbind nhl2 <- rbind(nhl, nhl_expanded) # Create Win % Calculated Column nhl2$Win_Percentage <- nhl2$Wins / (nhl2$Wins + nhl2$Losses) # Print results print(head(nhl2)) View(nhl2) # 4. Merge Horizontally install.packages("tidyverse") library(tidyverse) # Read arena data arena <- read.csv("nhl_hockey_arenas.csv", stringsAsFactors = FALSE) # Print unique team names to find spelling errors print(unique(arena$Team)) print(unique(nhl2$Team)) arena$Team <- gsub("Seattle Kracken", "Seattle Kraken", arena$Team) # Merge horizontally using base R's merge function nhl3 <- merge(x = nhl2, y = arena[, c("Team", "Arena.Name", "Arena.Location", "Seating.Capacity")], by = "Team", all.x = TRUE) # Rename columns names(nhl3)[names(nhl3) == "Arena.Name"] <- "Arena" names(nhl3)[names(nhl3) == "Arena.Location"] <- "Location" names(nhl3)[names(nhl3) == "Seating.Capacity"] <- "Capacity" # Print results print(head(nhl3)) View(nhl3) # 5. Write nhl3 to a new .csv file write.csv(nhl3, "hockey_data.csv",row.names = FALSE)

Problem to be created as R markdown file is attached as an image. The code in R for the given problem(attached as image) is as below: rm(list = ls()) library(xml2) library(dplyr) library(httr) library(stringr) # 1. Use the xml2 package to scrape data on NHL hockey teams user_agent <- "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Mobile Safari/537.36" # Initialize timer Start <- Sys.time() # Initialize variables to store results names <- c() years <- c() wins <- c() losses <- c() # Function to scrape a single page scrape_page <- function(url) { Sys.sleep(5) # Wait for 5 seconds page <- read_html(url, user_agent = user_agent) # Extract data team_names <- xml_text(xml_find_all(page, "//td[@class='name']")) team_years <- xml_text(xml_find_all(page, "//td[@class='year']")) team_wins <- xml_text(xml_find_all(page, "//td[@class='wins']")) team_losses <- xml_text(xml_find_all(page, "//td[@class='losses']")) # Append data to global variables names <<- c(names, team_names) years <<- c(years, team_years) wins <<- c(wins, team_wins) losses <<- c(losses, team_losses) } # Define the base URL for the pages base_url <- "https://www.scrapethissite.com/pages/forms/?page_num=" # Loop through the pages for (i in 1:24) { print(paste("Scraping page", i)) scrape_page(paste0(base_url, i)) Sys.sleep(sample(2:20, 1)) # Wait a random amount of time (2 - 20 seconds) between pages } # Create the data frame nhl <- data.frame( Team = names, Year = as.integer(years), Wins = as.integer(wins), Losses = as.integer(losses), stringsAsFactors = FALSE ) # Print results print(head(nhl)) View(nhl) # Save scraping time End <- Sys.time() Start End print(End - Start) # 2. Clean and transform nhl_data # Clean up the data (remove \n and whitespace) nhl$Team <- str_trim(str_replace_all(nhl$Team, "\n", "")) nhl$Year <- as.integer(str_trim(str_replace_all(nhl$Year, "\n", ""))) nhl$Wins <- as.integer(str_trim(str_replace_all(nhl$Wins, "\n", ""))) nhl$Losses <- as.integer(str_trim(str_replace_all(nhl$Losses, "\n", ""))) # Check for NA values which might have been introduced during conversion sum(is.na(nhl$Year)) # Should be 0 for a successful conversion sum(is.na(nhl$Wins)) # Should be 0 for a successful conversion sum(is.na(nhl$Losses)) # Should be 0 for a successful conversion # If there are any NA values, you might need to inspect the rows and handle them appropriately if (any(is.na(nhl))) { nhl[is.na(nhl)] <- 0 # Example: replacing NA values with 0 } # Print results print(head(nhl)) # 3. Merge vertically #install.packages("readxl") library(readxl) # Read Excel Data nhl_expanded <- read_excel("nhl_2012-2021.xlsx", skip = 1) # Clean Excel Data nhl_expanded$Team <- as.character(nhl_expanded$Team) nhl_expanded$Team <- str_trim(str_replace_all(nhl_expanded$Team, "\\$", "")) nhl_expanded$Year <- as.integer(nhl_expanded$Season) nhl_expanded$Wins <- as.integer(nhl_expanded$W) nhl_expanded$Losses <- as.integer(nhl_expanded$L) library(stringr) # Remove trailing from team names in nhl nhl$Team <- str_replace_all(nhl$Team, "\\$", "") # Remove trailing from team names in nhl_expanded nhl_expanded$Team <- str_replace_all(nhl_expanded$Team, "\\*$", "") library(dplyr) # Ensure nhl_expanded has the same columns as nhl, in the same order nhl_expanded <- nhl_expanded %>% select(names(nhl)) # Now you can use rbind nhl2 <- rbind(nhl, nhl_expanded) # Create Win % Calculated Column nhl2$Win_Percentage <- nhl2$Wins / (nhl2$Wins + nhl2$Losses) # Print results print(head(nhl2)) View(nhl2) # 4. Merge Horizontally install.packages("tidyverse") library(tidyverse) # Read arena data arena <- read.csv("nhl_hockey_arenas.csv", stringsAsFactors = FALSE) # Print unique team names to find spelling errors print(unique(arena$Team)) print(unique(nhl2$Team)) arena$Team <- gsub("Seattle Kracken", "Seattle Kraken", arena$Team) # Merge horizontally using base R's merge function nhl3 <- merge(x = nhl2, y = arena[, c("Team", "Arena.Name", "Arena.Location", "Seating.Capacity")], by = "Team", all.x = TRUE) # Rename columns names(nhl3)[names(nhl3) == "Arena.Name"] <- "Arena" names(nhl3)[names(nhl3) == "Arena.Location"] <- "Location" names(nhl3)[names(nhl3) == "Seating.Capacity"] <- "Capacity" # Print results print(head(nhl3)) View(nhl3) # 5. Write nhl3 to a new .csv file write.csv(nhl3, "hockey_data.csv",row.names = FALSE)

Database System Concepts

7th Edition

ISBN:9780078022159

Author:Abraham Silberschatz Professor, Henry F. Korth, S. Sudarshan

Publisher:Abraham Silberschatz Professor, Henry F. Korth, S. Sudarshan

Chapter1: Introduction

Section: Chapter Questions

Problem 1PE

See similar textbooks

Related questions

Question

1. Create the given problem in R Markdown.
2. You will need to add details to complete the documentation for the rest of the
assignment.
a. Import from Excel and vertical merge the data. b. Import from .CSV and horizontally merge. c. Perform any necessary clean up as required in HW4. d. Your code and HTML output should be attractive, organized, and lead the reader through your process and conclusions.

Problem to be created as R markdown file is attached as an image. The code in R for the given problem(attached as image) is as below:

rm(list = ls())
library(xml2)
library(dplyr)
library(httr)
library(stringr)

# 1. Use the xml2 package to scrape data on NHL hockey teams

user_agent <- "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Mobile Safari/537.36"

# Initialize timer
Start <- Sys.time()

# Initialize variables to store results
names <- c()
years <- c()
wins <- c()
losses <- c()

# Function to scrape a single page
scrape_page <- function(url) {
Sys.sleep(5) # Wait for 5 seconds
page <- read_html(url, user_agent = user_agent)

# Extract data
team_names <- xml_text(xml_find_all(page, "//td[@class='name']"))
team_years <- xml_text(xml_find_all(page, "//td[@class='year']"))
team_wins <- xml_text(xml_find_all(page, "//td[@class='wins']"))
team_losses <- xml_text(xml_find_all(page, "//td[@class='losses']"))

# Append data to global variables
names <<- c(names, team_names)
years <<- c(years, team_years)
wins <<- c(wins, team_wins)
losses <<- c(losses, team_losses)
}

# Define the base URL for the pages
base_url <- "https://www.scrapethissite.com/pages/forms/?page_num="

# Loop through the pages
for (i in 1:24) {
print(paste("Scraping page", i))
scrape_page(paste0(base_url, i))
Sys.sleep(sample(2:20, 1)) # Wait a random amount of time (2 - 20 seconds) between pages
}

# Create the data frame
nhl <- data.frame(
Team = names,
Year = as.integer(years),
Wins = as.integer(wins),
Losses = as.integer(losses),
stringsAsFactors = FALSE
)

# Print results
print(head(nhl))
View(nhl)

# Save scraping time
End <- Sys.time()
Start
End
print(End - Start)

# 2. Clean and transform nhl_data
# Clean up the data (remove \n and whitespace)
nhl$Team <- str_trim(str_replace_all(nhl$Team, "\n", ""))
nhl$Year <- as.integer(str_trim(str_replace_all(nhl$Year, "\n", "")))
nhl$Wins <- as.integer(str_trim(str_replace_all(nhl$Wins, "\n", "")))
nhl$Losses <- as.integer(str_trim(str_replace_all(nhl$Losses, "\n", "")))

# Check for NA values which might have been introduced during conversion
sum(is.na(nhl$Year)) # Should be 0 for a successful conversion
sum(is.na(nhl$Wins)) # Should be 0 for a successful conversion
sum(is.na(nhl$Losses)) # Should be 0 for a successful conversion

# If there are any NA values, you might need to inspect the rows and handle them appropriately
if (any(is.na(nhl))) {
nhl[is.na(nhl)] <- 0 # Example: replacing NA values with 0
}

# Print results
print(head(nhl))

# 3. Merge vertically

#install.packages("readxl")
library(readxl)

# Read Excel Data
nhl_expanded <- read_excel("nhl_2012-2021.xlsx", skip = 1)

# Clean Excel Data
nhl_expanded$Team <- as.character(nhl_expanded$Team)
nhl_expanded$Team <- str_trim(str_replace_all(nhl_expanded$Team, "\\*$", ""))
nhl_expanded$Year <- as.integer(nhl_expanded$Season)
nhl_expanded$Wins <- as.integer(nhl_expanded$W)
nhl_expanded$Losses <- as.integer(nhl_expanded$L)

library(stringr)

# Remove trailing * from team names in nhl
nhl$Team <- str_replace_all(nhl$Team, "\\*$", "")

# Remove trailing * from team names in nhl_expanded
nhl_expanded$Team <- str_replace_all(nhl_expanded$Team, "\\*$", "")

library(dplyr)

# Ensure nhl_expanded has the same columns as nhl, in the same order
nhl_expanded <- nhl_expanded %>%
select(names(nhl))

# Now you can use rbind
nhl2 <- rbind(nhl, nhl_expanded)

# Create Win % Calculated Column
nhl2$Win_Percentage <- nhl2$Wins / (nhl2$Wins + nhl2$Losses)

# Print results
print(head(nhl2))
View(nhl2)

# 4. Merge Horizontally

install.packages("tidyverse")
library(tidyverse)

# Read arena data
arena <- read.csv("nhl_hockey_arenas.csv", stringsAsFactors = FALSE)

# Print unique team names to find spelling errors
print(unique(arena$Team))
print(unique(nhl2$Team))

arena$Team <- gsub("Seattle Kracken", "Seattle Kraken", arena$Team)

# Merge horizontally using base R's merge function
nhl3 <- merge(x = nhl2,
y = arena[, c("Team", "Arena.Name", "Arena.Location", "Seating.Capacity")],
by = "Team",
all.x = TRUE)

# Rename columns
names(nhl3)[names(nhl3) == "Arena.Name"] <- "Arena"
names(nhl3)[names(nhl3) == "Arena.Location"] <- "Location"
names(nhl3)[names(nhl3) == "Seating.Capacity"] <- "Capacity"

# Print results
print(head(nhl3))
View(nhl3)

# 5. Write nhl3 to a new .csv file
write.csv(nhl3, "hockey_data.csv",row.names = FALSE)

2. (5 points) Clean and transform nhl:
o Clean up the data (remove \n and whitespace)
o nhl$ Year: convert Year into an integer data type.
o nhl Wins: convert Wins into an integer data type.
o nhl$ Losses: convert Losses into an integer data type.
3.
5 points) Merge vertically. "nhl_2012-2021.xlsx" contains win/loss data for NHL teams for
additional years (2012-2021). Read the data and merge with nhl. Note: There are no UTF-8 or
special characters in the data.
o nhl_expanded: data frame created from the file "nhl_2012-2021.xlsx". (Note: column E =
Wins, column F = losses). Ensure data types match nhl.
o
Remove the trailing * from team names.
o nhl2: data frame created when nhl and nhl_expanded are vertically merged.
o nh12$Win%: create a new calculated column for observation = wins/(wins + losses)
4.
Merge horizontally. "nhl_hockey_arenas.csv" contains data NHL teams, arenas where
they play, location of the arena, and other details about the arena. Read the data and merge with
nhl2. Note: There are no UTF-8 or special characters in the data.
arena: data frame created from the file "hockey_arenas.csv"
o nh13: data frame created by horizontally merging nh12 with arena columns Arena.Name
(renamed to Arena), Arena.Location (renamed to Location) and Seating.Capacity
(renamed to Capacity). Drop other unnecessary columns.
Hint: there is a spelling error in team names. Use unique() to find it.
Hint: make sure that the team names match. Anaheim Ducks has changed their name since
1992.
There is also an issue with the New York Islanders having two arenas - one before 2012 and one
after 2012. Solving this problem is optional.
There will be 888 observations of 8 variables if you solved the optional problem.
There will be 919 observations of 8 variables if you did not solve the optional problem.
5. (1 points) Write nh13 to a new .csv file name "hockey_data.csv" separated by a comma.

1. (10 points Use the xml2 package to scrape data on NHL hockey teams.
(https://www.scrapethissite.com/pages/forms/). Specifically, collect the team's name, year,
wins, and losses for each team across all pages.
Scraping etiquette:
o After your bot navigates to the page, add a 5 second pause to allow the page to
load.
o
Emulate a common browser by specifying and using a user agent when scraping.
Wait a random amount of time (2-20 seconds) between pages.
o
o Include a time so you know how long the scraping took to run.
o Include a counter so you know each time your bot moves to a new page.
NOTE: The above functionality is demonstrated in 0_metacritic-scraping.R under
the Project Proposal instructions.
Use R code to produce the following output:
o name: character vector storing the name of each team
o year: character vector storing the year of each entry
o wins: character vector storing the number of team wins that year
o losses: character vector storing the number of team losses that year
o
nhl: data frame containing columns called Team, Year, Wins, and Losses
combining the vectors created above. The data frame should store the raw,
scraped data (no additional data cleaning/formatting)
Page 1 of 2

Expert Solution

Step by step

Solved in 3 steps

SEE SOLUTION Check out a sample Q&A here

Knowledge Booster

Learn more about

Need a deep-dive on the concept behind this application? Look no further. Learn more about this topic, computer-science and related others by exploring similar questions and additional content below.