PAC

docx

School

Columbia University *

*We aren’t endorsed by this school

Course

5200

Subject

Computer Science

Date

Feb 20, 2024

Type

docx

Pages

2

Uploaded by DeaconExploration13239

Report
# Example of Hyperparameter Tuning in R using xgboost and caret library(caret) library(xgboost) # Load the necessary library analysisData = read.csv('analysisData.csv') scoringData = read.csv('scoringData.csv') # Get a concise summary of the data structure str(analysisData) str(scoringData) # Additionally, you can show the number of missing values for the first few columns sapply(analysisData[1:5], function(x) sum(is.na(x))) sapply(scoringData[1:5], function(x) sum(is.na(x))) # Check for columns with all NA values all_na_columns_analysis <- sapply(analysisData, function(x) all(is.na(x))) all_na_columns_scoring <- sapply(scoringData, function(x) all(is.na(x))) # Check for factors with all levels as NA factor_na_analysis <- sapply(analysisData, function(x) is.factor(x) && all(levels(x) == "")) factor_na_scoring <- sapply(scoringData, function(x) is.factor(x) && all(levels(x) == "")) # Print columns that have issues cat("Columns with all NAs in analysisData:\n") print(names(analysisData)[all_na_columns_analysis]) cat("Columns with all NAs in scoringData:\n") print(names(scoringData)[all_na_columns_scoring]) cat("Factor columns with all levels as NA in analysisData:\n") print(names(analysisData)[factor_na_analysis]) cat("Factor columns with all levels as NA in scoringData:\n") print(names(analysisData)[factor_na_scoring]) # Preprocess the data: convert factors to numeric codes analysisData <- analysisData %>% mutate_if(is.character, as.factor) %>% mutate_if(is.factor, as.numeric) # Convert data to matrix format for xgboost x_train <- as.matrix(analysisData[, -which(colnames(analysisData) == "price")])
y_train <- analysisData$price # Define the control using a random search trControl <- trainControl(method = "cv", number = 5, search = "random", verboseIter = TRUE) # Define the tune grid, you can expand this as per computational resource availability and requirement xgbGrid <- expand.grid( nrounds = c(100, 500, 1000), eta = c(0.01, 0.05, 0.1), max_depth = c(3, 6, 9), gamma = c(0, 0.1, 0.2), colsample_bytree = c(0.5, 0.8, 1), min_child_weight = c(1, 3, 5), subsample = c(0.5, 0.75, 1) ) # Train the model with the tuning grid xgb_model_tuned <- train( x = x_train, y = y_train, trControl = trControl, tuneGrid = xgbGrid, method = "xgbTree" ) # After training, predict using the best model best_model <- xgb_model_tuned$finalModel dtest <- xgb.DMatrix(data = as.matrix(scoringData)) scoringData$predicted_price <- predict(best_model, newdata = dtest) # Save the results submissionFile = data.frame(id = scoringData$id, price = predicted_price) write.csv(scoringData, "submission xgboost(2).csv", row.names = FALSE)
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help