regression_models_lab

py

School

University of Central Florida *

*We aren’t endorsed by this school

Course

5610

Subject

Statistics

Date

Apr 3, 2024

Type

py

Pages

5

Uploaded by madhu_iitm

Report
# -*- coding: utf-8 -*- """Regression Models Lab.ipynb Automatically generated by Colaboratory. Original file is located at ## Imports """ # Commented out IPython magic to ensure Python compatibility. import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split # %matplotlib inline import matplotlib.pyplot as plt """## Upload the housing_2.csv input file ## Load the input file """ data_file = "housing_2.csv" df = pd.read_csv(data_file) df.shape """## Split the data into training and test datasets""" def prepare_data(df): y = df.SalePrice # The target variable # Remove the SalePrice from the feature dataset X X = df.drop(['SalePrice'], axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.15, random_state=0) # A quick check print (X_train.shape, y_train.shape) print (X_test.shape, y_test.shape) return X_train, X_test, y_train, y_test """## Train the LinearRegression etimator from scikit-learn. Take 1 of 2""" X_train, X_test, y_train, y_test = prepare_data(df) # You will get a ValueError: Input X contains NaN.!! This is done on purpose to illustrate this aspect of this estimator type linreg_model = LinearRegression () linreg_model.fit(X_train, y_train) # Read the error message and move to the next working cell ... #=================================================================
"""## Identify the variables with nulls""" df.info(verbose=True, show_counts=True) # Let's narrow down on columns that have nulls import io buffer = io.StringIO() df.info(buf=buffer, verbose=True, show_counts=True) df_as_string = buffer.getvalue().split('\n') no_nulls = df.shape[0] # all values in a column are non-nulls print (no_nulls) # We will use regular expressions to check every line in the df.info() report import re pattern = r"\s+(\d+)\s+([A-Za-z_]+)\s+(\d+)\s+non-null\.*" vars_with_nulls = [] for line in df_as_string: matches = re.match(pattern, line) if matches: var_name = matches.group(2) non_nulls_count = int(matches.group(3)) if non_nulls_count < no_nulls: # Only interested in variables that have fewer than max non-null values #print (non_nulls_count) vars_with_nulls.append((var_name, non_nulls_count)) print(vars_with_nulls) # We have two columns with nulls: df.LotFrontage # A good candidate for being plugged in with a mean value df.MasVnrArea # Will plug in 0 for nulls """## Plug the nulls""" df['LotFrontage'].fillna(df.LotFrontage.mean(), inplace=True) df['MasVnrArea'].fillna(0.0, inplace=True) df.LotFrontage.info(), df.MasVnrArea.info() # All back to 1099 non-null values .... """## Train the LinearRegression etimator from scikit-learn. Take 2 of 2""" X_train, X_test, y_train, y_test = prepare_data(df) # df has been fixed for nulls now ... # You will get a ValueError: Input X contains NaN.!! This is done on purpose to illustrate this aspect of this estimator type linreg_model = LinearRegression () linreg_model.fit(X_train, y_train) linreg_model.get_params()
linreg_model.score (X_train, y_train), linreg_model.score (X_test, y_test) """## XGBRegressor""" # xgboost.XGBRegressor offers the same API as scikit-learn estimators (.fit(), .predict(), etc.) import xgboost as xgb xgb_model = xgb.XGBRegressor (objective='reg:squarederror', random_state=777) xgb_model.fit(X_train, y_train) xgb_model.get_params() """### *Q1: Compare the internal model parameters of the scikit-learn's LinearRegression estimator with and those of XGBRegressor. What can you tell about the models?* ### *Q2: What do these model parameter names tell you: 'max_depth', 'max_leaves', and 'tree_method'?* """ xgb_model.score (X_train, y_train), xgb_model.score (X_test, y_test) # Looks like the XGBRegressor model really overfits on the train data, but it also does a good job with the test data! """That's better than scikit-learn's LinearRegressor""" linreg_model.score (X_train, y_train), linreg_model.score (X_test, y_test) """``` One of the success factors is the use of multiple estimators working together: 'n_estimators': 100 (the default value) that are boosting (improving) each other's estimations like a vaccine booster does ;-) ``` """ # Let's double the number of estimators to see if we can get further # accuracy improvement xgb_model200 = xgb.XGBRegressor (objective='reg:squarederror', \ random_state=7777, n_estimators=200) xgb_model200.fit (X_train, y_train) xgb_model200.score (X_train, y_train), xgb_model200.score (X_test, y_test) """You may see (or may not!) some marginal accuracy improvement... Let's calculate the MSE metric using the mean_squared_error() function """ from sklearn.metrics import mean_squared_error mse_100 = mean_squared_error(y_test, xgb_model.predict(X_test)) mse_200 = mean_squared_error(y_test, xgb_model200.predict(X_test)) mse_100, mse_200
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
(mse_100 - mse_200) / mse_100 * 100 # Just about a 0.1% better MSE with twice as many estimators """## Finding the most informative features (variables)""" fi = xgb_model.feature_importances_ len(fi) # Gettig column (predictor) indexes sorted_index_desc = np.argsort(fi)[::-1][:7];sorted_index_desc """ ``` Understanding: np.argsort(fi)[::-1][:7] NumPy's argsort returns the indices of the fi array that, if used, would sort the fi array in ascending order; we need to reverse the order to descending to get the most important features are on top and [::-1] is the idiom for achieving that. ``` """ # Listing the column names: #dir(df.columns[sorted_index_desc]) df.columns[sorted_index_desc].values fi[sorted_index_desc] len(fi[sorted_index_desc]) plt.figure(figsize=(10,5)) plt.grid() plt.title('Plotting Relative Feature Importance', fontsize=16, color='r') plt.ylabel('Relative feature importance',fontsize=14, color='b') plt.xlabel('Features', fontsize=14, color='b' ) x = range(0, len(fi[sorted_index_desc])) plt.xticks(x, df.columns[sorted_index_desc].values) plt.scatter(x, fi[sorted_index_desc], color='g'); """ ## sklearn.ensemble.RandomForestRegressor""" from sklearn.ensemble import RandomForestRegressor rfr = RandomForestRegressor(random_state=24) rfr.fit(X_train, y_train) rfr.get_params()
rfr.score (X_train, y_train), rfr.score (X_test, y_test) """### *Q3: Compare the R2 scores for the training and the test data of the sklearn.linear_model.LinearRegression, XGBRegressor, and sklearn.ensemble.RandomForestRegressor models respectively... Which model's results are better?* *End of the lab notebook* """