boston csv is located here: https://gist.github.com/nnbphuong/def91b5553736764e8e08f6255390f37cross-validation lab Remember that the best model has the highest accuracy (mean) and the lowest variance among all models to avoid overfitting. ''' # Compare Algorithms from pandas import read_csv from matplotlib import pyplot from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier ############################################################################ # to do: add the linear regression model (import the linear regression model from sklearn) ############################################################################# # load dataset. You should put the dataset in the current directory or provide a path to the where # the data is located. filename = 'Diabetes dataset.csv' names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = read_csv(filename, names=names) array = dataframe.values X = array[:,0:8] Y = array[:,8] ########################################################################### # to do: Load Boston dataset instead of Diabetes dataset ########################################################################## # prepare models models = [] ################################################################################### #### # Prepare 6 multiple linear regression model by reviewing # the multiple linear regression lab (RFE model). Choose a model with 12, 11, 10, 9, 7, and 5 features ################################################################################### ##### models.append(('RF2', RandomForestClassifier(max_depth=2, random_state=0))) models.append(('RF3', RandomForestClassifier(max_depth=3, random_state=0))) models.append(('RF4', RandomForestClassifier(max_depth=4, random_state=0))) models.append(('RF5', RandomForestClassifier(max_depth=5, random_state=0))) models.append(('RF6', RandomForestClassifier(max_depth=6, random_state=0))) models.append(('RF7', RandomForestClassifier(max_depth=7, random_state=0))) # evaluate each model in turn ############################################################################# # In this section of the code you don't need to change anything. Just gather # the results of the test accuracy, i.e. scores and compare them. # the model with the highest accuracy is the best model in terms of performance # Pay attention that the code not only outputs the mean score but also the variance # the best performing model should have a low variance, i.e.not overfitted. ############################################################################## results = [] names = [] scoring = 'accuracy' for name, model in models: kfold = KFold(n_splits=10, random_state=7) cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # boxplot algorithm comparison fig = pyplot.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) pyplot.boxplot(results) ax.set_xticklabels(names) pyplot.show() ######################################################## # compare the results and comment on your findings.
boston csv is located here: https://gist.github.com/nnbphuong/def91b5553736764e8e08f6255390f37
cross-validation lab
Remember that the best model has the highest accuracy (mean) and the lowest variance among all models to avoid overfitting.
'''
# Compare Algorithms
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
############################################################################
# to do: add the linear regression model (import the linear regression model from
sklearn)
#############################################################################
# load dataset. You should put the dataset in the current directory or provide a
path to the where # the data is located.
filename = 'Diabetes dataset.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
###########################################################################
# to do: Load Boston dataset instead of Diabetes dataset
##########################################################################
# prepare models
models = []
###################################################################################
####
# Prepare 6 multiple linear regression model by reviewing
# the multiple linear regression lab (RFE model). Choose a model with 12, 11, 10,
9, 7, and 5 features
###################################################################################
#####
models.append(('RF2', RandomForestClassifier(max_depth=2, random_state=0)))
models.append(('RF3', RandomForestClassifier(max_depth=3, random_state=0)))
models.append(('RF4', RandomForestClassifier(max_depth=4, random_state=0)))
models.append(('RF5', RandomForestClassifier(max_depth=5, random_state=0)))
models.append(('RF6', RandomForestClassifier(max_depth=6, random_state=0)))
models.append(('RF7', RandomForestClassifier(max_depth=7, random_state=0)))
# evaluate each model in turn
#############################################################################
# In this section of the code you don't need to change anything. Just gather
# the results of the test accuracy, i.e. scores and compare them.
# the model with the highest accuracy is the best model in terms of performance
# Pay attention that the code not only outputs the mean score but also the variance
# the best performing model should have a low variance, i.e.not overfitted.
##############################################################################
results = []
names = []
scoring = 'accuracy'
for name, model in models:
kfold = KFold(n_splits=10, random_state=7)
cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()
########################################################
# compare the results and comment on your findings.

Step by step
Solved in 2 steps with 8 images
