boston csv is located here: https://gist.github.com/nnbphuong/def91b5553736764e8e08f6255390f37cross-validation lab Remember that the best model has the highest accuracy (mean) and the lowest variance among all models to avoid overfitting. ''' # Compare Algorithms from pandas import read_csv from matplotlib import pyplot from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier ############################################################################ # to do: add the linear regression model (import the linear regression model from sklearn) ############################################################################# # load dataset. You should put the dataset in the current directory or provide a path to the where # the data is located. filename = 'Diabetes dataset.csv' names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = read_csv(filename, names=names) array = dataframe.values X = array[:,0:8] Y = array[:,8] ########################################################################### # to do: Load Boston dataset instead of Diabetes dataset ########################################################################## # prepare models models = [] ################################################################################### #### # Prepare 6 multiple linear regression model by reviewing # the multiple linear regression lab (RFE model). Choose a model with 12, 11, 10, 9, 7, and 5 features ################################################################################### ##### models.append(('RF2', RandomForestClassifier(max_depth=2, random_state=0))) models.append(('RF3', RandomForestClassifier(max_depth=3, random_state=0))) models.append(('RF4', RandomForestClassifier(max_depth=4, random_state=0))) models.append(('RF5', RandomForestClassifier(max_depth=5, random_state=0))) models.append(('RF6', RandomForestClassifier(max_depth=6, random_state=0))) models.append(('RF7', RandomForestClassifier(max_depth=7, random_state=0))) # evaluate each model in turn ############################################################################# # In this section of the code you don't need to change anything. Just gather # the results of the test accuracy, i.e. scores and compare them. # the model with the highest accuracy is the best model in terms of performance # Pay attention that the code not only outputs the mean score but also the variance # the best performing model should have a low variance, i.e.not overfitted. ############################################################################## results = [] names = [] scoring = 'accuracy' for name, model in models: kfold = KFold(n_splits=10, random_state=7) cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # boxplot algorithm comparison fig = pyplot.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) pyplot.boxplot(results) ax.set_xticklabels(names) pyplot.show() ######################################################## # compare the results and comment on your findings.

boston csv is located here: https://gist.github.com/nnbphuong/def91b5553736764e8e08f6255390f37cross-validation lab Remember that the best model has the highest accuracy (mean) and the lowest variance among all models to avoid overfitting. ''' # Compare Algorithms from pandas import read_csv from matplotlib import pyplot from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier ############################################################################ # to do: add the linear regression model (import the linear regression model from sklearn) ############################################################################# # load dataset. You should put the dataset in the current directory or provide a path to the where # the data is located. filename = 'Diabetes dataset.csv' names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = read_csv(filename, names=names) array = dataframe.values X = array[:,0:8] Y = array[:,8] ########################################################################### # to do: Load Boston dataset instead of Diabetes dataset ########################################################################## # prepare models models = [] ################################################################################### #### # Prepare 6 multiple linear regression model by reviewing # the multiple linear regression lab (RFE model). Choose a model with 12, 11, 10, 9, 7, and 5 features ################################################################################### ##### models.append(('RF2', RandomForestClassifier(max_depth=2, random_state=0))) models.append(('RF3', RandomForestClassifier(max_depth=3, random_state=0))) models.append(('RF4', RandomForestClassifier(max_depth=4, random_state=0))) models.append(('RF5', RandomForestClassifier(max_depth=5, random_state=0))) models.append(('RF6', RandomForestClassifier(max_depth=6, random_state=0))) models.append(('RF7', RandomForestClassifier(max_depth=7, random_state=0))) # evaluate each model in turn ############################################################################# # In this section of the code you don't need to change anything. Just gather # the results of the test accuracy, i.e. scores and compare them. # the model with the highest accuracy is the best model in terms of performance # Pay attention that the code not only outputs the mean score but also the variance # the best performing model should have a low variance, i.e.not overfitted. ############################################################################## results = [] names = [] scoring = 'accuracy' for name, model in models: kfold = KFold(n_splits=10, random_state=7) cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg) # boxplot algorithm comparison fig = pyplot.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) pyplot.boxplot(results) ax.set_xticklabels(names) pyplot.show() ######################################################## # compare the results and comment on your findings.

Related questions

Question

boston csv is located here: https://gist.github.com/nnbphuong/def91b5553736764e8e08f6255390f37

cross-validation lab

Remember that the best model has the highest accuracy (mean) and the lowest variance among all models to avoid overfitting.

'''

# Compare Algorithms

from pandas import read_csv

from matplotlib import pyplot

from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier

############################################################################

# to do: add the linear regression model (import the linear regression model from

sklearn)

#############################################################################

# load dataset. You should put the dataset in the current directory or provide a

path to the where # the data is located.

filename = 'Diabetes dataset.csv'

names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

dataframe = read_csv(filename, names=names)

array = dataframe.values

X = array[:,0:8]

Y = array[:,8]

###########################################################################

# to do: Load Boston dataset instead of Diabetes dataset

##########################################################################

# prepare models

models = []

###################################################################################

####

# Prepare 6 multiple linear regression model by reviewing

# the multiple linear regression lab (RFE model). Choose a model with 12, 11, 10,

9, 7, and 5 features

###################################################################################

#####

models.append(('RF2', RandomForestClassifier(max_depth=2, random_state=0)))

models.append(('RF3', RandomForestClassifier(max_depth=3, random_state=0)))

models.append(('RF4', RandomForestClassifier(max_depth=4, random_state=0)))

models.append(('RF5', RandomForestClassifier(max_depth=5, random_state=0)))

models.append(('RF6', RandomForestClassifier(max_depth=6, random_state=0)))

models.append(('RF7', RandomForestClassifier(max_depth=7, random_state=0)))

# evaluate each model in turn

#############################################################################

# In this section of the code you don't need to change anything. Just gather

# the results of the test accuracy, i.e. scores and compare them.

# the model with the highest accuracy is the best model in terms of performance

# Pay attention that the code not only outputs the mean score but also the variance

# the best performing model should have a low variance, i.e.not overfitted.

##############################################################################

results = []

names = []

scoring = 'accuracy'

for name, model in models:

kfold = KFold(n_splits=10, random_state=7)

cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

results.append(cv_results)

names.append(name)

msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())

print(msg)

# boxplot algorithm comparison

fig = pyplot.figure()

fig.suptitle('Algorithm Comparison')

ax = fig.add_subplot(111)

pyplot.boxplot(results)

ax.set_xticklabels(names)

pyplot.show()

########################################################

# compare the results and comment on your findings.

Expert Solution

Step by step

Solved in 2 steps with 8 images

SEE SOLUTION Check out a sample Q&A here