What should my code be to get the size of my six clusters Here is my code so far # Install necessary libraries import pandas as pd import numpy as np import seaborn as sbn from google.colab import files from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score import matplotlib.pyplot as plt from scipy.stats import skew # suppress scientific notation in later outputs pd.options.display.float_format = '{:.2f}'.format np.set_printoptions(suppress = True) # load cluster data and remove any commas so we can process properly df = pd.read_csv('CC GENERAL.csv') df = df.replace(',','',regex = True) #elbow graph to see how many cluster to do elbow_kwargs = {'init':'random', 'n_init':10, 'max_iter':480, 'random_state':48} sse = [] for x in range(1,15): model_chk = KMeans(n_clusters = x, **elbow_kwargs) model_chk.fit(df) sse.append(model_chk.inertia_) plt.style.use('fivethirtyeight') plt.plot(range(1,15),sse) plt.xticks(range(1,15)) plt.xlabel('Clusters') plt.ylabel('SSE') plt.show() #Drop all null values df =df.dropna() df.dropna(inplace=True) # now we buil out our model for real # using the same hyper parameters as above # only change is solidifying our n_clusters based on the elbow model_kmean = KMeans(init = 'random', n_clusters = 6, n_init = 10, max_iter = 480, random_state = 48) # run the model on our dataset model_kmean.fit(df) pd.options.display.max_columns = False # when looking at this dataframe we want to look for places where the values are vastly different from one another # columns where there are very little variances tell us that field has no distinction between clusters aka are not deterministic display(cluster_df) # here we can pull which fields have high levels of skew # this shows us the fields where 1 or 2 of the clusters vary signficantly from the others # good place to start for identifying attributes that are distinctive and show clear splits in behavior high_skew_fields = cluster_df.skew().loc[lambda x: abs(x) >= 0.5].index display(cluster_df[high_skew_fields]) #plotting the results plt.scatter(cluster_df,cluster_df) plt.show()
What should my code be to get the size of my six clusters Here is my code so far # Install necessary libraries import pandas as pd import numpy as np import seaborn as sbn from google.colab import files from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score import matplotlib.pyplot as plt from scipy.stats import skew # suppress scientific notation in later outputs pd.options.display.float_format = '{:.2f}'.format np.set_printoptions(suppress = True) # load cluster data and remove any commas so we can process properly df = pd.read_csv('CC GENERAL.csv') df = df.replace(',','',regex = True) #elbow graph to see how many cluster to do elbow_kwargs = {'init':'random', 'n_init':10, 'max_iter':480, 'random_state':48} sse = [] for x in range(1,15): model_chk = KMeans(n_clusters = x, **elbow_kwargs) model_chk.fit(df) sse.append(model_chk.inertia_) plt.style.use('fivethirtyeight') plt.plot(range(1,15),sse) plt.xticks(range(1,15)) plt.xlabel('Clusters') plt.ylabel('SSE') plt.show() #Drop all null values df =df.dropna() df.dropna(inplace=True) # now we buil out our model for real # using the same hyper parameters as above # only change is solidifying our n_clusters based on the elbow model_kmean = KMeans(init = 'random', n_clusters = 6, n_init = 10, max_iter = 480, random_state = 48) # run the model on our dataset model_kmean.fit(df) pd.options.display.max_columns = False # when looking at this dataframe we want to look for places where the values are vastly different from one another # columns where there are very little variances tell us that field has no distinction between clusters aka are not deterministic display(cluster_df) # here we can pull which fields have high levels of skew # this shows us the fields where 1 or 2 of the clusters vary signficantly from the others # good place to start for identifying attributes that are distinctive and show clear splits in behavior high_skew_fields = cluster_df.skew().loc[lambda x: abs(x) >= 0.5].index display(cluster_df[high_skew_fields]) #plotting the results plt.scatter(cluster_df,cluster_df) plt.show()
Database System Concepts
7th Edition
ISBN:9780078022159
Author:Abraham Silberschatz Professor, Henry F. Korth, S. Sudarshan
Publisher:Abraham Silberschatz Professor, Henry F. Korth, S. Sudarshan
Chapter1: Introduction
Section: Chapter Questions
Problem 1PE
Related questions
Question
What should my code be to get the size of my six clusters
Here is my code so far
# Install necessary libraries
import pandas as pd
import numpy as np
import seaborn as sbn
from google.colab import files
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from scipy.stats import skew
# suppress scientific notation in later outputs
pd.options.display.float_format = '{:.2f}'.format
np.set_printoptions(suppress = True)
# load cluster data and remove any commas so we can process properly
df = pd.read_csv('CC GENERAL.csv')
df = df.replace(',','',regex = True)
#elbow graph to see how many cluster to do
elbow_kwargs = {'init':'random', 'n_init':10, 'max_iter':480, 'random_state':48}
sse = []
for x in range(1,15):
model_chk = KMeans(n_clusters = x, **elbow_kwargs)
model_chk.fit(df)
sse.append(model_chk.inertia_)
plt.style.use('fivethirtyeight')
plt.plot(range(1,15),sse)
plt.xticks(range(1,15))
plt.xlabel('Clusters')
plt.ylabel('SSE')
plt.show()
#Drop all null values
df =df.dropna()
df.dropna(inplace=True)
# now we buil out our model for real
# using the same hyper parameters as above
# only change is solidifying our n_clusters based on the elbow
model_kmean = KMeans(init = 'random', n_clusters = 6, n_init = 10, max_iter = 480, random_state = 48)
# run the model on our dataset
model_kmean.fit(df)
pd.options.display.max_columns = False
# when looking at this dataframe we want to look for places where the values are vastly different from one another
# columns where there are very little variances tell us that field has no distinction between clusters aka are not deterministic
display(cluster_df)
# here we can pull which fields have high levels of skew
# this shows us the fields where 1 or 2 of the clusters vary signficantly from the others
# good place to start for identifying attributes that are distinctive and show clear splits in behavior
high_skew_fields = cluster_df.skew().loc[lambda x: abs(x) >= 0.5].index
display(cluster_df[high_skew_fields])
#plotting the results
plt.scatter(cluster_df,cluster_df)
plt.show()
AI-Generated Solution
Unlock instant AI solutions
Tap the button
to generate a solution
Recommended textbooks for you

Database System Concepts
Computer Science
ISBN:
9780078022159
Author:
Abraham Silberschatz Professor, Henry F. Korth, S. Sudarshan
Publisher:
McGraw-Hill Education

Starting Out with Python (4th Edition)
Computer Science
ISBN:
9780134444321
Author:
Tony Gaddis
Publisher:
PEARSON

Digital Fundamentals (11th Edition)
Computer Science
ISBN:
9780132737968
Author:
Thomas L. Floyd
Publisher:
PEARSON

Database System Concepts
Computer Science
ISBN:
9780078022159
Author:
Abraham Silberschatz Professor, Henry F. Korth, S. Sudarshan
Publisher:
McGraw-Hill Education

Starting Out with Python (4th Edition)
Computer Science
ISBN:
9780134444321
Author:
Tony Gaddis
Publisher:
PEARSON

Digital Fundamentals (11th Edition)
Computer Science
ISBN:
9780132737968
Author:
Thomas L. Floyd
Publisher:
PEARSON

C How to Program (8th Edition)
Computer Science
ISBN:
9780133976892
Author:
Paul J. Deitel, Harvey Deitel
Publisher:
PEARSON

Database Systems: Design, Implementation, & Manag…
Computer Science
ISBN:
9781337627900
Author:
Carlos Coronel, Steven Morris
Publisher:
Cengage Learning

Programmable Logic Controllers
Computer Science
ISBN:
9780073373843
Author:
Frank D. Petruzella
Publisher:
McGraw-Hill Education