What should my code be to get the size of my six clusters Here is my code so far # Install necessary libraries import pandas as pd import numpy as np import seaborn as sbn from google.colab import files from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score import matplotlib.pyplot as plt from scipy.stats import skew # suppress scientific notation in later outputs pd.options.display.float_format = '{:.2f}'.format np.set_printoptions(suppress = True) # load cluster data and remove any commas so we can process properly df = pd.read_csv('CC GENERAL.csv') df = df.replace(',','',regex = True) #elbow graph to see how many cluster to do elbow_kwargs = {'init':'random', 'n_init':10, 'max_iter':480, 'random_state':48} sse = [] for x in range(1,15): model_chk = KMeans(n_clusters = x, **elbow_kwargs) model_chk.fit(df) sse.append(model_chk.inertia_) plt.style.use('fivethirtyeight') plt.plot(range(1,15),sse) plt.xticks(range(1,15)) plt.xlabel('Clusters') plt.ylabel('SSE') plt.show() #Drop all null values df =df.dropna() df.dropna(inplace=True) # now we buil out our model for real # using the same hyper parameters as above # only change is solidifying our n_clusters based on the elbow model_kmean = KMeans(init = 'random', n_clusters = 6, n_init = 10, max_iter = 480, random_state = 48) # run the model on our dataset model_kmean.fit(df) pd.options.display.max_columns = False # when looking at this dataframe we want to look for places where the values are vastly different from one another # columns where there are very little variances tell us that field has no distinction between clusters aka are not deterministic display(cluster_df) # here we can pull which fields have high levels of skew # this shows us the fields where 1 or 2 of the clusters vary signficantly from the others # good place to start for identifying attributes that are distinctive and show clear splits in behavior high_skew_fields = cluster_df.skew().loc[lambda x: abs(x) >= 0.5].index display(cluster_df[high_skew_fields]) #plotting the results plt.scatter(cluster_df,cluster_df) plt.show()

What should my code be to get the size of my six clusters Here is my code so far # Install necessary libraries import pandas as pd import numpy as np import seaborn as sbn from google.colab import files from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score import matplotlib.pyplot as plt from scipy.stats import skew # suppress scientific notation in later outputs pd.options.display.float_format = '{:.2f}'.format np.set_printoptions(suppress = True) # load cluster data and remove any commas so we can process properly df = pd.read_csv('CC GENERAL.csv') df = df.replace(',','',regex = True) #elbow graph to see how many cluster to do elbow_kwargs = {'init':'random', 'n_init':10, 'max_iter':480, 'random_state':48} sse = [] for x in range(1,15): model_chk = KMeans(n_clusters = x, **elbow_kwargs) model_chk.fit(df) sse.append(model_chk.inertia_) plt.style.use('fivethirtyeight') plt.plot(range(1,15),sse) plt.xticks(range(1,15)) plt.xlabel('Clusters') plt.ylabel('SSE') plt.show() #Drop all null values df =df.dropna() df.dropna(inplace=True) # now we buil out our model for real # using the same hyper parameters as above # only change is solidifying our n_clusters based on the elbow model_kmean = KMeans(init = 'random', n_clusters = 6, n_init = 10, max_iter = 480, random_state = 48) # run the model on our dataset model_kmean.fit(df) pd.options.display.max_columns = False # when looking at this dataframe we want to look for places where the values are vastly different from one another # columns where there are very little variances tell us that field has no distinction between clusters aka are not deterministic display(cluster_df) # here we can pull which fields have high levels of skew # this shows us the fields where 1 or 2 of the clusters vary signficantly from the others # good place to start for identifying attributes that are distinctive and show clear splits in behavior high_skew_fields = cluster_df.skew().loc[lambda x: abs(x) >= 0.5].index display(cluster_df[high_skew_fields]) #plotting the results plt.scatter(cluster_df,cluster_df) plt.show()

Database System Concepts

7th Edition

ISBN:9780078022159

Author:Abraham Silberschatz Professor, Henry F. Korth, S. Sudarshan

Publisher:Abraham Silberschatz Professor, Henry F. Korth, S. Sudarshan

Chapter1: Introduction

Section: Chapter Questions

Problem 1PE

See similar textbooks

Related questions

Question

What should my code be to get the size of my six clusters

Here is my code so far

# Install necessary libraries

import pandas as pd

import numpy as np

import seaborn as sbn

from google.colab import files

from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt

from scipy.stats import skew

# suppress scientific notation in later outputs

pd.options.display.float_format = '{:.2f}'.format

np.set_printoptions(suppress = True)

# load cluster data and remove any commas so we can process properly

df = pd.read_csv('CC GENERAL.csv')

df = df.replace(',','',regex = True)

#elbow graph to see how many cluster to do

elbow_kwargs = {'init':'random', 'n_init':10, 'max_iter':480, 'random_state':48}

sse = []

for x in range(1,15):

model_chk = KMeans(n_clusters = x, **elbow_kwargs)

model_chk.fit(df)

sse.append(model_chk.inertia_)

plt.style.use('fivethirtyeight')

plt.plot(range(1,15),sse)

plt.xticks(range(1,15))

plt.xlabel('Clusters')

plt.ylabel('SSE')

plt.show()

#Drop all null values

df =df.dropna()

df.dropna(inplace=True)

# now we buil out our model for real

# using the same hyper parameters as above

# only change is solidifying our n_clusters based on the elbow

model_kmean = KMeans(init = 'random', n_clusters = 6, n_init = 10, max_iter = 480, random_state = 48)

# run the model on our dataset

model_kmean.fit(df)

pd.options.display.max_columns = False

# when looking at this dataframe we want to look for places where the values are vastly different from one another

# columns where there are very little variances tell us that field has no distinction between clusters aka are not deterministic

display(cluster_df)

# here we can pull which fields have high levels of skew

# this shows us the fields where 1 or 2 of the clusters vary signficantly from the others

# good place to start for identifying attributes that are distinctive and show clear splits in behavior

high_skew_fields = cluster_df.skew().loc[lambda x: abs(x) >= 0.5].index

display(cluster_df[high_skew_fields])

#plotting the results

plt.scatter(cluster_df,cluster_df)

plt.show()

AI-Generated Solution

AI-generated content may present inaccurate or offensive content that does not represent bartleby’s views.

Unlock instant AI solutions

Tap the button
to generate a solution

Click the button to generate
a solution

SEE SOLUTION