understanding_k_means

py

School

University of Central Florida *

*We aren’t endorsed by this school

Course

5610

Subject

Computer Science

Date

Apr 3, 2024

Type

py

Pages

3

Uploaded by madhu_iitm

Report
# -*- coding: utf-8 -*- """Understanding k-means.ipynb Automatically generated by Colaboratory. Original file is located at ## Imports """ import numpy as np import pandas as pd from sklearn.cluster import KMeans """## Upload the input file""" input_file = "kmeans_clusters_3C_2D.dat" df=pd.read_csv(input_file, sep=' ', header=None) df.head() df.shape """## Input data scatterplot""" import seaborn as sns sns.set() sns.scatterplot(x = df[0], y = df[1]).set(title='The raw input data'); """## Train the model""" # We use the init='k-means++' hyperparameter setting for better convergence performance. n_clusters = 3 # That's out best guess (which turns out to be true!) kmeansModel = KMeans(n_clusters=n_clusters, init='k-means++', n_init='auto', random_state=0, verbose=True).fit(df) """### *Q1: Explain the above output.* ### *Q2: What made the algorithm converge so quickly?* """ """## Inspect the resulting model""" kmeansModel.get_params() # To confirm the actual number of iterations -- use the model's max_iter property value kmeansModel.n_iter_ kmeansModel.max_iter kmeansModel.inertia_ """## The cluster centroids""" kmeansModel.cluster_centers_
# How to read the above output: # The 0-based row index is the cluster id. # For example, checking the 2nd centroid's cooridinates (Feature_A, Feature_B): kmeansModel.cluster_centers_[2] """## Use the model for prediction""" kmeansModel.predict ([[1,3]]) # Predicting one observation kmeansModel.predict([[6, 7], [11, 8]]) # A bulk prediction """## Comparing predictions with the "ground truth" labels""" kmeansModel.labels_ # The predicted "ground truth" labels mapped to the cluster ids (there are a few errors!) # The actual ground truth labels (50 * 3) used to generate the data points in the input dataset (we mimic the about ouptut) gtl_='0' * 50 + '2'*50 + '1'*50 gtl=np.array([int(ch) for ch in gtl_]) print (gtl) data_points_in_each_cluster = df.shape[0] // n_clusters for cluster_idx in range(n_clusters): predicted_dp_in_cluster = list(kmeansModel.labels_).count(cluster_idx) print(f'The cluster #{cluster_idx} has been predicted to have [{predicted_dp_in_cluster}] data points out of [{data_points_in_each_cluster}]') # Finding the indexes of the wrongfully predicted data points idx_wrong = np.where (kmeansModel.labels_ != gtl) print (idx_wrong) wrong_labels = kmeansModel.labels_[idx_wrong] correct_labels = gtl[idx_wrong] print (f'The wrong labels:\t{wrong_labels} \nThe correct labels:\t{correct_labels}\ nA total of {len(wrong_labels)} wrong predictions...') """### *Q3: In the above output there are no mixed-up predictions between clusters 1 and 2. How could you explain this fact?*""" # Converting the results in a pandas DataFrame for convenience df_wrong_data_points = df.iloc[idx_wrong] print (df_wrong_data_points) """## Visualizing the results""" # Commented out IPython magic to ensure Python compatibility. # %matplotlib inline import matplotlib.pyplot as plt shift=0.11 # to slightly shift the wrongfully predicted data point (colored yellow) plt.title("Visualizing the predicted labels ...") plt.scatter(df[0], df[1], c=kmeansModel.labels_, cmap = 'Dark2_r')
plt.scatter(df_wrong_data_points[0]+shift, df_wrong_data_points[1]+shift, c = 'yellow') for i, centroid_ctr in enumerate(kmeansModel.cluster_centers_): plt.text(centroid_ctr[0], centroid_ctr[1], f"{i}", color='cyan', fontsize=25) plt.show(); # Compare the above scatterplot with the "true" one below import seaborn as sns sns.set() sns.scatterplot(x = df[0], y = df[1], hue=(gtl), palette="muted").set(title='The ground truth labels');
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help