understanding_pca

py

School

University of Central Florida *

*We aren’t endorsed by this school

Course

5610

Subject

Computer Science

Date

Apr 3, 2024

Type

py

Pages

4

Uploaded by madhu_iitm

Report
# -*- coding: utf-8 -*- """Understanding PCA.ipynb Automatically generated by Colaboratory. Original file is located at ## Imports """ # Commented out IPython magic to ensure Python compatibility. # %matplotlib inline import matplotlib.pyplot as plt import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.datasets import load_breast_cancer from sklearn.preprocessing import StandardScaler """## Developing Intuition ### Create a synthetic input dataset """ prng = np.random.default_rng(1011) rows = 2 data_points_cnt = 50 a1 = np.array([[1,3], [1,1]]) # the data Jitter matrix a2 = prng.normal (size= (rows, data_points_cnt)) X = np.dot(a1, a2).T print (f"The input dataset (x,y)\n{X[:10]}\n...") """### Create and fit a PCA transformer with as many components as there are features""" pca = PCA(random_state=1011) pca.fit(X) print (pca.components_) # The loadings matrix of shape (n_components, n_features) """``` Loadings are essentially the correlation between the original variables and the PCA components (axes). Loadings indicate a direction (+/-) and the magnitude of that correlation. ``` ### Q1: Why is there no target variable (y) in the dataset? ### Review the model """ pca.get_params() pca.n_components_, pca.n_features_in_ pca_ex_var = pca.explained_variance_ratio_ print (pca_ex_var)
print (f"The first PCA - PC1 explains {round(pca_ex_var[0]*100,2)}% variance in the original dataset.") """### Transform the original data""" X_pca_2 = pca.transform(X) print (f"The new dataset dimensions: {X_pca_2.shape}") print (f"The transformed dataset:\n{X_pca_2[:10]}\n...") """### Visualize PCA transformations results""" # We need to do an inverse transformation to map the data back to the original 2- feature space: X_new_2 = pca.inverse_transform(X_pca_2) # X_new_2 is the reconstructed original X dataset plt.figure(figsize=(5,5)) plt.grid() plt.axis('equal') plt.scatter(X[:,0], X[:,1], alpha=0.2, s=100) plt.scatter(X_new_2[:,0], X_new_2[:,1], alpha=0.4, marker="^", c='r'); # Were we able to fully reconstruct the original dataset? print (f"Original std: {X.std()}\nReconstructed std:{X_new_2.std()}") """### Create and fit a PCA transformer with only 1 PCA""" # Now using only the first PCA to fit and transform the data pca_n1 = PCA(n_components=1, random_state=1011) X_pca_n1 = pca_n1.fit_transform(X) print (f"The new dataset dimensions: {X_pca_n1.shape}") print (f"The transformed dataset:\n{X_pca_n1[:10]}\n...") pca_n1.n_components_, pca_n1.n_features_in_ """### Visualize the one-PCA transformations results""" # We need to do an inverse transformation to map the data back to the original 2- feature space: X_new = pca_n1.inverse_transform(X_pca_n1) plt.figure(figsize=(6,6)) plt.grid() plt.axis('equal') plt.scatter(X[:,0], X[:,1], alpha=0.2) # X_new is the original X dataset with some information lost # during PCA fit_transform() transformation plt.scatter(X_new[:,0], X_new[:,1], alpha=0.3, marker="^", c='r'); # We were able to only partially reconstruct the original dataset (some information was lost) print (f"Original dataset's std: {X.std()}\nReconstructed datset's std: {X_new.std()}")
X_new.std() / X.std() # about 3% of information lost """### Q2: How would you explain the lost information? ## Understanding the need to perform data scaling for PCA with the help of the [Breast Cancer Dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_br east_cancer.html) ### Fitting a 2-component PCA to unscaled data """ bc_bunch = load_breast_cancer() X = bc_bunch.data print (X.shape) #y = bc_bunch.target # do not need it feature_names = bc_bunch.feature_names pca = PCA(n_components=2, random_state=1011) X_pca = pca.fit_transform(X) # We fit to unscaled data components = pca.components_ print (components.shape) # The loadings. We map a 30-variable feature space down to a 2D one. pca.explained_variance_ratio_ # Looks great, or does it? plt.figure(figsize=(10,10)) plt.imshow (components.T) # 30 features x 2 PCAs plt.yticks(range(len(feature_names)), feature_names) plt.colorbar(); # The figure is created from un-scalled data df_X = pd.DataFrame(X, columns=feature_names) # pandas is always a great computation helper! df_X.describe().loc['mean'].sort_values(ascending=False) # These features dominate in the unscaled dataset, making PCA transformations strongly skewed -- PCA believes that two components is enough to explain the data: # worst area 880.583128 # mean area 654.889104 """### Fitting to the scaled data""" # Same two components, but the data will be scaled! pca2 = PCA(n_components=2, random_state=1011) scaler = StandardScaler() X_scaled = scaler.fit_transform(X) X_pca = pca2.fit_transform(X_scaled) components = pca2.components_ plt.figure(figsize=(10,10)) plt.imshow (components.T) # 30 features x 2 PCAs plt.yticks(range(len(feature_names)), feature_names) plt.colorbar(); # You can see now a fuller feature enagement
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
"""### Checking the explained variance""" # The unscaled data: # worst area 880.583128 # mean area 654.889104 pca.explained_variance_ratio_ # Now, after scaling the data, 2 components are not enough ... pca2.explained_variance_ratio_ # Let's make it 5 components pca5 = PCA(n_components=5, random_state=1011) X_pca = pca5.fit_transform(X_scaled) pca_n5_exp_var = pca5.explained_variance_ratio_ pca_n5_exp_var """### The Scree Plot""" plt.figure(figsize= (5,5)) plt.grid(True) plt.title ("The Scree Plot") plt.plot( pca_n5_exp_var, 'ro-', linewidth=2); # So, with 5 components, how much variance does PCA explain? np.cumsum(pca_n5_exp_var)[-1] * 100 components_n5 = pca5.components_ components_n5 """## Find the number of components for a given level of explained variance""" # The warranted question now is how many components is needed to explain, say, 95% of the variance in the initial data ... pca_ALL = PCA(random_state=1011) pca_ALL.fit(X_scaled) print (pca_ALL.n_components_) pca_n_ALL_exp_var = pca_ALL.explained_variance_ratio_ pca_n_ALL_exp_var pca_ALL.n_components_ """### Exercise 1: How can you programmatically find the number of PCAs that would cumulatively explain 95% of the variance in the data? ``` Hint: Use pca_n_ALL_exp_var and np.argmax with the cumsum >= 0.95 predicate ``` *End of the lab notebook* """