assignment18fda

pdf

School

Northeastern University *

*We aren’t endorsed by this school

Course

6400

Subject

Industrial Engineering

Date

Jan 9, 2024

Type

pdf

Pages

9

Uploaded by vishalbunty01

Report
assignment18fda December 9, 2023 [16]: import pandas as pd import numpy as np from datetime import datetime, timedelta # Set random seed for reproducibility np . random . seed( 0 ) # Generate dates within the last two years def generate_dates (n): start_date = datetime . now() - timedelta(days =730 ) return [start_date + timedelta(days = np . random . randint( 0 , 730 )) for _ in range (n)] # Generating dataset n_customers = 100 n_transactions = 1000 customer_ids = np . random . choice( range ( 1 , n_customers + 1 ), n_transactions) dates_of_purchase = generate_dates(n_transactions) purchase_amounts = np . random . uniform( 20 , 500 , n_transactions) df = pd . DataFrame({ 'Customer_ID' : customer_ids, 'Date_of_Purchase' : dates_of_purchase, 'Purchase_Amount' : purchase_amounts }) df [16]: Customer_ID Date_of_Purchase Purchase_Amount 0 45 2022-11-30 04:38:58.163613 59.158930 1 48 2023-07-31 04:38:58.163613 266.328664 2 65 2023-01-18 04:38:58.163613 126.238281 3 68 2023-05-03 04:38:58.163613 131.511920 4 68 2023-05-07 04:38:58.163613 272.439987 .. 995 27 2022-08-03 04:38:58.163613 314.646965 1
996 49 2023-09-25 04:38:58.163613 363.876860 997 72 2022-01-14 04:38:58.163613 158.001619 998 55 2023-08-07 04:38:58.163613 259.736243 999 97 2023-05-05 04:38:58.163613 479.868506 [1000 rows x 3 columns] Data Preprocessing [17]: df . info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Customer_ID 1000 non-null int64 1 Date_of_Purchase 1000 non-null datetime64[ns] 2 Purchase_Amount 1000 non-null float64 dtypes: datetime64[ns](1), float64(1), int64(1) memory usage: 23.6 KB [18]: df . isna() . sum() [18]: Customer_ID 0 Date_of_Purchase 0 Purchase_Amount 0 dtype: int64 [19]: df . describe() [19]: Customer_ID Purchase_Amount count 1000.000000 1000.000000 mean 50.397000 262.852416 std 29.136065 138.296137 min 1.000000 20.089623 25% 26.000000 140.998710 50% 50.000000 264.457078 75% 76.000000 385.929854 max 100.000000 499.929837 [20]: df[ 'Customer_ID' ] . value_counts() . sort_index(ascending = True ) [20]: 1 18 2 7 3 7 4 15 5 10 2
.. 96 5 97 8 98 8 99 9 100 11 Name: Customer_ID, Length: 100, dtype: int64 [21]: df[ 'Date_of_Purchase' ] . min() [21]: Timestamp('2021-12-09 04:38:58.163613') [22]: df[ 'Date_of_Purchase' ] . max() [22]: Timestamp('2023-12-08 04:38:58.163613') Feature Engineering [23]: latest_date = datetime . now() rfm_table = df . groupby( 'Customer_ID' ) . agg({ 'Date_of_Purchase' : lambda v:(latest_date - v . max()) . days, 'Customer_ID' : 'count' , 'Purchase_Amount' : 'sum' }) rfm_table [23]: Date_of_Purchase Customer_ID Purchase_Amount Customer_ID 1 129 18 4469.707768 2 6 7 1615.129002 3 20 7 1777.182084 4 53 15 3126.994941 5 85 10 2853.746454 96 259 5 1011.771606 97 86 8 2317.176873 98 208 8 1793.294517 99 5 9 2446.570512 100 77 11 2642.239188 [100 rows x 3 columns] [24]: rfm_table . rename(columns = { 'Date_of_Purchase' : 'Recency' , 'Customer_ID' : 'Frequency' , 'Purchase_Amount' : 'Monetary' },inplace = 'True' ) 3
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
rfm_table [24]: Recency Frequency Monetary Customer_ID 1 129 18 4469.707768 2 6 7 1615.129002 3 20 7 1777.182084 4 53 15 3126.994941 5 85 10 2853.746454 96 259 5 1011.771606 97 86 8 2317.176873 98 208 8 1793.294517 99 5 9 2446.570512 100 77 11 2642.239188 [100 rows x 3 columns] [25]: from sklearn.preprocessing import StandardScaler scaler = StandardScaler() rfm_scaled = scaler . fit_transform(rfm_table[[ 'Recency' , 'Frequency' , 'Monetary' ]]) rfm_scaled = pd . DataFrame(rfm_scaled, columns = [ 'Recency' , 'Frequency' , 'Monetary' ]) rfm_scaled . head( 10 ) [25]: Recency Frequency Monetary 0 0.613496 2.871833 2.284111 1 -0.982528 -1.076937 -1.257184 2 -0.800867 -1.076937 -1.056147 3 -0.372665 1.794895 0.618386 4 0.042561 0.000000 0.279403 5 -0.139101 0.717958 0.038605 6 1.145505 0.358979 0.374454 7 -1.047407 0.717958 1.098450 8 0.885988 -0.717958 -1.257697 9 -0.255883 0.000000 0.170944 Customer Segmentation [26]: from sklearn.cluster import KMeans import matplotlib.pyplot as plt import warnings warnings . filterwarnings( 'ignore' ) sse = [] 4
for k in range ( 1 , 11 ): kmeans = KMeans(n_clusters = k, random_state =42 ) kmeans . fit(rfm_scaled) sse . append(kmeans . inertia_) [27]: plt . plot( range ( 1 , 11 ), sse, marker = 'o' ) plt . title( 'Elbow Method for Optimal K' ) plt . xlabel( 'Number of Clusters (K)' ) plt . ylabel( 'Sum of Squared Distances (SSE)' ) plt . show() [29]: from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score k = 3 kmeans = KMeans(n_clusters = k, init = 'k-means++' , random_state =42 ) y_kmeans = kmeans . fit_predict(rfm_scaled) silhouette_avg = silhouette_score(rfm_scaled, y_kmeans) print ( "For" , k, "clusters average silhouette_score is:" , silhouette_avg) 5
y_kmeans rfm_scaled[ 'Clusters' ] = y_kmeans For 3 clusters average silhouette_score is: 0.45024274127470876 Visualization [30]: recency = rfm_scaled[ 'Recency' ] frequency = rfm_scaled[ 'Frequency' ] plt . figure(figsize = ( 10 , 6 )) plt . title( 'Customer Loyalty Clusters' ) plt . scatter(rfm_scaled[y_kmeans == 0 ][ 'Recency' ], rfm_scaled[y_kmeans == 0 ][ 'Frequency' ], s =40 , c = 'cornflowerblue' , alpha =0.8 ,label = 'Cluster 1' ) plt . scatter(rfm_scaled[y_kmeans == 1 ][ 'Recency' ], rfm_scaled[y_kmeans == 1 ][ 'Frequency' ], s =40 , c = 'orange' , alpha =0.8 ,label = 'Cluster 2' ) plt . scatter(rfm_scaled[y_kmeans == 2 ][ 'Recency' ], rfm_scaled[y_kmeans == 2 ][ 'Frequency' ], s =40 , c = 'forestgreen' , alpha =0.8 ,label = 'Cluster 3' ) plt . legend() plt . xlabel( 'Scaled Recency' ) plt . ylabel( 'Scaled Frequency' ) plt . show() 6
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help
[31]: plt . figure(figsize = ( 10 , 6 )) plt . title( 'Customer Segments(RFM Clusters)' ) plt . scatter(rfm_scaled[y_kmeans == 0 ][ 'Recency' ], rfm_scaled[y_kmeans == 0 ][ 'Monetary' ], s =40 , c = 'cornflowerblue' , alpha =0.8 , label = 'Cluster 1' ) plt . scatter(rfm_scaled[y_kmeans == 1 ][ 'Recency' ], rfm_scaled[y_kmeans == 1 ][ 'Monetary' ], s =40 , c = 'orange' , alpha =0.8 , label = 'Cluster 2' ) plt . scatter(rfm_scaled[y_kmeans == 2 ][ 'Recency' ], rfm_scaled[y_kmeans == 2 ][ 'Monetary' ], s =40 , c = 'forestgreen' , alpha =0.8 ,label = 'Cluster 3' ) plt . legend() plt . xlabel( 'Scaled Recency' ) plt . ylabel( 'Scaled Monetary' ) plt . show() [32]: plt . figure(figsize = ( 10 , 6 )) plt . title( 'Customer Segments(RFM Clusters)' ) plt . scatter(rfm_scaled[y_kmeans == 0 ][ 'Frequency' ], rfm_scaled[y_kmeans == 0 ][ 'Monetary' ], s =50 , c = 'orange' , alpha =0.8 , label = 'Cluster 1' ) plt . scatter(rfm_scaled[y_kmeans == 1 ][ 'Frequency' ], rfm_scaled[y_kmeans == 1 ][ 'Monetary' ], s =50 , c = 'cornflowerblue' , alpha =0.8 , label = 'Cluster 2' ) plt . scatter(rfm_scaled[y_kmeans == 2 ][ 'Frequency' ], rfm_scaled[y_kmeans == 2 ][ 'Monetary' ], s =50 , c = 'forestgreen' , alpha =0.8 ,label = 'Cluster 3' ) plt . legend() 7
plt . xlabel( 'Scaled Frequency' ) plt . ylabel( 'Scaled Monetary' ) plt . show() [33]: fig = plt . figure(figsize = ( 16 , 14 )) ax = fig . add_subplot( 111 , projection = '3d' ) cluster_colors = { 0 : 'cornflowerblue' , 1 : 'orange' , 2 : 'green' } for cluster in rfm_scaled[ 'Clusters' ] . unique(): cluster_data = rfm_scaled[rfm_scaled[ 'Clusters' ] == cluster] ax . scatter(cluster_data[ 'Recency' ], cluster_data[ 'Frequency' ], cluster_data[ 'Monetary' ], label = f'Cluster { cluster +1 } ' ,alpha =0.6 , c = cluster_colors[cluster]) ax . set_xlabel( 'Scaled Recency' ) ax . set_ylabel( 'Scaled Frequency' ) ax . set_zlabel( 'Scaled Monetary' ) ax . set_title( '3D Clustering' ) ax . legend() plt . show() 8
9
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
  • Access to all documents
  • Unlimited textbook solutions
  • 24/7 expert homework help