assignment18fda
pdf
keyboard_arrow_up
School
Northeastern University *
*We aren’t endorsed by this school
Course
6400
Subject
Industrial Engineering
Date
Jan 9, 2024
Type
Pages
9
Uploaded by vishalbunty01
assignment18fda
December 9, 2023
[16]:
import
pandas
as
pd
import
numpy
as
np
from
datetime
import
datetime, timedelta
# Set random seed for reproducibility
np
.
random
.
seed(
0
)
# Generate dates within the last two years
def
generate_dates
(n):
start_date
=
datetime
.
now()
-
timedelta(days
=730
)
return
[start_date
+
timedelta(days
=
np
.
random
.
randint(
0
,
730
))
for
_
in
␣
↪
range
(n)]
# Generating dataset
n_customers
= 100
n_transactions
= 1000
customer_ids
=
np
.
random
.
choice(
range
(
1
, n_customers
+ 1
), n_transactions)
dates_of_purchase
=
generate_dates(n_transactions)
purchase_amounts
=
np
.
random
.
uniform(
20
,
500
, n_transactions)
df
=
pd
.
DataFrame({
'Customer_ID'
: customer_ids,
'Date_of_Purchase'
: dates_of_purchase,
'Purchase_Amount'
: purchase_amounts
})
df
[16]:
Customer_ID
Date_of_Purchase
Purchase_Amount
0
45 2022-11-30 04:38:58.163613
59.158930
1
48 2023-07-31 04:38:58.163613
266.328664
2
65 2023-01-18 04:38:58.163613
126.238281
3
68 2023-05-03 04:38:58.163613
131.511920
4
68 2023-05-07 04:38:58.163613
272.439987
..
…
…
…
995
27 2022-08-03 04:38:58.163613
314.646965
1
996
49 2023-09-25 04:38:58.163613
363.876860
997
72 2022-01-14 04:38:58.163613
158.001619
998
55 2023-08-07 04:38:58.163613
259.736243
999
97 2023-05-05 04:38:58.163613
479.868506
[1000 rows x 3 columns]
Data Preprocessing
[17]:
df
.
info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
#
Column
Non-Null Count
Dtype
---
------
--------------
-----
0
Customer_ID
1000 non-null
int64
1
Date_of_Purchase
1000 non-null
datetime64[ns]
2
Purchase_Amount
1000 non-null
float64
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 23.6 KB
[18]:
df
.
isna()
.
sum()
[18]:
Customer_ID
0
Date_of_Purchase
0
Purchase_Amount
0
dtype: int64
[19]:
df
.
describe()
[19]:
Customer_ID
Purchase_Amount
count
1000.000000
1000.000000
mean
50.397000
262.852416
std
29.136065
138.296137
min
1.000000
20.089623
25%
26.000000
140.998710
50%
50.000000
264.457078
75%
76.000000
385.929854
max
100.000000
499.929837
[20]:
df[
'Customer_ID'
]
.
value_counts()
.
sort_index(ascending
=
True
)
[20]:
1
18
2
7
3
7
4
15
5
10
2
..
96
5
97
8
98
8
99
9
100
11
Name: Customer_ID, Length: 100, dtype: int64
[21]:
df[
'Date_of_Purchase'
]
.
min()
[21]:
Timestamp('2021-12-09 04:38:58.163613')
[22]:
df[
'Date_of_Purchase'
]
.
max()
[22]:
Timestamp('2023-12-08 04:38:58.163613')
Feature Engineering
[23]:
latest_date
=
datetime
.
now()
rfm_table
=
df
.
groupby(
'Customer_ID'
)
.
agg({
'Date_of_Purchase'
:
lambda
v:(latest_date
-
v
.
max())
.
days,
'Customer_ID'
:
'count'
,
'Purchase_Amount'
:
'sum'
})
rfm_table
[23]:
Date_of_Purchase
Customer_ID
Purchase_Amount
Customer_ID
1
129
18
4469.707768
2
6
7
1615.129002
3
20
7
1777.182084
4
53
15
3126.994941
5
85
10
2853.746454
…
…
…
…
96
259
5
1011.771606
97
86
8
2317.176873
98
208
8
1793.294517
99
5
9
2446.570512
100
77
11
2642.239188
[100 rows x 3 columns]
[24]:
rfm_table
.
rename(columns
=
{
'Date_of_Purchase'
:
'Recency'
,
'Customer_ID'
:
'Frequency'
,
'Purchase_Amount'
:
'Monetary'
},inplace
=
'True'
)
3
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
- Access to all documents
- Unlimited textbook solutions
- 24/7 expert homework help
rfm_table
[24]:
Recency
Frequency
Monetary
Customer_ID
1
129
18
4469.707768
2
6
7
1615.129002
3
20
7
1777.182084
4
53
15
3126.994941
5
85
10
2853.746454
…
…
…
…
96
259
5
1011.771606
97
86
8
2317.176873
98
208
8
1793.294517
99
5
9
2446.570512
100
77
11
2642.239188
[100 rows x 3 columns]
[25]:
from
sklearn.preprocessing
import
StandardScaler
scaler
=
StandardScaler()
rfm_scaled
=
scaler
.
fit_transform(rfm_table[[
'Recency'
,
'Frequency'
,
␣
↪
'Monetary'
]])
rfm_scaled
=
pd
.
DataFrame(rfm_scaled, columns
=
[
'Recency'
,
'Frequency'
,
␣
↪
'Monetary'
])
rfm_scaled
.
head(
10
)
[25]:
Recency
Frequency
Monetary
0
0.613496
2.871833
2.284111
1 -0.982528
-1.076937 -1.257184
2 -0.800867
-1.076937 -1.056147
3 -0.372665
1.794895
0.618386
4
0.042561
0.000000
0.279403
5 -0.139101
0.717958
0.038605
6
1.145505
0.358979
0.374454
7 -1.047407
0.717958
1.098450
8
0.885988
-0.717958 -1.257697
9 -0.255883
0.000000
0.170944
Customer Segmentation
[26]:
from
sklearn.cluster
import
KMeans
import
matplotlib.pyplot
as
plt
import
warnings
warnings
.
filterwarnings(
'ignore'
)
sse
=
[]
4
for
k
in
range
(
1
,
11
):
kmeans
=
KMeans(n_clusters
=
k, random_state
=42
)
kmeans
.
fit(rfm_scaled)
sse
.
append(kmeans
.
inertia_)
[27]:
plt
.
plot(
range
(
1
,
11
), sse, marker
=
'o'
)
plt
.
title(
'Elbow Method for Optimal K'
)
plt
.
xlabel(
'Number of Clusters (K)'
)
plt
.
ylabel(
'Sum of Squared Distances (SSE)'
)
plt
.
show()
[29]:
from
sklearn.cluster
import
KMeans
from
sklearn.metrics
import
silhouette_score
k
= 3
kmeans
=
KMeans(n_clusters
=
k, init
=
'k-means++'
, random_state
=42
)
y_kmeans
=
kmeans
.
fit_predict(rfm_scaled)
silhouette_avg
=
silhouette_score(rfm_scaled, y_kmeans)
print
(
"For"
, k,
"clusters average silhouette_score is:"
, silhouette_avg)
5
y_kmeans
rfm_scaled[
'Clusters'
]
=
y_kmeans
For 3 clusters average silhouette_score is: 0.45024274127470876
Visualization
[30]:
recency
=
rfm_scaled[
'Recency'
]
frequency
=
rfm_scaled[
'Frequency'
]
plt
.
figure(figsize
=
(
10
,
6
))
plt
.
title(
'Customer Loyalty Clusters'
)
plt
.
scatter(rfm_scaled[y_kmeans
== 0
][
'Recency'
], rfm_scaled[y_kmeans
==
␣
↪
0
][
'Frequency'
], s
=40
, c
=
'cornflowerblue'
, alpha
=0.8
,label
=
'Cluster 1'
)
plt
.
scatter(rfm_scaled[y_kmeans
== 1
][
'Recency'
], rfm_scaled[y_kmeans
==
␣
↪
1
][
'Frequency'
], s
=40
, c
=
'orange'
, alpha
=0.8
,label
=
'Cluster 2'
)
plt
.
scatter(rfm_scaled[y_kmeans
== 2
][
'Recency'
], rfm_scaled[y_kmeans
==
␣
↪
2
][
'Frequency'
], s
=40
, c
=
'forestgreen'
, alpha
=0.8
,label
=
'Cluster 3'
)
plt
.
legend()
plt
.
xlabel(
'Scaled Recency'
)
plt
.
ylabel(
'Scaled Frequency'
)
plt
.
show()
6
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
- Access to all documents
- Unlimited textbook solutions
- 24/7 expert homework help
[31]:
plt
.
figure(figsize
=
(
10
,
6
))
plt
.
title(
'Customer Segments(RFM Clusters)'
)
plt
.
scatter(rfm_scaled[y_kmeans
== 0
][
'Recency'
], rfm_scaled[y_kmeans
==
␣
↪
0
][
'Monetary'
], s
=40
, c
=
'cornflowerblue'
, alpha
=0.8
, label
=
'Cluster 1'
)
plt
.
scatter(rfm_scaled[y_kmeans
== 1
][
'Recency'
], rfm_scaled[y_kmeans
==
␣
↪
1
][
'Monetary'
], s
=40
, c
=
'orange'
, alpha
=0.8
, label
=
'Cluster 2'
)
plt
.
scatter(rfm_scaled[y_kmeans
== 2
][
'Recency'
], rfm_scaled[y_kmeans
==
␣
↪
2
][
'Monetary'
], s
=40
, c
=
'forestgreen'
, alpha
=0.8
,label
=
'Cluster 3'
)
plt
.
legend()
plt
.
xlabel(
'Scaled Recency'
)
plt
.
ylabel(
'Scaled Monetary'
)
plt
.
show()
[32]:
plt
.
figure(figsize
=
(
10
,
6
))
plt
.
title(
'Customer Segments(RFM Clusters)'
)
plt
.
scatter(rfm_scaled[y_kmeans
== 0
][
'Frequency'
], rfm_scaled[y_kmeans
==
␣
↪
0
][
'Monetary'
], s
=50
, c
=
'orange'
, alpha
=0.8
, label
=
'Cluster 1'
)
plt
.
scatter(rfm_scaled[y_kmeans
== 1
][
'Frequency'
], rfm_scaled[y_kmeans
==
␣
↪
1
][
'Monetary'
], s
=50
, c
=
'cornflowerblue'
, alpha
=0.8
, label
=
'Cluster 2'
)
plt
.
scatter(rfm_scaled[y_kmeans
== 2
][
'Frequency'
], rfm_scaled[y_kmeans
==
␣
↪
2
][
'Monetary'
], s
=50
, c
=
'forestgreen'
, alpha
=0.8
,label
=
'Cluster 3'
)
plt
.
legend()
7
plt
.
xlabel(
'Scaled Frequency'
)
plt
.
ylabel(
'Scaled Monetary'
)
plt
.
show()
[33]:
fig
=
plt
.
figure(figsize
=
(
16
,
14
))
ax
=
fig
.
add_subplot(
111
, projection
=
'3d'
)
cluster_colors
=
{
0
:
'cornflowerblue'
,
1
:
'orange'
,
2
:
'green'
}
for
cluster
in
rfm_scaled[
'Clusters'
]
.
unique():
cluster_data
=
rfm_scaled[rfm_scaled[
'Clusters'
]
==
cluster]
ax
.
scatter(cluster_data[
'Recency'
], cluster_data[
'Frequency'
],
␣
↪
cluster_data[
'Monetary'
], label
=
f'Cluster
{
cluster
+1
}
'
,alpha
=0.6
,
c
=
cluster_colors[cluster])
ax
.
set_xlabel(
'Scaled Recency'
)
ax
.
set_ylabel(
'Scaled Frequency'
)
ax
.
set_zlabel(
'Scaled Monetary'
)
ax
.
set_title(
'3D Clustering'
)
ax
.
legend()
plt
.
show()
8
9
Your preview ends here
Eager to read complete document? Join bartleby learn and gain access to the full version
- Access to all documents
- Unlimited textbook solutions
- 24/7 expert homework help