IE6400_Quiz18_Day 21
pdf
keyboard_arrow_up
School
Northeastern University *
*We aren’t endorsed by this school
Course
6400
Subject
Industrial Engineering
Date
Feb 20, 2024
Type
Pages
1
Uploaded by ColonelStraw13148
import
pandas as
pd
import
numpy as
np
from
datetime import
datetime
, timedelta
import
matplotlib.pyplot as
plt
# Define the parameters for the synthetic data
start_date =
datetime
(
2012
, 1
, 1
)
end_date =
datetime
(
2022
, 12
, 31
) # Updated end date to December 2022
num_days =
(
end_date -
start_date
)
.
days +
1
base_sales =
10000 # Average daily sales
sales_std_dev =
1000 # Standard deviation for sales variation
# Generate synthetic daily sales data
dates =
[
start_date +
timedelta
(
days
=
x
) for
x in
range
(
num_days
)]
sales =
np
.
random
.
normal
(
base_sales
, sales_std_dev
, num_days
)
.
round
(
2
)
sales =
np
.
where
(
sales <
0
, 0
, sales
) # Ensure sales are non-negative
# Create a DataFrame for the daily sales data
sales_data =
pd
.
DataFrame
({
'Date'
: dates
, 'Total Revenue'
: sales
})
# Plotting the time series
plt
.
figure
(
figsize
=
(
12
, 6
))
plt
.
plot
(
sales_data
[
'Date'
], sales_data
[
'Total Revenue'
], label
=
'Total Revenue'
)
plt
.
title
(
'Time Series Plot of Daily Sales Data'
)
plt
.
xlabel
(
'Date'
)
plt
.
ylabel
(
'Total Revenue'
)
plt
.
grid
(
True
)
plt
.
legend
()
plt
.
show
()
sales_data
.
head
()
Date
Total Revenue
0
2012-01-01
10805.14
1
2012-01-02
9779.27
2
2012-01-03
10554.37
3
2012-01-04
9587.57
4
2012-01-05
11120.40
Data Transformation
# Convert the 'Date' column to datetime format
sales_data
[
'Date'
] =
pd
.
to_datetime
(
sales_data
[
'Date'
])
# Extract year and month from the 'Date' column
sales_data
[
'Year'
] =
sales_data
[
'Date'
]
.
dt
.
year
sales_data
[
'Month'
] =
sales_data
[
'Date'
]
.
dt
.
month
# Aggregate daily sales to monthly sales
monthly_sales =
sales_data
.
groupby
([
'Year'
, 'Month'
], as_index
=
False
)[
'Total Revenue'
]
.
sum
()
# Feature engineering: add a column for the month number (1 to 120)
monthly_sales
[
'MonthNumber'
] =
(
monthly_sales
[
'Year'
] -
2012
) *
12 +
monthly_sales
[
'Month'
]
monthly_sales
Year
Month
Total Revenue
MonthNumber
0
2012
1
317708.47
1
1
2012
2
286907.41
2
2
2012
3
303435.68
3
3
2012
4
305924.18
4
4
2012
5
311490.48
5
...
...
...
...
...
127
2022
8
310212.28
128
128
2022
9
295225.62
129
129
2022
10
309116.39
130
130
2022
11
293850.55
131
131
2022
12
315075.06
132
132 rows ×
4 columns
Model Development and Comparison:
Machine Learning Mode
# import nessary library
from
sklearn.model_selection import
train_test_split
from
sklearn.neural_network import
MLPRegressor
from
sklearn.ensemble import
RandomForestRegressor
from
sklearn.tree import
DecisionTreeRegressor
from
sklearn.metrics import
mean_squared_error
, mean_absolute_error
from
statsmodels.tsa.arima.model import
ARIMA
import
warnings
# Train-test split
train
, test =
train_test_split
(
monthly_sales
, test_size
=
0.2
, shuffle
=
False
)
# Neural Network model
nn_model =
MLPRegressor
(
hidden_layer_sizes
=
(
100
,), max_iter
=
1000
, random_state
=
42
)
nn_model
.
fit
(
train
[[
'MonthNumber'
]], train
[
'Total Revenue'
])
nn_predictions =
nn_model
.
predict
(
test
[[
'MonthNumber'
]])
nn_rmse =
np
.
sqrt
(
mean_squared_error
(
test
[
'Total Revenue'
], nn_predictions
))
nn_mae =
mean_absolute_error
(
test
[
'Total Revenue'
], nn_predictions
)
# Random Forest model
rf_model =
RandomForestRegressor
(
n_estimators
=
100
, random_state
=
42
)
rf_model
.
fit
(
train
[[
'MonthNumber'
]], train
[
'Total Revenue'
])
rf_predictions =
rf_model
.
predict
(
test
[[
'MonthNumber'
]])
rf_rmse =
np
.
sqrt
(
mean_squared_error
(
test
[
'Total Revenue'
], rf_predictions
))
rf_mae =
mean_absolute_error
(
test
[
'Total Revenue'
], rf_predictions
)
# Decision Tree model
dt_model =
DecisionTreeRegressor
(
random_state
=
42
)
dt_model
.
fit
(
train
[[
'MonthNumber'
]], train
[
'Total Revenue'
])
dt_predictions =
dt_model
.
predict
(
test
[[
'MonthNumber'
]])
dt_rmse =
np
.
sqrt
(
mean_squared_error
(
test
[
'Total Revenue'
], dt_predictions
))
dt_mae =
mean_absolute_error
(
test
[
'Total Revenue'
], dt_predictions
)
# ARIMA model
arima_model =
ARIMA
(
monthly_sales
[
'Total Revenue'
], order
=
(
5
, 1
, 0
))
arima_results =
arima_model
.
fit
()
arima_predictions =
arima_results
.
predict
(
start
=
len
(
train
), end
=
len
(
monthly_sales
)
-
1
, typ
=
'levels'
)
arima_rmse =
np
.
sqrt
(
mean_squared_error
(
test
[
'Total Revenue'
], arima_predictions
))
arima_mae =
mean_absolute_error
(
test
[
'Total Revenue'
], arima_predictions
)
/Users/skyleraliya/opt/anaconda3/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stochastic Optimizer: Maximum ite
rations (1000) reached and the optimization hasn't converged yet.
warnings.warn(
# Plotting Monthly Total Revenue Prediction for Next 6 Months
plt
.
figure
(
figsize
=
(
12
, 6
))
# Plot historical data
plt
.
plot
(
monthly_sales
[
'MonthNumber'
], monthly_sales
[
'Total Revenue'
], marker
=
'o'
, linestyle
=
'-'
, color
=
'green'
, label
=
'Historical Data'
, alpha
=
0.7
)
# Plot predictions for the next 6 months
future_months =
range
(
max
(
monthly_sales
[
'MonthNumber'
])
+
1
, max
(
monthly_sales
[
'MonthNumber'
])
+
7
)
future_months_data =
pd
.
DataFrame
({
'MonthNumber'
: future_months
})
# Neural Network predictions
nn_future_predictions =
nn_model
.
predict
(
future_months_data
[[
'MonthNumber'
]])
plt
.
plot
(
future_months
, nn_future_predictions
, label
=
'Neural Network Predictions'
)
# Random Forest predictions
rf_future_predictions =
rf_model
.
predict
(
future_months_data
[[
'MonthNumber'
]])
plt
.
plot
(
future_months
, rf_future_predictions
, label
=
'Random Forest Predictions'
)
# Decision Tree predictions
dt_future_predictions =
dt_model
.
predict
(
future_months_data
[[
'MonthNumber'
]])
plt
.
plot
(
future_months
, dt_future_predictions
, label
=
'Decision Tree Predictions'
)
# ARIMA predictions
arima_future_predictions =
arima_results
.
predict
(
start
=
len
(
monthly_sales
), end
=
len
(
monthly_sales
)
+
5
, typ
=
'levels'
)
plt
.
plot
(
future_months
, arima_future_predictions
, label
=
'ARIMA Predictions'
)
plt
.
title
(
'Monthly Total Revenue Prediction for Next 6 Months (Including Historical)'
)
plt
.
xlabel
(
'Month Number'
)
plt
.
ylabel
(
'Total Revenue'
)
plt
.
grid
(
True
)
plt
.
legend
()
plt
.
show
()
/Users/skyleraliya/opt/anaconda3/lib/python3.9/site-packages/statsmodels/tsa/statespace/representation.py:374: FutureWarning: Unknown keyword arguments: dict_keys(['ty
p']).Passing unknown keyword arguments will raise a TypeError beginning in version 0.15.
warnings.warn(msg, FutureWarning)
Comparison with ARIMA
# Print RMSE and MAE for each model
model_performance =
pd
.
DataFrame
({
'Model'
: [
'Neural Network'
, 'Random Forest'
, 'Decision Tree'
, 'ARIMA'
],
'RMSE'
: [
nn_rmse
, rf_rmse
, dt_rmse
, arima_rmse
],
'MAE'
: [
nn_mae
, rf_mae
, dt_mae
, arima_mae
]
})
# Sort the DataFrame based on RMSE in descending order
model_performance =
model_performance
.
sort_values
(
by
=
'RMSE'
, ascending
=
False
)
# Format values to keep two decimal points
model_performance
[
'RMSE'
] =
model_performance
[
'RMSE'
]
.
apply
(
lambda
x
: round
(
x
, 2
))
model_performance
[
'MAE'
] =
model_performance
[
'MAE'
]
.
apply
(
lambda
x
: round
(
x
, 2
))
print
(
'Model Performance Comparison'
)
print
(
model_performance
)
Model Performance Comparison
Model RMSE MAE
0 Neural Network 288687.98 288525.02
3 ARIMA 13564.39 10798.53
2 Decision Tree 12825.98 10978.91
1 Random Forest 10165.63 8094.33
# Determine the best model based on lowest RMSE and MAE
best_rmse_model =
model_performance
.
iloc
[
-
1
][
'Model'
]
best_mae_model =
model_performance
.
iloc
[
-
1
][
'Model'
]
# Print recommendations
print
(
f'Recommended Best Model:'
)
print
(
f'Based on RMSE: {
best_rmse_model
}'
)
print
(
f'Based on MAE: {
best_mae_model
}'
)
Recommended Best Model:
Based on RMSE: Random Forest
Based on MAE: Random Forest
# Monthly Total Revenue Prediction Comparison
recent_months =
monthly_sales
.
tail
(
12
) # Consider the last 12 months for comparison
plt
.
figure
(
figsize
=
(
12
, 6
))
# Plot actual revenue
plt
.
plot
(
recent_months
[
'MonthNumber'
], recent_months
[
'Total Revenue'
], marker
=
'o'
, linestyle
=
'-'
, label
=
'Actual Revenue'
)
# Plot predictions by different models
plt
.
plot
(
recent_months
[
'MonthNumber'
], nn_model
.
predict
(
recent_months
[[
'MonthNumber'
]]), label
=
'Neural Network Predictions'
)
plt
.
plot
(
recent_months
[
'MonthNumber'
], rf_model
.
predict
(
recent_months
[[
'MonthNumber'
]]), label
=
'Random Forest Predictions'
)
plt
.
plot
(
recent_months
[
'MonthNumber'
], dt_model
.
predict
(
recent_months
[[
'MonthNumber'
]]), label
=
'Decision Tree Predictions'
)
plt
.
plot
(
recent_months
[
'MonthNumber'
], arima_results
.
predict
(
start
=
len
(
monthly_sales
)
-
12
, end
=
len
(
monthly_sales
)
-
1
, typ
=
'levels'
), label
=
'ARIMA Predictions'
)
plt
.
title
(
'Monthly Total Revenue Prediction Comparison'
)
plt
.
xlabel
(
'Month Number'
)
plt
.
ylabel
(
'Total Revenue'
)
plt
.
grid
(
True
)
plt
.
legend
()
plt
.
show
()
In [1]:
14000
13000
-
12000
Total
Revenue
Time
Series
Plot
of
Daily
Sales
Data
Total
Revenue
11000
10000
9000
8000
7000
-
2012
2014
2016
2018
Date
2020
2022
Out[1]:
In [2]:
Out[2]:
In [3]:
In [4]:
In [5]:
In [6]:
In [7]:
Monthly
Total
Revenue
Prediction
Comparison
300000
-
250000
-
Actual
Revenue
Neural
Network
Predictions
Random
Forest
Predictions
-
Decision
Tree
Predictions
-
ARIMA
Predictions
150000
100000
-
50000
+
122
124
126
Month
Number
128
130
132
In [ ]:
Discover more documents: Sign up today!
Unlock a world of knowledge! Explore tailored content for a richer learning experience. Here's what you'll get:
- Access to all documents
- Unlimited textbook solutions
- 24/7 expert homework help