99 lines
2.9 KiB
Python
99 lines
2.9 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Wed Jul 14 13:37:58 2021
|
|
@author: sadrachpierre
|
|
"""
|
|
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
from statsmodels.tsa.stattools import adfuller
|
|
from statsmodels.tsa.seasonal import seasonal_decompose
|
|
|
|
df = pd.read_csv("AirPassengers.csv")
|
|
|
|
print(df.head())
|
|
|
|
print(df.tail())
|
|
|
|
df['Month'] = pd.to_datetime(df['Month'], format='%Y-%m')
|
|
|
|
df.index = df['Month']
|
|
del df['Month']
|
|
print(df.head())
|
|
|
|
# sns.lineplot(data=df)
|
|
# plt.ylabel("Number of Passengers")
|
|
# plt.show()
|
|
|
|
rolling_mean = df.rolling(7).mean()
|
|
rolling_std = df.rolling(7).std()
|
|
|
|
plt.plot(df, color="blue", label="Original Passenger Data")
|
|
plt.plot(rolling_mean, color="red", label="Rolling Mean #Passenger")
|
|
plt.plot(rolling_std, color="black", label="Rolling Standard Deviation in #Passenger")
|
|
plt.title("Passenger Time Series, Rolling Mean, Standard Deviation")
|
|
plt.legend(loc="best")
|
|
plt.show()
|
|
|
|
adft = adfuller(df, autolag="AIC")
|
|
|
|
output_df = pd.DataFrame({"Values": [adft[0], adft[1], adft[2], adft[3], adft[4]['1%'], adft[4]['5%'], adft[4]['10%']],
|
|
"Metric": ["Test Statistics", "p-value", "No. of lags used", "Number of observations used",
|
|
"critical value (1%)", "critical value (5%)", "critical value (10%)"]})
|
|
print(output_df)
|
|
|
|
autocorrelation_lag1 = df['#Passengers'].autocorr(lag=1)
|
|
print("One Month Lag: ", autocorrelation_lag1)
|
|
|
|
autocorrelation_lag3 = df['#Passengers'].autocorr(lag=3)
|
|
print("Three Month Lag: ", autocorrelation_lag3)
|
|
|
|
autocorrelation_lag6 = df['#Passengers'].autocorr(lag=6)
|
|
print("Six Month Lag: ", autocorrelation_lag6)
|
|
|
|
autocorrelation_lag9 = df['#Passengers'].autocorr(lag=9)
|
|
print("Nine Month Lag: ", autocorrelation_lag9)
|
|
|
|
decompose = seasonal_decompose(df['#Passengers'], model='additive', period=7)
|
|
decompose.plot()
|
|
plt.show()
|
|
df['Date'] = df.index
|
|
|
|
train = df[df['Date'] < pd.to_datetime("1960-08", format='%Y-%m')]
|
|
train['train'] = train['#Passengers']
|
|
del train['Date']
|
|
del train['#Passengers']
|
|
test = df[df['Date'] >= pd.to_datetime("1960-08", format='%Y-%m')]
|
|
del test['Date']
|
|
test['test'] = test['#Passengers']
|
|
del test['#Passengers']
|
|
plt.plot(train, color="black")
|
|
plt.plot(test, color="red")
|
|
plt.title("Train/Test split for Passenger Data")
|
|
plt.ylabel("Passenger Number")
|
|
plt.xlabel('Year-Month')
|
|
sns.set()
|
|
plt.show()
|
|
|
|
from pmdarima.arima import auto_arima
|
|
|
|
model = auto_arima(train, trace=True, error_action='ignore', suppress_warnings=True)
|
|
model.fit(train)
|
|
forecast = model.predict(n_periods=len(test))
|
|
forecast = pd.DataFrame(forecast, index=test.index, columns=['Prediction'])
|
|
|
|
plt.plot(train, label='Train')
|
|
plt.plot(test, label='Test')
|
|
plt.plot(forecast, label='Prediction')
|
|
plt.title('#Passenger Prediction')
|
|
plt.xlabel('Date')
|
|
plt.ylabel('Actual #Passenger')
|
|
plt.legend(loc='upper left', fontsize=8)
|
|
plt.show()
|
|
|
|
from math import sqrt
|
|
from sklearn.metrics import mean_squared_error
|
|
|
|
print("RMSE: ", rms) |