Focus: Time Series

Focus: Time Series#

ARIMA#

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statsmodels.tsa.api as smts
import statsmodels.tsa.stattools as smtst

from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.stats.diagnostic import acorr_ljungbox

from itertools import product
import tqdm

df = pd.read_csv('time_series.csv', index_col='Date', parse_dates=True)
df

	y
Date
2012-10-02	1938.0
2012-10-03	3521.0
2012-10-04	3475.0
2012-10-05	3148.0
2012-10-06	2006.0
...	...
2014-05-27	5032.0
2014-05-28	4008.0
2014-05-29	4587.0
2014-05-30	4869.0
2014-05-31	2887.0

607 rows × 1 columns

def tsplot(y, lags=None, figsize=(20, 10), title=None):
    fig = plt.figure(figsize=figsize)

    layout = (2, 2)
    ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
    acf_ax = plt.subplot2grid(layout, (1, 0))
    pacf_ax = plt.subplot2grid(layout, (1, 1))

    y.plot(ax=ts_ax)
    p_value = smts.adfuller(y)[1]
    if title is None:
        title = 'Time Series Analysis Plots'
    ts_ax.set_title('{0}\n Dickey-Fuller: p={1:.5f}'.format(title, p_value))
    
    smts.graphics.plot_acf(y, lags=lags, ax=acf_ax)
    smts.graphics.plot_pacf(y, lags=lags, ax=pacf_ax)
    plt.tight_layout()

tsplot(df, lags=60)

../../_images/d9dbe5eeccea0b01a4af8cb0ad7a7322fe7e05d6ecf74bd1682bb9608f941546.png

df_sdiff = (df - df.shift(7)).dropna()
tsplot(df_sdiff, lags=60)

../../_images/e16a4a38763486549addb2d09c5aa9a46f7d082ea8a98d6d072b2d5155135849.png

df_2diff = df_sdiff.diff().dropna()
tsplot(df_2diff, lags=60)

../../_images/630f5b31c46f7c1091669b42607b99fec876117c44f5a8244e3f6bc5050e165f.png

candidate_model = sm.tsa.statespace.SARIMAX(df, order=(0, 1, 6), seasonal_order=(0, 1, 1, 7)).fit()
candidate_model.summary()

C:\Users\silve\AppData\Roaming\jupyterlab-desktop\jlab_server\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  self._init_dates(dates, freq)
C:\Users\silve\AppData\Roaming\jupyterlab-desktop\jlab_server\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  self._init_dates(dates, freq)

SARIMAX Results
Dep. Variable:	y	No. Observations:	607
Model:	SARIMAX(0, 1, 6)x(0, 1, [1], 7)	Log Likelihood	-4671.220
Date:	Sun, 03 Dec 2023	AIC	9358.439
Time:	23:58:25	BIC	9393.601
Sample:	10-02-2012	HQIC	9372.128
	- 05-31-2014
Covariance Type:	opg

	coef	std err	z	P>\|z\|	[0.025	0.975]
ma.L1	-0.3710	0.031	-12.036	0.000	-0.431	-0.311
ma.L2	-0.2547	0.038	-6.753	0.000	-0.329	-0.181
ma.L3	-0.0418	0.034	-1.221	0.222	-0.109	0.025
ma.L4	-0.2354	0.035	-6.637	0.000	-0.305	-0.166
ma.L5	0.0191	0.037	0.514	0.607	-0.054	0.092
ma.L6	0.0523	0.035	1.501	0.133	-0.016	0.120
ma.S.L7	-0.9324	0.015	-61.301	0.000	-0.962	-0.903
sigma2	3.394e+05	1.35e+04	25.215	0.000	3.13e+05	3.66e+05

Ljung-Box (L1) (Q):	0.00	Jarque-Bera (JB):	219.02
Prob(Q):	0.99	Prob(JB):	0.00
Heteroskedasticity (H):	2.29	Skew:	-0.19
Prob(H) (two-sided):	0.00	Kurtosis:	5.94

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

candidate_model.plot_diagnostics(figsize=(20,10))
plt.show()

../../_images/55e075d1121f47ab8e604aa1cf1c0a2acdfd12af2fe269cc73ad397fdec02d2c.png

lags = 60
fig, axes = plt.subplots(2, 1, figsize=(20, 10))
smts.graphics.plot_acf(candidate_model.resid, lags=lags, ax=axes[0])
smts.graphics.plot_pacf(candidate_model.resid, lags=lags, ax=axes[1])
plt.show()

../../_images/62879c34af64bdd0147e5637ff4cdb12972a9ebabef493ac590100106e29ee2a.png

acorr_ljungbox(candidate_model.resid, lags=[2*7], return_df=True, model_df=6)  # model_df = p + q

	lb_stat	lb_pvalue
14	15.647813	0.047708

candidate_model2 = sm.tsa.statespace.SARIMAX(df, order=(0, 1, 4), seasonal_order=(0, 1, 1, 7)).fit()
candidate_model2.summary()

C:\Users\silve\AppData\Roaming\jupyterlab-desktop\jlab_server\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  self._init_dates(dates, freq)
C:\Users\silve\AppData\Roaming\jupyterlab-desktop\jlab_server\lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
  self._init_dates(dates, freq)

SARIMAX Results
Dep. Variable:	y	No. Observations:	607
Model:	SARIMAX(0, 1, 4)x(0, 1, [1], 7)	Log Likelihood	-4672.557
Date:	Sun, 03 Dec 2023	AIC	9357.114
Time:	23:58:28	BIC	9383.485
Sample:	10-02-2012	HQIC	9367.381
	- 05-31-2014
Covariance Type:	opg

	coef	std err	z	P>\|z\|	[0.025	0.975]
ma.L1	-0.3604	0.030	-12.003	0.000	-0.419	-0.302
ma.L2	-0.2415	0.036	-6.634	0.000	-0.313	-0.170
ma.L3	-0.0366	0.031	-1.196	0.232	-0.097	0.023
ma.L4	-0.2121	0.032	-6.630	0.000	-0.275	-0.149
ma.S.L7	-0.9160	0.015	-59.577	0.000	-0.946	-0.886
sigma2	3.416e+05	1.34e+04	25.578	0.000	3.15e+05	3.68e+05

Ljung-Box (L1) (Q):	0.07	Jarque-Bera (JB):	229.44
Prob(Q):	0.79	Prob(JB):	0.00
Heteroskedasticity (H):	2.30	Skew:	-0.20
Prob(H) (two-sided):	0.00	Kurtosis:	6.00

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

candidate_model2.plot_diagnostics(figsize=(20,10))
plt.show()

../../_images/efceaea7f3f874e5ad96b3972245dfe52657b6e183a64f50621a4c756297cf22.png

lags = 60
fig, axes = plt.subplots(2, 1, figsize=(20, 10))
smts.graphics.plot_acf(candidate_model2.resid, lags=lags, ax=axes[0])
smts.graphics.plot_pacf(candidate_model2.resid, lags=lags, ax=axes[1])
plt.show()

../../_images/566a21bc67288194cf9d81fb86087a5828a48d947803c55140aa6df4eec87558.png

acorr_ljungbox(candidate_model.resid2, lags=[2*7], return_df=True, model_df=4)  # model_df = p + q

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[13], line 1
----> 1 acorr_ljungbox(candidate_model.resid2, lags=[2*7], return_df=True, model_df=4)  # model_df = p + q

File ~\AppData\Roaming\jupyterlab-desktop\jlab_server\lib\site-packages\statsmodels\base\wrapper.py:34, in ResultsWrapper.__getattribute__(self, attr)
     31 except AttributeError:
     32     pass
---> 34 obj = getattr(results, attr)
     35 data = results.model.data
     36 how = self._wrap_attrs.get(attr)

AttributeError: 'SARIMAXResults' object has no attribute 'resid2'

ps = range(0, 2)
qs = range(3, 6)

p = 0
d = 1 
q = 4

Ps = range(0, 2)
Qs = range(0, 3)

P = 0
D = 1 
Q = 1
s = 7

parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
len(parameters_list)

def gs_arima(series, parameters_list, d, D, s, opt_method='powell'):

    results = []
    models = {}
    best_aic = float("inf")

    for param in parameters_list:
        # we need try-except because on some combinations model might fail to converge
        try:
            model = sm.tsa.statespace.SARIMAX(
                series, 
                order=(param[0], d, param[1]), 
                seasonal_order=(param[2], D, param[3], s)).fit(method=opt_method, disp=False)
        except:
            continue

        aic = model.aic
        if aic < best_aic:
            best_model = model
            best_aic = aic
            best_param = param
        results.append([param, model.aic])

    result_table = pd.DataFrame(results)
    result_table.columns = ['parameters', 'aic']
    result_table = result_table.sort_values(by='aic', ascending=True).reset_index(drop=True)
    
    return result_table, best_model

result_table, best_model = gs_arima(df, parameters_list, d, D, s)

result_table.head(10)

best_model.summary()

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def plot_predictions(df, model, n_steps):
    """
        Plots model vs predicted values
        
        series - dataset with timeseries
        model - fitted model
        n_steps - number of steps to predict in the future
    """
    data = df.copy()
    data.columns = ['actual']
    data['arima_model'] = model.fittedvalues

    # forecast values
    data['arima_model'][:s+d] = np.NaN
    forecast = data['arima_model'].append(model.forecast(n_steps))

    # calculate error, again having shifted on s+d steps from the beginning
    error = mean_absolute_percentage_error(data['actual'][s+d:-n_steps], data['arima_model'][s+d:-n_steps])

    plt.figure(figsize=(20, 10))
    plt.title("Mean Absolute Percentage Error: {0:.2f}%".format(error))
    plt.plot(forecast, color='r', label="model")
    plt.axvspan(data.index[-1], forecast.index[-1], alpha=0.5, color='lightgrey')
    plt.plot(data.actual, label="actual", alpha=0.8)
    plt.legend()
    plt.grid(True)

plot_predictions(pd.DataFrame(series), best_model, 60)

Focus: Time Series

Contents

Focus: Time Series#

ARIMA#