df.loc['2014','store_1_item_2'].plot()predictions.loc['2015','store_1_item_2'].plot()
df.loc['2014','store_2_item_1'].plot()predictions.loc['2015','store_2_item_1'].plot()
df.loc['2014','store_2_item_2'].plot()predictions.loc['2015','store_2_item_2'].plot()
_____________________________________________________________________
Complete code:
# importsimport pandas as pdfrom pmdarima.preprocessing import FourierFeaturizerimport htsfrom hts.hierarchy import HierarchyTreefrom hts.model import AutoArimaModelfrom hts import HTSRegressor# read data from the csv filedata = pd.read_csv('train.csv', index_col='date', parse_dates=True)# Train/Test split with reduced sizetrain_data = data.query('store == [1,2] and item == [1, 2]').loc['2013':'2014']test_data = data.query('store == [1,2] and item == [1, 2]').loc['2015']# Create the stores time series# For each timestamp group by store and apply sumstores_ts = train_data.drop(columns=['item']).groupby(['date','store']).sum()stores_ts = stores_ts.unstack('store')stores_ts.columns = stores_ts.columns.droplevel(0)stores_ts.columns = ['store_' + str(i) for i in stores_ts.columns]# Create the items time series# For each timestamp group by item and apply sumitems_ts = train_data.drop(columns=['store']).groupby(['date','item']).sum()items_ts = items_ts.unstack('item')items_ts.columns = items_ts.columns.droplevel(0)items_ts.columns = ['item_' + str(i) for i in items_ts.columns]# Create the stores_items time series# For each timestamp group by store AND by item and apply sumstore_item_ts = train_data.pivot_table(index= 'date', columns=['store', 'item'], aggfunc='sum')store_item_ts.columns = store_item_ts.columns.droplevel(0)# Rename the columns as store_i_item_jcol_names = []for i in store_item_ts.columns:col_name = 'store_' + str(i[0]) + '_item_' + str(i[1])col_names.append(col_name)store_item_ts.columns = store_item_ts.columns.droplevel(0)store_item_ts.columns = col_names# Create a new dataframe and add the root level of the hierarchy as the sum of all stores (or all items)df = pd.DataFrame()df['total'] = stores_ts.sum(1) # Concatenate all created dataframes into one df# df is the dataframe that will be used for model trainingdf = pd.concat([df, stores_ts, items_ts, store_item_ts], 1)# Build fourier terms for train and test setsfour_terms = FourierFeaturizer(365.2, 1)# Build the exogenous features dataframe for training dataexog_train_df = pd.DataFrame()for i in range(1, 3):for j in range(1, 3):_, exog = four_terms.fit_transform(train_data.query(f'store == {i} and item == {j}').sales)exog.columns= [f'store_{i}_item_{j}_'+ x for x in exog.columns]exog_train_df = pd.concat([exog_train_df, exog], axis=1)exog_train_df['date'] = df.indexexog_train_df.set_index('date', inplace=True)# add the exogenous features dataframe to df before trainingdf = pd.concat([df, exog_train_df], axis= 1)# Build the exogenous features dataframe for test set# It will be used only when using model.predict()exog_test_df = pd.DataFrame()for i in range(1, 3):for j in range(1, 3):_, exog_test = four_terms.fit_transform(test_data.query(f'store == {i} and item == {j}').sales)exog_test.columns= [f'store_{i}_item_{j}_'+ x for x in exog_test.columns]exog_test_df = pd.concat([exog_test_df, exog_test], axis=1)# Build the hierarchy of the Grouped Time Seriesstores = [i for i in stores_ts.columns]items = [i for i in items_ts.columns]store_items = col_names# Exogenous features mappingexog_store_items = {e: [v for v in exog_train_df.columns if v.startswith(e)] for e in store_items} exog_stores = {e:[v for v in exog_train_df.columns if v.startswith(e)] for e in stores}exog_items = {e:[v for v in exog_train_df.columns if v.find(e) != -1] for e in items}exog_total = {'total':[v for v in exog_train_df.columns if v.find('FOURIER') != -1]}# Merge all dictionariesexog_to_merge = [exog_store_items, exog_stores, exog_items, exog_total]exogenous = {k:v for x in exog_to_merge for k,v in x.items()}# Build hierarchytotal = {'total': stores + items}store_h = {k: [v for v in store_items if v.startswith(k)] for k in stores}hierarchy = {**total, **store_h}# Hierarchy tree automatically created by htsht = HierarchyTree.from_nodes(nodes=hierarchy, df=df, exogenous=exogenous)# Instanciate the auto arima model using HTSRegressorautoarima = HTSRegressor(model='auto_arima', D=1, m=7, seasonal=True, revision_method='OLS', n_jobs=12)# Fit the model to the training df that includes time series and exog_train_df# Set exogenous param to the previously built dictionarymodel = autoarima.fit(df, hierarchy, exogenous=exogenous)# Make predictions# Set the exogenous_df param predictions = model.predict(exogenous_df=exog_test_df, steps_ahead=365)
Other approaches I thought of and that I already implemented successfully for one series (for store 1 and item 1 for example) :
TBATS applied to each series independently inside a loop across all 500 time series
auto_arima (SARIMAX) with exogenous features (=Fourier terms to deal with the weekly and annual seasonalities) for each series independently + a loop across all 500 time series
What do you think of these approaches? Do you have other suggestions on how to scale ARIMA to multiple time series?
I also want to try LSTM but I'm new to data science and deep learning and do not know how to prepare the data. Should I keep the data in their original form (long format) and apply one hot encoding to train_data['store'] and train_data['item'] columns or should I start with the df I ended up with here?
I Hope this helped you in fixing the issue with exogenous regressors. To handle negative forecasts I would suggest you to try square root transformation.