The goal of this notebook is to develop and compare different approaches to time-series problems.
Content:
The content here was inspired by this article at machinelearningmastery.com, How to Get Started with Deep Learning for Time Series Forecasting (7-Day Mini-Course)
Dependencies
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras import optimizers
from keras.utils import plot_model
from keras.models import Sequential, Model
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Flatten
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
%matplotlib inline
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)
# Set seeds to make the experiment more reproducible.
from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(1)
seed(1)
Loading data:
train = pd.read_csv('../input/demand-forecasting-kernels-only/train.csv', parse_dates=['date'])
test = pd.read_csv('../input/demand-forecasting-kernels-only/test.csv', parse_dates=['date'])
Train set :
train.describe()

train.head()

Time period of the train dataset :
print('Min date from train set: %s' % train['date'].min().date())
print('Max date from train set: %s' % train['date'].max().date())

Let’s find out what’s the time gap between the last day from training set from the last day of the test set, this will be out lag (the amount of day that need to be forecast)
lag_size = (test['date'].max().date() - train['date'].max().date()).days
print('Max date from train set: %s' % train['date'].max().date())
print('Max date from test set: %s' % test['date'].max().date())
print('Forecast lag size', lag_size)

Basic EDA :
To explore the time series data first we need to aggregate the sales by day
Overall daily sales
daily_sales_sc = go.Scatter(x=daily_sales['date'], y=daily_sales['sales'])
layout = go.Layout(title='Daily sales', xaxis=dict(title='Date'), yaxis=dict(title='Sales'))
fig = go.Figure(data=[daily_sales_sc], layout=layout)
iplot(fig)

Daily sales by store :
store_daily_sales_sc = []
for store in store_daily_sales['store'].unique():
current_store_daily_sales = store_daily_sales[(store_daily_sales['store'] == store)]
store_daily_sales_sc.append(go.Scatter(x=current_store_daily_sales['date'], y=current_store_daily_sales['sales'], name=('Store %s' % store)))
layout = go.Layout(title='Store daily sales', xaxis=dict(title='Date'), yaxis=dict(title='Sales'))
fig = go.Figure(data=store_daily_sales_sc, layout=layout)
iplot(fig)

Daily sales by item :
item_daily_sales_sc = []
for item in item_daily_sales['item'].unique():
current_item_daily_sales = item_daily_sales[(item_daily_sales['item'] == item)]
item_daily_sales_sc.append(go.Scatter(x=current_item_daily_sales['date'], y=current_item_daily_sales['sales'], name=('Item %s' % item)))
layout = go.Layout(title='Item daily sales', xaxis=dict(title='Date'), yaxis=dict(title='Sales'))
fig = go.Figure(data=item_daily_sales_sc, layout=layout)
iplot(fig)

Sub-sample train set to get only the last year of data and reduce training time
train = train[(train['date'] >= '2017-01-01')]
Rearrange dataset so we can apply shift methods
train_gp = train.sort_values('date').groupby(['item', 'store', 'date'], as_index=False)
train_gp = train_gp.agg({'sales':['mean']})
train_gp.columns = ['item', 'store', 'date', 'sales']
train_gp.head()

Transform the data into a time series problem
def series_to_supervised(data, window=1, lag=1, dropnan=True):
cols, names = list(), list()
# Input sequence (t-n, ... t-1)
for i in range(window, 0, -1):
cols.append(data.shift(i))
names += [('%s(t-%d)' % (col, i)) for col in data.columns]
# Current timestep (t=0)
cols.append(data)
names += [('%s(t)' % (col)) for col in data.columns]
# Target timestep (t=lag)
cols.append(data.shift(-lag))
names += [('%s(t+%d)' % (col, lag)) for col in data.columns]
# Put it all together
agg = pd.concat(cols, axis=1)
agg.columns = names
# Drop rows with NaN values
if dropnan:
agg.dropna(inplace=True)
return agg
We will use the current timestep and the last 29 to forecast 90 days ahead
window = 29
lag = lag_size
series = series_to_supervised(train_gp.drop('date', axis=1), window=window, lag=lag)
series.head()

Drop rows with different item or store values than the shifted columns.
last_item = 'item(t-%d)' % window
last_store = 'store(t-%d)' % window
series = series[(series['store(t)'] == series[last_store])]
series = series[(series['item(t)'] == series[last_item])]
Remove unwanted columns
columns_to_drop = [('%s(t+%d)' % (col, lag)) for col in ['item', 'store']]
for i in range(window, 0, -1):
columns_to_drop += [('%s(t-%d)' % (col, i)) for col in ['item', 'store']]
series.drop(columns_to_drop, axis=1, inplace=True)
series.drop(['item(t)', 'store(t)'], axis=1, inplace=True)
Train/validation split :
labels_col = 'sales(t+%d)' % lag_size
labels = series[labels_col]
series = series.drop(labels_col, axis=1)
X_train, X_valid, Y_train, Y_valid = train_test_split(series, labels.values, test_size=0.4, random_state=0)
print('Train set shape', X_train.shape)
print('Validation set shape', X_valid.shape)
X_train.head()

MLP for Time Series Forecasting :
- First we will use a Multilayer Perceptron model or MLP model, here our model will have input features equal to the window size.
- The thing with MLP models is that the model don’t take the input as sequenced data, so for the model, it is just receiving inputs and don’t treat them as sequenced data, that may be a problem since the model won’t see the data with the sequence patter that it has.
- Input shape [samples, timesteps].
epochs = 40
batch = 256
lr = 0.0003
adam = optimizers.Adam(lr)
model_mlp = Sequential()
model_mlp.add(Dense(100, activation='relu', input_dim=X_train.shape[1]))
model_mlp.add(Dense(1))
model_mlp.compile(loss='mse', optimizer=adam)
model_mlp.summary()

mlp_history = model_mlp.fit(X_train.values, Y_train, validation_data=(X_valid.values, Y_valid), epochs=epochs, verbose=2)
WARNING:tensorflow:From C:\Users\ved.prakash\.conda\envs\Developer\lib\site-packages\keras\backend\tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead. Train on 539046 samples, validate on 359364 samples Epoch 1/40 - 23s - loss: 296.7964 - val_loss: 283.0145 Epoch 2/40 - 21s - loss: 285.0671 - val_loss: 281.7296 Epoch 3/40 - 20s - loss: 283.0100 - val_loss: 281.4281 Epoch 4/40 - 23s - loss: 281.6383 - val_loss: 278.6470 Epoch 5/40 - 24s - loss: 280.6746 - val_loss: 279.0183 Epoch 6/40 - 24s - loss: 279.6116 - val_loss: 277.7753 Epoch 7/40 - 23s - loss: 278.7575 - val_loss: 277.3698 Epoch 8/40 - 24s - loss: 277.7416 - val_loss: 276.1339 Epoch 9/40 - 23s - loss: 277.0546 - val_loss: 276.6886 Epoch 10/40 - 20s - loss: 276.3664 - val_loss: 276.8414 Epoch 11/40 - 20s - loss: 275.8846 - val_loss: 277.0113 Epoch 12/40 - 20s - loss: 275.3375 - val_loss: 277.3087 Epoch 13/40 - 20s - loss: 274.8365 - val_loss: 274.4823 Epoch 14/40 - 20s - loss: 274.6113 - val_loss: 281.3596 Epoch 15/40 - 21s - loss: 274.2429 - val_loss: 272.5593 Epoch 16/40 - 21s - loss: 273.9740 - val_loss: 273.3476 Epoch 17/40 - 23s - loss: 273.8451 - val_loss: 274.4691 Epoch 18/40 - 21s - loss: 273.4529 - val_loss: 275.8403 Epoch 19/40 - 20s - loss: 273.2054 - val_loss: 275.9277 Epoch 20/40 - 20s - loss: 272.8323 - val_loss: 276.0930 Epoch 21/40 - 20s - loss: 272.3891 - val_loss: 273.4565 Epoch 22/40 - 25s - loss: 272.0809 - val_loss: 271.7454 Epoch 23/40 - 21s - loss: 271.8350 - val_loss: 273.5631 Epoch 24/40 - 20s - loss: 271.3900 - val_loss: 271.0826 Epoch 25/40 - 20s - loss: 271.2118 - val_loss: 275.9031 Epoch 26/40 - 20s - loss: 270.7912 - val_loss: 271.3738 Epoch 27/40 - 20s - loss: 270.4341 - val_loss: 269.8823 Epoch 28/40 - 20s - loss: 270.1743 - val_loss: 271.0881 Epoch 29/40 - 21s - loss: 270.0708 - val_loss: 277.2779 Epoch 30/40 - 21s - loss: 269.8993 - val_loss: 270.7083 Epoch 31/40 - 20s - loss: 269.5579 - val_loss: 269.4271 Epoch 32/40 - 20s - loss: 269.6041 - val_loss: 276.1689 Epoch 33/40 - 20s - loss: 269.2013 - val_loss: 270.7666 Epoch 34/40 - 19s - loss: 269.0003 - val_loss: 269.2294 Epoch 35/40 - 20s - loss: 268.7911 - val_loss: 281.9628 Epoch 36/40 - 20s - loss: 268.8397 - val_loss: 270.8018 Epoch 37/40 - 20s - loss: 268.7605 - val_loss: 267.7105 Epoch 38/40 - 20s - loss: 268.4983 - val_loss: 269.2251 Epoch 39/40 - 20s - loss: 268.3747 - val_loss: 267.3423 Epoch 40/40 - 19s - loss: 268.2942 - val_loss: 268.7007
CNN for Time Series Forecasting
- For the CNN model we will use one convolutional hidden layer followed by a max pooling layer. The filter maps are then flattened before being interpreted by a Dense layer and outputting a prediction.
- The convolutional layer should be able to identify patterns between the timesteps.
- Input shape [samples, timesteps, features].
Data preprocess¶
- Reshape from [samples, timesteps] into [samples, timesteps, features].
- This same reshaped data will be used on the CNN and the LSTM model.
X_train_series = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
X_valid_series = X_valid.values.reshape((X_valid.shape[0], X_valid.shape[1], 1))
print('Train set shape', X_train_series.shape)
print('Validation set shape', X_valid_series.shape)

model_cnn = Sequential()
model_cnn.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train_series.shape[1], X_train_series.shape[2])))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Flatten())
model_cnn.add(Dense(50, activation='relu'))
model_cnn.add(Dense(1))
model_cnn.compile(loss='mse', optimizer=adam)
model_cnn.summary()

cnn_history = model_cnn.fit(X_train_series, Y_train, validation_data=(X_valid_series, Y_valid), epochs=epochs, verbose=2)
Train on 539046 samples, validate on 359364 samples Epoch 1/40 - 61s - loss: 302.3749 - val_loss: 284.5457 Epoch 2/40 - 56s - loss: 286.2072 - val_loss: 280.9687 Epoch 3/40 - 57s - loss: 282.9365 - val_loss: 284.0360 Epoch 4/40 - 57s - loss: 281.0557 - val_loss: 277.8678 Epoch 5/40 - 56s - loss: 279.6068 - val_loss: 276.8600 Epoch 6/40 - 57s - loss: 278.6245 - val_loss: 294.1346 Epoch 7/40 - 58s - loss: 277.6902 - val_loss: 276.1326 Epoch 8/40 - 57s - loss: 277.0438 - val_loss: 276.3102 Epoch 9/40 - 55s - loss: 276.3717 - val_loss: 275.9277 Epoch 10/40 - 56s - loss: 276.0398 - val_loss: 274.7697 Epoch 11/40 - 56s - loss: 275.5340 - val_loss: 282.8980 Epoch 12/40 - 56s - loss: 275.2893 - val_loss: 275.1289 Epoch 13/40 - 56s - loss: 274.8561 - val_loss: 273.3157 Epoch 14/40 - 56s - loss: 274.5388 - val_loss: 274.9236 Epoch 15/40 - 55s - loss: 274.2379 - val_loss: 275.7810 Epoch 16/40 - 58s - loss: 274.1078 - val_loss: 275.8257 Epoch 17/40 - 55s - loss: 274.0082 - val_loss: 273.0019 Epoch 18/40 - 56s - loss: 273.7846 - val_loss: 277.5217 Epoch 19/40 - 56s - loss: 273.3980 - val_loss: 272.9790 Epoch 20/40 - 56s - loss: 273.4978 - val_loss: 273.4316 Epoch 21/40 - 55s - loss: 273.2888 - val_loss: 272.9668 Epoch 22/40 - 58s - loss: 273.1055 - val_loss: 273.3566 Epoch 23/40 - 55s - loss: 272.9974 - val_loss: 274.7178 Epoch 24/40 - 56s - loss: 272.9274 - val_loss: 273.1638 Epoch 25/40 - 56s - loss: 272.7718 - val_loss: 272.5034 Epoch 26/40 - 55s - loss: 272.7108 - val_loss: 273.3041 Epoch 27/40 - 60s - loss: 272.5352 - val_loss: 272.8699 Epoch 28/40 - 58s - loss: 272.3428 - val_loss: 273.9018 Epoch 29/40 - 57s - loss: 272.2864 - val_loss: 276.0303 Epoch 30/40 - 56s - loss: 272.2298 - val_loss: 273.3634 Epoch 31/40 - 55s - loss: 272.0477 - val_loss: 275.9128 Epoch 32/40 - 55s - loss: 272.0614 - val_loss: 273.4985 Epoch 33/40 - 55s - loss: 271.9201 - val_loss: 272.7883 Epoch 34/40 - 57s - loss: 271.8000 - val_loss: 272.0070 Epoch 35/40 - 55s - loss: 271.7214 - val_loss: 272.8003 Epoch 36/40 - 58s - loss: 271.6669 - val_loss: 272.3711 Epoch 37/40 - 56s - loss: 271.5468 - val_loss: 274.2162 Epoch 38/40 - 59s - loss: 271.5455 - val_loss: 271.4067 Epoch 39/40 - 55s - loss: 271.4137 - val_loss: 271.7298 Epoch 40/40 - 56s - loss: 271.4621 - val_loss: 274.7311
LSTM for Time Series Forecasting
- Now the LSTM model actually sees the input data as a sequence, so it’s able to learn patterns from sequenced data (assuming it exists) better than the other ones, especially patterns from long sequences.
- Input shape [samples, timesteps, features]
model_lstm = Sequential()
model_lstm.add(LSTM(50, activation='relu', input_shape=(X_train_series.shape[1], X_train_series.shape[2])))
model_lstm.add(Dense(1))
model_lstm.compile(loss='mse', optimizer=adam)
model_lstm.summary()

lstm_history = model_lstm.fit(X_train_series, Y_train, validation_data=(X_valid_series, Y_valid), epochs=epochs, verbose=2)
Train on 539046 samples, validate on 359364 samples Epoch 1/40 - 241s - loss: 296.4003 - val_loss: 282.5395 Epoch 2/40 - 243s - loss: 278.4952 - val_loss: 277.4245 Epoch 3/40 - 275s - loss: 274.7992 - val_loss: 276.7243 Epoch 4/40 - 271s - loss: 416.6561 - val_loss: 451.4692 Epoch 5/40 - 237s - loss: 386.2520 - val_loss: 435.2592 Epoch 6/40 - 227s - loss: 363.2775 - val_loss: 326.7852 Epoch 7/40 - 233s - loss: 537.2991 - val_loss: 387.1006 Epoch 8/40 - 232s - loss: 343.1943 - val_loss: 305.9990 Epoch 9/40 - 243s - loss: 412.9937 - val_loss: 1751.8770 Epoch 10/40 - 232s - loss: 598.1155 - val_loss: 469.5936 Epoch 11/40 - 238s - loss: 479.1226 - val_loss: 414.3998 Epoch 12/40 - 233s - loss: 477.7992 - val_loss: 399.0074 Epoch 13/40 - 233s - loss: 450.3913 - val_loss: 1069.6221 Epoch 14/40 - 238s - loss: 449.3031 - val_loss: 347.3966 Epoch 15/40 - 244s - loss: 345.5024 - val_loss: 352.9056 Epoch 16/40 - 222s - loss: 321.6109 - val_loss: 323.2200 Epoch 17/40 - 232s - loss: 610.2419 - val_loss: 643.8430 Epoch 18/40 - 223s - loss: 760.8084 - val_loss: 345.7041 Epoch 19/40 - 225s - loss: 345.3888 - val_loss: 324.8337 Epoch 20/40 - 232s - loss: 328.3242 - val_loss: 304.3258 Epoch 21/40 - 238s - loss: 822.0794 - val_loss: 407.5317 Epoch 22/40 - 249s - loss: 347.5258 - val_loss: 310.1085 Epoch 23/40 - 252s - loss: 583.8443 - val_loss: 458.1804 Epoch 24/40 - 264s - loss: 459.6971 - val_loss: 448.5026 Epoch 25/40 - 277s - loss: 411.1899 - val_loss: 355.0936 Epoch 26/40 - 272s - loss: 322.7635 - val_loss: 304.7395 Epoch 27/40 - 270s - loss: 336.1895 - val_loss: 319.1880 Epoch 28/40 - 296s - loss: 330.8041 - val_loss: 331.2426 Epoch 29/40 - 258s - loss: 389.2990 - val_loss: 376.6205 Epoch 30/40 - 252s - loss: 8299.6533 - val_loss: 2475.4134 Epoch 31/40 - 233s - loss: 939.6349 - val_loss: 355.5661 Epoch 32/40 - 215s - loss: 349.9022 - val_loss: 324.7493 Epoch 33/40 - 224s - loss: 322.6144 - val_loss: 327.9130 Epoch 34/40 - 216s - loss: 350.6495 - val_loss: 392.8342 Epoch 35/40 - 226s - loss: 451.0064 - val_loss: 395.6966 Epoch 36/40 - 299s - loss: 383.2123 - val_loss: 380.7600 Epoch 37/40 - 306s - loss: 1017.7759 - val_loss: 368.7438 Epoch 38/40 - 280s - loss: 385.0194 - val_loss: 337.9106 Epoch 39/40 - 279s - loss: 3379.1455 - val_loss: 432.0354 Epoch 40/40 - 303s - loss: 515.9205 - val_loss: 689.1181
CNN-LSTM for Time Series Forecasting
- Input shape [samples, subsequences, timesteps, features].
Model explanation from the article
“The benefit of this model is that the model can support very long input sequences that can be read as blocks or subsequences by the CNN model, then pieced together by the LSTM model.”
“When using a hybrid CNN-LSTM model, we will further divide each sample into further subsequences. The CNN model will interpret each sub-sequence and the LSTM will piece together the interpretations from the subsequences. As such, we will split each sample into 2 subsequences of 2 times per subsequence.”
“The CNN will be defined to expect 2 timesteps per subsequence with one feature. The entire CNN model is then wrapped in TimeDistributed wrapper layers so that it can be applied to each subsequence in the sample. The results are then interpreted by the LSTM layer before the model outputs a prediction.”
Data preprocess
- Reshape from [samples, timesteps, features] into [samples, subsequences, timesteps, features].
subsequences = 2
timesteps = X_train_series.shape[1]//subsequences
X_train_series_sub = X_train_series.reshape((X_train_series.shape[0], subsequences, timesteps, 1))
X_valid_series_sub = X_valid_series.reshape((X_valid_series.shape[0], subsequences, timesteps, 1))
print('Train set shape', X_train_series_sub.shape)
print('Validation set shape', X_valid_series_sub.shape)

model_cnn_lstm = Sequential()
model_cnn_lstm.add(TimeDistributed(Conv1D(filters=64, kernel_size=1, activation='relu'), input_shape=(None, X_train_series_sub.shape[2], X_train_series_sub.shape[3])))
model_cnn_lstm.add(TimeDistributed(MaxPooling1D(pool_size=2)))
model_cnn_lstm.add(TimeDistributed(Flatten()))
model_cnn_lstm.add(LSTM(50, activation='relu'))
model_cnn_lstm.add(Dense(1))
model_cnn_lstm.compile(loss='mse', optimizer=adam)
Train on 539046 samples, validate on 359364 samples Epoch 1/40 - 94s - loss: 327.7718 - val_loss: 318.8454 Epoch 2/40 - 93s - loss: 315.8257 - val_loss: 315.3481 Epoch 3/40 - 99s - loss: 312.0692 - val_loss: 307.3203 Epoch 4/40 - 93s - loss: 306.4253 - val_loss: 300.9821 Epoch 5/40 - 94s - loss: 301.8997 - val_loss: 298.2752 Epoch 6/40 - 92s - loss: 298.2369 - val_loss: 294.4639 Epoch 7/40 - 96s - loss: 296.0242 - val_loss: 290.4033 Epoch 8/40 - 94s - loss: 289.8763 - val_loss: 288.0622 Epoch 9/40 - 93s - loss: 287.0455 - val_loss: 284.6490 Epoch 10/40 - 92s - loss: 285.2549 - val_loss: 282.7159 Epoch 11/40 - 92s - loss: 284.1178 - val_loss: 283.3195 Epoch 12/40 - 92s - loss: 283.0223 - val_loss: 283.8179 Epoch 13/40 - 95s - loss: 282.2192 - val_loss: 284.1431 Epoch 14/40 - 93s - loss: 281.5122 - val_loss: 280.4515 Epoch 15/40 - 94s - loss: 280.8412 - val_loss: 280.5496 Epoch 16/40 - 94s - loss: 280.3307 - val_loss: 281.6433 Epoch 17/40 - 101s - loss: 279.8033 - val_loss: 279.4845 Epoch 18/40 - 93s - loss: 279.3819 - val_loss: 279.8729 Epoch 19/40 - 98s - loss: 278.7890 - val_loss: 278.4162 Epoch 20/40 - 92s - loss: 278.3325 - val_loss: 278.4002 Epoch 21/40 - 93s - loss: 277.9252 - val_loss: 277.5195 Epoch 22/40 - 92s - loss: 277.4064 - val_loss: 276.6636 Epoch 23/40 - 92s - loss: 277.1679 - val_loss: 276.7318 Epoch 24/40 - 90s - loss: 276.8452 - val_loss: 276.5995 Epoch 25/40 - 90s - loss: 276.5479 - val_loss: 277.6941 Epoch 26/40 - 94s - loss: 276.3842 - val_loss: 276.8231 Epoch 27/40 - 90s - loss: 276.1422 - val_loss: 275.7310 Epoch 28/40 - 90s - loss: 275.9599 - val_loss: 275.7850 Epoch 29/40 - 90s - loss: 275.8070 - val_loss: 275.1646 Epoch 30/40 - 90s - loss: 275.7007 - val_loss: 275.0607 Epoch 31/40 - 90s - loss: 275.4211 - val_loss: 276.5081 Epoch 32/40 - 89s - loss: 275.3506 - val_loss: 274.6546 Epoch 33/40 - 91s - loss: 275.3072 - val_loss: 276.0577 Epoch 34/40 - 90s - loss: 274.9943 - val_loss: 275.7494 Epoch 35/40 - 90s - loss: 274.9158 - val_loss: 275.0233 Epoch 36/40 - 90s - loss: 274.8942 - val_loss: 278.8588 Epoch 37/40 - 91s - loss: 274.7746 - val_loss: 274.5011 Epoch 38/40 - 94s - loss: 274.6655 - val_loss: 274.3602 Epoch 39/40 - 103s - loss: 274.5718 - val_loss: 274.7006 Epoch 40/40 - 105s - loss: 274.5007 - val_loss: 275.3394
Comparing models
fig, axes = plt.subplots(2, 2, sharex=True, sharey=True,figsize=(22,12))
ax1, ax2 = axes[0]
ax3, ax4 = axes[1]
ax1.plot(mlp_history.history['loss'], label='Train loss')
ax1.plot(mlp_history.history['val_loss'], label='Validation loss')
ax1.legend(loc='best')
ax1.set_title('MLP')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('MSE')
ax2.plot(cnn_history.history['loss'], label='Train loss')
ax2.plot(cnn_history.history['val_loss'], label='Validation loss')
ax2.legend(loc='best')
ax2.set_title('CNN')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('MSE')
ax3.plot(lstm_history.history['loss'], label='Train loss')
ax3.plot(lstm_history.history['val_loss'], label='Validation loss')
ax3.legend(loc='best')
ax3.set_title('LSTM')
ax3.set_xlabel('Epochs')
ax3.set_ylabel('MSE')
ax4.plot(cnn_lstm_history.history['loss'], label='Train loss')
ax4.plot(cnn_lstm_history.history['val_loss'], label='Validation loss')
ax4.legend(loc='best')
ax4.set_title('CNN-LSTM')
ax4.set_xlabel('Epochs')
ax4.set_ylabel('MSE')
plt.show()

MLP on train and validation
mlp_train_pred = model_mlp.predict(X_train.values)
mlp_valid_pred = model_mlp.predict(X_valid.values)
print('Train rmse:', np.sqrt(mean_squared_error(Y_train, mlp_train_pred)))
print('Validation rmse:', np.sqrt(mean_squared_error(Y_valid, mlp_valid_pred)))

CNN on train and validation
cnn_train_pred = model_cnn.predict(X_train_series)
cnn_valid_pred = model_cnn.predict(X_valid_series)
print('Train rmse:', np.sqrt(mean_squared_error(Y_train, cnn_train_pred)))
print('Validation rmse:', np.sqrt(mean_squared_error(Y_valid, cnn_valid_pred)))

LSTM on train and validation
lstm_train_pred = model_lstm.predict(X_train_series)
lstm_valid_pred = model_cnn.predict(X_valid_series)
print('Train rmse:', np.sqrt(mean_squared_error(Y_train, lstm_train_pred)))
print('Validation rmse:', np.sqrt(mean_squared_error(Y_valid, lstm_valid_pred)))

Conclusion
Here you could see some approaches to a time series problem, how to develop and the differences between them, this is not meant to have a great performance, so if you want better results, you are more than welcomed to try a few different hyper-parameters, especially the window size and the networks topology, if you do, please let me know the results.
I hope you learned a few things here, leave a feedback and if you liked what you saw make sure to check the article that I used as source.
If you want to check out how you can use LSTM as autoencoders and create new features that represent a time series take a look at my other kernel Time-series forecasting with deep learning & LSTM autoencoders.