TFT model¶
In [1]:
Copied!
#!pip install deepts_forecasting
#!pip install deepts_forecasting
Import libraries¶
In [2]:
Copied!
import numpy as np
import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from deepts_forecasting.utils.data import TimeSeriesDataSet
from deepts_forecasting.utils.data.encoders import TorchNormalizer
from deepts_forecasting.datasets import AirPassengersDataset
from deepts_forecasting.models.tft.tft import TemporalFusionTransformer
import numpy as np
import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from deepts_forecasting.utils.data import TimeSeriesDataSet
from deepts_forecasting.utils.data.encoders import TorchNormalizer
from deepts_forecasting.datasets import AirPassengersDataset
from deepts_forecasting.models.tft.tft import TemporalFusionTransformer
Dataset¶
In [3]:
Copied!
data = AirPassengersDataset().load()
data['year'] = data['Month'].dt.year
data['month'] = data['Month'].dt.month
data['group'] = '0'
data['time_idx'] = np.arange(len(data))
data['Passengers'] = data['Passengers'].astype(float)
data['month'] = data['month'].astype('str')
data.head()
data = AirPassengersDataset().load()
data['year'] = data['Month'].dt.year
data['month'] = data['Month'].dt.month
data['group'] = '0'
data['time_idx'] = np.arange(len(data))
data['Passengers'] = data['Passengers'].astype(float)
data['month'] = data['month'].astype('str')
data.head()
Out[3]:
| Month | Passengers | year | month | group | time_idx | |
|---|---|---|---|---|---|---|
| 0 | 1949-01-01 | 112.0 | 1949 | 1 | 0 | 0 |
| 1 | 1949-02-01 | 118.0 | 1949 | 2 | 0 | 1 |
| 2 | 1949-03-01 | 132.0 | 1949 | 3 | 0 | 2 |
| 3 | 1949-04-01 | 129.0 | 1949 | 4 | 0 | 3 |
| 4 | 1949-05-01 | 121.0 | 1949 | 5 | 0 | 4 |
Split train/test sets¶
In [4]:
Copied!
max_encoder_length = 18
max_prediction_length = 12
training_cutoff = data["time_idx"].max() - max_encoder_length - max_prediction_length
training = TimeSeriesDataSet(
data[lambda x: x.time_idx <= training_cutoff],
max_encoder_length= max_encoder_length,
min_encoder_length=max_encoder_length,
max_prediction_length=max_prediction_length,
min_prediction_length=max_prediction_length,
time_idx="time_idx",
target="Passengers",
group_ids=["group"],
static_categoricals=[],
static_reals=[],
time_varying_known_categoricals=['month'],
time_varying_known_reals=[],
time_varying_unknown_reals=["Passengers"],
time_varying_unknown_categoricals=[],
target_normalizer=TorchNormalizer(method="standard",
transformation=None),
)
training.get_parameters()
validation = TimeSeriesDataSet.from_dataset(training,
data[lambda x: x.time_idx > training_cutoff])
batch_size = 16
train_dataloader = DataLoader(training, batch_size=batch_size, shuffle=False, drop_last=False)
val_dataloader = DataLoader(validation, batch_size=batch_size, shuffle=False, drop_last=False)
max_encoder_length = 18
max_prediction_length = 12
training_cutoff = data["time_idx"].max() - max_encoder_length - max_prediction_length
training = TimeSeriesDataSet(
data[lambda x: x.time_idx <= training_cutoff],
max_encoder_length= max_encoder_length,
min_encoder_length=max_encoder_length,
max_prediction_length=max_prediction_length,
min_prediction_length=max_prediction_length,
time_idx="time_idx",
target="Passengers",
group_ids=["group"],
static_categoricals=[],
static_reals=[],
time_varying_known_categoricals=['month'],
time_varying_known_reals=[],
time_varying_unknown_reals=["Passengers"],
time_varying_unknown_categoricals=[],
target_normalizer=TorchNormalizer(method="standard",
transformation=None),
)
training.get_parameters()
validation = TimeSeriesDataSet.from_dataset(training,
data[lambda x: x.time_idx > training_cutoff])
batch_size = 16
train_dataloader = DataLoader(training, batch_size=batch_size, shuffle=False, drop_last=False)
val_dataloader = DataLoader(validation, batch_size=batch_size, shuffle=False, drop_last=False)
Define model¶
In [15]:
Copied!
pl.seed_everything(1234)
# create PyTorch Lighning Trainer with early stopping
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4,
patience=60, verbose=False, mode="min")
lr_logger = LearningRateMonitor()
trainer = pl.Trainer(
max_epochs=300,
gpus=0, # run on CPU, if on multiple GPUs, use accelerator="ddp"
gradient_clip_val=0.1,
limit_train_batches=30, # 30 batches per epoch
callbacks=[lr_logger, early_stop_callback],
logger=TensorBoardLogger("lightning_logs")
)
model = TemporalFusionTransformer.from_dataset(
training,
hidden_size=32,
lstm_layers=2,
hidden_continuous_size=4,
attention_head_size=1
)
model.summarize
pl.seed_everything(1234)
# create PyTorch Lighning Trainer with early stopping
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4,
patience=60, verbose=False, mode="min")
lr_logger = LearningRateMonitor()
trainer = pl.Trainer(
max_epochs=300,
gpus=0, # run on CPU, if on multiple GPUs, use accelerator="ddp"
gradient_clip_val=0.1,
limit_train_batches=30, # 30 batches per epoch
callbacks=[lr_logger, early_stop_callback],
logger=TensorBoardLogger("lightning_logs")
)
model = TemporalFusionTransformer.from_dataset(
training,
hidden_size=32,
lstm_layers=2,
hidden_continuous_size=4,
attention_head_size=1
)
model.summarize
Global seed set to 1234 GPU available: False, used: False TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs
Out[15]:
<bound method LightningModule.summarize of TemporalFusionTransformer(
(loss): L1Loss()
(logging_metrics): ModuleList()
(input_embeddings): MultiEmbedding(
(embeddings): ModuleDict(
(month): Embedding(12, 6)
)
)
(prescalers): ModuleDict(
(Passengers): Linear(in_features=1, out_features=4, bias=True)
)
(static_variable_selection): VariableSelectionNetwork(
(single_variable_grns): ModuleDict()
(prescalers): ModuleDict()
(softmax): Softmax(dim=-1)
)
(encoder_variable_selection): VariableSelectionNetwork(
(flattened_grn): GatedResidualNetwork(
(resample_norm): ResampleNorm(
(resample): TimeDistributedInterpolation()
(gate): Sigmoid()
(norm): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
)
(fc1): Linear(in_features=10, out_features=2, bias=True)
(elu): ELU(alpha=1.0)
(context): Linear(in_features=32, out_features=2, bias=False)
(fc2): Linear(in_features=2, out_features=2, bias=True)
(gate_norm): GateAddNorm(
(glu): GatedLinearUnit(
(dropout): Dropout(p=0.1, inplace=False)
(fc): Linear(in_features=2, out_features=4, bias=True)
)
(add_norm): AddNorm(
(norm): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
)
)
)
(single_variable_grns): ModuleDict(
(month): ResampleNorm(
(resample): TimeDistributedInterpolation()
(gate): Sigmoid()
(norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
)
(Passengers): GatedResidualNetwork(
(resample_norm): ResampleNorm(
(resample): TimeDistributedInterpolation()
(gate): Sigmoid()
(norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
)
(fc1): Linear(in_features=4, out_features=4, bias=True)
(elu): ELU(alpha=1.0)
(fc2): Linear(in_features=4, out_features=4, bias=True)
(gate_norm): GateAddNorm(
(glu): GatedLinearUnit(
(dropout): Dropout(p=0.1, inplace=False)
(fc): Linear(in_features=4, out_features=64, bias=True)
)
(add_norm): AddNorm(
(norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
)
)
)
)
(prescalers): ModuleDict(
(Passengers): Linear(in_features=1, out_features=4, bias=True)
)
(softmax): Softmax(dim=-1)
)
(decoder_variable_selection): VariableSelectionNetwork(
(single_variable_grns): ModuleDict(
(month): ResampleNorm(
(resample): TimeDistributedInterpolation()
(gate): Sigmoid()
(norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
)
)
(prescalers): ModuleDict()
(softmax): Softmax(dim=-1)
)
(static_context_variable_selection): GatedResidualNetwork(
(fc1): Linear(in_features=32, out_features=32, bias=True)
(elu): ELU(alpha=1.0)
(fc2): Linear(in_features=32, out_features=32, bias=True)
(gate_norm): GateAddNorm(
(glu): GatedLinearUnit(
(dropout): Dropout(p=0.1, inplace=False)
(fc): Linear(in_features=32, out_features=64, bias=True)
)
(add_norm): AddNorm(
(norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
)
)
)
(static_context_initial_hidden_lstm): GatedResidualNetwork(
(fc1): Linear(in_features=32, out_features=32, bias=True)
(elu): ELU(alpha=1.0)
(fc2): Linear(in_features=32, out_features=32, bias=True)
(gate_norm): GateAddNorm(
(glu): GatedLinearUnit(
(dropout): Dropout(p=0.1, inplace=False)
(fc): Linear(in_features=32, out_features=64, bias=True)
)
(add_norm): AddNorm(
(norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
)
)
)
(static_context_initial_cell_lstm): GatedResidualNetwork(
(fc1): Linear(in_features=32, out_features=32, bias=True)
(elu): ELU(alpha=1.0)
(fc2): Linear(in_features=32, out_features=32, bias=True)
(gate_norm): GateAddNorm(
(glu): GatedLinearUnit(
(dropout): Dropout(p=0.1, inplace=False)
(fc): Linear(in_features=32, out_features=64, bias=True)
)
(add_norm): AddNorm(
(norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
)
)
)
(static_context_enrichment): GatedResidualNetwork(
(fc1): Linear(in_features=32, out_features=32, bias=True)
(elu): ELU(alpha=1.0)
(fc2): Linear(in_features=32, out_features=32, bias=True)
(gate_norm): GateAddNorm(
(glu): GatedLinearUnit(
(dropout): Dropout(p=0.1, inplace=False)
(fc): Linear(in_features=32, out_features=64, bias=True)
)
(add_norm): AddNorm(
(norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
)
)
)
(lstm_encoder): LSTM(32, 32, num_layers=2, batch_first=True, dropout=0.1)
(lstm_decoder): LSTM(32, 32, num_layers=2, batch_first=True, dropout=0.1)
(post_lstm_gate_encoder): GatedLinearUnit(
(dropout): Dropout(p=0.1, inplace=False)
(fc): Linear(in_features=32, out_features=64, bias=True)
)
(post_lstm_gate_decoder): GatedLinearUnit(
(dropout): Dropout(p=0.1, inplace=False)
(fc): Linear(in_features=32, out_features=64, bias=True)
)
(post_lstm_add_norm_encoder): AddNorm(
(norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
)
(post_lstm_add_norm_decoder): AddNorm(
(norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
)
(static_enrichment): GatedResidualNetwork(
(fc1): Linear(in_features=32, out_features=32, bias=True)
(elu): ELU(alpha=1.0)
(context): Linear(in_features=32, out_features=32, bias=False)
(fc2): Linear(in_features=32, out_features=32, bias=True)
(gate_norm): GateAddNorm(
(glu): GatedLinearUnit(
(dropout): Dropout(p=0.1, inplace=False)
(fc): Linear(in_features=32, out_features=64, bias=True)
)
(add_norm): AddNorm(
(norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
)
)
)
(multihead_attn): InterpretableMultiHeadAttention(
(dropout): Dropout(p=0.1, inplace=False)
(v_layer): Linear(in_features=32, out_features=32, bias=True)
(q_layers): ModuleList(
(0): Linear(in_features=32, out_features=32, bias=True)
)
(k_layers): ModuleList(
(0): Linear(in_features=32, out_features=32, bias=True)
)
(attention): ScaledDotProductAttention(
(softmax): Softmax(dim=2)
)
(w_h): Linear(in_features=32, out_features=32, bias=False)
)
(post_attn_gate_norm): GateAddNorm(
(glu): GatedLinearUnit(
(dropout): Dropout(p=0.1, inplace=False)
(fc): Linear(in_features=32, out_features=64, bias=True)
)
(add_norm): AddNorm(
(norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
)
)
(pos_wise_ff): GatedResidualNetwork(
(fc1): Linear(in_features=32, out_features=32, bias=True)
(elu): ELU(alpha=1.0)
(fc2): Linear(in_features=32, out_features=32, bias=True)
(gate_norm): GateAddNorm(
(glu): GatedLinearUnit(
(dropout): Dropout(p=0.1, inplace=False)
(fc): Linear(in_features=32, out_features=64, bias=True)
)
(add_norm): AddNorm(
(norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
)
)
)
(pre_output_gate_norm): GateAddNorm(
(glu): GatedLinearUnit(
(fc): Linear(in_features=32, out_features=64, bias=True)
)
(add_norm): AddNorm(
(norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
)
)
(output_layer): Linear(in_features=32, out_features=1, bias=True)
)>
In [16]:
Copied!
model.hparams
model.hparams
Out[16]:
"attention_head_size": 1
"categorical_groups": {}
"dropout": 0.1
"embedding_labels": {'month': array(['1', '10', '11', '12', '2', '3', '4', '5', '6', '7', '8', '9'],
dtype=object)}
"embedding_paddings": []
"embedding_sizes": {'month': [12, 6]}
"hidden_continuous_size": 4
"hidden_continuous_sizes": {}
"hidden_size": 32
"learning_rate": 0.001
"log_interval": -1
"log_val_interval": None
"logging_metrics": ModuleList()
"loss": L1Loss()
"lstm_layers": 2
"max_encoder_length": 18
"max_prediction_length": 12
"monotone_constaints": {}
"output_size": 1
"output_transformer": TorchNormalizer()
"share_single_variable_networks": False
"static_categoricals": []
"static_reals": []
"time_varying_categoricals_decoder": ['month']
"time_varying_categoricals_encoder": ['month']
"time_varying_reals_decoder": []
"time_varying_reals_encoder": ['Passengers']
"x_categoricals": ['month']
"x_reals": ['Passengers']
Train model with early stopping¶
In [17]:
Copied!
trainer.fit(
model, train_dataloader=train_dataloader, val_dataloaders=val_dataloader,
)
# (given that we use early stopping, this is not necessarily the last epoch)
best_model_path = trainer.checkpoint_callback.best_model_path
best_model = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
# calcualte mean absolute error on validation set
actuals = torch.cat([model.transform_output(prediction=y, target_scale=x['target_scale'])
for x, y in iter(val_dataloader)])
predictions,x_index = best_model.predict(val_dataloader)
mae = (actuals - predictions).abs().mean()
# print('predictions shape is:', predictions.shape)
# print('actuals shape is:', actuals.shape)
print(torch.cat([actuals, predictions]))
print('MAE is:', mae)
trainer.fit(
model, train_dataloader=train_dataloader, val_dataloaders=val_dataloader,
)
# (given that we use early stopping, this is not necessarily the last epoch)
best_model_path = trainer.checkpoint_callback.best_model_path
best_model = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
# calcualte mean absolute error on validation set
actuals = torch.cat([model.transform_output(prediction=y, target_scale=x['target_scale'])
for x, y in iter(val_dataloader)])
predictions,x_index = best_model.predict(val_dataloader)
mae = (actuals - predictions).abs().mean()
# print('predictions shape is:', predictions.shape)
# print('actuals shape is:', actuals.shape)
print(torch.cat([actuals, predictions]))
print('MAE is:', mae)
| Name | Type | Params ---------------------------------------------------------------------------------------- 0 | loss | L1Loss | 0 1 | logging_metrics | ModuleList | 0 2 | input_embeddings | MultiEmbedding | 72 3 | prescalers | ModuleDict | 8 4 | static_variable_selection | VariableSelectionNetwork | 0 5 | encoder_variable_selection | VariableSelectionNetwork | 738 6 | decoder_variable_selection | VariableSelectionNetwork | 96 7 | static_context_variable_selection | GatedResidualNetwork | 4.3 K 8 | static_context_initial_hidden_lstm | GatedResidualNetwork | 4.3 K 9 | static_context_initial_cell_lstm | GatedResidualNetwork | 4.3 K 10 | static_context_enrichment | GatedResidualNetwork | 4.3 K 11 | lstm_encoder | LSTM | 16.9 K 12 | lstm_decoder | LSTM | 16.9 K 13 | post_lstm_gate_encoder | GatedLinearUnit | 2.1 K 14 | post_lstm_add_norm_encoder | AddNorm | 64 15 | static_enrichment | GatedResidualNetwork | 5.3 K 16 | multihead_attn | InterpretableMultiHeadAttention | 4.2 K 17 | post_attn_gate_norm | GateAddNorm | 2.2 K 18 | pos_wise_ff | GatedResidualNetwork | 4.3 K 19 | pre_output_gate_norm | GateAddNorm | 2.2 K 20 | output_layer | Linear | 33 ---------------------------------------------------------------------------------------- 72.2 K Trainable params 0 Non-trainable params 72.2 K Total params 0.289 Total estimated model params size (MB)
Global seed set to 1234
Epoch 89: 100%|██████████| 7/7 [00:00<00:00, 21.05it/s, loss=0.114, v_num=12, val_loss=1.170, train_loss=0.107]
tensor([[[417.0000],
[391.0000],
[419.0000],
[461.0000],
[472.0000],
[535.0000],
[622.0000],
[606.0000],
[508.0000],
[461.0000],
[390.0000],
[432.0000]],
[[331.3324],
[314.6927],
[363.6489],
[350.5824],
[362.6381],
[428.3661],
[471.2575],
[470.3118],
[404.3524],
[343.9659],
[301.0706],
[330.8917]]], dtype=torch.float64)
MAE is: tensor(103.4075, dtype=torch.float64)