TFT model¶

In [1]:

            
                Copied!
                
#!pip install deepts_forecasting
#!pip install deepts_forecasting

Import libraries¶

In [2]:

            
                Copied!
                
                    
                    
                
                

        
import numpy as np
import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger

from deepts_forecasting.utils.data import TimeSeriesDataSet
from deepts_forecasting.utils.data.encoders import TorchNormalizer
from deepts_forecasting.datasets import AirPassengersDataset
from deepts_forecasting.models.tft.tft import TemporalFusionTransformer
import numpy as np
import torch
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger

from deepts_forecasting.utils.data import TimeSeriesDataSet
from deepts_forecasting.utils.data.encoders import TorchNormalizer
from deepts_forecasting.datasets import AirPassengersDataset
from deepts_forecasting.models.tft.tft import TemporalFusionTransformer

Dataset¶

In [3]:

            
                Copied!
                
                    
                    
                
                

        
data = AirPassengersDataset().load()
data['year'] = data['Month'].dt.year
data['month'] = data['Month'].dt.month
data['group'] = '0'
data['time_idx'] = np.arange(len(data))
data['Passengers'] = data['Passengers'].astype(float)
data['month'] = data['month'].astype('str')
data.head()
data = AirPassengersDataset().load()
data['year'] = data['Month'].dt.year
data['month'] = data['Month'].dt.month
data['group'] = '0'
data['time_idx'] = np.arange(len(data))
data['Passengers'] = data['Passengers'].astype(float)
data['month'] = data['month'].astype('str')
data.head()

Out[3]:

	Month	Passengers	year	month	time_idx
0	1949-01-01	112.0	1949	1	0
1	1949-02-01	118.0	1949	2	1
2	1949-03-01	132.0	1949	3	2
3	1949-04-01	129.0	1949	4	3
4	1949-05-01	121.0	1949	5	4

Split train/test sets¶

In [4]:

            
                Copied!
                
                    
                    
                
                

        
max_encoder_length = 18
max_prediction_length = 12

training_cutoff = data["time_idx"].max() - max_encoder_length - max_prediction_length

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    max_encoder_length= max_encoder_length,
    min_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    min_prediction_length=max_prediction_length,
    time_idx="time_idx",
    target="Passengers",
    group_ids=["group"],
    static_categoricals=[],
    static_reals=[],
    time_varying_known_categoricals=['month'],
    time_varying_known_reals=[],
    time_varying_unknown_reals=["Passengers"],
    time_varying_unknown_categoricals=[],
    target_normalizer=TorchNormalizer(method="standard",
                                      transformation=None),
    )

training.get_parameters()
validation = TimeSeriesDataSet.from_dataset(training,
                                            data[lambda x: x.time_idx > training_cutoff])

batch_size = 16
train_dataloader = DataLoader(training, batch_size=batch_size, shuffle=False, drop_last=False)
val_dataloader = DataLoader(validation, batch_size=batch_size, shuffle=False, drop_last=False)
max_encoder_length = 18
max_prediction_length = 12

training_cutoff = data["time_idx"].max() - max_encoder_length - max_prediction_length

training = TimeSeriesDataSet(
    data[lambda x: x.time_idx <= training_cutoff],
    max_encoder_length= max_encoder_length,
    min_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    min_prediction_length=max_prediction_length,
    time_idx="time_idx",
    target="Passengers",
    group_ids=["group"],
    static_categoricals=[],
    static_reals=[],
    time_varying_known_categoricals=['month'],
    time_varying_known_reals=[],
    time_varying_unknown_reals=["Passengers"],
    time_varying_unknown_categoricals=[],
    target_normalizer=TorchNormalizer(method="standard",
                                      transformation=None),
    )

training.get_parameters()
validation = TimeSeriesDataSet.from_dataset(training,
                                            data[lambda x: x.time_idx > training_cutoff])

batch_size = 16
train_dataloader = DataLoader(training, batch_size=batch_size, shuffle=False, drop_last=False)
val_dataloader = DataLoader(validation, batch_size=batch_size, shuffle=False, drop_last=False)

Define model¶

In [15]:

            
                Copied!
                
                    
                    
                
                

        
pl.seed_everything(1234)
# create PyTorch Lighning Trainer with early stopping
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4,
                                    patience=60, verbose=False, mode="min")
lr_logger = LearningRateMonitor()

trainer = pl.Trainer(
    max_epochs=300,
    gpus=0,  # run on CPU, if on multiple GPUs, use accelerator="ddp"
    gradient_clip_val=0.1,
    limit_train_batches=30,  # 30 batches per epoch
    callbacks=[lr_logger, early_stop_callback],
    logger=TensorBoardLogger("lightning_logs")
)

model = TemporalFusionTransformer.from_dataset(
    training,
     hidden_size=32, 
     lstm_layers=2, 
     hidden_continuous_size=4,
     attention_head_size=1
)
model.summarize
pl.seed_everything(1234)
# create PyTorch Lighning Trainer with early stopping
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4,
                                    patience=60, verbose=False, mode="min")
lr_logger = LearningRateMonitor()

trainer = pl.Trainer(
    max_epochs=300,
    gpus=0,  # run on CPU, if on multiple GPUs, use accelerator="ddp"
    gradient_clip_val=0.1,
    limit_train_batches=30,  # 30 batches per epoch
    callbacks=[lr_logger, early_stop_callback],
    logger=TensorBoardLogger("lightning_logs")
)

model = TemporalFusionTransformer.from_dataset(
    training,
     hidden_size=32, 
     lstm_layers=2, 
     hidden_continuous_size=4,
     attention_head_size=1
)
model.summarize

Global seed set to 1234
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

Out[15]:

<bound method LightningModule.summarize of TemporalFusionTransformer(
  (loss): L1Loss()
  (logging_metrics): ModuleList()
  (input_embeddings): MultiEmbedding(
    (embeddings): ModuleDict(
      (month): Embedding(12, 6)
    )
  )
  (prescalers): ModuleDict(
    (Passengers): Linear(in_features=1, out_features=4, bias=True)
  )
  (static_variable_selection): VariableSelectionNetwork(
    (single_variable_grns): ModuleDict()
    (prescalers): ModuleDict()
    (softmax): Softmax(dim=-1)
  )
  (encoder_variable_selection): VariableSelectionNetwork(
    (flattened_grn): GatedResidualNetwork(
      (resample_norm): ResampleNorm(
        (resample): TimeDistributedInterpolation()
        (gate): Sigmoid()
        (norm): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
      )
      (fc1): Linear(in_features=10, out_features=2, bias=True)
      (elu): ELU(alpha=1.0)
      (context): Linear(in_features=32, out_features=2, bias=False)
      (fc2): Linear(in_features=2, out_features=2, bias=True)
      (gate_norm): GateAddNorm(
        (glu): GatedLinearUnit(
          (dropout): Dropout(p=0.1, inplace=False)
          (fc): Linear(in_features=2, out_features=4, bias=True)
        )
        (add_norm): AddNorm(
          (norm): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (single_variable_grns): ModuleDict(
      (month): ResampleNorm(
        (resample): TimeDistributedInterpolation()
        (gate): Sigmoid()
        (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      )
      (Passengers): GatedResidualNetwork(
        (resample_norm): ResampleNorm(
          (resample): TimeDistributedInterpolation()
          (gate): Sigmoid()
          (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        )
        (fc1): Linear(in_features=4, out_features=4, bias=True)
        (elu): ELU(alpha=1.0)
        (fc2): Linear(in_features=4, out_features=4, bias=True)
        (gate_norm): GateAddNorm(
          (glu): GatedLinearUnit(
            (dropout): Dropout(p=0.1, inplace=False)
            (fc): Linear(in_features=4, out_features=64, bias=True)
          )
          (add_norm): AddNorm(
            (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          )
        )
      )
    )
    (prescalers): ModuleDict(
      (Passengers): Linear(in_features=1, out_features=4, bias=True)
    )
    (softmax): Softmax(dim=-1)
  )
  (decoder_variable_selection): VariableSelectionNetwork(
    (single_variable_grns): ModuleDict(
      (month): ResampleNorm(
        (resample): TimeDistributedInterpolation()
        (gate): Sigmoid()
        (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      )
    )
    (prescalers): ModuleDict()
    (softmax): Softmax(dim=-1)
  )
  (static_context_variable_selection): GatedResidualNetwork(
    (fc1): Linear(in_features=32, out_features=32, bias=True)
    (elu): ELU(alpha=1.0)
    (fc2): Linear(in_features=32, out_features=32, bias=True)
    (gate_norm): GateAddNorm(
      (glu): GatedLinearUnit(
        (dropout): Dropout(p=0.1, inplace=False)
        (fc): Linear(in_features=32, out_features=64, bias=True)
      )
      (add_norm): AddNorm(
        (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (static_context_initial_hidden_lstm): GatedResidualNetwork(
    (fc1): Linear(in_features=32, out_features=32, bias=True)
    (elu): ELU(alpha=1.0)
    (fc2): Linear(in_features=32, out_features=32, bias=True)
    (gate_norm): GateAddNorm(
      (glu): GatedLinearUnit(
        (dropout): Dropout(p=0.1, inplace=False)
        (fc): Linear(in_features=32, out_features=64, bias=True)
      )
      (add_norm): AddNorm(
        (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (static_context_initial_cell_lstm): GatedResidualNetwork(
    (fc1): Linear(in_features=32, out_features=32, bias=True)
    (elu): ELU(alpha=1.0)
    (fc2): Linear(in_features=32, out_features=32, bias=True)
    (gate_norm): GateAddNorm(
      (glu): GatedLinearUnit(
        (dropout): Dropout(p=0.1, inplace=False)
        (fc): Linear(in_features=32, out_features=64, bias=True)
      )
      (add_norm): AddNorm(
        (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (static_context_enrichment): GatedResidualNetwork(
    (fc1): Linear(in_features=32, out_features=32, bias=True)
    (elu): ELU(alpha=1.0)
    (fc2): Linear(in_features=32, out_features=32, bias=True)
    (gate_norm): GateAddNorm(
      (glu): GatedLinearUnit(
        (dropout): Dropout(p=0.1, inplace=False)
        (fc): Linear(in_features=32, out_features=64, bias=True)
      )
      (add_norm): AddNorm(
        (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (lstm_encoder): LSTM(32, 32, num_layers=2, batch_first=True, dropout=0.1)
  (lstm_decoder): LSTM(32, 32, num_layers=2, batch_first=True, dropout=0.1)
  (post_lstm_gate_encoder): GatedLinearUnit(
    (dropout): Dropout(p=0.1, inplace=False)
    (fc): Linear(in_features=32, out_features=64, bias=True)
  )
  (post_lstm_gate_decoder): GatedLinearUnit(
    (dropout): Dropout(p=0.1, inplace=False)
    (fc): Linear(in_features=32, out_features=64, bias=True)
  )
  (post_lstm_add_norm_encoder): AddNorm(
    (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
  )
  (post_lstm_add_norm_decoder): AddNorm(
    (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
  )
  (static_enrichment): GatedResidualNetwork(
    (fc1): Linear(in_features=32, out_features=32, bias=True)
    (elu): ELU(alpha=1.0)
    (context): Linear(in_features=32, out_features=32, bias=False)
    (fc2): Linear(in_features=32, out_features=32, bias=True)
    (gate_norm): GateAddNorm(
      (glu): GatedLinearUnit(
        (dropout): Dropout(p=0.1, inplace=False)
        (fc): Linear(in_features=32, out_features=64, bias=True)
      )
      (add_norm): AddNorm(
        (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (multihead_attn): InterpretableMultiHeadAttention(
    (dropout): Dropout(p=0.1, inplace=False)
    (v_layer): Linear(in_features=32, out_features=32, bias=True)
    (q_layers): ModuleList(
      (0): Linear(in_features=32, out_features=32, bias=True)
    )
    (k_layers): ModuleList(
      (0): Linear(in_features=32, out_features=32, bias=True)
    )
    (attention): ScaledDotProductAttention(
      (softmax): Softmax(dim=2)
    )
    (w_h): Linear(in_features=32, out_features=32, bias=False)
  )
  (post_attn_gate_norm): GateAddNorm(
    (glu): GatedLinearUnit(
      (dropout): Dropout(p=0.1, inplace=False)
      (fc): Linear(in_features=32, out_features=64, bias=True)
    )
    (add_norm): AddNorm(
      (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    )
  )
  (pos_wise_ff): GatedResidualNetwork(
    (fc1): Linear(in_features=32, out_features=32, bias=True)
    (elu): ELU(alpha=1.0)
    (fc2): Linear(in_features=32, out_features=32, bias=True)
    (gate_norm): GateAddNorm(
      (glu): GatedLinearUnit(
        (dropout): Dropout(p=0.1, inplace=False)
        (fc): Linear(in_features=32, out_features=64, bias=True)
      )
      (add_norm): AddNorm(
        (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (pre_output_gate_norm): GateAddNorm(
    (glu): GatedLinearUnit(
      (fc): Linear(in_features=32, out_features=64, bias=True)
    )
    (add_norm): AddNorm(
      (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    )
  )
  (output_layer): Linear(in_features=32, out_features=1, bias=True)
)>

In [16]:

            
                Copied!
                
model.hparams
model.hparams

Out[16]:

"attention_head_size":               1
"categorical_groups":                {}
"dropout":                           0.1
"embedding_labels":                  {'month': array(['1', '10', '11', '12', '2', '3', '4', '5', '6', '7', '8', '9'],
      dtype=object)}
"embedding_paddings":                []
"embedding_sizes":                   {'month': [12, 6]}
"hidden_continuous_size":            4
"hidden_continuous_sizes":           {}
"hidden_size":                       32
"learning_rate":                     0.001
"log_interval":                      -1
"log_val_interval":                  None
"logging_metrics":                   ModuleList()
"loss":                              L1Loss()
"lstm_layers":                       2
"max_encoder_length":                18
"max_prediction_length":             12
"monotone_constaints":               {}
"output_size":                       1
"output_transformer":                TorchNormalizer()
"share_single_variable_networks":    False
"static_categoricals":               []
"static_reals":                      []
"time_varying_categoricals_decoder": ['month']
"time_varying_categoricals_encoder": ['month']
"time_varying_reals_decoder":        []
"time_varying_reals_encoder":        ['Passengers']
"x_categoricals":                    ['month']
"x_reals":                           ['Passengers']

Train model with early stopping¶

In [17]:

            
                Copied!
                
                    
                    
                
                

        
trainer.fit(
    model, train_dataloader=train_dataloader, val_dataloaders=val_dataloader,
)

# (given that we use early stopping, this is not necessarily the last epoch)
best_model_path = trainer.checkpoint_callback.best_model_path
best_model = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

# calcualte mean absolute error on validation set
actuals = torch.cat([model.transform_output(prediction=y, target_scale=x['target_scale'])
                     for x, y in iter(val_dataloader)])
predictions,x_index = best_model.predict(val_dataloader)
mae = (actuals - predictions).abs().mean()
# print('predictions shape is:', predictions.shape)
# print('actuals shape is:', actuals.shape)
print(torch.cat([actuals, predictions]))
print('MAE is:', mae)
trainer.fit(
    model, train_dataloader=train_dataloader, val_dataloaders=val_dataloader,
)

# (given that we use early stopping, this is not necessarily the last epoch)
best_model_path = trainer.checkpoint_callback.best_model_path
best_model = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

# calcualte mean absolute error on validation set
actuals = torch.cat([model.transform_output(prediction=y, target_scale=x['target_scale'])
                     for x, y in iter(val_dataloader)])
predictions,x_index = best_model.predict(val_dataloader)
mae = (actuals - predictions).abs().mean()
# print('predictions shape is:', predictions.shape)
# print('actuals shape is:', actuals.shape)
print(torch.cat([actuals, predictions]))
print('MAE is:', mae)

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | L1Loss                          | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 72    
3  | prescalers                         | ModuleDict                      | 8     
4  | static_variable_selection          | VariableSelectionNetwork        | 0     
5  | encoder_variable_selection         | VariableSelectionNetwork        | 738   
6  | decoder_variable_selection         | VariableSelectionNetwork        | 96    
7  | static_context_variable_selection  | GatedResidualNetwork            | 4.3 K 
8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 4.3 K 
9  | static_context_initial_cell_lstm   | GatedResidualNetwork            | 4.3 K 
10 | static_context_enrichment          | GatedResidualNetwork            | 4.3 K 
11 | lstm_encoder                       | LSTM                            | 16.9 K
12 | lstm_decoder                       | LSTM                            | 16.9 K
13 | post_lstm_gate_encoder             | GatedLinearUnit                 | 2.1 K 
14 | post_lstm_add_norm_encoder         | AddNorm                         | 64    
15 | static_enrichment                  | GatedResidualNetwork            | 5.3 K 
16 | multihead_attn                     | InterpretableMultiHeadAttention | 4.2 K 
17 | post_attn_gate_norm                | GateAddNorm                     | 2.2 K 
18 | pos_wise_ff                        | GatedResidualNetwork            | 4.3 K 
19 | pre_output_gate_norm               | GateAddNorm                     | 2.2 K 
20 | output_layer                       | Linear                          | 33    
----------------------------------------------------------------------------------------
72.2 K    Trainable params
0         Non-trainable params
72.2 K    Total params
0.289     Total estimated model params size (MB)

Global seed set to 1234

Epoch 89: 100%|██████████| 7/7 [00:00<00:00, 21.05it/s, loss=0.114, v_num=12, val_loss=1.170, train_loss=0.107]  
tensor([[[417.0000],
         [391.0000],
         [419.0000],
         [461.0000],
         [472.0000],
         [535.0000],
         [622.0000],
         [606.0000],
         [508.0000],
         [461.0000],
         [390.0000],
         [432.0000]],

        [[331.3324],
         [314.6927],
         [363.6489],
         [350.5824],
         [362.6381],
         [428.3661],
         [471.2575],
         [470.3118],
         [404.3524],
         [343.9659],
         [301.0706],
         [330.8917]]], dtype=torch.float64)
MAE is: tensor(103.4075, dtype=torch.float64)