Update models and add DecoderOnly part

hvlgo · hvlgo · commit bbd5921a1256 · 2024-11-04T09:16:14.000Z
diff --git a/exp/exp_forecast.py b/exp/exp_forecast.py
@@ -70,7 +70,7 @@ def vali(self, vali_data, vali_loader, criterion, is_test=False):
                 batch_x_mark = batch_x_mark.float().to(self.device)
                 batch_y_mark = batch_y_mark.float().to(self.device)
                 
-                outputs = self.model(batch_x, batch_x_mark, None, batch_y_mark)
+                outputs = self.model(batch_x, batch_x_mark, batch_y_mark)
                 if is_test or self.args.nonautoregressive:
                         outputs = outputs[:, -self.args.output_token_len:, :]
                         batch_y = batch_y[:, -self.args.output_token_len:, :].to(self.device)
@@ -138,7 +138,7 @@ def train(self, setting):
                 batch_x_mark = batch_x_mark.float().to(self.device)
                 batch_y_mark = batch_y_mark.float().to(self.device)
 
-                outputs = self.model(batch_x, batch_x_mark, None, batch_y_mark)
+                outputs = self.model(batch_x, batch_x_mark, batch_y_mark)
                 if self.args.dp:
                     torch.cuda.synchronize()
                 if self.args.nonautoregressive:
@@ -228,7 +228,7 @@ def test(self, setting, test=0):
                 for j in range(inference_steps):  
                     if len(pred_y) != 0:
                         batch_x = torch.cat([batch_x[:, self.args.input_token_len:, :], pred_y[-1]], dim=1)
-                    outputs = self.model(batch_x, batch_x_mark, None, batch_y_mark)
+                    outputs = self.model(batch_x, batch_x_mark, batch_y_mark)
                     pred_y.append(outputs[:, -self.args.output_token_len:, :])
                 pred_y = torch.cat(pred_y, dim=1)
                 if dis != 0:
diff --git a/layers/Transformer_EncDec.py b/layers/Transformer_EncDec.py
@@ -2,27 +2,6 @@
 import torch.nn.functional as F
 
 
-class ConvLayer(nn.Module):
-    def __init__(self, c_in):
-        super(ConvLayer, self).__init__()
-        self.downConv = nn.Conv1d(in_channels=c_in,
-                                  out_channels=c_in,
-                                  kernel_size=3,
-                                  padding=2,
-                                  padding_mode='circular')
-        self.norm = nn.BatchNorm1d(c_in)
-        self.activation = nn.ELU()
-        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
-
-    def forward(self, x):
-        x = self.downConv(x.permute(0, 2, 1))
-        x = self.norm(x)
-        x = self.activation(x)
-        x = self.maxPool(x)
-        x = x.transpose(1, 2)
-        return x
-
-
 class EncoderLayer(nn.Module):
     def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
         super(EncoderLayer, self).__init__()
@@ -52,6 +31,73 @@ def forward(self, x, attn_mask=None, tau=None, delta=None):
         return self.norm2(x + y), attn
 
 
+class DecoderLayer(nn.Module):
+    def __init__(self, self_attention, cross_attention, d_model, d_ff=None,
+                 dropout=0.1, activation="relu"):
+        super(DecoderLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.self_attention = self_attention
+        self.cross_attention = cross_attention
+        self.conv1 = nn.Conv1d(in_channels=d_model,
+                               out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(
+            in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None):
+        x = x + self.dropout(self.self_attention(
+            x, x, x,
+            attn_mask=x_mask,
+            tau=tau, delta=None
+        )[0])
+        x = self.norm1(x)
+
+        x = x + self.dropout(self.cross_attention(
+            x, cross, cross,
+            attn_mask=cross_mask,
+            tau=tau, delta=delta
+        )[0])
+
+        y = x = self.norm2(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+        return self.norm3(x + y)
+
+
+class DecoderOnlyLayer(nn.Module):
+    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
+        super(DecoderOnlyLayer, self).__init__()
+        d_ff = d_ff or 4 * d_model
+        self.attention = attention
+        self.conv1 = nn.Conv1d(in_channels=d_model,
+                               out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(
+            in_channels=d_ff, out_channels=d_model, kernel_size=1)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = F.relu if activation == "relu" else F.gelu
+
+    def forward(self, x, attn_mask=None, tau=None, delta=None):
+        new_x, attn = self.attention(
+            x, x, x,
+            attn_mask=attn_mask,
+            tau=tau, delta=delta
+        )
+        x = x + self.dropout(new_x)
+
+        y = x = self.norm1(x)
+        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
+        y = self.dropout(self.conv2(y).transpose(-1, 1))
+
+        return self.norm2(x + y), attn
+
+
 class TimerLayer(nn.Module):
     def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
         super(TimerLayer, self).__init__()
@@ -115,44 +161,6 @@ def forward(self, x, attn_mask=None, tau=None, delta=None):
         return x, attns
 
 
-class DecoderLayer(nn.Module):
-    def __init__(self, self_attention, cross_attention, d_model, d_ff=None,
-                 dropout=0.1, activation="relu"):
-        super(DecoderLayer, self).__init__()
-        d_ff = d_ff or 4 * d_model
-        self.self_attention = self_attention
-        self.cross_attention = cross_attention
-        self.conv1 = nn.Conv1d(in_channels=d_model,
-                               out_channels=d_ff, kernel_size=1)
-        self.conv2 = nn.Conv1d(
-            in_channels=d_ff, out_channels=d_model, kernel_size=1)
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.norm3 = nn.LayerNorm(d_model)
-        self.dropout = nn.Dropout(dropout)
-        self.activation = F.relu if activation == "relu" else F.gelu
-
-    def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None):
-        x = x + self.dropout(self.self_attention(
-            x, x, x,
-            attn_mask=x_mask,
-            tau=tau, delta=None
-        )[0])
-        x = self.norm1(x)
-
-        x = x + self.dropout(self.cross_attention(
-            x, cross, cross,
-            attn_mask=cross_mask,
-            tau=tau, delta=delta
-        )[0])
-
-        y = x = self.norm2(x)
-        y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
-        y = self.dropout(self.conv2(y).transpose(-1, 1))
-
-        return self.norm3(x + y)
-
-
 class Decoder(nn.Module):
     def __init__(self, layers, norm_layer=None, projection=None):
         super(Decoder, self).__init__()
@@ -173,6 +181,38 @@ def forward(self, x, cross, x_mask=None, cross_mask=None, tau=None, delta=None):
         return x
 
 
+class DecoderOnly(nn.Module):
+    def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
+        super(DecoderOnly, self).__init__()
+        self.attn_layers = nn.ModuleList(attn_layers)
+        self.conv_layers = nn.ModuleList(
+            conv_layers) if conv_layers is not None else None
+        self.norm = norm_layer
+
+    def forward(self, x, attn_mask=None, tau=None, delta=None):
+        # x [B, L, D]
+        attns = []
+        if self.conv_layers is not None:
+            for i, (attn_layer, conv_layer) in enumerate(zip(self.attn_layers, self.conv_layers)):
+                delta = delta if i == 0 else None
+                x, attn = attn_layer(
+                    x, attn_mask=attn_mask, tau=tau, delta=delta)
+                x = conv_layer(x)
+                attns.append(attn)
+            x, attn = self.attn_layers[-1](x, tau=tau, delta=None)
+            attns.append(attn)
+        else:
+            for attn_layer in self.attn_layers:
+                x, attn = attn_layer(
+                    x, attn_mask=attn_mask, tau=tau, delta=delta)
+                attns.append(attn)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        return x, attns
+
+
 class TimerBlock(nn.Module):
     def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
         super(TimerBlock, self).__init__()
diff --git a/models/moirai.py b/models/moirai.py
@@ -28,24 +28,24 @@ def __init__(self, configs):
         )
         self.head = nn.Linear(configs.d_model, configs.input_token_len)
 
-    def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
+    def forecast(self, x, x_mark, y_mark):
         if self.use_norm:
-            means = x_enc.mean(1, keepdim=True).detach()
-            x_enc = x_enc - means
+            means = x.mean(1, keepdim=True).detach()
+            x = x - means
             stdev = torch.sqrt(
-                torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)
-            x_enc /= stdev
+                torch.var(x, dim=1, keepdim=True, unbiased=False) + 1e-5)
+            x /= stdev
         
-        B, _, C = x_enc.shape
-        padding = torch.zeros(B, self.input_token_len, C).to(x_enc.device)
-        x_enc = torch.cat([x_enc, padding], dim=1)
+        B, _, C = x.shape
+        padding = torch.zeros(B, self.input_token_len, C).to(x.device)
+        x = torch.cat([x, padding], dim=1)
         # [B, C, L]
-        x_enc = x_enc.permute(0, 2, 1)
+        x = x.permute(0, 2, 1)
         # [B, C, N, P]
-        x_enc = x_enc.unfold(dimension=-1, size=self.input_token_len, step=self.input_token_len)
-        N = x_enc.shape[2]
+        x = x.unfold(dimension=-1, size=self.input_token_len, step=self.input_token_len)
+        N = x.shape[2]
         # [B, C, N, D]
-        enc_out = self.embedding(x_enc)
+        enc_out = self.embedding(x)
         # [B, C * N, D]
         enc_out = enc_out.reshape(B, C * N, -1)
         enc_out, attns = self.encoder(enc_out, n_vars=C, n_tokens=N)
@@ -60,5 +60,5 @@ def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
             dec_out = dec_out * stdev + means
         return dec_out
 
-    def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
-        return self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
+    def forward(self, x, x_mark, y_mark):
+        return self.forecast(x, x_mark, y_mark)
diff --git a/models/patchtst.py b/models/patchtst.py
@@ -64,19 +64,19 @@ def __init__(self, configs):
         self.head_nf = configs.d_model * int((configs.seq_len - patch_len) / stride + 2)
         self.head = FlattenHead(self.head_nf, configs.test_pred_len, head_dropout=configs.dropout)
 
-    def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
+    def forecast(self, x, x_mark, y_mark):
         if self.use_norm:
             # Normalization from Non-stationary Transformer
-            means = x_enc.mean(1, keepdim=True).detach()
-            x_enc = x_enc - means
+            means = x.mean(1, keepdim=True).detach()
+            x = x - means
             stdev = torch.sqrt(
-                torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)
-            x_enc /= stdev
+                torch.var(x, dim=1, keepdim=True, unbiased=False) + 1e-5)
+            x /= stdev
 
         # do patching and embedding
-        x_enc = x_enc.permute(0, 2, 1)
+        x = x.permute(0, 2, 1)
         # u: [bs * nvars x patch_num x d_model]
-        enc_out, n_vars = self.patch_embedding(x_enc)
+        enc_out, n_vars = self.patch_embedding(x)
 
         # Encoder
         # z: [bs * nvars x patch_num x d_model]
@@ -99,6 +99,6 @@ def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
                     (means[:, 0, :].unsqueeze(1).repeat(1, self.pred_len, 1))
         return dec_out
 
-    def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
-        dec_out = self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
+    def forward(self, x, x_mark, y_mark):
+        dec_out = self.forecast(x, x_mark, y_mark)
         return dec_out[:, -self.pred_len:, :]  # [B, L, D]
diff --git a/models/timer.py b/models/timer.py
@@ -1,6 +1,6 @@
 import torch
 from torch import nn
-from layers.Transformer_EncDec import Encoder, EncoderLayer
+from layers.Transformer_EncDec import DecoderOnly, DecoderOnlyLayer
 from layers.SelfAttention_Family import FullAttention, AttentionLayer
 from layers.Embed import PositionalEmbedding
 
@@ -15,11 +15,9 @@ def __init__(self, configs):
         self.embedding = nn.Linear(self.input_token_len, configs.d_model, bias=False)
         self.position_embedding = PositionalEmbedding(configs.d_model)
         self.dropout = nn.Dropout(configs.dropout)
-        
-        # Timer is a Decoder-only Transformer. Please refer to issue: https://github.com/thuml/Large-Time-Series-Model/issues/23
-        self.blocks = Encoder(
+        self.blocks = DecoderOnly(
             [
-                EncoderLayer(
+                DecoderOnlyLayer(
                     AttentionLayer(
                         FullAttention(True, attention_dropout=configs.dropout, 
                                       output_attention=False), configs.d_model, configs.n_heads),
@@ -34,29 +32,29 @@ def __init__(self, configs):
         self.head = nn.Linear(configs.d_model, configs.output_token_len)
         self.use_norm = configs.use_norm
 
-    def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
+    def forecast(self, x, x_mark, y_mark):
         if self.use_norm:
-            means = x_enc.mean(1, keepdim=True).detach()
-            x_enc = x_enc - means
+            means = x.mean(1, keepdim=True).detach()
+            x = x - means
             stdev = torch.sqrt(
-                torch.var(x_enc, dim=1, keepdim=True, unbiased=False) + 1e-5)
-            x_enc /= stdev
+                torch.var(x, dim=1, keepdim=True, unbiased=False) + 1e-5)
+            x /= stdev
         # [B, L, C]
-        B, _, C = x_enc.shape
+        B, _, C = x.shape
         # [B, C, L]
-        x_enc = x_enc.permute(0, 2, 1)
+        x = x.permute(0, 2, 1)
         # [B, C, N, P]
-        x_enc = x_enc.unfold(
+        x = x.unfold(
             dimension=-1, size=self.input_token_len, step=self.input_token_len)
-        N = x_enc.shape[2]
+        N = x.shape[2]
         # [B * C, N, P]
-        x_enc = x_enc.reshape(B * C, N, -1)
+        x = x.reshape(B * C, N, -1)
         # [B * C, N, D]
-        enc_out = self.embedding(x_enc) + self.position_embedding(x_enc)
-        enc_out = self.dropout(enc_out)
-        enc_out, attns = self.blocks(enc_out)
+        embed_out = self.embedding(x) + self.position_embedding(x)
+        embed_out = self.dropout(embed_out)
+        embed_out, attns = self.blocks(embed_out)
         # [B * C, N, P]
-        dec_out = self.head(enc_out)
+        dec_out = self.head(embed_out)
         # [B, C, L]
         dec_out = dec_out.reshape(B, C, -1)
         # [B, L, C]
@@ -65,5 +63,5 @@ def forecast(self, x_enc, x_mark_enc, x_dec, x_mark_dec):
             dec_out = dec_out * stdev + means
         return dec_out
 
-    def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, mask=None):
-        return self.forecast(x_enc, x_mark_enc, x_dec, x_mark_dec)
+    def forward(self, x, x_mark, y_mark):
+        return self.forecast(x, x_mark, y_mark)
diff --git a/models/timer_xl.py b/models/timer_xl.py