import torch
import torch.nn as nn
from src.models.neural.transformer import MultiHeadAttention

batch_size, d_model = 32, 512
seq_len_en, seq_len_de = 50, 60

torch.manual_seed(0)
SOURCE = torch.rand((batch_size, seq_len_en, d_model))
TARGET = torch.rand((batch_size, seq_len_de, d_model))

print(f"Shape of SOURCE tensor: {SOURCE.shape}")
print(f"Shape of TARGET tensor: {TARGET.shape}")

Shape of SOURCE tensor: torch.Size([32, 50, 512])
Shape of TARGET tensor: torch.Size([32, 60, 512])

n_heads = 8

mha = MultiHeadAttention(d_model, n_heads)

mha_out_self_attn = mha(SOURCE, SOURCE, SOURCE)

print(f"Shape of self-attention output: {mha_out_self_attn.shape}")

Shape of self-attention output: torch.Size([32, 50, 512])

mha_out_cross_attn = mha(TARGET, SOURCE, SOURCE)

print(f"Shape of cross-attention output: {mha_out_cross_attn.shape}")

Shape of cross-attention output: torch.Size([32, 60, 512])

class FeedForwardNetwork(nn.Module):
    
    def __init__(self, d_model, d_ffn=2048):
        super().__init__()
        
        # Define basic Feed Forward Network as proposed in the original Transformer paper
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ffn),
            nn.ReLU(),
            nn.Linear(d_ffn, d_model),
        )

    def forward(self, x):
        return self.net(x)

d_ffn = 2048

ffn = FeedForwardNetwork(d_model, d_ffn=d_ffn)

print(ffn)

FeedForwardNetwork(
  (net): Sequential(
    (0): Linear(in_features=512, out_features=2048, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2048, out_features=512, bias=True)
  )
)

ffn_out = ffn(mha_out_self_attn)

print(ffn_out.shape)

torch.Size([32, 50, 512])

class TransformerEncoderLayer(nn.Module):
    
    def __init__(self, d_model, n_heads, d_ffn=2048, dropout=0.1):
        super().__init__()
        
        # MultiHeadAttention block
        self.mha1 = MultiHeadAttention(d_model, n_heads)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(d_model)
        
        # FeedForwardNetwork block
        self.ffn = FeedForwardNetwork(d_model, d_ffn)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(d_model)
        
    def forward(self, source):
        # MultiHeadAttentionBlock
        out1 = self.mha1(source, source, source) # self-attention
        out1 = self.dropout1(out1)               # dropout
        out1 = self.norm1(out1 + source)         # residual connection
        # FeedForward block
        out2 = self.ffn(out1)                    # feed-forward network (incl. nonlinearity)
        out2 = self.dropout2(out2)               # dropout
        out2 = self.norm2(out2 + out1)           # residual connection + layer normalization
        # Return final output
        return out2

encoder_layer = TransformerEncoderLayer(d_model, n_heads)

#print(encoder_layer)  # quite verbose and not very important, but feel free to comment out to show network architecture

encoder_layer_output = encoder_layer(SOURCE)

print(f"Shape of encoder layer output: {encoder_layer_output.shape}")

Shape of encoder layer output: torch.Size([32, 50, 512])

class TransformerEncoder(nn.Module):
    
    def __init__(self, n_layers, d_model, n_heads, d_ffn=2048, dropout=0.1):
        super().__init__()
        
        # Define n_layers (N) encoder layers
        self.layers = nn.ModuleList(
            [ TransformerEncoderLayer(d_model, n_heads, d_ffn, dropout) for _ in range(n_layers) ]
        )

    def forward(self, source):
        for l in self.layers:
            out = l(source)
        return out

n_layers = 6

encoder = TransformerEncoder(n_layers, d_model, n_heads)

#print(encoder)  # quite verbose and not very important, but feel free to comment out to show network architecture

encoder_output = encoder(SOURCE)

print(f"Shape of encoder output: {encoder_output.shape}")

Shape of encoder output: torch.Size([32, 50, 512])

class TransformerDecoderLayer(nn.Module):
    
    def __init__(self, d_model, n_heads, d_ffn=2048, dropout=0.1):
        super().__init__()
        
        # 1st MultiHeadAttention block (decoder input only)
        self.mha1 = MultiHeadAttention(d_model, n_heads)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(d_model)
        
        # 2nd MultiHeadAttention block (encoder & decoder)
        self.mha2 = MultiHeadAttention(d_model, n_heads)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(d_model)

        # Feed-forward network block
        self.ff = FeedForwardNetwork(d_model, d_ffn)
        self.dropout3 = nn.Dropout(dropout)
        self.norm3 = nn.LayerNorm(d_model)
        
    def forward(self, target, memory):
        # 1st MultiHeadAttention block
        out1 = self.mha1(target, target, target)
        out1 = self.dropout1(out1)
        out1 = self.norm1(out1 + target)
        # 2nd MultiHeadAttention block
        out2 = self.mha2(out1, memory, memory)
        out2 = self.dropout2(out2)
        out2 = self.norm2(out2 + out1)
        # FeedForward block
        out3 = self.ff(out2)
        out3 = self.dropout3(out3)
        out3 = self.norm3(out3 + out2)
        # Return final output
        return out3

decoder_layer = TransformerDecoderLayer(d_model, n_heads)

#print(decoder_layer)  # quite verbose and not very important, but feel free to comment out to show network architecture

decoder_output = decoder_layer(TARGET, encoder_output)

print(decoder_output.shape)

torch.Size([32, 60, 512])

class TransformerDecoder(nn.Module):
    
    def __init__(self, n_layers, d_model, n_heads, d_ffn=2048, dropout= 0.1):
        super().__init__()
        
        # Define num_layers (N) decoder layers
        self.layers = nn.ModuleList(
            [ TransformerDecoderLayer(d_model, n_heads, d_ffn, dropout) for _ in range(n_layers) ]
        )

    def forward(self, target, memory):
        # Push through each decoder layer
        for l in self.layers:
            target = l(target, memory)
        return target

decoder = TransformerDecoder(n_layers, d_model, n_heads)

#print(decoder)  # quite verbose and not very important, but feel free to comment out to show network architecture

decoder_output = decoder(TARGET, encoder_output)

print(decoder_output.shape)

torch.Size([32, 60, 512])

class Transformer(nn.Module):
    
    def __init__(self, n_layers, d_model, n_heads, d_ffn=2048, dropout=0.1):
        super().__init__()
        
        # Definer encoder
        self.encoder = TransformerEncoder(n_layers, d_model, n_heads, d_ffn, dropout)
        
        #Define decoder
        self.decoder = TransformerDecoder(n_layers, d_model, n_heads, d_ffn, dropout)

    def forward(self, source, target):
        memory = self.encoder(source)
        return self.decoder(target, memory)

transformer = Transformer(n_layers, d_model, n_heads)

#print(transformer)  # quite verbose and not very important, but feel free to comment out to show network architecture

transformer_output = transformer(SOURCE, TARGET)

print(f"Shape of transformer output: {transformer_output.shape}")

Shape of transformer output: torch.Size([32, 60, 512])

class Model(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        
        # Define Transformer (with some fixed parameters)
        n_layers, d_model, n_heads = 6, 512, 8
        self.transformer = Transformer(n_layers, d_model, n_heads)

        # Define final linear layer + softmax layer
        self.linear  = nn.Linear(d_model, vocab_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, source, target):
        out = self.transformer(source, target)
        return self.softmax(self.linear(out))

vocab_size = 10_000

model = Model(vocab_size)

model_output = model(SOURCE, TARGET)

print(f"Shape of model output: {model_output.shape}")

Shape of model output: torch.Size([32, 60, 10000])

print(f"Sum of all vector values: {torch.sum(model_output[0,0]).item()}")

Sum of all vector values: 1.0000001192092896

Model	$n_{layers}$	$n_{heads}$	$d_{model}$	$n_{params}$
BERT Base	12	12	768	~110M
BERT Large	24	16	1024	~340M
RoBERTa Base	12	12	768	~125M
RoBERTa Large	24	16	1024	~355M
DistilBERT	6	12	768	~66M
DeBERTa V3 Base	12	12	768	~184M

Model	$n_{layers}$	$n_{heads}$	$d_{model}$	$n_{params}$
GPT-2 Small	12	12	768	~117M
GPT-2 Medium	24	16	1024	~345M
GPT-2 Large	36	20	1280	~774M
GPT-3 (175B)	96	96	12288	~175B
GPT-3.5 (Turbo)	?	?	?	~154B (estimated)
GPT-4 (oAI)	?	?	?	>1T? (speculative)
LLaMA 2 7B	32	32	4096	~7B
LLaMA 2 13B	40	40	5120	~13B
LLaMA 2 70B	80	64	8192	~70B
Mistral 7B	32	32	4096	~7.3B

Model	$n_{layers}$	$n_{heads}$	$d_{model}$	$n_{params}$
T5 Small	6 / 6	8	512	~60M
T5 Base	12 / 12	12	768	~220M
T5 Large	24 / 24	16	1024	~770M
T5-11B	24 / 24	128	1024	~11B
BART Base	6 / 6	12	768	~140M
BART Large	12 / 12	16	1024	~400M
mBART50	12 / 12	16	1024	~610M
Pegasus	16 / 16	16	1024	~568M

Transformers — Basic Architecture¶

Setting up the Notebook¶

Make Required Imports¶

Preliminaries¶

Generate Example Data¶

The Transformer Architecture¶

Multi-Head Attention¶

Motivation & Basic Idea¶

Attention Head¶

Multi-Head Attention¶

Feed-Forward Network Layer¶

Encoder¶

Encoder Layer¶

Complete Encoder¶

Decoder¶

Decoder Layer¶

Complete Decoder¶

Full Encoder-Decoder Architecture¶

Additional Important Components¶

Positional Encodings¶

Masking¶

Padding Masking¶

Causal Masking¶

Output Handling¶

Discussion & What's Next?¶

Encoder vs. Decoder vs. Both¶

Encoder-Only Architectures¶

Decoder-Only Architecture¶

Encoder-Decoder Architectures¶

Optimized Implementations¶

Variants & Extensions¶

Different Layers & Architectures¶

Multi-Head Attention Variants¶

KV Caching¶

Avoiding quadratic complexity¶

Positional Encoding Strategies¶

Beyond Text¶

Summary¶