import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch import Tensor
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from src.utils.compute.gpu import *
from src.utils.data.files import *
from src.text.vectorizing.vocab import *
from src.utils.plotting.nn import *

import spacy

SRC_LANGUAGE = 'DE'
TGT_LANGUAGE = 'EN'

nlp = {
    SRC_LANGUAGE: spacy.blank(SRC_LANGUAGE),
    TGT_LANGUAGE: spacy.blank(TGT_LANGUAGE)
}

multi30k_zip, target_path = download_dataset("text/corpora/translation/multi30k-de-en.zip")

File 'data/datasets/text/corpora/translation/multi30k-de-en.zip' already exists (use 'overwrite=True' to overwrite it).

multi30k = decompress_file(multi30k_zip, target_path=target_path)

print(multi30k)

['data/datasets/text/corpora/translation/multi30k-de-en.csv']

# Select preferred device (GPU, if available; CPU otherwise); you can enfore the use of the CPU
DEVICE = select_device(force_cpu=False)

print("Available device: {}".format(DEVICE))

Available device: cuda:0

df = pd.read_csv(multi30k[0])

df.head()

df_train = df[(df["PART"] == "TRAIN")]
df_test  = df[(df["PART"] == "TEST")]
df_val   = df[(df["PART"] == "VAL")]

print(f"Size of training set: {len(df_train)}")
print(f"Size of validation set: {len(df_val)}")
print(f"Size of test set: {len(df_test)}")

Size of training set: 29000
Size of validation set: 1014
Size of test set: 1000

tokens = {SRC_LANGUAGE: set(), TGT_LANGUAGE: set()}

for index, row in tqdm(df_train.iterrows(), total=len(df_train), desc=f"Processing rows"):
    for lang in [SRC_LANGUAGE, TGT_LANGUAGE]:
        tokens[lang].update([ t.text for t in nlp[lang](row[lang]) ])
        #break

Processing rows: 100%|████████████████████████████████████████████████████████████████████████| 29000/29000 [00:09<00:00, 2922.76it/s]

# Define list of special tokens
TOKEN_UNK, TOKEN_PAD, TOKEN_SOS, TOKEN_EOS = "<UNK>", "<PAD>", "<SOS>", "<EOS>"
SPECIAL_TOKENS = [TOKEN_UNK, TOKEN_PAD, TOKEN_SOS, TOKEN_EOS]

vocabulary = {}

for lang in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Create vocabulary
    vocabulary[lang] = Vocabulary(list(tokens[lang]), special_tokens=SPECIAL_TOKENS)
    # Set the default index to handle unknown words
    vocabulary[lang].set_default_index(vocabulary[lang][TOKEN_UNK])

IDX_UNK = vocabulary[SRC_LANGUAGE].token2index[TOKEN_UNK]
IDX_PAD = vocabulary[SRC_LANGUAGE].token2index[TOKEN_PAD]
IDX_SOS = vocabulary[SRC_LANGUAGE].token2index[TOKEN_SOS]
IDX_EOS = vocabulary[SRC_LANGUAGE].token2index[TOKEN_EOS]


print(f"Size of {SRC_LANGUAGE} vocabulary: {len(vocabulary[SRC_LANGUAGE])}")
print(f"Size of {TGT_LANGUAGE} vocabulary: {len(vocabulary[TGT_LANGUAGE])}")

Size of DE vocabulary: 19214
Size of EN vocabulary: 10837

def text_transform(s, lang):
    # Tokenize input string
    tokens = [ t.text for t in nlp[lang](s) ]
    # Encode tokens to token indices using vocabulary
    indices = vocabulary[lang].encode(tokens)
    # Prepend SOS token and append EOS token
    return torch.cat((torch.tensor([IDX_SOS]), torch.tensor(indices), torch.tensor([IDX_EOS])))

# Method to collate data samples into batch tensors
def collate_fn(batch):
    pad_value = vocabulary[TGT_LANGUAGE].token2index[TOKEN_PAD]
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform(src_sample.rstrip("\n"), SRC_LANGUAGE))
        tgt_batch.append(text_transform(tgt_sample.rstrip("\n"), TGT_LANGUAGE))
    # pad_sequence is an in-built method proivded by torch package
    src_batch = pad_sequence(src_batch, padding_value=pad_value)
    tgt_batch = pad_sequence(tgt_batch, padding_value=pad_value)
    return src_batch, tgt_batch

example_batch = [
    ["Das ist ein Auto.", "This is a car."],
    ["Das Kind spielt mit dem gelben Ball.", "The kid is playing with the yelly ball."]
]

src_batch, tgt_batch = collate_fn(example_batch)

print(f"Tensor for source language (shape: {src_batch.shape}):\n{src_batch}")

Tensor for source language (shape: torch.Size([10, 2])):
tensor([[    2,     2],
        [10354, 10354],
        [18847,  4516],
        [ 4458,  6417],
        [ 7400, 13141],
        [15806,  9726],
        [    3,   380],
        [    1,  5523],
        [    1, 15806],
        [    1,     3]])

class Multi30kDataset(Dataset):
    def __init__(self, sources, targets):
        self.sources = sources
        self.targets = targets
 
    def __len__(self):
        return len(self.sources)
 
    def __getitem__(self, idx):
        return self.sources[idx], self.targets[idx]

dataset_train = Multi30kDataset(df_train[SRC_LANGUAGE].tolist(), df_train[TGT_LANGUAGE].tolist())
dataset_val   = Multi30kDataset(df_val[SRC_LANGUAGE].tolist(), df_val[TGT_LANGUAGE].tolist())

BATCH_SIZE = 64

loader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
loader_val   = DataLoader(dataset_val, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)

for src, tgt in loader_train:
    print(src.shape, tgt.shape)
    break

torch.Size([29, 64]) torch.Size([37, 64])

class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* np.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])


class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * np.sqrt(self.emb_size)

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask.bool()


def create_mask(src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == IDX_PAD)
    tgt_padding_mask = (tgt == IDX_PAD)
    
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

batch_src = torch.LongTensor([
    [6, 4, 2, 1, 9],
    [5, 3, 4, 0, 0],
    [7, 8, 2, 3, 0],
    [1, 2, 0, 0, 0]
]).T  # <-- Transpose!

batch_tgt = torch.LongTensor([
    [3, 6, 4, 5, 0, 0],
    [2, 3, 1, 4, 7, 6],
    [5, 4, 1, 2, 5, 3],
    [6, 7, 4, 5, 1, 0]
]).T  # <-- Transpose!

src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(batch_src, batch_tgt)

print(src_mask)

tensor([[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]], device='cuda:0')

print(src_padding_mask)

tensor([[False, False, False,  True],
        [False, False, False, False],
        [False, False, False, False],
        [ True, False, False, False],
        [False, False, False, False]])

print(tgt_mask)

tensor([[False,  True,  True,  True],
        [False, False,  True,  True],
        [False, False, False,  True],
        [False, False, False, False]], device='cuda:0')

print(tgt_padding_mask)

tensor([[False, False, False, False],
        [False, False, False, False],
        [False,  True,  True, False],
        [False, False, False, False],
        [False, False, False,  True],
        [False, False, False, False]])

class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = nn.Transformer(d_model=emb_size,
                                          nhead=nhead,
                                          num_encoder_layers=num_encoder_layers,
                                          num_decoder_layers=num_decoder_layers,
                                          dim_feedforward=dim_feedforward,
                                          dropout=dropout,
                                          batch_first=True)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)

def train_epoch(loader, model, optimizer, criterion, description):
    # Set model in the training mode
    model.train()
    epoch_loss = 0
    # Iterate over all batches in data loader
    for idx, (src, tgt) in enumerate(tqdm(loader, desc=description, leave=False)):
        src = src.T.to(DEVICE)
        tgt = tgt.T.to(DEVICE)
        # Shit target sequences one token to the left
        tgt_output = tgt[:, 1:]        
        tgt_input = tgt[:, :-1]
        # Create all mask required for the nn.Transformer class
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        # Pass batch and all masks to model
        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
        # PyTorch magic (backpropagation and parameter updates)
        optimizer.zero_grad()
        loss = criterion(logits.reshape(-1, logits.shape[-1]), tgt_output.reshape(-1))
        loss.backward()
        optimizer.step()
        # Keep track of total epoch loss
        epoch_loss += loss.item()
    # Return loss for monitoring
    return epoch_loss / len(list(loader))

def evaluate(loader, model, description):
    # Set model in the evaluation mode
    model.eval()
    epoch_loss = 0
    # Iterate over all batches in data loader
    for idx, (src, tgt) in enumerate(tqdm(loader, desc=description, leave=False)):
        src = src.T.to(DEVICE)
        tgt = tgt.T.to(DEVICE)
        # Shit target sequences one token to the left
        tgt_output = tgt[:, 1:]        
        tgt_input = tgt[:, :-1]
        # Create all mask required for the nn.Transformer class
        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)
        # Pass batch and all masks to model
        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
        # Compute loss
        loss = criterion(logits.reshape(-1, logits.shape[-1]), tgt_output.reshape(-1))
        # Keep track of total epoch loss
        epoch_loss += loss.item()
    # Return loss for monitoring
    return epoch_loss / len(list(loader))

torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocabulary[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocabulary[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 4*EMB_SIZE
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

# Create model
model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

# Initialize weights
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

# Move model to device (ideally GPU, otherwise CPU)
model = model.to(DEVICE)

# Define loss function
criterion = torch.nn.CrossEntropyLoss(ignore_index=IDX_PAD)

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

num_epochs = 20
losses = []

for epoch in range(num_epochs):
    description = f"Epoch {epoch+1}/{num_epochs}"
    train_loss = train_epoch(loader_train, model, optimizer, criterion, description)
    val_loss = evaluate(loader_val, model, description)
    losses.append((train_loss, val_loss))

plot_training_results(losses, legend=['Loss (train)', 'Loss (val)'])

# Method to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    # Move everything to the correct device
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)
    # Pass source batch to the encoder (memory = encoder output)
    memory = model.encode(src, src_mask)
    # Initialize the output sequence with the SOS token index
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        # Generate causal mask for decoder
        tgt_mask = (generate_square_subsequent_mask(ys.size(1)).type(torch.bool)).to(DEVICE)
        # Pass current target sequence and memory to the decoder
        out = model.decode(ys, memory, tgt_mask)
        # Get the output for the last item in sequence (= next predicted word)
        prob = model.generator(out[:, -1])
        # Find the next most likely word based on highest probability
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()
        # Add next word to current target sequence
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
        # If the predicted word is <EOS>, we can exit the loop
        if next_word == IDX_EOS:
            break
    return ys

# Actual method to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    # Convert the source text into the corresponding sequence of token indices
    src = text_transform(src_sentence.strip(), SRC_LANGUAGE).view(1, -1)
    num_tokens = src.shape[1]
    # Compute the required mask for the source sequence
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    # Generate the target sequence using greedy decoding
    tgt_tokens = greedy_decode(model, src, src_mask, max_len=num_tokens+5, start_symbol=IDX_SOS).flatten()
    # Replace the token ids with the actual tokens/words and remove all special tokens
    return " ".join(vocabulary[TGT_LANGUAGE].decode(list(tgt_tokens.cpu().numpy()))).replace("<SOS>", "").replace("<EOS>", "")

print(translate(model, "Eine Gruppe von Menschen steht vor einem Iglu ."))

 A group of people standing in front of an igloo .

print(translate(model, "Ein Koch posiert beim Kochen für die Kamera ."))

 A chef is posing for the camera while cooking .

print(translate(model, "Ein Hund springt im Freien über ein Hindernis ."))

 A dog jumps over an outdoor obstacle .

print(translate(model, "Zwei Männer unterhalten sich auf dem Gehsteig während ein Auto vorbeifährt ."))

 Two men are talking on the sidewalk while a car passes by .

print(translate(model, "Ein kleiner Junge steht neben einer Pyramide aus Sand ."))

 A young boy stands next to a pyramid .

	PART	DE	EN
0	TRAIN	Zwei junge weiße Männer sind im Freien in der ...	Two young, White males are outside near many b...
1	TRAIN	Mehrere Männer mit Schutzhelmen bedienen ein A...	Several men in hard hats are operating a giant...
2	TRAIN	Ein kleines Mädchen klettert in ein Spielhaus ...	A little girl climbing into a wooden playhouse.
3	TRAIN	Ein Mann in einem blauen Hemd steht auf einer ...	A man in a blue shirt is standing on a ladder ...
4	TRAIN	Zwei Männer stehen am Herd und bereiten Essen zu.	Two men are at the stove preparing food.

Machine Translation with Transformers¶

Setting up the Notebook¶

Make Required Imports¶

Checking & Setting Computing Device¶

Preliminaries¶

Dataset Preparation¶

Loading Data from File¶

Create Vocabularies¶

Batch Collation¶

Create `Dataset` & `DataLoader` Instances¶

Definition of Model Architecture¶

Positional Encoding¶

Masking¶

Required Masks¶

Auxiliary Methods¶

Masking: Example¶

Encoder-Decoder Architecture¶

Model Training & Validation¶

Auxiliary Methods¶

Create Model, Criterion & Optimizer¶

Perform Training¶

Model Inference¶

Auxiliary Methods¶

Basic Tests¶

Summary¶

Machine Translation with Transformers¶

Setting up the Notebook¶

Make Required Imports¶

Checking & Setting Computing Device¶

Preliminaries¶

Dataset Preparation¶

Loading Data from File¶

Create Vocabularies¶

Batch Collation¶

Create Dataset & DataLoader Instances¶

Definition of Model Architecture¶

Positional Encoding¶

Masking¶

Required Masks¶

Auxiliary Methods¶

Masking: Example¶

Encoder-Decoder Architecture¶

Model Training & Validation¶

Auxiliary Methods¶

Create Model, Criterion & Optimizer¶

Perform Training¶

Model Inference¶

Auxiliary Methods¶

Basic Tests¶

Summary¶

Create `Dataset` & `DataLoader` Instances¶