import sys
import numpy as np
from tqdm import tqdm

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer

from src.utils.compute.gpu import *
from src.utils.data.files import *

movie_reviews_zip, target_folder = download_dataset("text/corpora/reviews/movie-reviews-imdb.zip")

File 'data/datasets/text/corpora/reviews/movie-reviews-imdb.zip' already exists (use 'overwrite=True' to overwrite it).

movie_reviews = decompress_file(movie_reviews_zip, target_path=target_folder)

print(movie_reviews)

['data/datasets/text/corpora/reviews/movie-reviews-imdb.txt']

# Select preferred device (GPU, if available; CPU otherwise); you can enfore the use of the CPU
device = select_device(force_cpu=False)

print("Available device: {}".format(device))

Available device: cuda:0

mode = "demo"
#mode = "full"

total_reviews = sum(1 for _ in open(movie_reviews[0]))

print(f"Total number of reviews (1 review per line): {total_reviews}")

Total number of reviews (1 review per line): 100000

if mode == "demo":
    num_considered_reviews = 10_000
else:
    num_considered_reviews = 100_000

num_reviews = min(total_reviews, num_considered_reviews)

print(f"Number of reviews used for training dataset: {num_reviews}")

Number of reviews used for training dataset: 10000

tokenizer = AutoTokenizer.from_pretrained("gpt2")

tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>'}

EOS_TOKEN_GPT2 = "<|endoftext|>"

tokens = []

with open(movie_reviews[0]) as file:
    for idx, review in enumerate(tqdm(file, total=num_reviews, leave=False)):
        if idx >= num_reviews:
            break
        tokens.extend(tokenizer.encode(f"{review.strip().lower()} {EOS_TOKEN_GPT2}", truncation=True, max_length=sys.maxsize))

print(f"Total number of tokens: {len(tokens)}")

Total number of tokens: 2922984

class CausalLMDataset(Dataset):

    def __init__(self, tokens, max_len=128, stride=None):
        self.input_ids  = []
        self.target_ids = []
        
        if stride is None:
            stride = max_len

        for i in range(0, len(tokens)-max_len, stride):
            self.input_ids.append(torch.LongTensor(tokens[i:(i+max_len)]))
            self.target_ids.append(torch.LongTensor(tokens[(i+1):(i+max_len+1)]))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

context_size = 128

dataset = CausalLMDataset(tokens, max_len=context_size, stride=context_size//2)

loader = DataLoader(dataset, batch_size=64, shuffle=True, drop_last=True)

def train_epoch(loader, model, optimizer, description):
    epoch_loss = 0.0
    model_device = next(model.parameters()).device
    for idx, (inputs, targets) in enumerate(tqdm(loader, desc=description, leave=False)):
        # Move current batch to GPU, if available
        inputs, targets = inputs.to(model_device), targets.to(model_device)
        # Calculate loss
        logits, loss = model(inputs, targets)
        # Reset the gradients from previous iteration
        model.zero_grad()
        # Calculate new Gradients using backpropagation
        loss.backward()
        # Update all trainable parameters (i.e., the theta values of the model)
        optimizer.step()
        # Update epoch loss
        epoch_loss += loss.item()
    return epoch_loss

def save_checkpoint(model, optimizer, epoch, loss, path="checkpoint.pt"):
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": loss,
    }
    torch.save(checkpoint, path)
    print(f"Checkpoint saved at {path}")

def load_checkpoint(model, optimizer, path="checkpoint.pt", device="cuda"):
    checkpoint = torch.load(path, map_location=device)
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint["epoch"]
    loss = checkpoint["loss"]
    print(f"Checkpoint loaded (epoch {epoch}, loss {loss:.4f})")
    return epoch, loss

def generate_response(prompt, tokenizer, model, max_new_tokens=50):
    model_device = next(model.parameters()).device
    prompt_encoded = tokenizer.encode(prompt, return_tensors="pt").to(model_device)
    generated = model.generate(prompt_encoded, max_new_tokens=max_new_tokens)
    output_text = tokenizer.decode(generated[0], skip_special_tokens=True)
    return output_text

def generate_example_responses(tokenizer, model, path="example-responses.txt"):
    prompts = ["the best part of the movie was", "my favorite scene of the in the movie", "the script and the direction"]
    with open(path, "w") as file:
        for prompt in prompts:
            response = generate_response(prompt, tokenizer, model)
            file.write(f"{response}\n\n")

folder = create_folder("data/generated/models/gpt2/")

print(folder)

data/generated/models/gpt2/

class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 1024):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* np.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

class GPT2LanguageModel(nn.Module):
    def __init__(self, vocab_size, d_model=512, n_heads=8, num_layers=6, context_size=128, dropout=0.1):
        super().__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.context_size = context_size

        # Token and position embedding layers
        self.token_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = PositionalEncoding(d_model, dropout=dropout)

        # Transformer encoder = decoder without cross-attention block
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=4*d_model,
            dropout=dropout,
            activation="gelu",
            batch_first=True
        )        
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Output project from the embedding size to the vocabulary size
        self.lm_head = nn.Linear(d_model, vocab_size, bias=False)

        # Create causal mask and initialize weights
        self.register_buffer("mask", self._generate_square_subsequent_mask(context_size))
        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.token_embed.weight, mean=0.0, std=0.02)
        nn.init.normal_(self.lm_head.weight, mean=0.0, std=0.02)

    def _generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz), diagonal=1).bool()
        return mask

    def forward(self, input_ids, targets=None):
        # Get batch size B and sequence length T from inputs
        B, T = input_ids.shape
        device = input_ids.device

        # Generate combined token and position embedding vectors
        x = self.pos_embed(self.token_embed(input_ids))

        # Create required causal mask by trimming precomputed mask to current input length
        causal_mask = self.mask[:T, :T].to(device)

        # Pass inputs and mask to transformer
        x = self.transformer(x, mask=causal_mask)

        # Compute output logits
        logits = self.lm_head(x)  # (B, T, vocab_size)

        loss = None
        # If targets are available (during training), compute loss
        if targets is not None:
            loss = F.cross_entropy(
                logits.view(-1, self.vocab_size),
                targets.view(-1),
                ignore_index=-1,
            )
        # Return logits (for inference) and the loss (for training)
        return logits, loss


    @torch.no_grad()
    def generate(self, input_ids, max_new_tokens=50, temperature=1.0, top_k=10):
        for _ in range(max_new_tokens):
            if input_ids.size(1) > self.context_size:
                input_ids = input_ids[:, -self.context_size:]
    
            logits, _ = self(input_ids)
            logits = logits[:, -1, :] / temperature  # focus on last token
    
            # Top-k filtering
            if top_k is not None and top_k < logits.size(-1):
                topk_vals, topk_idx = torch.topk(logits, top_k)
                probs = F.softmax(topk_vals, dim=-1)
                next_token = topk_idx.gather(
                    1, torch.multinomial(probs, num_samples=1)
                )
            else:
                # fallback to full softmax sampling
                probs = F.softmax(logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
    
            input_ids = torch.cat([input_ids, next_token], dim=1)
        return input_ids

if mode == "demo":
    model = GPT2LanguageModel(tokenizer.vocab_size, d_model=128, n_heads=4, num_layers=3).to(device)
else:
    model = GPT2LanguageModel(tokenizer.vocab_size, d_model=256, n_heads=8, num_layers=6).to(device)


optimizer = optim.AdamW(
    model.parameters(),
    lr=3e-4,          # initial learning rate
    betas=(0.9, 0.95),  # GPT-2 and many LM use this instead of (0.9, 0.999)
    weight_decay=0.1    # encourages generalization
)

print(model)

GPT2LanguageModel(
  (token_embed): Embedding(50257, 128)
  (pos_embed): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (lm_head): Linear(in_features=128, out_features=50257, bias=False)
)

if mode == "demo":
    prompt = "the best part of the movie was"
    print(generate_response(prompt, tokenizer, model))
else:
    generate_example_responses(tokenizer, model, path=f"{folder}example-responses-{0}.txt")

the best part of the movie was MEN pipeiopCong YOU thruiopiopaks YOUiop Points childcare YOU foreground Vanilla childcareESPN textures textures Points barrier textures Herman bathroomsiop erasediopESPNaks childcare MEN MEN beginner MEN thru motivate bandampire boiler barrieraksaks MEN Points cannonaks thru Hockey childcare

num_epochs = 5

for epoch in range(num_epochs):
    description = f"Epoch {epoch+1}/{num_epochs}"
    epoch_loss = train_epoch(loader, model, optimizer, description)
    #
    if mode == "demo":
        print(generate_response(prompt, tokenizer, model))
    else:
        save_checkpoint(model, optimizer, epoch+1, epoch_loss, path=f"{folder}checkpoint-{epoch+1}.pt")
        generate_example_responses(tokenizer, model, path=f"{folder}example-responses-{epoch+1}.txt")

print(f"Done training {num_epochs} epochs.")

the best part of the movie was a few, and it. there's been a movie. i can't have been an lot of the film that and i can be the movie was a very very very bad. the movie to get the first movie. it was a very film.

the best part of the movie was in a film. this movie. there are so bad movie is just the movie. i can make up to be a few minutes. it's no sense of the story, but this. the film is a film, and one of the end,

the best part of the movie was a film was to me of the movie. it's really just a film. but i've been funny. it. i was a movie are a bad, but this movie is a great movie that the "i am not even

the best part of the movie was the movie is bad. the movie was a good. this movie was not to the characters and you have the movie i was the movie was a bad, it was the movie was so stupid. this is that the worst. i had a

the best part of the movie was the plot, but they could have been a few other than it was not to do not as it is a lot better. it just to the worst film was made. it was a film. it is no sense. and the worst is the acting
Done training 5 epochs.

Building a GPT-Style LLM from Scratch¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Checking & Setting Computing Device¶

Preliminaries¶

Dataset Preparation¶

Load Reviews from File¶

Tokenize & Generate Token Stream¶

Create `Dataset` and `DataLoader`¶

Auxiliary Methods¶

Training a Single Epoch¶

Saving & Loading Checkpoints¶

Generate & Save Example Responses¶

Creating the Model¶

Positional Encoding¶

Transformer Model¶

Model Training¶

Summary¶

Building a GPT-Style LLM from Scratch¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Checking & Setting Computing Device¶

Preliminaries¶

Dataset Preparation¶

Load Reviews from File¶

Tokenize & Generate Token Stream¶

Create Dataset and DataLoader¶

Auxiliary Methods¶

Training a Single Epoch¶

Saving & Loading Checkpoints¶

Generate & Save Example Responses¶

Creating the Model¶

Positional Encoding¶

Transformer Model¶

Model Training¶

Summary¶

Create `Dataset` and `DataLoader`¶