import sys
import pandas as pd
import torch
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import GPT2Config, GPT2LMHeadModel
from transformers import AutoModelForCausalLM, AutoTokenizer

from src.utils.compute.gpu import *
from src.utils.data.files import *

movie_reviews_zip, target_folder = download_dataset("text/corpora/reviews/movie-reviews-imdb.zip")

File 'data/datasets/text/corpora/reviews/movie-reviews-imdb.zip' already exists (use 'overwrite=True' to overwrite it).

movie_reviews = decompress_file(movie_reviews_zip, target_path=target_folder)

print(movie_reviews)

['data/datasets/text/corpora/reviews/movie-reviews-imdb.txt']

# Select preferred device (GPU, if available; CPU otherwise); you can enfore the use of the CPU
DEVICE = select_device(force_cpu=False)

print("Available device: {}".format(DEVICE))

Available device: cuda:0

#mode = "tiny"  # 1,000 reviews
mode = "small" # 10,000 reviews 
#mode = "full"  # 100,000 reviews

total_reviews = sum(1 for _ in open(movie_reviews[0]))

print(f"Total number of reviews (1 review per line): {total_reviews}")

Total number of reviews (1 review per line): 100000

if mode == "tiny":
    num_considered_reviews = 1_000
elif mode == "small":
    num_considered_reviews = 10_000
else:
    num_considered_reviews = 100_000

num_reviews = min(total_reviews, num_considered_reviews)

print(f"Number of reviews used for training dataset: {num_reviews}")

Number of reviews used for training dataset: 10000

model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>'}

EOS_TOKEN_GPT2 = "<|endoftext|>"

tokens = []

with open(movie_reviews[0]) as file:
    for idx, review in enumerate(tqdm(file, total=num_reviews, leave=False)):
        if idx >= num_reviews:
            break
        tokens.extend(tokenizer.encode(f"{review.strip()} {EOS_TOKEN_GPT2}", truncation=True, max_length=sys.maxsize))

print(f"Total number of tokens: {len(tokens)}")

Total number of tokens: 2880982

class GPT2TextDataset(Dataset):
    def __init__(self, tokens, max_length=128, stride=None):
        self.input_ids  = []
        
        if stride is None:
            stride = max_len

        for i in range(0, len(tokens)-max_length, stride):
            self.input_ids.append(torch.LongTensor(tokens[i:(i+max_length)]))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx]

context_size = 128

dataset = GPT2TextDataset(tokens, max_length=context_size, stride=context_size//2)

loader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)

for batch in loader:
    print(batch.shape)
    break

torch.Size([32, 128])

def count_parameters(model):
    n_params = sum(p.numel() for p in model.parameters())
    n_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return n_params, n_trainable_params

def kld_loss(student_logits, teacher_logits, temperature=2.0):
    p = F.log_softmax(student_logits / temperature, dim=-1)
    q = F.log_softmax(teacher_logits / temperature, dim=-1)
    return (temperature*temperature) * F.kl_div(p, q, log_target=True, reduction='batchmean')

def mse_logit_loss(student_logits, teacher_logits):
    return F.mse_loss(student_logits, teacher_logits)

def train_epoch(loader, teacher, student, criterion, optimizer, description, alpha=0.5):
    student.train()
    epoch_loss = 0.0
    device = next(student.parameters()).device
    for idx, input_ids in enumerate(tqdm(loader, desc=description, leave=False)):
        # Move current batch to GPU, if available
        input_ids = input_ids.to(device)
        # Compute attention mask since it's mandatory
        attention_mask = torch.ones_like(input_ids)
        # Forward pass + compute (hard) loss
        outputs = student(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=input_ids   # causal LM: predict next token
        )
        # Get logit outputs from student and teacher        
        student_logits = outputs.logits
        teacher_logits = teacher(input_ids).logits
        # Compute weighted loss between the hard loss and soft loss
        hard_loss = outputs.loss
        # Compute total loss as balanced loss between hard and soft loss
        loss = criterion(hard_loss, teacher_logits, student_logits)
        # Perform PyTorch magic (backpropagation + parameter updates)
        student.zero_grad()
        loss.backward()
        optimizer.step()
        # Update epoch loss
        epoch_loss += loss.item()
    return epoch_loss

def save_checkpoint(model, optimizer, epoch, loss, path="checkpoint.pt"):
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": loss,
    }
    torch.save(checkpoint, path)
    print(f"Checkpoint saved at {path}")

def load_checkpoint(model, optimizer, path="checkpoint.pt", device="cuda"):
    checkpoint = torch.load(path, map_location=device)
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint["epoch"]
    loss = checkpoint["loss"]
    print(f"Checkpoint loaded (epoch {epoch}, loss {loss:.4f})")
    return epoch, loss

def generate_response(prompt, tokenizer, model, max_new_tokens=50, do_sample=True, temperature=1.0, top_p=0.5):
    # Get the device where the model is located
    model_device = next(model.parameters()).device
    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt", padding=False, truncation=True).to(model_device)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    # Generate continuation
    output_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,     # enable stochastic sampling
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    # Decode to string and return generated response
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

folder = create_folder("data/generated/models/knowledge-distiallation-gpt2/")

print(folder)

data/generated/models/knowledge-distiallation-gpt2/

teacher = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE)

for param in teacher.parameters():
    param.requires_grad = False
teacher = teacher.eval()

n_params_teacher, n_trainable_params_teacher = count_parameters(teacher)

print(f"Total number of parameters: {n_params_teacher:,}")
print(f"Number of trainable parameters: {n_trainable_params_teacher:,}")

Total number of parameters: 124,439,808
Number of trainable parameters: 0

teacher_config = teacher.config

print(f"Embedding size: {teacher_config.n_embd}")
print(f"Number of layers: {teacher_config.n_layer}")
print(f"Number of heads: {teacher_config.n_head}")

Embedding size: 768
Number of layers: 12
Number of heads: 12

prompt = "The best part of the movie was"

print(generate_response(prompt, tokenizer, teacher))

The best part of the movie was the music, the sound, the way the music was played. It was just so good.

How did you make the film?

I had to make it from scratch. I was working with the director, and I had to get

student_config = GPT2Config(
    vocab_size=50257,
    n_positions=128,
    n_ctx=128,
    n_embd=256,
    n_layer=4,
    n_head=4
)

student = GPT2LMHeadModel(student_config).to(DEVICE)

n_params_student, n_trainable_params_student = count_parameters(student)

print(f"Total number of parameters: {n_params_student:,}")
print(f"Number of trainable parameters: {n_trainable_params_student:,}")

Total number of parameters: 16,058,112
Number of trainable parameters: 16,058,112

print(f"Size of student model compared to teacher model: {n_params_student/n_params_teacher*100:.2f}%")

Size of student model compared to teacher model: 12.90%

print(generate_response(prompt, tokenizer, student))

The best part of the movie was nevertheless Herazard Likely cous angrilyDimviouslyB Duchess audiences audiences spe spe spe cookedOffic veteran veteran veteran militants dwarves StraEnableduilding measuringeland Totem negotiate Called Called consistently Batman consistentlyantine puppet mechanic settle testimony 1992 mirrored ingenatmealconnectionconnection MormonismY gravitationalenses Sanders

def balanced_loss(hard_loss, teacher_logits, student_logits, soft_loss_func="kld", alpha=0.5):
    if soft_loss_func.lower() == "kld":
        soft_loss = kld_loss(student_logits, teacher_logits)
    else:
        soft_loss = mse_logit_loss(student_logits, teacher_logits)
    return alpha*hard_loss + (1-alpha)*soft_loss

criterion = lambda x, y, z: balanced_loss(x, y, z, soft_loss_func="kld", alpha=0.5)

optimizer = optim.AdamW(
    student.parameters(),
    lr=3e-4,            # initial learning rate
    betas=(0.9, 0.95),  # GPT-2 and many LM use this instead of (0.9, 0.999)
    weight_decay=0.1    # encourages generalization
)

num_epochs = 5

for epoch in range(num_epochs):
    description = f"Epoch {epoch+1}/{num_epochs}"
    epoch_loss = train_epoch(loader, teacher, student, criterion, optimizer, description, alpha=0.5)
    # Generate and print student's repsonse for an example prompt
    student_response = generate_response(prompt, tokenizer, student)
    student_response = re.sub(r"\s+", " ", student_response)
    print(student_response)
    # Save a checkpoint in full training mode
    if mode == "full":
        save_checkpoint(model, optimizer, epoch+1, epoch_loss, path=f"{folder}checkpoint-{epoch+1}.pt")

Epoch 1/5:   0%|                                                                                                                          | 0/1406 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.

The best part of the movie was not in the film. I'm just the film, I've seen a good film. I was a little more than a good film. I was a great movie. I'm just just just a bit of the characters.

The best part of the movie was the film of the film and I was never really going to be the film. I'm not going to see it. I didn't even see it. I just watched the movie, I've seen a few years ago. I was

The best part of the movie was the most part of the movie, and that's a really bad, but I think I have a little to say about how much the movie is. it is, and it's a lot better. The story is good, and the plot is

The best part of the movie was that I didn't want to get the film to do with the characters. I was really disappointed. I really didn't see the movie for the movie. I can say I had a lot of the way to get into the movie. I didn

The best part of the movie was the first movie, and then I'm a huge fan of the movie. I've seen the first movie, but I have been pretty much a lot of a lot of time and I don't think that was really going to be in the movie

# Define a list of example prompts
prompts = ["The best part of the movie was", "When I saw the movie", "The movie has a great cast and"]
# Define list that records all generated responses during the multiple training steps
results = []

num_epochs = 5

for alpha in [0.0, 0.5, 1.0]:
    # Create and intialize student model
    student = GPT2LMHeadModel(student_config).to(DEVICE)
    # Specify loss function (here: set the value for alpha)
    criterion = lambda x, y, z: balanced_loss(x, y, z, soft_loss_func="kld", alpha=alpha)    
    # Create optimizer instance
    optimizer = optim.AdamW(
        student.parameters(),
        lr=3e-4,            # initial learning rate
        betas=(0.9, 0.95),  # GPT-2 and many LM use this instead of (0.9, 0.999)
        weight_decay=0.1    # encourages generalization
    )
    # Train for num_epochs 
    for epoch in range(num_epochs):
        description = f"[alpha={alpha}] Epoch {epoch+1}/{num_epochs}"
        epoch_loss = train_epoch(loader, teacher, student, criterion, optimizer, description, alpha=alpha)
        # For example prompt, generate response by teacher and student
        for prompt in prompts:
            teacher_response = generate_response(prompt, tokenizer, teacher)
            student_response = generate_response(prompt, tokenizer, student)
            teacher_response = re.sub(r"\s+", " ", teacher_response)
            student_response = re.sub(r"\s+", " ", student_response)
            results.append((alpha, epoch+1, teacher_response, student_response))

df = pd.DataFrame(results, columns=["ALPHA", "EPOCH", "TEACHER", "STUDENT"])
df.to_csv(f"{folder}results.csv", index=None)

df.head(n=len(df))

Variant	Parameters	Layers	Embedding Size	Attention Heads	Notes
GPT-2 Small	~117M	12	768	12	Default “base” model used in many tutorials
GPT-2 Medium	~345M	24	1024	16	More expressive, but heavier to train and run
GPT-2 Large	~774M	36	1280	20	Significant performance improvement
GPT-2 XL	~1.5B	48	1600	25	Largest publicly released GPT-2 model

	ALPHA	EPOCH	TEACHER	STUDENT
0	0.0	1	The best part of the movie was the actors, who...	The best part of the movie was a bit of the mo...
1	0.0	1	When I saw the movie, I was like, 'Oh, I see t...	When I saw the movie. I got a bit of the way t...
2	0.0	1	The movie has a great cast and some great acto...	The movie has a great cast and it is an intere...
3	0.0	2	The best part of the movie was the music. I'm ...	The best part of the movie was the first place...
4	0.0	2	When I saw the movie, I was like, 'Oh my God, ...	When I saw the movie and I was a bit of a lot ...
5	0.0	2	The movie has a great cast and the characters ...	The movie has a great cast and is a lot of fun...
6	0.0	3	The best part of the movie was that it was a b...	The best part of the movie was that they were ...
7	0.0	3	When I saw the movie, I was like, 'Wow, this i...	When I saw the movie, but I was very surprised...
8	0.0	3	The movie has a great cast and is set in a wor...	The movie has a great cast and there are a few...
9	0.0	4	The best part of the movie was the way that it...	The best part of the movie was that he had to ...
10	0.0	4	When I saw the movie, I thought it was going t...	When I saw the movie on a few times. I think i...
11	0.0	4	The movie has a great cast and a great story. ...	The movie has a great cast and they have a lot...
12	0.0	5	The best part of the movie was the fact that t...	The best part of the movie was that the movie ...
13	0.0	5	When I saw the movie, I thought, 'This is what...	When I saw the movie and the movie was a littl...
14	0.0	5	The movie has a great cast and it's a very fun...	The movie has a great cast and performances, b...
15	0.5	1	The best part of the movie was the way that th...	The best part of the movie was done by the way...
16	0.5	1	When I saw the movie, I thought, 'Oh my God, t...	When I saw the movie was just going to go for ...
17	0.5	1	The movie has a great cast and the movie is ab...	The movie has a great cast and I can't have be...
18	0.5	2	The best part of the movie was the fact that t...	The best part of the movie was, the first time...
19	0.5	2	When I saw the movie, I was like, 'Wow. This i...	When I saw the movie, and I saw it in the firs...
20	0.5	2	The movie has a great cast and it has a lot of...	The movie has a great cast and, and the main c...
21	0.5	3	The best part of the movie was the first time ...	The best part of the movie was that the film w...
22	0.5	3	When I saw the movie, I knew I had to do somet...	When I saw the movie I was watching it. It was...
23	0.5	3	The movie has a great cast and a great script....	The movie has a great cast and a few more than...
24	0.5	4	The best part of the movie was the fact that i...	The best part of the movie was that they had b...
25	0.5	4	When I saw the movie, I was like, 'Oh my God. ...	When I saw the movie. I think I would have to ...
26	0.5	4	The movie has a great cast and some great acti...	The movie has a great cast and there's nothing...
27	0.5	5	The best part of the movie was the fact that i...	The best part of the movie was that he was on ...
28	0.5	5	When I saw the movie, I was so excited. I had ...	When I saw the movie I was trying to tell me t...
29	0.5	5	The movie has a great cast and some of the bes...	The movie has a great cast and a very good cas...
30	1.0	1	The best part of the movie was the ending. It ...	The best part of the movie was that the movie,...
31	1.0	1	When I saw the movie, I was like, 'What's wron...	When I saw the movie was a few hours of this m...
32	1.0	1	The movie has a great cast and is a fun ride. ...	The movie has a great cast and the characters ...
33	1.0	2	The best part of the movie was the way that th...	The best part of the movie was so much better.
34	1.0	2	When I saw the movie, I thought it was a reall...	When I saw the movie was a very little, I was ...
35	1.0	2	The movie has a great cast and is a must-see f...	The movie has a great cast and the film that's...
36	1.0	3	The best part of the movie was the story, and ...	The best part of the movie was the worst of th...
37	1.0	3	When I saw the movie, I was like, 'Oh my God, ...	When I saw the movie was the first one, I had ...
38	1.0	3	The movie has a great cast and the actors are ...	The movie has a great cast and directed by Rob...
39	1.0	4	The best part of the movie was that it's about...	The best part of the movie was a great film wi...
40	1.0	4	When I saw the movie, I didn't think I was goi...	When I saw the movie I saw it on DVD. I was ve...
41	1.0	4	The movie has a great cast and a great cast an...	The movie has a great cast and a few good acto...
42	1.0	5	The best part of the movie was that I got to m...	The best part of the movie was the best. The a...
43	1.0	5	When I saw the movie, I thought, 'Wow, that's ...	When I saw the movie. I didn't know how to do ...
44	1.0	5	The movie has a great cast and some great acti...	The movie has a great cast and the acting is n...

Logit Distillation¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Checking & Setting Computing Device¶

Preliminaries¶

Logit Distillation Explained¶

Overview & Basic Idea¶

Soft Labels vs Hard Labels¶

KL Divergence Loss¶

MSE Loss¶

KLD vs MSE Loss: Pros & Cons¶

Total Loss¶

Logit Distillation — A Complete Practical Example¶

Dataset Preparation¶

Load Reviews from File¶

Tokenize & Generate Token Stream¶

Create `Dataset` and `DataLoader`¶

Auxiliary Methods¶

Compute the Model Size¶

Loss Functions¶

Training a Single Epoch¶

Saving & Loading Checkpoints¶

Generate & Save Example Responses¶

Creating the Models¶

Teacher Model¶

Student Model¶

Training the Student Model¶

Summary¶

Logit Distillation¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Checking & Setting Computing Device¶

Preliminaries¶

Logit Distillation Explained¶

Overview & Basic Idea¶

Soft Labels vs Hard Labels¶

KL Divergence Loss¶

MSE Loss¶

KLD vs MSE Loss: Pros & Cons¶

Total Loss¶

Logit Distillation — A Complete Practical Example¶

Dataset Preparation¶

Load Reviews from File¶

Tokenize & Generate Token Stream¶

Create Dataset and DataLoader¶

Auxiliary Methods¶

Compute the Model Size¶

Loss Functions¶

Training a Single Epoch¶

Saving & Loading Checkpoints¶

Generate & Save Example Responses¶

Creating the Models¶

Teacher Model¶

Student Model¶

Training the Student Model¶

Summary¶

Create `Dataset` and `DataLoader`¶