import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

# Set seed to ensure consistent results
torch.manual_seed(0)

# Create 1-dimensional tensor as example document stream
tokens = torch.randint(0, 100, size=(50,))

print(tokens)

tensor([44, 39, 33, 60, 63, 79, 27,  3, 97, 83,  1, 66, 56, 99, 78, 76, 56, 68,
        94, 33, 26, 19, 91, 54, 24, 41, 69, 69, 49, 80, 81, 12, 63, 60, 95, 85,
        22, 99, 11, 88, 78, 43, 96, 89, 71, 57, 83, 95, 82, 71])

def chunk_tokens(tokens, max_len=6, stride=None):
    # If stride=None, create non-overlapping chunks 
    if stride is None:
        stride = max_len

    # Move sliding window overall tokens with the given stride
    for i in range(0, len(tokens)-max_len, stride):
        # Extract chunk representing the model input
        input_chunk  = tokens[i:(i+max_len)]
        # Extract chunk representing the model target
        target_chunk = tokens[(i+1):(i+max_len+1)]
        # Print input-target pair
        print(input_chunk, "->", target_chunk)

chunk_tokens(tokens, max_len=6)

tensor([44, 39, 33, 60, 63, 79]) -> tensor([39, 33, 60, 63, 79, 27])
tensor([27,  3, 97, 83,  1, 66]) -> tensor([ 3, 97, 83,  1, 66, 56])
tensor([56, 99, 78, 76, 56, 68]) -> tensor([99, 78, 76, 56, 68, 94])
tensor([94, 33, 26, 19, 91, 54]) -> tensor([33, 26, 19, 91, 54, 24])
tensor([24, 41, 69, 69, 49, 80]) -> tensor([41, 69, 69, 49, 80, 81])
tensor([81, 12, 63, 60, 95, 85]) -> tensor([12, 63, 60, 95, 85, 22])
tensor([22, 99, 11, 88, 78, 43]) -> tensor([99, 11, 88, 78, 43, 96])
tensor([96, 89, 71, 57, 83, 95]) -> tensor([89, 71, 57, 83, 95, 82])

chunk_tokens(tokens, max_len=6, stride=3)

tensor([44, 39, 33, 60, 63, 79]) -> tensor([39, 33, 60, 63, 79, 27])
tensor([60, 63, 79, 27,  3, 97]) -> tensor([63, 79, 27,  3, 97, 83])
tensor([27,  3, 97, 83,  1, 66]) -> tensor([ 3, 97, 83,  1, 66, 56])
tensor([83,  1, 66, 56, 99, 78]) -> tensor([ 1, 66, 56, 99, 78, 76])
tensor([56, 99, 78, 76, 56, 68]) -> tensor([99, 78, 76, 56, 68, 94])
tensor([76, 56, 68, 94, 33, 26]) -> tensor([56, 68, 94, 33, 26, 19])
tensor([94, 33, 26, 19, 91, 54]) -> tensor([33, 26, 19, 91, 54, 24])
tensor([19, 91, 54, 24, 41, 69]) -> tensor([91, 54, 24, 41, 69, 69])
tensor([24, 41, 69, 69, 49, 80]) -> tensor([41, 69, 69, 49, 80, 81])
tensor([69, 49, 80, 81, 12, 63]) -> tensor([49, 80, 81, 12, 63, 60])
tensor([81, 12, 63, 60, 95, 85]) -> tensor([12, 63, 60, 95, 85, 22])
tensor([60, 95, 85, 22, 99, 11]) -> tensor([95, 85, 22, 99, 11, 88])
tensor([22, 99, 11, 88, 78, 43]) -> tensor([99, 11, 88, 78, 43, 96])
tensor([88, 78, 43, 96, 89, 71]) -> tensor([78, 43, 96, 89, 71, 57])
tensor([96, 89, 71, 57, 83, 95]) -> tensor([89, 71, 57, 83, 95, 82])

class TokenDataset(Dataset):

    def __init__(self, tokens, max_len=6, stride=None):
        self.input_ids  = []
        self.target_ids = []
        
        if stride is None:
            stride = max_len

        for i in range(0, len(tokens)-max_len, stride):
            self.input_ids.append(torch.LongTensor(tokens[i:(i+max_len)]))
            self.target_ids.append(torch.LongTensor(tokens[(i+1):(i+max_len+1)]))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

token_dataset = TokenDataset(tokens, max_len=6, stride=3)

token_dataloader = DataLoader(token_dataset, batch_size=4, shuffle=True, drop_last=True, num_workers=0)

for nr, batch in enumerate(token_dataloader):
    print(f"==================== [Batch {nr}] ====================")
    input_ids, target_ids = batch[0], batch[1]
    for i in range(len(input_ids)):
        print(input_ids[i], "==>", target_ids[i])

==================== [Batch 0] ====================
tensor([88, 78, 43, 96, 89, 71]) ==> tensor([78, 43, 96, 89, 71, 57])
tensor([27,  3, 97, 83,  1, 66]) ==> tensor([ 3, 97, 83,  1, 66, 56])
tensor([81, 12, 63, 60, 95, 85]) ==> tensor([12, 63, 60, 95, 85, 22])
tensor([69, 49, 80, 81, 12, 63]) ==> tensor([49, 80, 81, 12, 63, 60])
==================== [Batch 1] ====================
tensor([24, 41, 69, 69, 49, 80]) ==> tensor([41, 69, 69, 49, 80, 81])
tensor([44, 39, 33, 60, 63, 79]) ==> tensor([39, 33, 60, 63, 79, 27])
tensor([76, 56, 68, 94, 33, 26]) ==> tensor([56, 68, 94, 33, 26, 19])
tensor([96, 89, 71, 57, 83, 95]) ==> tensor([89, 71, 57, 83, 95, 82])
==================== [Batch 2] ====================
tensor([83,  1, 66, 56, 99, 78]) ==> tensor([ 1, 66, 56, 99, 78, 76])
tensor([22, 99, 11, 88, 78, 43]) ==> tensor([99, 11, 88, 78, 43, 96])
tensor([19, 91, 54, 24, 41, 69]) ==> tensor([91, 54, 24, 41, 69, 69])
tensor([60, 95, 85, 22, 99, 11]) ==> tensor([95, 85, 22, 99, 11, 88])

documents = [
    "This is the first document of our dataset to train a Transformer-based LLM. We assume that we tokenize the input text using the pretrained tokenizer",
    "Good LLMs require huge datasets to train. The documents used for training should come from a wider variety of sources and domains.",
    "Hailstorms form when strong updrafts in thunderstorms carry raindrops into cold areas of the atmosphere, where they freeze and **accumulate additional layers of ice as they are repeatedly lifted and dropped within the storm, eventually falling to the ground when they become too heavy for the updrafts to support."
]

tokenizer = AutoTokenizer.from_pretrained("gpt2")

tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>'}

EOS_TOKEN_GPT2 = "<|endoftext|>"

text = f" {EOS_TOKEN_GPT2} ".join(documents)

print(text)

This is the first document of our dataset to train a Transformer-based LLM. We assume that we tokenize the input text using the pretrained tokenizer <|endoftext|> Good LLMs require huge datasets to train. The documents used for training should come from a wider variety of sources and domains. <|endoftext|> Hailstorms form when strong updrafts in thunderstorms carry raindrops into cold areas of the atmosphere, where they freeze and **accumulate additional layers of ice as they are repeatedly lifted and dropped within the storm, eventually falling to the ground when they become too heavy for the updrafts to support.

class TextDataset(Dataset):

    def __init__(self, text, tokenizer, max_len=6, stride=None):
        self.input_ids  = []
        self.target_ids = []
        
        if stride is None:
            stride = max_len

        # Tokenize text and convert tokens to token indices
        tokens = tokenizer.encode(text)

        for i in range(0, len(tokens)-max_len, stride):
            self.input_ids.append(torch.LongTensor(tokens[i:(i+max_len)]))
            self.target_ids.append(torch.LongTensor(tokens[(i+1):(i+max_len+1)]))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.target_ids[idx])

text_dataset = TextDataset(text, tokenizer)

text_dataloader = DataLoader(text_dataset, batch_size=4, shuffle=True, drop_last=True, num_workers=0)

for nr, batch in enumerate(text_dataloader):
    print(f"==================== [Batch {nr}] ====================")
    input_ids, target_ids = batch[0], batch[1]
    for i in range(len(input_ids)):
        print(input_ids[i], "==>", target_ids[i])

==================== [Batch 0] ====================
tensor([27140, 10128,  2421,  3236, 40522,   284]) ==> tensor([10128,  2421,  3236, 40522,   284,  4512])
tensor([ 1913,  2325,  1617,    82,   287, 18355]) ==> tensor([ 2325,  1617,    82,   287, 18355, 38563])
tensor([13363, 11241,  7509,   220, 50256,  4599]) ==> tensor([11241,  7509,   220, 50256,  4599, 27140])
tensor([38563,  3283,  6290, 49253,   656,  4692]) ==> tensor([ 3283,  6290, 49253,   656,  4692,  3006])
==================== [Batch 1] ====================
tensor([ 284,  262, 2323,  618,  484, 1716]) ==> tensor([ 262, 2323,  618,  484, 1716, 1165])
tensor([4512,   13,  383, 4963,  973,  329]) ==> tensor([  13,  383, 4963,  973,  329, 3047])
tensor([  674, 27039,   284,  4512,   257,  3602]) ==> tensor([27039,   284,  4512,   257,  3602, 16354])
tensor([16354,    12,  3106, 27140,    44,    13]) ==> tensor([   12,  3106, 27140,    44,    13,   775])
==================== [Batch 2] ====================
tensor([  220, 50256, 42913, 38563,  1296,   618]) ==> tensor([50256, 42913, 38563,  1296,   618,  1913])
tensor([1626,  262, 6388,   11, 4191, 7463]) ==> tensor([ 262, 6388,   11, 4191, 7463,  284])
tensor([ 5039,  3224, 11685,   286,  4771,   355]) ==> tensor([ 3224, 11685,   286,  4771,   355,   484])
tensor([  484,   389,  7830, 13663,   290,  5710]) ==> tensor([  389,  7830, 13663,   290,  5710,  1626])
==================== [Batch 3] ====================
tensor([ 262, 5128, 2420, 1262,  262, 2181]) ==> tensor([ 5128,  2420,  1262,   262,  2181, 13363])
tensor([  484, 16611,   290, 12429,  4134,   388]) ==> tensor([16611,   290, 12429,  4134,   388,  5039])
tensor([1212,  318,  262,  717, 3188,  286]) ==> tensor([ 318,  262,  717, 3188,  286,  674])
tensor([ 4996,   286,  4237,   290, 18209,    13]) ==> tensor([  286,  4237,   290, 18209,    13,   220])
==================== [Batch 4] ====================
tensor([1165, 4334,  329,  262, 2325, 1617]) ==> tensor([4334,  329,  262, 2325, 1617,   82])
tensor([ 3047,   815,  1282,   422,   257, 10595]) ==> tensor([  815,  1282,   422,   257, 10595,  4996])
tensor([3006,  286,  262, 8137,   11,  810]) ==> tensor([ 286,  262, 8137,   11,  810,  484])
tensor([  775,  7048,   326,   356, 11241,  1096]) ==> tensor([ 7048,   326,   356, 11241,  1096,   262])

Model Provider	Model Name	Context Size (Tokens)
OpenAI	GPT-3.5	4,096
	GPT-4	8,192 (standard), 32,768 (GPT-4-32k)
	GPT-4o (2024)	128,000
Anthropic	Claude 1 & 2	9,000 to 100,000
	Claude 3 (Opus, etc.)	Up to 200,000
Google	Gemini 1.5 (Pro, Flash)	Up to 1,000,000
Mistral	Mistral-7B	32,000
	Mixtral (MoE)	32,000+
Meta	LLaMA 1 & 2	2,048 to 4,096
	LLaMA 3 (8B, 70B)	8,192
Cohere	Command R+	128,000
MosaicML	MPT-7B	65,536
xAI	Grok-1	~8,000
	Grok-1.5 (2025)	128,000

Model Provider	Model Name	Training Tokens (Approx.)
OpenAI	GPT-3 (175B)	~300 billion
	GPT-4 (speculative)	Estimated >1 trillion
	GPT-4o	Estimated >2 trillion (likely multilingual and multimodal)
Anthropic	Claude 1 & 2	Unknown, but likely 1–2T+
	Claude 3 (Opus, etc.)	Estimated 2-4 trillion
Google DeepMind	Gemini 1.0	Unknown
	Gemini 1.5	Likely >5 trillion (based on long context + mixture of modalities)
Meta	LLaMA 1	1 trillion
	LLaMA 2 (7B/13B/70B)	2 trillion
	LLaMA 3 (8B, 70B)	15 trillion tokens
Mistral	Mistral-7B	Trained on 1.5 trillion
	Mixtral (MoE)	Estimated 2-3 trillion
Cohere	Command R+	~1.4 trillion
MosaicML	MPT-7B	~1 trillion
xAI	Grok-1	Unknown
	Grok-1.5	Likely >1-2 trillion

Data Batching for Training LLMs¶

Setting up the Notebook¶

Make Required Imports¶

Preliminaries¶

Quick Recap: Transformer-Based Language Models¶

Batching with a Sliding Window¶

Create Toy Document Stream¶

Sliding Window Chunking¶

Generating Targets¶

Basic implementation¶

Practical Implementation¶

Pretokenized Input¶

Raw Text Input¶

Summary¶