import re, regex, collections, json
from src.text.preprocessing.tokenizing import MyBpeTokenizer
from src.utils.data.files import *

treasure_island_book, _ = download_dataset("text/corpora/books/treasure-island.txt")

File 'data/datasets/text/corpora/books/treasure-island.txt' already exists (use 'overwrite=True' to overwrite it).

doc = 'low low low low low lower lower newest newest newest newest newest newest widest widest widest longer'

TOKEN_EOS = '_'

def pretokenize(doc):
    return doc.split()

print(pretokenize(doc))

['low', 'low', 'low', 'low', 'low', 'lower', 'lower', 'newest', 'newest', 'newest', 'newest', 'newest', 'newest', 'widest', 'widest', 'widest', 'longer']

def generate_sequence(word, eos=TOKEN_EOS):
    return ' '.join((list(word) + [eos]))
    
generate_sequence('fastest')

'f a s t e s t _'

def initialize_corpus_state(docs: list):
    # Initial vocabulary as an empty set
    vocabulary = set()
    # Initialize dictionary representing the corpus state
    corpus_state = collections.defaultdict(int)
    # Loop over all documents
    for doc in docs:
        # Add all characters in the current document to the vocabulary
        vocabulary.update(set(doc))
        # For each word in the document, generate the sequence and update add it to the corpus state
        for word in pretokenize(doc):
            corpus_state[generate_sequence(word)] += 1
    # Remove whitespace character from final vocabulary
    vocabulary.discard(" ")
    # Add EOS token
    vocabulary.add(TOKEN_EOS)
    return dict(corpus_state), vocabulary

corpus_state, vocabulary = initialize_corpus_state([doc])

print(json.dumps(corpus_state, indent=2))

{
  "l o w _": 5,
  "l o w e r _": 2,
  "n e w e s t _": 6,
  "w i d e s t _": 3,
  "l o n g e r _": 1
}

print(vocabulary)

{'t', 'e', '_', 'r', 'o', 'l', 's', 'i', 'g', 'w', 'n', 'd'}

def find_most_frequent_token_pair(corpus_state):
    token_pair_counts = collections.defaultdict(int)
    
    for word, freq in corpus_state.items():
        sequence = word.split()
        for i in range(len(sequence)-1):
            token_pair_counts[f"{sequence[i]} {sequence[i+1]}"] += freq

    # Return the most frequent pair (if their are ties, we just randomly break them) + all token pair counts
    return max(token_pair_counts.keys(), key=(lambda key: token_pair_counts[key])), token_pair_counts

# We re-initialize the corpus state and the vocabulary to ensure a consistent output of this code cell
corpus_state, vocabulary = initialize_corpus_state([doc])

top_token_pair, token_pair_counts = find_most_frequent_token_pair(corpus_state)

print(json.dumps(token_pair_counts, indent=2))

{
  "l o": 8,
  "o w": 7,
  "w _": 5,
  "w e": 8,
  "e r": 3,
  "r _": 3,
  "n e": 6,
  "e w": 6,
  "e s": 9,
  "s t": 9,
  "t _": 9,
  "w i": 3,
  "i d": 3,
  "d e": 3,
  "o n": 1,
  "n g": 1,
  "g e": 1
}

print(top_token_pair)

e s

def perform_merge(token_pair, corpus_state, vocabulary):
    # Create new token by merging token pair
    new_token = re.sub(' ', '', token_pair)
    # Create merge as tuple of token pair and new token
    merge = (token_pair, new_token)
    # Add new token to vocabulary
    vocabulary.add(new_token)
    # Define search pattern
    pattern = re.compile(r"(?<!\S)" + re.escape(token_pair) + r"(?!\S)")
    # Loop through corpus state and record which keys/sequences need to be updated
    matches = {}
    for sequence, count in corpus_state.items():
        for match in pattern.finditer(sequence):
            matches[sequence] = pattern.sub(new_token, sequence)
    # Perform the update of keys/sequences
    for old, new in matches.items():
        corpus_state[new] = corpus_state.pop(old)
    # Return the updated corpus state and vocabulary
    return merge, corpus_state, vocabulary

# We re-initialize the corpus state and the vocabulary to ensure a consistent output of this code cell
corpus_state, vocabulary = initialize_corpus_state([doc])

merge, corpus_state, vocabulary = perform_merge('e s', corpus_state, vocabulary)

print(f"Merge: {merge}")
print()
print("Updated corpus state:")
print(json.dumps(corpus_state, indent=2))
print()
print("Updated vocabulary")
print(vocabulary)

Merge: ('e s', 'es')

Updated corpus state:
{
  "l o w _": 5,
  "l o w e r _": 2,
  "l o n g e r _": 1,
  "n e w es t _": 6,
  "w i d es t _": 3
}

Updated vocabulary
{'t', 'e', '_', 'r', 'es', 'o', 'l', 's', 'i', 'g', 'w', 'n', 'd'}

def bpe_learn(corpus, max_vocab_size=10000):

    # Initialize corpus state and vocabulary
    corpus_state, vocabulary = initialize_corpus_state(corpus)

    # Initialize the list of merges
    merges = []

    # Calculate the number of merging steps to ensure the maximum size of the vocabulary
    num_iter = max_vocab_size - len(vocabulary)
    
    for _ in range(num_iter):

        # Find the most frequent pair; if this fails, no more merging was possible and we can stop
        try:
            top_token_pair, _ = find_most_frequent_token_pair(corpus_state)
        except:
            break
    
        # Update corpus state and the vocabulary
        merge, corpus_state, vocabulary = perform_merge(top_token_pair, corpus_state, vocabulary)
        
        # Add newly merged token to vocabulary
        merges.append(merge)

    # Return list of merges, the corpus state, and the vocabulary
    return merges, corpus_state, vocabulary

merges, corpus_state, vocabulary = bpe_learn([doc], max_vocab_size=21)

print(f"Final corpus state:\n{json.dumps(corpus_state, indent=2)}\n")
print(f"Final vocabulary (size: {len(vocabulary)}):\n{vocabulary}\n ")
print(f"Final list of merges:\n{merges}")

Final corpus state:
{
  "w i d est_": 3,
  "lo n g e r _": 1,
  "low e r _": 2,
  "newest_": 6,
  "low_": 5
}

Final vocabulary (size: 21):
{'t', 'ne', 'new', 'w', 'newest_', 'e', 'r', 'es', 'low', 'est', 'low_', 'i', 'est_', 'lo', '_', 'o', 'l', 'n', 'd', 's', 'g'}
 
Final list of merges:
[('e s', 'es'), ('es t', 'est'), ('est _', 'est_'), ('l o', 'lo'), ('lo w', 'low'), ('n e', 'ne'), ('ne w', 'new'), ('new est_', 'newest_'), ('low _', 'low_')]

def tokenize_word(word, merges, verbose=False):
    sequence = generate_sequence(word)
    
    if verbose == True:
        print(sequence)
        
    for p, m in merges:
        if p not in sequence:
            continue
            
        p = re.compile(r'(?<!\S)' + re.escape(p) + r'(?!\S)')
        sequence = p.sub(m, sequence)
        
        if verbose == True:
            print(sequence)
        
    return sequence.split(' ')

tokens = tokenize_word('newer', merges, verbose=True)

print(tokens)

n e w e r _
ne w e r _
new e r _
['new', 'e', 'r', '_']

def tokenize(doc, merges, verbose=False):
    pretokens = pretokenize(doc)

    tokens = []
    for pt in pretokens:
        tokens.extend(tokenize_word(pt, merges, verbose=verbose))

    return tokens

doc2 = "newer longest knew ingest belong newest"

print(tokenize(doc2, merges, verbose=True))

n e w e r _
ne w e r _
new e r _
l o n g e s t _
l o n g es t _
l o n g est _
l o n g est_
lo n g est_
k n e w _
k ne w _
k new _
i n g e s t _
i n g es t _
i n g est _
i n g est_
b e l o n g _
b e lo n g _
n e w e s t _
n e w es t _
n e w est _
n e w est_
ne w est_
new est_
newest_
['new', 'e', 'r', '_', 'lo', 'n', 'g', 'est_', 'k', 'new', '_', 'i', 'n', 'g', 'est_', 'b', 'e', 'lo', 'n', 'g', '_', 'newest_']

def detokenize(tokens: list):
    doc = ''.join(tokens)
    return re.sub(TOKEN_EOS, " ", doc).strip()

print(detokenize(['new', 'er_', 'long', 'est_', 'k', 'new', '_', 'i', 'n', 'g', 'est_', 'b', 'e', 'long', '_', 'newest_']))

newer longest knew ingest belong newest

gpt2pattern = regex.compile(r"""'s|'t|'re|'ve|'m|'ll|'d|\p{L}+|\p{N}+|[^\s\p{L}\p{N}]+""")

print(regex.findall(gpt2pattern, "Hello've world123 how's     are you!!!?   "))

['Hello', "'ve", 'world', '123', 'how', "'s", 'are', 'you', '!!!?']

my_tokenizer_example = MyBpeTokenizer(pretokenize=MyBpeTokenizer.PRE_TOKENIZE__SPLIT).fit([doc], max_vocab_size=100, verbose=True)

Initilize corpus and vocabulary...
Perform 88 iterations...

 20%|██████████████████████▎                                                                                      | 18/88 [00:00<00:00, 5307.38it/s]

print(f"Final corpus state:\n{json.dumps(my_tokenizer_example._corpus_state, indent=2)}\n")
print(f"Final vocabulary (size: {len(my_tokenizer_example._vocabulary)}):\n{my_tokenizer_example._vocabulary}\n ")
print(f"Final list of merges:\n{my_tokenizer_example._merges}")

Final corpus state:
{
  "\u0120low": 5,
  "\u0120newest": 6,
  "\u0120widest": 3,
  "\u0120lower": 2,
  "\u0120longer": 1
}

Final vocabulary (size: 30):
{'t', 'Ġlo', 'Ġlow', 'Ġne', 'Ġwid', 'Ġ', 'w', 'Ġlonger', 'Ġl', 'Ġw', 'Ġlong', 'e', 'r', 'es', 'est', 'i', 'Ġn', 'Ġlon', 'o', 'l', 'Ġnewest', 'Ġnew', 'n', 'd', 'er', 'Ġlower', 's', 'Ġwi', 'g', 'Ġwidest'}
 
Final list of merges:
[('e s', 'es'), ('es t', 'est'), ('Ġ l', 'Ġl'), ('Ġl o', 'Ġlo'), ('Ġlo w', 'Ġlow'), ('Ġ n', 'Ġn'), ('Ġn e', 'Ġne'), ('Ġne w', 'Ġnew'), ('Ġnew est', 'Ġnewest'), ('Ġ w', 'Ġw'), ('e r', 'er'), ('Ġw i', 'Ġwi'), ('Ġwi d', 'Ġwid'), ('Ġwid est', 'Ġwidest'), ('Ġlow er', 'Ġlower'), ('Ġlo n', 'Ġlon'), ('Ġlon g', 'Ġlong'), ('Ġlong er', 'Ġlonger')]

with open(treasure_island_book, "r") as file:
    book = file.read().replace('\n', '').strip()

print(f"Number of characters: {len(book)}")

Number of characters: 375911

my_tokenizer_book = MyBpeTokenizer(pretokenize=MyBpeTokenizer.PRE_TOKENIZE__GPT2).fit([book], max_vocab_size=10000, verbose=True)

Initilize corpus and vocabulary...
Perform 9911 iterations...

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 9911/9911 [04:16<00:00, 38.63it/s]

print(my_tokenizer_book.tokenize("There is still a lot of treasure buried on the island."))
print(my_tokenizer_book.tokenize("I've checked, my last shipment was delayed."))
print(my_tokenizer_book.tokenize("The captain and the lieutenant had a discussion."))
print(my_tokenizer_book.tokenize("The team members are Alice, John, Jim, and Bob."))
print(my_tokenizer_book.tokenize("I've checked, but I will check again."))

['ĠThere', 'Ġis', 'Ġstill', 'Ġa', 'Ġlot', 'Ġof', 'Ġtreasure', 'Ġburied', 'Ġon', 'Ġthe', 'Ġisland', 'Ġ.']
['ĠI', 'Ġ', "'", 've', 'Ġche', 'cked', 'Ġ,', 'Ġmy', 'Ġlast', 'Ġship', 'ment', 'Ġwas', 'Ġdelayed', 'Ġ.']
['ĠThe', 'Ġcaptain', 'Ġand', 'Ġthe', 'Ġlieuten', 'ant', 'Ġhad', 'Ġa', 'Ġdiscu', 'ss', 'ion', 'Ġ.']
['ĠThe', 'Ġte', 'am', 'Ġme', 'mb', 'ers', 'Ġare', 'ĠA', 'li', 'ce', 'Ġ,', 'ĠJohn', 'Ġ,', 'ĠJim', 'Ġ,', 'Ġand', 'ĠB', 'o', 'b', 'Ġ.']
['ĠI', 'Ġ', "'", 've', 'Ġche', 'cked', 'Ġ,', 'Ġbut', 'ĠI', 'Ġwill', 'Ġcheck', 'Ġagain', 'Ġ.']

Byte-Pair Encoding Tokenization¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

BPE from Scratch¶

Core Steps¶

Pretokenize Text¶

Initialize Corpus State and Vocabulary¶

Token Merging Step¶

Calculate Number of Token Pairs¶

Perform Merge¶

BPE Learning Algorithm¶

BPE Tokenization Algorithm¶

Detokenize¶

Discussion & Limitations¶

Example Application¶

Revised BPE Tokenizer Implementation¶

Training using Real-World Data¶

Summary¶