import re, regex, collections, json
from src.text.preprocessing.tokenizing import MyWordPieceTokenizer
from src.utils.data.files import *

treasure_island_book, _ = download_dataset("text/corpora/books/treasure-island.txt")

File 'data/datasets/text/corpora/books/treasure-island.txt' already exists (use 'overwrite=True' to overwrite it).

doc = 'low low low low low lower lower newest newest newest newest newest newest widest widest widest longer'

CTOKEN = '_'

def pretokenize(doc):
    return doc.split()

print(pretokenize(doc))

['low', 'low', 'low', 'low', 'low', 'lower', 'lower', 'newest', 'newest', 'newest', 'newest', 'newest', 'newest', 'widest', 'widest', 'widest', 'longer']

def generate_sequence(word, ctoken=CTOKEN):
    return ' '.join([c if i == 0 else f"{ctoken}{c}" for i, c in enumerate(word)])
    
generate_sequence('fastest')

'f _a _s _t _e _s _t'

def initialize_corpus_state(docs: list):
    # Initial vocabulary as an empty set
    vocabulary = set()
    # Initialize dictionary representing the corpus state
    corpus_state = collections.defaultdict(int)
    # Loop over all documents
    for doc in docs:
        # For each word in the document, generate the sequence and update add it to the corpus state
        for word in pretokenize(doc):
            # Update vocabulary
            for idx, char in enumerate(word):
                if idx == 0:
                    vocabulary.add(char)
                else:
                    vocabulary.add(f"{CTOKEN}{char}")
            # Update sequence count
            corpus_state[generate_sequence(word)] += 1
    return dict(corpus_state), vocabulary

corpus_state, vocabulary = initialize_corpus_state([doc])

print(json.dumps(corpus_state, indent=2))

{
  "l _o _w": 5,
  "l _o _w _e _r": 2,
  "n _e _w _e _s _t": 6,
  "w _i _d _e _s _t": 3,
  "l _o _n _g _e _r": 1
}

print(vocabulary)

{'_o', '_e', '_r', '_w', '_i', '_g', 'l', 'w', '_t', '_s', '_d', 'n', '_n'}

def find_best_token_pair(corpus_state):
    # Initialize dictionaries to keep track of all counts
    token_counts = collections.defaultdict(int)
    token_pair_counts = collections.defaultdict(int)
    # Iterate over all token sequences in the current corpus state
    for word, freq in corpus_state.items():
        sequence = word.split()
        # Special case: the sequence already a single token
        if len(sequence) == 1:
            token_counts[sequence[0]] += freq
            continue
        # Iterate over all token pair and update the count
        for i in range(len(sequence)-1):
            pair = (sequence[i], sequence[i+1])
            token_counts[f"{sequence[i]}"] += freq
            token_pair_counts[pair] += freq
        # Don't forget the last token that was not captured by the previous loop
        token_counts[sequence[-1]] += freq
    # Calculate the score of all token pairs using the formula approximating the likelihood
    token_pair_scores = { 
        ' '.join(pair): count / (token_counts[pair[0]] * token_counts[pair[1]]) 
        for pair, count in token_pair_counts.items() 
    }
    # Return the most frequent pair (if their are ties, we just randomly break them) + all token pair counts
    return max(token_pair_scores.keys(), key=(lambda key: token_pair_scores[key])), token_pair_scores

# We re-initialize the corpus state and the vocabulary to ensure a consistent output of this code cell
corpus_state, vocabulary = initialize_corpus_state([doc])

#top_token_pair, token_pair_counts = find_most_frequent_token_pair(corpus_state)
top_token_pair, token_pair_scores = find_best_token_pair(corpus_state)

print(json.dumps(token_pair_scores, indent=2))

{
  "l _o": 0.125,
  "_o _w": 0.0673076923076923,
  "_w _e": 0.03418803418803419,
  "_e _r": 0.05555555555555555,
  "n _e": 0.05555555555555555,
  "_e _w": 0.02564102564102564,
  "_e _s": 0.05555555555555555,
  "_s _t": 0.1111111111111111,
  "w _i": 0.3333333333333333,
  "_i _d": 0.3333333333333333,
  "_d _e": 0.05555555555555555,
  "_o _n": 0.125,
  "_n _g": 1.0,
  "_g _e": 0.05555555555555555
}

print(top_token_pair)

_n _g

def create_new_token(token_pair):
    t1, t2 = token_pair.split()
    return ''.join([t1, re.sub(CTOKEN, "", t2)])

print(create_new_token("n _g"))
print(create_new_token("_n _g"))

ng
_ng

def perform_merge(token_pair, corpus_state, vocabulary):
    # Create new token by merging token pair
    new_token = create_new_token(token_pair)
    # Create merge as tuple of token pair and new token
    merge = (token_pair, new_token)
    # Add new token to vocabulary
    vocabulary.add(new_token)
    # Define search pattern
    pattern = re.compile(r"(?<!\S)" + re.escape(token_pair) + r"(?!\S)")
    # Loop through corpus state and record which keys/sequences need to be updated
    matches = {}
    for sequence, count in corpus_state.items():
        for match in pattern.finditer(sequence):
            matches[sequence] = pattern.sub(new_token, sequence)
    # Perform the update of keys/sequences
    for old, new in matches.items():
        corpus_state[new] = corpus_state.pop(old)
    # Return the updated corpus state and vocabulary
    return merge, corpus_state, vocabulary

# We re-initialize the corpus state and the vocabulary to ensure a consistent output of this code cell
corpus_state, vocabulary = initialize_corpus_state([doc])

merge, corpus_state, vocabulary = perform_merge('_n _g', corpus_state, vocabulary)

print(f"Merge: {merge}")
print()
print("Updated corpus state:")
print(json.dumps(corpus_state, indent=2))
print()
print("Updated vocabulary")
print(vocabulary)

Merge: ('_n _g', '_ng')

Updated corpus state:
{
  "l _o _w": 5,
  "l _o _w _e _r": 2,
  "n _e _w _e _s _t": 6,
  "w _i _d _e _s _t": 3,
  "l _o _ng _e _r": 1
}

Updated vocabulary
{'_o', '_e', '_r', '_w', '_i', '_g', 'l', 'w', '_t', '_ng', '_s', '_d', 'n', '_n'}

def wordpiece_learn(corpus, max_vocab_size=10000):

    # Initialize corpus state and vocabulary
    corpus_state, vocabulary = initialize_corpus_state(corpus)

    # Initialize the list of merges
    merges = []

    # Calculate the number of merging steps to ensure the maximum size of the vocabulary
    num_iter = max_vocab_size - len(vocabulary)
    
    for _ in range(num_iter):

        # Find the most frequent pair; if this fails, no more merging was possible and we can stop
        try:
            top_token_pair, _ = find_best_token_pair(corpus_state)
        except:
            break
    
        # Update corpus state and the vocabulary
        merge, corpus_state, vocabulary = perform_merge(top_token_pair, corpus_state, vocabulary)
        
        # Add newly merged token to vocabulary
        merges.append(merge)

    # Return list of merges, the corpus state, and the vocabulary
    return merges, corpus_state, vocabulary

merges, corpus_state, vocabulary = wordpiece_learn([doc], max_vocab_size=100)

print(f"Final corpus state:\n{json.dumps(corpus_state, indent=2)}\n")
print(f"Final vocabulary (size: {len(vocabulary)}):\n{vocabulary}\n ")
print(f"Final list of merges:\n{merges}")

Final corpus state:
{
  "low": 5,
  "longer": 1,
  "lower": 2,
  "widest": 3,
  "newest": 6
}

Final vocabulary (size: 30):
{'_r', '_g', 'newest', 'ne', 'widest', '_s', 'wide', 'new', '_ng', '_i', '_w', 'wi', 'long', 'w', '_er', 'lo', 'longe', 'n', '_o', '_e', 'low', 'longer', '_t', 'newe', 'wid', '_n', 'l', '_d', 'lower', '_st'}
 
Final list of merges:
[('_n _g', '_ng'), ('w _i', 'wi'), ('wi _d', 'wid'), ('l _o', 'lo'), ('lo _ng', 'long'), ('_s _t', '_st'), ('lo _w', 'low'), ('long _e', 'longe'), ('longe _r', 'longer'), ('n _e', 'ne'), ('ne _w', 'new'), ('wid _e', 'wide'), ('_e _r', '_er'), ('new _e', 'newe'), ('low _er', 'lower'), ('wide _st', 'widest'), ('newe _st', 'newest')]

def tokenize_word(word, merges, verbose=False):
    sequence = generate_sequence(word)
    
    if verbose == True:
        print(sequence)
        
    for p, m in merges:
        if p not in sequence:
            continue
            
        p = re.compile(r'(?<!\S)' + re.escape(p) + r'(?!\S)')
        sequence = p.sub(m, sequence)
        
        if verbose == True:
            print(sequence)
        
    return sequence.split(' ')

tokens = tokenize_word('newer', merges, verbose=True)

print(tokens)

n _e _w _e _r
ne _w _e _r
new _e _r
new _er
new _er
['new', '_er']

def tokenize(doc, merges, verbose=False):
    pretokens = pretokenize(doc)

    tokens = []
    for pt in pretokens:
        tokens.extend(tokenize_word(pt, merges, verbose=verbose))

    return tokens

doc2 = "newer longest knew ingest belong newest"

example_token_list = tokenize(doc2, merges, verbose=True)

n _e _w _e _r
ne _w _e _r
new _e _r
new _er
new _er
l _o _n _g _e _s _t
l _o _ng _e _s _t
lo _ng _e _s _t
long _e _s _t
long _e _st
longe _st
k _n _e _w
k _n _e _w
i _n _g _e _s _t
i _ng _e _s _t
i _ng _e _st
b _e _l _o _n _g
b _e _l _o _ng
b _e _l _o _ng
n _e _w _e _s _t
n _e _w _e _st
ne _w _e _st
new _e _st
newe _st
newest

print(example_token_list)

['new', '_er', 'longe', '_st', 'k', '_n', '_e', '_w', 'i', '_ng', '_e', '_st', 'b', '_e', '_l', '_o', '_ng', 'newest']

def detokenize(tokens: list):
    doc = ' '.join(tokens)
    return re.sub(f" {CTOKEN}", "", doc).strip()

print(detokenize(example_token_list))

newer longest knew ingest belong newest

gpt2pattern = regex.compile(r"""'s|'t|'re|'ve|'m|'ll|'d|\p{L}+|\p{N}+|[^\s\p{L}\p{N}]+""")

print(regex.findall(gpt2pattern, "Hello've world123 how's     are you!!!?   "))

['Hello', "'ve", 'world', '123', 'how', "'s", 'are', 'you', '!!!?']

my_tokenizer_example = MyWordPieceTokenizer(pretokenize=MyWordPieceTokenizer.PRE_TOKENIZE__SPLIT).fit([doc], max_vocab_size=100, verbose=True)

Initilize corpus and vocabulary...
Perform 87 iterations...

 20%|█████████████████████▎                                                                                       | 17/87 [00:00<00:00, 4181.76it/s]

print(f"Final corpus state:\n{json.dumps(my_tokenizer_example._corpus_state, indent=2)}\n")
print(f"Final vocabulary (size: {len(my_tokenizer_example._vocabulary)}):\n{my_tokenizer_example._vocabulary}\n ")
print(f"Final list of merges:\n{my_tokenizer_example._merges}")

Final corpus state:
{
  "low": 5,
  "longer": 1,
  "lower": 2,
  "widest": 3,
  "newest": 6
}

Final vocabulary (size: 30):
{'##g', 'newest', '##st', 'ne', 'widest', 'wide', 'new', 'wi', 'long', 'w', 'lo', '##ng', 'longe', '##o', 'n', '##w', '##n', 'low', 'longer', '##r', '##t', 'newe', '##er', '##i', '##e', 'wid', '##d', 'l', '##s', 'lower'}
 
Final list of merges:
[('##n ##g', '##ng'), ('w ##i', 'wi'), ('wi ##d', 'wid'), ('l ##o', 'lo'), ('lo ##ng', 'long'), ('##s ##t', '##st'), ('lo ##w', 'low'), ('long ##e', 'longe'), ('longe ##r', 'longer'), ('n ##e', 'ne'), ('ne ##w', 'new'), ('wid ##e', 'wide'), ('##e ##r', '##er'), ('new ##e', 'newe'), ('low ##er', 'lower'), ('wide ##st', 'widest'), ('newe ##st', 'newest')]

with open(treasure_island_book, "r") as file:
    book = file.read().replace('\n', '').strip()

print(f"Number of characters: {len(book)}")

Number of characters: 375911

my_tokenizer_book = MyWordPieceTokenizer(pretokenize=MyWordPieceTokenizer.PRE_TOKENIZE__GPT2).fit([book], max_vocab_size=20000, verbose=True)

Initilize corpus and vocabulary...
Perform 19839 iterations...

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 19839/19839 [15:31<00:00, 21.29it/s]

print(my_tokenizer_book.tokenize("There is still a lot of treasure buried on the island."))
print(my_tokenizer_book.tokenize("I've checked, my last shipment was delayed."))
print(my_tokenizer_book.tokenize("The captain and the lieutenant had a discussion."))
print(my_tokenizer_book.tokenize("The team members are Alice, John, Jim, and Bob."))
print(my_tokenizer_book.tokenize("I've checked, but I will check again."))

['Th', '##e', '##r', '##e', 'is', 'still', 'a', 'lot', 'of', 'treasur', '##e', 'buried', 'on', 'th', '##e', 'island', '.']
['I', "'", '##ve', 'check', '##e', '##d', ',', 'my', 'last', 'shipm', '##e', '##nt', 'was', 'd', '##e', '##lay', '##e', '##d', '.']
['Th', '##e', 'captain', 'and', 'th', '##e', 'li', '##e', '##ut', '##e', '##nant', 'had', 'a', 'discussion', '.']
['Th', '##e', 't', '##e', '##am', 'm', '##emb', '##e', '##rs', 'a', '##r', '##e', 'Al', '##ic', '##e', ',', 'John', ',', 'Jim', ',', 'and', 'B', '##ob', '.']
['I', "'", '##ve', 'check', '##e', '##d', ',', 'but', 'I', 'will', 'check', 'again', '.']

Subword Tokenization (WordPiece)¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

WordPiece from Scratch¶

Core Steps¶

Pretokenize Text¶

Initialize Corpus State and Vocabulary¶

Token Merging Step¶

Calculate Number of Token Pairs¶

Perform Merge¶

WordPiece vs. BPE — Intuition¶

WordPiece Learning Algorithm¶

WordPiece Tokenization Algorithm¶

Detokenize¶

Discussion & Limitations¶

Example Application¶

Revised WordPiece Tokenizer Implementation¶

Training using Real-World Data¶

Summary¶