from src.utils.libimports.tokenizing import *

text = "After I got too many spam email to my robert@example.org address, I'm now using bobby@example.org for my correspondence."

# Simple(!) RegEx to match email addresses
email_regex = r"[\w.-]+@[\w.-]+\.[\w]{2,}"

# Find all email addresses
email_addresses = re.findall(email_regex, text)

print(email_addresses)

['robert@example.org', 'bobby@example.org']

text = "This is great"

tokens = list(text)

print(tokens)

['T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'g', 'r', 'e', 'a', 't']

tokens = re.findall(r".", text)

print(tokens)

['T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'g', 'r', 'e', 'a', 't']

sentences = ["Text processing with Python is great.", 
             "It isn't (very) complicated to get started.",
             "However,careful to...you know....avoid mistakes.",
             "Contact me at alice@example.org; see http://example.org.",
             "This is so cooool #nlprocks :))) :-P <3."]

document = ' '.join(sentences)

# Print the document to see if everything looks alright
print (document)

Text processing with Python is great. It isn't (very) complicated to get started. However,careful to...you know....avoid mistakes. Contact me at alice@example.org; see http://example.org. This is so cooool #nlprocks :))) :-P <3.

sentence_tokenizer = PunktSentenceTokenizer()

# The tokenize() method returns a list containing the sentences
sentences_alt = sentence_tokenizer.tokenize(document)

# Loop over all sentences and print each sentence
for s in sentences_alt:
    print (s)

Text processing with Python is great.
It isn't (very) complicated to get started.
However,careful to...you know....avoid mistakes.
Contact me at alice@example.org; see http://example.org.
This is so cooool #nlprocks :))) :-P <3.

nltk_word_tokenizer = NLTKWordTokenizer()

print ("Output of NLTKWordTokenizer:")
for s in sentences:
    print (nltk_word_tokenizer.tokenize(s))
print()
print ("Output of the word_tokenize() method:")
for s in sentences:
    print (word_tokenize(s))

Output of NLTKWordTokenizer:
['Text', 'processing', 'with', 'Python', 'is', 'great', '.']
['It', 'is', "n't", '(', 'very', ')', 'complicated', 'to', 'get', 'started', '.']
['However', ',', 'careful', 'to', '...', 'you', 'know', '....', 'avoid', 'mistakes', '.']
['Contact', 'me', 'at', 'alice', '@', 'example.org', ';', 'see', 'http', ':', '//example.org', '.']
['This', 'is', 'so', 'cooool', '#', 'nlprocks', ':', ')', ')', ')', ':', '-P', '<', '3', '.']

Output of the word_tokenize() method:
['Text', 'processing', 'with', 'Python', 'is', 'great', '.']
['It', 'is', "n't", '(', 'very', ')', 'complicated', 'to', 'get', 'started', '.']
['However', ',', 'careful', 'to', '...', 'you', 'know', '....', 'avoid', 'mistakes', '.']
['Contact', 'me', 'at', 'alice', '@', 'example.org', ';', 'see', 'http', ':', '//example.org', '.']
['This', 'is', 'so', 'cooool', '#', 'nlprocks', ':', ')', ')', ')', ':', '-P', '<', '3', '.']

treebank_tokenizer = TreebankWordTokenizer()

print ("Output of TreebankWordTokenizer:")
for s in sentences:
    print (treebank_tokenizer.tokenize(s))

Output of TreebankWordTokenizer:
['Text', 'processing', 'with', 'Python', 'is', 'great', '.']
['It', 'is', "n't", '(', 'very', ')', 'complicated', 'to', 'get', 'started', '.']
['However', ',', 'careful', 'to', '...', 'you', 'know', '...', '.avoid', 'mistakes', '.']
['Contact', 'me', 'at', 'alice', '@', 'example.org', ';', 'see', 'http', ':', '//example.org', '.']
['This', 'is', 'so', 'cooool', '#', 'nlprocks', ':', ')', ')', ')', ':', '-P', '<', '3', '.']

tweet_tokenizer = TweetTokenizer()

print ("Output of TweetTokenizer:")
for s in sentences:
    print (tweet_tokenizer.tokenize(s))

Output of TweetTokenizer:
['Text', 'processing', 'with', 'Python', 'is', 'great', '.']
['It', "isn't", '(', 'very', ')', 'complicated', 'to', 'get', 'started', '.']
['However', ',', 'careful', 'to', '...', 'you', 'know', '...', 'avoid', 'mistakes', '.']
['Contact', 'me', 'at', 'alice@example.org', ';', 'see', 'http://example.org', '.']
['This', 'is', 'so', 'cooool', '#nlprocks', ':)', ')', ')', ':-P', '<3', '.']

#pattern = '\w+' # all alphanumeric words
#pattern = '[a-zA-Z]+' # all alphanumeric words (without digits)
pattern = '[a-zA-Z\']+' # all alphanumeric words (without digits, but keep contractions)

regexp_tokenizer = RegexpTokenizer(pattern)

print (f"Output of RegexpTokenizer for pattern {pattern}:")
for s in sentences:
    print (regexp_tokenizer.tokenize(s))

Output of RegexpTokenizer for pattern [a-zA-Z']+:
['Text', 'processing', 'with', 'Python', 'is', 'great']
['It', "isn't", 'very', 'complicated', 'to', 'get', 'started']
['However', 'careful', 'to', 'you', 'know', 'avoid', 'mistakes']
['Contact', 'me', 'at', 'alice', 'example', 'org', 'see', 'http', 'example', 'org']
['This', 'is', 'so', 'cooool', 'nlprocks', 'P']

print ("Output of spaCy tokenizer:")
for s in sentences:
    token_list = [ token.text for token in nlp(s) ] 
    print (token_list)

Output of spaCy tokenizer:
['Text', 'processing', 'with', 'Python', 'is', 'great', '.']
['It', 'is', "n't", '(', 'very', ')', 'complicated', 'to', 'get', 'started', '.']
['However', ',', 'careful', 'to', '...', 'you', 'know', '....', 'avoid', 'mistakes', '.']
['Contact', 'me', 'at', 'alice@example.org', ';', 'see', 'http://example.org', '.']
['This', 'is', 'so', 'cooool', '#', 'nlprocks', ':)))', ':-P', '<3', '.']

# Choose pretrained tokenizer model
tokenizer_model_name = "gpt2" # BPE
#tokenizer_model_name = "bert-base-uncased" # WordPiece
#tokenizer_model_name = "t5-small" # SentencePiece

# Load a pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name)

# Encode text into token IDs
encoded = tokenizer("Most fossils of Tyrannosaurus rex fall into the Cretaceous period.")

print(f"Token IDs: {encoded['input_ids']}")

print(f"Tokens: {tokenizer.convert_ids_to_tokens(encoded['input_ids'])}")

Token IDs: [6943, 34066, 286, 38401, 47650, 302, 87, 2121, 656, 262, 327, 1186, 37797, 2278, 13]
Tokens: ['Most', 'Ġfossils', 'Ġof', 'ĠTyrann', 'osaurus', 'Ġre', 'x', 'Ġfall', 'Ġinto', 'Ġthe', 'ĠC', 'ret', 'aceous', 'Ġperiod', '.']

Technique	Key Feature	Common Usage
BPE	Frequency-based merging	GPT, Transformers
WordPiece	Likelihood-based merging	BERT, XLNet
Unigram	Probabilistic model	T5, SentencePiece
SentencePiece	No pre-tokenization needed	Multilingual models
Morpheme-Based	Linguistic insights	Morphologically rich languages

Text Tokenization¶

Setting up the Notebook¶

Make Required Imports¶

Motivation: From Strings to Tokens¶

Text as String¶

Text as Written Language¶

The Task of Tokenization¶

Overview to Basic Approaches¶

Character Tokenization¶

Word Tokenization¶

Word Tokenization with NLTK¶

PunktSentenceTokenizer¶

NLTKWordTokenizer¶

TreebankWordTokenizer¶

TweetTokenizer¶

RegexpTokenizer¶

Word Tokenization with spaCy¶

Discussion¶

Subword Tokenization¶

Summary¶