from src.utils.libimports.tokenizing import *
from src.text.preprocessing.tokenizing import MyWordTokenizer

text1 = "Let's go to N.Y. to visit my mother-in-law!!! It'll be great, I'm sure."
text2 = "I've got an appointment with Dr. Smith...I hope it won't be bad."
text3 = "Please select your answer: A (luggage) or B (no luggage)."
text4 = "My website is www.example.com (email: user@example.com) #contact."
text5 = "The ambience was :o))), but the food was )-:"

print([token.text for token in nlp(text1)])
print([token.text for token in nlp(text2)])
print([token.text for token in nlp(text3)])
print([token.text for token in nlp(text4)])
print([token.text for token in nlp(text5)])

['Let', "'s", 'go', 'to', 'N.Y.', 'to', 'visit', 'my', 'mother', '-', 'in', '-', 'law', '!', '!', '!', 'It', "'ll", 'be', 'great', ',', 'I', "'m", 'sure', '.']
['I', "'ve", 'got', 'an', 'appointment', 'with', 'Dr.', 'Smith', '...', 'I', 'hope', 'it', 'wo', "n't", 'be', 'bad', '.']
['Please', 'select', 'your', 'answer', ':', 'A', '(', 'luggage', ')', 'or', 'B', '(', 'no', 'luggage', ')', '.']
['My', 'website', 'is', 'www.example.com', '(', 'email', ':', 'user@example.com', ')', '#', 'contact', '.']
['The', 'ambience', 'was', ':o)', ')', ')', ',', 'but', 'the', 'food', 'was', ')-:']

my_tokenizer = MyWordTokenizer()

print(my_tokenizer.tokenize(text1))
print(my_tokenizer.tokenize(text2))
print(my_tokenizer.tokenize(text3))
print(my_tokenizer.tokenize(text4))
print(my_tokenizer.tokenize(text5))

["Let's", 'go', 'to', 'N.Y.', 'to', 'visit', 'my', 'mother-in-law!!!', "It'll", 'be', 'great,', "I'm", 'sure.']
["I've", 'got', 'an', 'appointment', 'with', 'Dr.', 'Smith...I', 'hope', 'it', "won't", 'be', 'bad.']
['Please', 'select', 'your', 'answer:', 'A', '(luggage)', 'or', 'B', '(no', 'luggage).']
['My', 'website', 'is', 'www.example.com', '(email:', 'user@example.com)', '#contact.']
['The', 'ambience', 'was', ':o))),', 'but', 'the', 'food', 'was', ')-:']

def split_basic_punctuation(token):
    subtokens = []
    # Common Punctuation marks (word followed by a single punctuation mark -- except ".")
    for match in re.finditer(r"(.+)([,:;?!]{1})$", token, flags=re.IGNORECASE):
        subtokens.append(match[1])
        subtokens.append(match[2])
        return True, subtokens   
    return False, []

rules = [split_basic_punctuation]

print(my_tokenizer.tokenize(text1, rules=rules))
print(my_tokenizer.tokenize(text2, rules=rules))
print(my_tokenizer.tokenize(text3, rules=rules))
print(my_tokenizer.tokenize(text4, rules=rules))
print(my_tokenizer.tokenize(text5, rules=rules))

["Let's", 'go', 'to', 'N.Y.', 'to', 'visit', 'my', 'mother-in-law', '!', '!', '!', "It'll", 'be', 'great', ',', "I'm", 'sure.']
["I've", 'got', 'an', 'appointment', 'with', 'Dr.', 'Smith...I', 'hope', 'it', "won't", 'be', 'bad.']
['Please', 'select', 'your', 'answer', ':', 'A', '(luggage)', 'or', 'B', '(no', 'luggage).']
['My', 'website', 'is', 'www.example.com', '(email', ':', 'user@example.com)', '#contact.']
['The', 'ambience', 'was', ':o)))', ',', 'but', 'the', 'food', 'was', ')-', ':']

def split_period_punctuation(token):
    subtokens = []
    # Abbreviations vs. sentence period (only split if not an abbreviation)
    for match in re.finditer(r"(?<![.])([a-zA-Z0-9)\]}])([.]{1})$", token, flags=re.IGNORECASE):
        subtokens.append(token[:match.span(2)[0]])
        subtokens.append(token[match.span(2)[0]:])
        return True, subtokens 
    return False, []

rules = [split_basic_punctuation,
         split_period_punctuation
        ]

print(my_tokenizer.tokenize(text1, rules=rules))
print(my_tokenizer.tokenize(text2, rules=rules))
print(my_tokenizer.tokenize(text3, rules=rules))
print(my_tokenizer.tokenize(text4, rules=rules))
print(my_tokenizer.tokenize(text5, rules=rules))

["Let's", 'go', 'to', 'N.Y.', 'to', 'visit', 'my', 'mother-in-law', '!', '!', '!', "It'll", 'be', 'great', ',', "I'm", 'sure', '.']
["I've", 'got', 'an', 'appointment', 'with', 'Dr', '.', 'Smith...I', 'hope', 'it', "won't", 'be', 'bad', '.']
['Please', 'select', 'your', 'answer', ':', 'A', '(luggage)', 'or', 'B', '(no', 'luggage)', '.']
['My', 'website', 'is', 'www.example.com', '(email', ':', 'user@example.com)', '#contact', '.']
['The', 'ambience', 'was', ':o)))', ',', 'but', 'the', 'food', 'was', ')-', ':']

def split_clitics(token):
    # Define set of clitics
    clitics = set(["n't", "'s", "'m", "'re", "'ve", "'d", "'ll"])
    # Contains all the subtokens that might result from any splitting
    subtokens = []
    for match in re.finditer(rf"({'|'.join(clitics)})$", token, flags=re.IGNORECASE):
        subtokens.append(token[:match.span()[0]])
        subtokens.append(token[match.span()[0]:])
        return True, subtokens
    return False, []

rules = [split_basic_punctuation,
         split_period_punctuation,
         split_clitics
        ]

print(my_tokenizer.tokenize(text1, rules=rules))
print(my_tokenizer.tokenize(text2, rules=rules))
print(my_tokenizer.tokenize(text3, rules=rules))
print(my_tokenizer.tokenize(text4, rules=rules))
print(my_tokenizer.tokenize(text5, rules=rules))

['Let', "'s", 'go', 'to', 'N.Y.', 'to', 'visit', 'my', 'mother-in-law', '!', '!', '!', 'It', "'ll", 'be', 'great', ',', 'I', "'m", 'sure', '.']
['I', "'ve", 'got', 'an', 'appointment', 'with', 'Dr', '.', 'Smith...I', 'hope', 'it', 'wo', "n't", 'be', 'bad', '.']
['Please', 'select', 'your', 'answer', ':', 'A', '(luggage)', 'or', 'B', '(no', 'luggage)', '.']
['My', 'website', 'is', 'www.example.com', '(email', ':', 'user@example.com)', '#contact', '.']
['The', 'ambience', 'was', ':o)))', ',', 'but', 'the', 'food', 'was', ')-', ':']

def split_hyphenated_words(token):
    subtokens = []
    for match in re.finditer(r"([a-zA-Z]+)(-)", token):
        subtokens.append(token[:match.span(2)[0]])
        subtokens.append(token[match.span(2)[0]:match.span(2)[1]])
        subtokens.append(token[match.span(2)[1]:])
        return True, subtokens
    return False, []

rules = [split_basic_punctuation,
         split_period_punctuation,
         split_clitics,
         split_hyphenated_words]

print(my_tokenizer.tokenize(text1, rules=rules))
print(my_tokenizer.tokenize(text2, rules=rules))
print(my_tokenizer.tokenize(text3, rules=rules))
print(my_tokenizer.tokenize(text4, rules=rules))
print(my_tokenizer.tokenize(text5, rules=rules))

['Let', "'s", 'go', 'to', 'N.Y.', 'to', 'visit', 'my', 'mother', '-', 'in', '-', 'law', '!', '!', '!', 'It', "'ll", 'be', 'great', ',', 'I', "'m", 'sure', '.']
['I', "'ve", 'got', 'an', 'appointment', 'with', 'Dr', '.', 'Smith...I', 'hope', 'it', 'wo', "n't", 'be', 'bad', '.']
['Please', 'select', 'your', 'answer', ':', 'A', '(luggage)', 'or', 'B', '(no', 'luggage)', '.']
['My', 'website', 'is', 'www.example.com', '(email', ':', 'user@example.com)', '#contact', '.']
['The', 'ambience', 'was', ':o)))', ',', 'but', 'the', 'food', 'was', ')-', ':']

def split_ellipses(token):
    components = []
    for match in re.finditer(r"[.]{3,}", token):
        components.append(token[:match.span()[0]])
        components.append(token[match.span()[0]:match.span()[1]])
        components.append(token[match.span()[1]:])
        return True, components
    return False, []

rules = [split_basic_punctuation, 
         split_period_punctuation, 
         split_clitics, 
         split_hyphenated_words, 
         split_ellipses]

print(my_tokenizer.tokenize(text1, rules=rules))
print(my_tokenizer.tokenize(text2, rules=rules))
print(my_tokenizer.tokenize(text3, rules=rules))
print(my_tokenizer.tokenize(text4, rules=rules))
print(my_tokenizer.tokenize(text5, rules=rules))

['Let', "'s", 'go', 'to', 'N.Y.', 'to', 'visit', 'my', 'mother', '-', 'in', '-', 'law', '!', '!', '!', 'It', "'ll", 'be', 'great', ',', 'I', "'m", 'sure', '.']
['I', "'ve", 'got', 'an', 'appointment', 'with', 'Dr', '.', 'Smith', '...', 'I', 'hope', 'it', 'wo', "n't", 'be', 'bad', '.']
['Please', 'select', 'your', 'answer', ':', 'A', '(luggage)', 'or', 'B', '(no', 'luggage)', '.']
['My', 'website', 'is', 'www.example.com', '(email', ':', 'user@example.com)', '#contact', '.']
['The', 'ambience', 'was', ':o)))', ',', 'but', 'the', 'food', 'was', ')-', ':']

def split_parentheses_quotes(token):
    components = []
    for match in re.finditer(r"[(){}\[\]\"]{1}", token):
        components.append(token[:match.span()[0]])
        components.append(token[match.span()[0]:match.span()[1]])
        components.append(token[match.span()[1]:])
        return True, components
    return False, []

rules = [split_basic_punctuation, 
         split_period_punctuation, 
         split_clitics, 
         split_hyphenated_words, 
         split_ellipses,
         split_parentheses_quotes
        ]

print(my_tokenizer.tokenize(text1, rules=rules))
print(my_tokenizer.tokenize(text2, rules=rules))
print(my_tokenizer.tokenize(text3, rules=rules))
print(my_tokenizer.tokenize(text4, rules=rules))
print(my_tokenizer.tokenize(text5, rules=rules))

['Let', "'s", 'go', 'to', 'N.Y.', 'to', 'visit', 'my', 'mother', '-', 'in', '-', 'law', '!', '!', '!', 'It', "'ll", 'be', 'great', ',', 'I', "'m", 'sure', '.']
['I', "'ve", 'got', 'an', 'appointment', 'with', 'Dr', '.', 'Smith', '...', 'I', 'hope', 'it', 'wo', "n't", 'be', 'bad', '.']
['Please', 'select', 'your', 'answer', ':', 'A', '(', 'luggage', ')', 'or', 'B', '(', 'no', 'luggage', ')', '.']
['My', 'website', 'is', 'www.example.com', '(', 'email', ':', 'user@example.com', ')', '#contact', '.']
['The', 'ambience', 'was', ':o', ')', ')', ')', ',', 'but', 'the', 'food', 'was', ')', '-', ':']

def is_abbreviation(token):
    known_abbreviations = set(["mr.", "ms.", "mrs.", "prof.", "dr.", "mt.", "st."])
    if token.lower() in known_abbreviations:
        return True
    return False

exceptions = [is_abbreviation]

print(my_tokenizer.tokenize(text1, rules=rules, exceptions=exceptions))
print(my_tokenizer.tokenize(text2, rules=rules, exceptions=exceptions))
print(my_tokenizer.tokenize(text3, rules=rules, exceptions=exceptions))
print(my_tokenizer.tokenize(text4, rules=rules, exceptions=exceptions))
print(my_tokenizer.tokenize(text5, rules=rules, exceptions=exceptions))

['Let', "'s", 'go', 'to', 'N.Y.', 'to', 'visit', 'my', 'mother', '-', 'in', '-', 'law', '!', '!', '!', 'It', "'ll", 'be', 'great', ',', 'I', "'m", 'sure', '.']
['I', "'ve", 'got', 'an', 'appointment', 'with', 'Dr.', 'Smith', '...', 'I', 'hope', 'it', 'wo', "n't", 'be', 'bad', '.']
['Please', 'select', 'your', 'answer', ':', 'A', '(', 'luggage', ')', 'or', 'B', '(', 'no', 'luggage', ')', '.']
['My', 'website', 'is', 'www.example.com', '(', 'email', ':', 'user@example.com', ')', '#contact', '.']
['The', 'ambience', 'was', ':o', ')', ')', ')', ',', 'but', 'the', 'food', 'was', ')', '-', ':']

def is_emoticon(token):
    # Western Emoticaons (both orientations)
    TOPS = '[]}{()<>'
    EYES = '.:;8BX='
    NOSES = '-=~\'^o'
    MOUTHS = ')(/\|DPp[]{}<>oO*'

    # Generic patterns
    p = re.compile("^([%s]?)([%s])([%s]?)([%s]+)$" % tuple(map(re.escape, [TOPS, EYES, NOSES, MOUTHS])))
    for m in p.finditer(token):
        return True
    # Generic patterns (mirrored orientation)
    p = re.compile("^([%s]+)([%s]?)([%s])([%s]?)$" % tuple(map(re.escape, [MOUTHS, NOSES, EYES, TOPS])))
    for m in p.finditer(token):
        return True

    # Kaomoji (https://en.wikipedia.org/wiki/Kaomoji)
    t = token
    for match in re.finditer(r"(?<=[({\[])(.*)(?=[})\]])", t):
        t = t[match.span()[0]:match.span()[1]]

    known_emoticons = set(["^_^", "^^", "^-^"])
    if t.lower() in known_emoticons:
        return True
    return False

exceptions = [is_abbreviation, 
              is_emoticon]

print(my_tokenizer.tokenize(text1, rules=rules, exceptions=exceptions))
print(my_tokenizer.tokenize(text2, rules=rules, exceptions=exceptions))
print(my_tokenizer.tokenize(text3, rules=rules, exceptions=exceptions))
print(my_tokenizer.tokenize(text4, rules=rules, exceptions=exceptions))
print(my_tokenizer.tokenize(text5, rules=rules, exceptions=exceptions))

['Let', "'s", 'go', 'to', 'N.Y.', 'to', 'visit', 'my', 'mother', '-', 'in', '-', 'law', '!', '!', '!', 'It', "'ll", 'be', 'great', ',', 'I', "'m", 'sure', '.']
['I', "'ve", 'got', 'an', 'appointment', 'with', 'Dr.', 'Smith', '...', 'I', 'hope', 'it', 'wo', "n't", 'be', 'bad', '.']
['Please', 'select', 'your', 'answer', ':', 'A', '(', 'luggage', ')', 'or', 'B', '(', 'no', 'luggage', ')', '.']
['My', 'website', 'is', 'www.example.com', '(', 'email', ':', 'user@example.com', ')', '#contact', '.']
['The', 'ambience', 'was', ':o)))', ',', 'but', 'the', 'food', 'was', ')-:']

text6 = "Tell her that she got an A- in her exam;her phone number is 555 1234-5"
text7 = "I've visited my mother—in—law yesterday."

print(my_tokenizer.tokenize(text4, rules=rules, exceptions=exceptions))
print(my_tokenizer.tokenize(text6, rules=rules, exceptions=exceptions))
print(my_tokenizer.tokenize(text7, rules=rules, exceptions=exceptions))

['My', 'website', 'is', 'www.example.com', '(', 'email', ':', 'user@example.com', ')', '#contact', '.']
['Tell', 'her', 'that', 'she', 'got', 'an', 'A', '-', 'in', 'her', 'exam;her', 'phone', 'number', 'is', '555', '1234-5']
['I', "'ve", 'visited', 'my', 'mother—in—law', 'yesterday', '.']

Building a Word Tokenizer from Scratch¶

Motivation¶

Setting up the Notebook¶

Make Required Imports¶

Preliminaries¶

Purpose & Scope¶

Basic Assumptions¶

Alignment¶

Implementation¶

Basic Algorithm¶

Token Splitting Rules¶

Basic Punctuation Marks¶

Handling Periods¶

Clitics¶

Hyphenated Words¶

Ellipsis¶

Parentheses & Quotes¶

Exceptions¶

(Special) Abbreviations¶

Emoticons¶

Discussion & Limitations¶

Summary¶