from src.utils.libimports.textnorm import *
from src.text.preprocessing.normalizing import EmoticonNormalizer
from src.utils.data.files import *

slang_dictionary, _    = download_dataset("text/lexicons/normalization/slang-dictionary.csv")
english_vocabulary, _  = download_dataset("text/lexicons/normalization/vocabulary-american-english.csv")
british_to_american, _ = download_dataset("text/lexicons/normalization/british-to-american.csv")
emoji_mapping, _       = download_dataset("text/lexicons/normalization/emoji-mapping-subset.csv")

File 'data/datasets/text/lexicons/normalization/slang-dictionary.csv' already exists (use 'overwrite=True' to overwrite it).
File 'data/datasets/text/lexicons/normalization/vocabulary-american-english.csv' already exists (use 'overwrite=True' to overwrite it).
File 'data/datasets/text/lexicons/normalization/british-to-american.csv' already exists (use 'overwrite=True' to overwrite it).
File 'data/datasets/text/lexicons/normalization/emoji-mapping-subset.csv' already exists (use 'overwrite=True' to overwrite it).

text = "That movie was AMAZING! I went to see it twice."

print(text)
print(text.lower())
print(text.upper())

That movie was AMAZING! I went to see it twice.
that movie was amazing! i went to see it twice.
THAT MOVIE WAS AMAZING! I WENT TO SEE IT TWICE.

tokens = ["Yesterday", "@", "9.30", "pm", ",", "the", "match", "ended", "0:0", "--", "all", "players", "left", "dissappointed", "."]

def is_invalid_token(token):
    if token.isalpha() is False:
        return True
    return False

print([ token for token in tokens if is_invalid_token(token) is False ])
#print([ token for token in tokens if token.isalpha() is True ]) # Simpler but less flexible

['Yesterday', 'pm', 'the', 'match', 'ended', 'all', 'players', 'left', 'dissappointed']

text = "Yesterday @ 9.30 pm, the match ended 0:0 -- all players left dissappointed."

print([ token.text for token in nlp(text) if token.is_alpha == True and token.is_punct == False and token.is_digit == False ])

['Yesterday', 'pm', 'the', 'match', 'ended', 'all', 'players', 'left', 'dissappointed']

stopwords = ["a", "an", "the", "not", "and", "or", "but", "to"]

text = "Alice and Bob went to KFC but did not eat anything"

pattern = r"|".join([ r"\b({})\b".format(w) for w in stopwords ])

text = re.sub(pattern, r"", text, flags=re.I)  # Remove stopwords
text = re.sub(r"\s+", r" ", text)              # Remove duplicate whitespace (introduced by removing stopwords)
text = text.strip()                            # Remove trailing whitespaces (needed of string started or ended with a stopword)

print(text)

Alice Bob went KFC did eat anything

tokens = [ token for token in text.split() if token not in stopwords ]

print(tokens)

['Alice', 'Bob', 'went', 'KFC', 'did', 'eat', 'anything']

doc = nlp(text)

tokens = [ token.text for token in doc if token.is_stop == False ]

print(tokens)

['Alice', 'Bob', 'went', 'KFC', 'eat']

print([ token.lemma_ for token in nlp("Dogs make the best friends.") ])
print([ token.lemma_ for token in nlp("A dog makes a good friend.") ])

['dog', 'make', 'the', 'good', 'friend', '.']
['a', 'dog', 'make', 'a', 'good', 'friend', '.']

print([ token.lemma_ for token in nlp("Dogs make the best friends.") if token.is_stop is False and token.is_punct is False ])
print([ token.lemma_ for token in nlp("A dog makes a good friend.") if token.is_stop is False and token.is_punct is False ])

['dog', 'good', 'friend']
['dog', 'make', 'good', 'friend']

print("\U00000027") # Apostrophe (equivalent to ASCII character)
print("\U00002019") # Right Single Quotation Mark
print("\U000002BC") # Modifier Letter Apostrophe

'
’
ʼ

print("\U000000E4")
print("\U00000061\U00000308")

ä
ä

unicode_map1 = {"\U000000E4": "a", "\U00000061\U00000308": "a"}

text = u"Der Bär lässt sich ärgern" # German for "The bear let itself get taunted" (silly; just to have multiple umlauts)

print(''.join(idx if idx not in unicode_map1 else unicode_map1[idx] for idx in text))

Der Bar lasst sich argern

print(unidecode(text))

Der Bar lasst sich argern

unicode_map2 = {"\U000000E4": "\U000000E4", "\U00000061\U00000308": "\U000000E4"}

print(''.join(idx if idx not in unicode_map2 else unicode_map2[idx] for idx in text))

Der Bär lässt sich ärgern

unicode_map3 = {"\U000000E4": "ae", "\U00000061\U00000308": "ae"}

print(''.join(idx if idx not in unicode_map3 else unicode_map3[idx] for idx in text))

Der Baer laesst sich aergern

df_slang = pd.read_csv(slang_dictionary)

df_slang.head(8)

slang_dict = df_slang.set_index('slang')['translation'].to_dict()

print(slang_dict["brb"])

be right back

def translate_slang(word, slang_dict):
    try:
        return slang_dict[word.lower()]
    except:
        return word

print(translate_slang("brb", slang_dict))
print(translate_slang("omg", slang_dict))
print(translate_slang("gn8", slang_dict))
print(translate_slang("car", slang_dict))
print(translate_slang("bus", slang_dict))

be right back
oh my god
good night
car
bus

tokens = ["Good", "joke", "lol", "!", "The",  "delivery",  "was", " crazy", "OMG"]

print([ translate_slang(token, slang_dict) for token in tokens ])

['Good', 'joke', 'laughing out loud', '!', 'The', 'delivery', 'was', ' crazy', 'oh my god']

tokens_normalized = [ translate_slang(token, slang_dict).split() for token in tokens ]
print(tokens_normalized)

tokens_normalized = list(pd.core.common.flatten(tokens_normalized))
print(tokens_normalized)

[['Good'], ['joke'], ['laughing', 'out', 'loud'], ['!'], ['The'], ['delivery'], ['was'], ['crazy'], ['oh', 'my', 'god']]
['Good', 'joke', 'laughing', 'out', 'loud', '!', 'The', 'delivery', 'was', 'crazy', 'oh', 'my', 'god']

print(translate_slang("4", slang_dict))
print(translate_slang("F8", slang_dict))
print(translate_slang("GM", slang_dict))

for
fate
good morning

df_emojis = pd.read_csv(emoji_mapping)

df_emojis.head()

emoji_dict = df_emojis.set_index("EMOJI")["LABEL"].to_dict()

print(emoji_dict["😆"])

[EMOJI+]

def translate_emoji(emoji, emoji_dict):
    try:
        return emoji_dict[emoji]
    except:
        return emoji

print(translate_emoji("😆", emoji_dict))
print(translate_emoji("🤔", emoji_dict))
print(translate_emoji("🤢", emoji_dict))
print(translate_emoji("🙈", emoji_dict))

[EMOJI+]
[EMOJI0]
[EMOJI-]
🙈

tokens = ["The", "movie", "was", "😆", ",", "only", "the", "ending", "was", "😩", "🙈", "."]

print([ translate_emoji(token, emoji_dict) for token in tokens ])

['The', 'movie', 'was', '[EMOJI+]', ',', 'only', 'the', 'ending', 'was', '[EMOJI-]', '🙈', '.']

tokens_normalized = [ translate_emoji(token, emoji_dict) for token in tokens ]
print(tokens_normalized)

tokens_normalized = [ token.encode("ascii", "ignore").decode("utf-8") for token in tokens_normalized ]
print(tokens_normalized)

tokens_normalized = [ token for token in tokens_normalized if token.strip() != "" ]
print(tokens_normalized)

['The', 'movie', 'was', '[EMOJI+]', ',', 'only', 'the', 'ending', 'was', '[EMOJI-]', '🙈', '.']
['The', 'movie', 'was', '[EMOJI+]', ',', 'only', 'the', 'ending', 'was', '[EMOJI-]', '', '.']
['The', 'movie', 'was', '[EMOJI+]', ',', 'only', 'the', 'ending', 'was', '[EMOJI-]', '.']

emoticon_normalizer = EmoticonNormalizer()

for emoticon in [":-)]]", ";o))))", ":-(((", "((:", ":o|"]:
    _, _, emoticon_normalized, sentiment_label = emoticon_normalizer.normalize(emoticon)
    print(f"{emoticon} ==> {emoticon_normalized} / {sentiment_label}")

:-)]] ==> :-) / [EMOTICON+]
;o)))) ==> ;o) / [EMOTICON+]
:-((( ==> :-( / [EMOTICON-]
((: ==> (: / [EMOTICON+]
:o| ==> :o| / [EMOTICON-]

tokens = ["The", "movie", "was", ":p", ",", "only", "the", "ending", "was", ":(((", ">:o|", "."]

print([ emoticon_normalizer.normalize(token)[-1] for token in tokens ])

['The', 'movie', 'was', '[EMOTICON+]', ',', 'only', 'the', 'ending', 'was', '[EMOTICON-]', '[EMOTICON-]', '.']

df_be2ae = pd.read_csv(british_to_american)

df_be2ae.head()

be2ae = df_be2ae.set_index("BRITISH ENGLISH")["AMERICAN ENGLISH"].to_dict()

print(be2ae["organise"])

organize

def americanize_word(word, word_dict):
    try:
        return word_dict[word]
    except:
        return word

tokens = ["The", "programme", "was", "gruelling", "but", "useful", "."]

print([ americanize_word(token, be2ae) for token in tokens ])

['The', 'program', 'was', 'grueling', 'but', 'useful', '.']

df_vocab = pd.read_csv(english_vocabulary)

df_vocab.head()

vocab_ae = set(df_vocab["WORD"])

vocab_ae.update(set(slang_dict.keys()))

print(re.sub(r"(\w)\1{2,}", r"\1\1", "cooool", flags=re.I))

cool

print(re.sub(r"(\w)\1{2,}", r"\1\1", "looool", flags=re.I))

lool

def handle_expressive_lengthening(token, vocabulary):
    token_copy = token
    if re.match(r"(\w)\1{2,}", token_copy, flags=re.I) is False:
        return token
    token_copy = re.sub(r"(\w)\1{2,}", r"\1\1", token_copy, flags=re.I)
    if token_copy.lower() in vocab_ae:
        return token_copy
    token_copy = re.sub(r"(\w)\1", r"\1", token_copy, flags=re.I)
    if token_copy.lower() in vocab_ae:
        return token_copy
    else:
        return token

print(handle_expressive_lengthening("looool", vocab_ae))
print(handle_expressive_lengthening("cooooool", vocab_ae))
print(handle_expressive_lengthening("daaaaamn", vocab_ae))
print(handle_expressive_lengthening("niiiiceee", vocab_ae))
print(handle_expressive_lengthening("heeeckkkk", vocab_ae))

lol
cool
damn
nice
heck

tokens = ["The", "movie", "was", "amaaaazing", "looool", "."]

print([ handle_expressive_lengthening(token, vocab_ae) for token in tokens ])

['The', 'movie', 'was', 'amazing', 'lol', '.']

print(handle_expressive_lengthening("microsooooft", vocab_ae))

microsooooft

print(handle_expressive_lengthening("loooollll", vocab_ae)) # Works because both "o" and "l" need to get reduced to one character each
print(handle_expressive_lengthening("coooollll", vocab_ae)) # Does arguably not work since "cooll" is not a proper word so further reduction
print(handle_expressive_lengthening("beeerrrr", vocab_ae)) # Does not work since neither "beerr" nor "ber" are proper words

lol
col
beeerrrr

Linguistic Phenomenon	Description	Example Before Normalization
Spelling Variations	The same words may alternate spellings or regional differences.	"color/colour'
Case Sensitivity	.	"HELLO/hello"
Inflected Forms	Verbs may have different forms depending on the tense; nouns may have different forms in their singular and plural form.	"run/runs/ran/running", "foot/feet"
Contractions	.	"I am / I'm", "will no / won't"
Stop Words	Words may not contribute useful information to ....	"a/an", "the", "and", "or", "from", "with"
Punctuation	Removes or standardizes punctuation that might interfere with tokenization or processing.
Non-Standard Words	A text may contain words that are not common dictionary words; particular in user-generated content on social media.	"u (you)", "gr8 (great)", "lol (laughing out loud)"
Numerical Expressions	The Number can be differently represented using digits and words.	"1000 / 1000.00 / 1,000 / 1k / one thousand",
Compound Words	Compounds may be spelled differently depending on conventions.	"ice-cream / ice cream"`
Unicode Variants	Characters to are visually similar maybe in fact be different Unicode characters.	`ﬁ`, `fi` (ligature and standard)
Diacritics and Accents	The same word may be spelled with or without diacritical marks.	"café / cafe"
Special Characters	Removes or replaces characters like emojis, symbols, and hashtags.	"@Selene Hello 😊 #AI"
Whitespace	Removes unnecessary spaces or line breaks.	"Hello world!"
Ambiguity in Abbreviations	Expands or standardizes abbreviations for better comprehension.	"e.g.", "i.e."

	slang	translation
0	*4u	kiss for you
1	*67	unknown
2	eg	evil grin
3	07734	hello
4	0day	software illegally obtained before it was rele...
5	0noe	oh no
6	0vr	over
7	10q	thank you

	CODE_POINT	EMOJI	LABEL
0	1F600	😀	[EMOJI+]
1	1F603	😃	[EMOJI+]
2	1F604	😄	[EMOJI+]
3	1F601	😁	[EMOJI+]
4	1F606	😆	[EMOJI+]

British English	American English	Description
Flat	Apartment	A place to live
Lift	Elevator	Used to move between floors
Lorry	Truck	A large vehicle for goods transport
Petrol	Gas/Gasoline	Fuel for cars
Boot	Trunk	The storage area of a car
Bonnet	Hood	The front cover of a car engine
Biscuit	Cookie	A baked sweet treat
Crisps	Chips	Thinly sliced fried potatoes
Chips	Fries	Fried potato strips
Holiday	Vacation	A break or time off work/school
Jumper	Sweater	A knitted upper garment
Torch	Flashlight	A portable light source
Queue	Line	A row of people waiting
Dustbin	Trash can	A container for garbage
Trousers	Pants	Lower body clothing

	BRITISH ENGLISH	AMERICAN ENGLISH
0	africanisation	africanization
1	africanise	africanize
2	americanisation	americanization
3	americanise	americanize
4	arabise	arabize

	WORD
0	A
1	AA
2	AAA
3	AA's
4	AB

Text Normalization¶

Motivation¶

Preliminaries¶

Scope¶

Strings vs Token Lists¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Case Folding¶

Application¶

Discussion¶

Punctuation & Non-Word Removal¶

Application¶

Discussion¶

Stopword Removal¶

Application¶

Discussion¶

Stemming and Lemmatization¶

Application¶

Discussion¶

Unicode Variants¶

Application¶

Discussion¶

Handling Non-Standard Words¶

Internet Slang¶

Application¶

Discussion¶

Emojis¶

Application¶

Discussion¶

Emoticons¶

Application¶

Discussion¶

Misspelled Words¶

Americanize Text¶

Application¶

Discussion¶

Expressive Lengthenings¶

Application¶

Discussion¶

Summary¶