from src.utils.libimports.stemlem import *
from src.utils.data.files import *

nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/vdw/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!

True

lemmatizer_lexicon, _ = download_dataset("text/lexicons/lemmatization/lemmatizer-lexicon.dat")

File 'data/datasets/text/lexicons/lemmatization/lemmatizer-lexicon.dat' already exists (use 'overwrite=True' to overwrite it).

porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
snowball_stemmer = SnowballStemmer('english')

# Put all stemmers into a list to make their use easier
stemmer_list = [porter_stemmer, snowball_stemmer, lancaster_stemmer]

word_list = ['only', 'accepted', 'studying','study','studied', 'dogs', 'cats', 
             'running', 'phones', 'viewed', 'presumably', 'crying', 'went', 
             'packed', 'worse', 'best', 'mice', 'friends', 'makes']

for word in word_list:
    print (f"{word}:")
    for stemmer in stemmer_list:
        stemmed_word = stemmer.stem(word)
        print (f"\t{stemmed_word} ({type(stemmer).__name__})")

only:
	onli (PorterStemmer)
	onli (SnowballStemmer)
	on (LancasterStemmer)
accepted:
	accept (PorterStemmer)
	accept (SnowballStemmer)
	acceiv (LancasterStemmer)
studying:
	studi (PorterStemmer)
	studi (SnowballStemmer)
	study (LancasterStemmer)
study:
	studi (PorterStemmer)
	studi (SnowballStemmer)
	study (LancasterStemmer)
studied:
	studi (PorterStemmer)
	studi (SnowballStemmer)
	study (LancasterStemmer)
dogs:
	dog (PorterStemmer)
	dog (SnowballStemmer)
	dog (LancasterStemmer)
cats:
	cat (PorterStemmer)
	cat (SnowballStemmer)
	cat (LancasterStemmer)
running:
	run (PorterStemmer)
	run (SnowballStemmer)
	run (LancasterStemmer)
phones:
	phone (PorterStemmer)
	phone (SnowballStemmer)
	phon (LancasterStemmer)
viewed:
	view (PorterStemmer)
	view (SnowballStemmer)
	view (LancasterStemmer)
presumably:
	presum (PorterStemmer)
	presum (SnowballStemmer)
	presum (LancasterStemmer)
crying:
	cri (PorterStemmer)
	cri (SnowballStemmer)
	cry (LancasterStemmer)
went:
	went (PorterStemmer)
	went (SnowballStemmer)
	went (LancasterStemmer)
packed:
	pack (PorterStemmer)
	pack (SnowballStemmer)
	pack (LancasterStemmer)
worse:
	wors (PorterStemmer)
	wors (SnowballStemmer)
	wors (LancasterStemmer)
best:
	best (PorterStemmer)
	best (SnowballStemmer)
	best (LancasterStemmer)
mice:
	mice (PorterStemmer)
	mice (SnowballStemmer)
	mic (LancasterStemmer)
friends:
	friend (PorterStemmer)
	friend (SnowballStemmer)
	friend (LancasterStemmer)
makes:
	make (PorterStemmer)
	make (SnowballStemmer)
	mak (LancasterStemmer)

word_to_lemma = {}

with open(lemmatizer_lexicon) as file:
    # Line format: driving###VERB::drive;;ADJ::driving
    for line in file:
        line = line.strip()
        word, lemmas = line.split("###")
        
        if word not in word_to_lemma:
            word_to_lemma[word] = {}
        
        for pos_lemma_pair in lemmas.split(";;"):
            pos, lemma = pos_lemma_pair.split("::")
            word_to_lemma[word][pos] = lemma

word_to_lemma_tmp = {key: word_to_lemma[key] for key in ["she", "'s", "driving", "faster", "than", "allowed", "."]}

print(json.dumps(word_to_lemma_tmp, indent=2))

{
  "she": {
    "PRON": "she",
    "NOUN": "she"
  },
  "'s": {
    "PART": "'s",
    "AUX": "be",
    "VERB": "be",
    "PRON": "we"
  },
  "driving": {
    "VERB": "drive",
    "ADJ": "driving"
  },
  "faster": {
    "ADJ": "fast"
  },
  "than": {
    "SCONJ": "than",
    "NOUN": "than",
    "ADP": "than",
    "ADV": "than"
  },
  "allowed": {
    "VERB": "allow"
  },
  ".": {
    "PUNCT": "."
  }
}

word = "running"

print(f"The lemmata for {word} in the dictionary are: {word_to_lemma[word]}")

The lemmata for running in the dictionary are: {'VERB': 'run', 'NOUN': 'run', 'ADJ': 'running'}

word = "running"
ptag = "VERB"

print(f"The lemma for {word} with the POS tag {ptag} is: {word_to_lemma[word][ptag]}")

The lemma for running with the POS tag VERB is: run

def lemmatize(word, pos):
    try:
        return word_to_lemma[word.lower()][pos]
    except:
        return word

#word = "runnnnning"
word = "running"
ptag = "VERB"

print(f"The lemma for {word} with the POS tag {ptag} is: {lemmatize(word, ptag)}")

The lemma for running with the POS tag VERB is: run

# Sentence after Tokenization and POS Tagging
sentence = [
    ("She", "PRON"), 
    ("'s", "AUX"), 
    ("driving", "VERB"), 
    ("faster", "ADJ"), 
    ("than", "SCONJ"), 
    ("allowed", "VERB"),
    (".", "PUNCT")
]

lemmas = [lemmatize(word, ptag) for word, ptag in sentence]

print(f"All lemmas for the input sentence: {lemmas}")

All lemmas for the input sentence: ['she', 'be', 'drive', 'fast', 'than', 'allow', '.']

wordnet_lemmatizer = WordNetLemmatizer()

pos_list = ['n', 'v', 'a', 'r']

for word in word_list:
    print (word + ':')
    for pos in pos_list:
        lemmatized_word = wordnet_lemmatizer.lemmatize(word, pos=pos) # default is 'n'
        print ('\t', word, '=[{}]=>'.format(pos), lemmatized_word)

only:
	 only =[n]=> only
	 only =[v]=> only
	 only =[a]=> only
	 only =[r]=> only
accepted:
	 accepted =[n]=> accepted
	 accepted =[v]=> accept
	 accepted =[a]=> accepted
	 accepted =[r]=> accepted
studying:
	 studying =[n]=> studying
	 studying =[v]=> study
	 studying =[a]=> studying
	 studying =[r]=> studying
study:
	 study =[n]=> study
	 study =[v]=> study
	 study =[a]=> study
	 study =[r]=> study
studied:
	 studied =[n]=> studied
	 studied =[v]=> study
	 studied =[a]=> studied
	 studied =[r]=> studied
dogs:
	 dogs =[n]=> dog
	 dogs =[v]=> dog
	 dogs =[a]=> dogs
	 dogs =[r]=> dogs
cats:
	 cats =[n]=> cat
	 cats =[v]=> cat
	 cats =[a]=> cats
	 cats =[r]=> cats
running:
	 running =[n]=> running
	 running =[v]=> run
	 running =[a]=> running
	 running =[r]=> running
phones:
	 phones =[n]=> phone
	 phones =[v]=> phone
	 phones =[a]=> phones
	 phones =[r]=> phones
viewed:
	 viewed =[n]=> viewed
	 viewed =[v]=> view
	 viewed =[a]=> viewed
	 viewed =[r]=> viewed
presumably:
	 presumably =[n]=> presumably
	 presumably =[v]=> presumably
	 presumably =[a]=> presumably
	 presumably =[r]=> presumably
crying:
	 crying =[n]=> cry
	 crying =[v]=> cry
	 crying =[a]=> crying
	 crying =[r]=> crying
went:
	 went =[n]=> went
	 went =[v]=> go
	 went =[a]=> went
	 went =[r]=> went
packed:
	 packed =[n]=> packed
	 packed =[v]=> pack
	 packed =[a]=> packed
	 packed =[r]=> packed
worse:
	 worse =[n]=> worse
	 worse =[v]=> worse
	 worse =[a]=> bad
	 worse =[r]=> worse
best:
	 best =[n]=> best
	 best =[v]=> best
	 best =[a]=> best
	 best =[r]=> best
mice:
	 mice =[n]=> mouse
	 mice =[v]=> mice
	 mice =[a]=> mice
	 mice =[r]=> mice
friends:
	 friends =[n]=> friend
	 friends =[v]=> friends
	 friends =[a]=> friends
	 friends =[r]=> friends
makes:
	 makes =[n]=> make
	 makes =[v]=> make
	 makes =[a]=> makes
	 makes =[r]=> makes

sentence = "The newest study has shown that cats have mostly a better sense of smell than dogs."

# First, tokenize sentence
token_list = word_tokenize(sentence)

# Second, calculate POS tags for each token
pos_tag_list = pos_tag(token_list)

for pos in pos_tag_list:
    print(pos)

('The', 'DT')
('newest', 'JJS')
('study', 'NN')
('has', 'VBZ')
('shown', 'VBN')
('that', 'IN')
('cats', 'NNS')
('have', 'VBP')
('mostly', 'RB')
('a', 'DT')
('better', 'JJR')
('sense', 'NN')
('of', 'IN')
('smell', 'NN')
('than', 'IN')
('dogs', 'NNS')
('.', '.')

print ("\nOutput of NLTK lemmatizer:\n")
for token, tag in pos_tag_list:
    word_type = 'n' # Default if all fails
    tag_simple = tag[0].lower() # Converts, e.g., "VBD" to "v"
    if tag_simple in ['n', 'v', 'r']:
        # If the POS tag starts with "n","v", or "r", we know it's a noun, verb, or adverb
        word_type = tag_simple 
    elif tag_simple in ['j']:
        # If the POS tag starts with a "j", we know it's an adjective
        word_type = 'a' 
    lemmatized_token = wordnet_lemmatizer.lemmatize(token.lower(), pos=word_type)
    print(f"{token} ==[{tag}]==[{word_type}]==> {lemmatized_token}")

Output of NLTK lemmatizer:

The ==[DT]==[n]==> the
newest ==[JJS]==[a]==> new
study ==[NN]==[n]==> study
has ==[VBZ]==[v]==> have
shown ==[VBN]==[v]==> show
that ==[IN]==[n]==> that
cats ==[NNS]==[n]==> cat
have ==[VBP]==[v]==> have
mostly ==[RB]==[r]==> mostly
a ==[DT]==[n]==> a
better ==[JJR]==[a]==> good
sense ==[NN]==[n]==> sense
of ==[IN]==[n]==> of
smell ==[NN]==[n]==> smell
than ==[IN]==[n]==> than
dogs ==[NNS]==[n]==> dog
. ==[.]==[n]==> .

print ("\nOutput of spaCy lemmatizer:\n")
doc = nlp(sentence) # doc is an object, not just a simple list

for token in doc:
    print(f"{token.text} ==[{token.pos_}]==> {token.lemma_}")

Output of spaCy lemmatizer:

The ==[DET]==> the
newest ==[ADJ]==> new
study ==[NOUN]==> study
has ==[AUX]==> have
shown ==[VERB]==> show
that ==[SCONJ]==> that
cats ==[NOUN]==> cat
have ==[VERB]==> have
mostly ==[ADV]==> mostly
a ==[DET]==> a
better ==[ADJ]==> well
sense ==[NOUN]==> sense
of ==[ADP]==> of
smell ==[NOUN]==> smell
than ==[ADP]==> than
dogs ==[NOUN]==> dog
. ==[PUNCT]==> .

Stemming & Lemmatization¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Stemming¶

Define Set of Stemmers¶

Define List of Example Words¶

Perform Stemming¶

Lemmatization¶

Building a Simple Lemmatizer¶

Lemmatization with NLTK¶

Define Lemmatizer Using NLTK¶

Perform Lemmatization w.r.t. all Word Types¶

Lemmatization in Practice¶

Lemmatization with spaCy¶

Summary¶