from src.utils.libimports.postag import *
from src.utils.plotting.postag import *
from src.utils.data.files import *

yelp_review, _ = download_dataset("text/corpora/misc/yelp-reviews-mon-ami-gabi.csv")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 850k/850k [00:00<00:00, 11.9MiB/s]

sentence = "Alice considered if she should switch from engineering to computer science."
#sentence = "A ticket for a direct flight from Singapore to Germany typically costs around S$1,000."
#sentence = "Wow! The movie was amazing with so many funny scenes and great acting."

# Analyze input text (incl. POS tagging!)
doc = nlp(sentence)

# Print results
for token in doc:
    print(f"{token.text} (Universal: {token.pos_}, Penn: {token.tag_})")

Alice (Universal: PROPN, Penn: NNP)
considered (Universal: VERB, Penn: VBD)
if (Universal: SCONJ, Penn: IN)
she (Universal: PRON, Penn: PRP)
should (Universal: AUX, Penn: MD)
switch (Universal: VERB, Penn: VB)
from (Universal: ADP, Penn: IN)
engineering (Universal: NOUN, Penn: NN)
to (Universal: ADP, Penn: IN)
computer (Universal: NOUN, Penn: NN)
science (Universal: NOUN, Penn: NN)
. (Universal: PUNCT, Penn: .)

sentences = [
    "The back door is open.",
    "Her back was aching.",
    "Please come back soon.",
    "He will back the proposal."
]

for sentence in sentences:
    print(sentence)
    # Analyze input text (incl. POS tagging!)
    doc = nlp(sentence)
    # Print results
    for token in doc:
        print(f"\t{token.text} (Universal: {token.pos_}, Penn: {token.tag_})")

The back door is open.
	The (Universal: DET, Penn: DT)
	back (Universal: ADJ, Penn: JJ)
	door (Universal: NOUN, Penn: NN)
	is (Universal: AUX, Penn: VBZ)
	open (Universal: ADJ, Penn: JJ)
	. (Universal: PUNCT, Penn: .)
Her back was aching.
	Her (Universal: PRON, Penn: PRP$)
	back (Universal: NOUN, Penn: NN)
	was (Universal: AUX, Penn: VBD)
	aching (Universal: VERB, Penn: VBG)
	. (Universal: PUNCT, Penn: .)
Please come back soon.
	Please (Universal: INTJ, Penn: UH)
	come (Universal: VERB, Penn: VB)
	back (Universal: ADV, Penn: RB)
	soon (Universal: ADV, Penn: RB)
	. (Universal: PUNCT, Penn: .)
He will back the proposal.
	He (Universal: PRON, Penn: PRP)
	will (Universal: AUX, Penn: MD)
	back (Universal: VERB, Penn: VB)
	the (Universal: DET, Penn: DT)
	proposal (Universal: NOUN, Penn: NN)
	. (Universal: PUNCT, Penn: .)

sentences = [
    "Flying planes can be dangerous.",
    "Fruit flies like a banana."
]

for sentence in sentences:
    print(sentence)
    # Analyze input text (incl. POS tagging!)
    doc = nlp(sentence)
    # Print results
    for token in doc:
        print(f"\t{token.text} (Universal: {token.pos_}, Penn: {token.tag_})")

Flying planes can be dangerous.
	Flying (Universal: VERB, Penn: VBG)
	planes (Universal: NOUN, Penn: NNS)
	can (Universal: AUX, Penn: MD)
	be (Universal: AUX, Penn: VB)
	dangerous (Universal: ADJ, Penn: JJ)
	. (Universal: PUNCT, Penn: .)
Fruit flies like a banana.
	Fruit (Universal: NOUN, Penn: NN)
	flies (Universal: VERB, Penn: VBZ)
	like (Universal: ADP, Penn: IN)
	a (Universal: DET, Penn: DT)
	banana (Universal: NOUN, Penn: NN)
	. (Universal: PUNCT, Penn: .)

df = pd.read_csv(yelp_review)

df.head()

reviews = df['review'].tolist() # "review" is the name of the column of interest (see above)

# This dictionary will keep track of the count for each found adjective
adjective_frequencies = {}

# Check each review on by one
for review in tqdm(reviews):
    for token in nlp(review):
        # Ignore tokens that are not adjectives
        if token.pos_ != 'ADJ':
            continue
        # Convert token to lowercase, otherwise "Good" and "good" are considered differently
        adjective = token.text.lower()
        # Update counts/frequencies
        if adjective not in adjective_frequencies:
            adjective_frequencies[adjective] = 1.0
        else:
            adjective_frequencies[adjective] = adjective_frequencies[adjective] + 1.0

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:34<00:00, 28.75it/s]

# "Good" adjectives
print(adjective_frequencies['great'])
print(adjective_frequencies['amazing'])
print(adjective_frequencies['excellent'])
print()
# "Bad" adjectives
print(adjective_frequencies['disappointed'])
print(adjective_frequencies['pricey'])
print(adjective_frequencies['sad'])

811.0
170.0
155.0

17.0
32.0
9.0

show_wordcloud(adjective_frequencies)

Tag	Description	Examples
`ADJ`	Adjective: Describes a noun or pronoun	big, green
`ADP`	Adposition: Prepositions and postpositions	on, under
`ADV`	Adverb: Modifies verbs, adjectives, or other adverbs	quickly, very
`AUX`	Auxiliary: Verbs that provide grammatical support	is, have
`CCONJ`	Coordinating Conjunction: Links words, phrases, or clauses equally	and, but
`DET`	Determiner: Introduces a noun phrase	the, a
`INTJ`	Interjection: Expresses emotion or exclamation	wow, oh
`NOUN`	Noun: Refers to people, places, things, or concepts	dog, idea
`NUM`	Numeral: Indicates a number	three, 42
`PART`	Particle: Function words that do not fit other categories	not, to in to go
`PRON`	Pronoun: Substitutes for a noun	she, they
`PROPN`	Proper Noun: Refers to specific names of people, places, or organizations	John, Paris
`PUNCT`	Punctuation: Symbols that delimit text	., !, ?
`SCONJ`	Subordinating Conjunction: Introduces a subordinate clause	because, although
`SYM`	Symbol: Non-alphanumeric symbols	%, $, +
`VERB`	Verb: Expresses actions, occurrences, or states	run, think
`X`	Other: Words that do not belong to other categories or are hard to classify.

Tag	Description	Examples
`CC`	Coordinating conjunction	and, but, or
`CD`	Cardinal number	one, two, 3
`DT`	Determiner	the, a, an
`EX`	Existential "there"	there
`FW`	Foreign word	d’accord, vis-à-vis
`IN`	Preposition or subordinating conjunction	in, of, like, because
`JJ`	Adjective	big, red, fast
`JJR`	Adjective, comparative	bigger, faster
`JJS`	Adjective, superlative	biggest, fastest
`LS`	List item marker	1., A., a)
`MD`	Modal	can, could, should
`NN`	Noun, singular or mass	dog, car, music
`NNS`	Noun, plural	dogs, cars, ideas
`NNP`	Proper noun, singular	John, London
`NNPS`	Proper noun, plural	Americans, Rockies
`PDT`	Predeterminer	all, both, half
`POS`	Possessive ending	’s
`PRP`	Personal pronoun	I, you, he, she, it
`PRP$`	Possessive pronoun	my, your, his
`RB`	Adverb	quickly, very, not
`RBR`	Adverb, comparative	faster, better
`RBS`	Adverb, superlative	fastest, best
`RP`	Particle	up, off, out
`SYM`	Symbol	%, $, +
`TO`	"to"	to (as in to go)
`UH`	Interjection	oh, wow, hmm
`VB`	Verb, base form	run, eat
`VBD`	Verb, past tense	ran, ate
`VBG`	Verb, gerund or present participle	running, eating
`VBN`	Verb, past participle	eaten, driven
`VBP`	Verb, non-3rd person singular present	run, eat
`VBZ`	Verb, 3rd person singular present	runs, eats
`WDT`	Wh-determiner	which, that
`WP`	Wh-pronoun	who, what
`WP$`	Possessive wh-pronoun	whose
`WRB`	Wh-adverb	where, when

	review_number	review
0	1	Excellent food, great atmosphere, a bit noisy....
1	2	If you enjoy a little people watching with you...
2	3	affordable, fairly classic french foodsit outs...
3	4	Though heartbroken and a bit aimless on my 22n...
4	5	The food and wine was amazing, but the super h...

Part-of-Speech (POS) Tagging (Basics)¶

Introduction¶

Part-of-Speech (POS) Tags¶

Part-of-Speech (POS) Tagging¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

POS Tag Sets¶

POS Tagging Algorithm¶

Challenges¶

Naive Baseline Algorithm¶

Simple Application Example: Analysis of Restaurant Reviews¶

Load reviews from CSV file¶

Review analysis¶

Visualization of Results¶

Summary¶