from src.utils.libimports.textidx import *
from src.utils.data.files import *
from src.text.vectorizing.vocab import *

sentences_pos, _ = download_dataset("text/classification/sentence-polarity/sentence-polarity.pos")
sentences_neg, _ = download_dataset("text/classification/sentence-polarity/sentence-polarity.neg")

File 'data/datasets/text/classification/sentence-polarity/sentence-polarity.pos' already exists (use 'overwrite=True' to overwrite it).
File 'data/datasets/text/classification/sentence-polarity/sentence-polarity.neg' already exists (use 'overwrite=True' to overwrite it).

output_folder = create_folder("data/generated/")

dataset_news = [
    ("The mayor was elected for this term and the next term.", "politics"),
    ("A mayor's goal for the next term is to win.", "politics"),
    ("The goal for this term was to win the vote.", "politics"),
    ("This term's goals are next term's goals.", "politics"),
    ("The goal of any team player is the win.", "sports"),
    ("A win for the team is a win for each player.", "sports"),
    ("Players vote other players for another term.", "sports")
]

inputs_news = [ tup[0] for tup in dataset_news ]
targets_news = [ tup[1] for tup in dataset_news ]

label2index_news = { "politics": 0, "sports": 1 }
index2label_news = { 0: 'politics', 1: 'sports' }

labels_news = set(targets_news)

print(labels_news)

{'sports', 'politics'}

label2index_news = { label:index for index, label in enumerate(set(targets_news)) }

print(label2index_news)

{'sports': 0, 'politics': 1}

index2label_news = { v:k for k,v in label2index_news.items() }

print(index2label_news)

{0: 'sports', 1: 'politics'}

target_vector_news = [ label2index_news[label] for label in targets_news ]

print(f"{targets_news} ==> {target_vector_news}")

['politics', 'politics', 'politics', 'politics', 'sports', 'sports', 'sports'] ==> [1, 1, 1, 1, 0, 0, 0]

print(index2label_news[1])

politics

# Auxiliary method to preprocess a text string
def preprocess(text):
    return [token.text.lower() for token in nlp(text) ]
    

# Create counter (a specialized dictionary)
token_counter_news = Counter()

for text in inputs_news:
    for token in preprocess(text):
        token_counter_news[token] += 1
        
print(token_counter_news)

Counter({'the': 8, 'term': 7, '.': 7, 'for': 6, 'win': 5, 'this': 3, 'next': 3, 'a': 3, "'s": 3, 'goal': 3, 'is': 3, 'mayor': 2, 'was': 2, 'to': 2, 'vote': 2, 'goals': 2, 'team': 2, 'player': 2, 'players': 2, 'elected': 1, 'and': 1, 'are': 1, 'of': 1, 'any': 1, 'each': 1, 'other': 1, 'another': 1})

# Sort by word frequency
token_counter_news_sorted = sorted(token_counter_news.items(), key=lambda x: x[1], reverse=True)

print("Number of tokens: {}".format(len(token_counter_news_sorted)))
print(token_counter_news_sorted)

Number of tokens: 27
[('the', 8), ('term', 7), ('.', 7), ('for', 6), ('win', 5), ('this', 3), ('next', 3), ('a', 3), ("'s", 3), ('goal', 3), ('is', 3), ('mayor', 2), ('was', 2), ('to', 2), ('vote', 2), ('goals', 2), ('team', 2), ('player', 2), ('players', 2), ('elected', 1), ('and', 1), ('are', 1), ('of', 1), ('any', 1), ('each', 1), ('other', 1), ('another', 1)]

TOP_TOKENS_NEWS = 25

token_counter_news_sorted_filtered = token_counter_news_sorted[:TOP_TOKENS_NEWS]

print("Number of tokens: {}".format(len(token_counter_news_sorted_filtered)))
print(token_counter_news_sorted_filtered)

Number of tokens: 25
[('the', 8), ('term', 7), ('.', 7), ('for', 6), ('win', 5), ('this', 3), ('next', 3), ('a', 3), ("'s", 3), ('goal', 3), ('is', 3), ('mayor', 2), ('was', 2), ('to', 2), ('vote', 2), ('goals', 2), ('team', 2), ('player', 2), ('players', 2), ('elected', 1), ('and', 1), ('are', 1), ('of', 1), ('any', 1), ('each', 1)]

tokens_news = [ tup[0] for tup in token_counter_news_sorted_filtered ]

print(tokens_news)

['the', 'term', '.', 'for', 'win', 'this', 'next', 'a', "'s", 'goal', 'is', 'mayor', 'was', 'to', 'vote', 'goals', 'team', 'player', 'players', 'elected', 'and', 'are', 'of', 'any', 'each']

TOKEN_PAD, TOKEN_UNK, TOKEN_SOS, TOKEN_EOS, TOKEN_SEP, TOKEN_CLS = "<PAD>", "<UNK>", "<SOS>", "<EOS>", "<SEP>", "<CLS>"

SPECIAL_TOKENS = [TOKEN_PAD, TOKEN_UNK, TOKEN_SOS, TOKEN_EOS, TOKEN_SEP, TOKEN_CLS]

token2index_news = { token:index for index, token in enumerate(SPECIAL_TOKENS + tokens_news) }

print(token2index_news)

{'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3, '<SEP>': 4, '<CLS>': 5, 'the': 6, 'term': 7, '.': 8, 'for': 9, 'win': 10, 'this': 11, 'next': 12, 'a': 13, "'s": 14, 'goal': 15, 'is': 16, 'mayor': 17, 'was': 18, 'to': 19, 'vote': 20, 'goals': 21, 'team': 22, 'player': 23, 'players': 24, 'elected': 25, 'and': 26, 'are': 27, 'of': 28, 'any': 29, 'each': 30}

index2token_news = { v:k for k,v in token2index_news.items() }

print(index2token_news)

{0: '<PAD>', 1: '<UNK>', 2: '<SOS>', 3: '<EOS>', 4: '<SEP>', 5: '<CLS>', 6: 'the', 7: 'term', 8: '.', 9: 'for', 10: 'win', 11: 'this', 12: 'next', 13: 'a', 14: "'s", 15: 'goal', 16: 'is', 17: 'mayor', 18: 'was', 19: 'to', 20: 'vote', 21: 'goals', 22: 'team', 23: 'player', 24: 'players', 25: 'elected', 26: 'and', 27: 'are', 28: 'of', 29: 'any', 30: 'each'}

default_index = token2index_news[TOKEN_UNK]

print(default_index)

1

for special_token in SPECIAL_TOKENS:
    token_index = token2index_news[special_token]
    print("The index of {} in the vocabulary is: {}".format(special_token, token_index))

The index of <PAD> in the vocabulary is: 0
The index of <UNK> in the vocabulary is: 1
The index of <SOS> in the vocabulary is: 2
The index of <EOS> in the vocabulary is: 3
The index of <SEP> in the vocabulary is: 4
The index of <CLS> in the vocabulary is: 5

for idx in range(10):
    print("Token at index {}: {}".format(idx, index2token_news[idx]))

Token at index 0: <PAD>
Token at index 1: <UNK>
Token at index 2: <SOS>
Token at index 3: <EOS>
Token at index 4: <SEP>
Token at index 5: <CLS>
Token at index 6: the
Token at index 7: term
Token at index 8: .
Token at index 9: for

def encode(tokens: list[str]):
    return [ token2index_news[t] if t in token2index_news else default_index for t in tokens ]

encode(['the', 'mayor', 'was', 'elected'])

[6, 17, 18, 25]

encode(['the', 'president', 'was', 'elected'])

[6, 1, 18, 25]

def decode(indices: list[int], default_token="<???>"):
    return [ index2token_news[i] if i in index2token_news else default_token for i in indices ]

decode([6, 17, 18, 25, 3])

['the', 'mayor', 'was', 'elected', '<EOS>']

decode([6, 99, 18, 25, 3])

['the', '<???>', 'was', 'elected', '<EOS>']

input_vectors_news = [ encode(preprocess(text)) for text in inputs_news ]

input_vectors_news

[[6, 17, 18, 25, 9, 11, 7, 26, 6, 12, 7, 8],
 [13, 17, 14, 15, 9, 6, 12, 7, 16, 19, 10, 8],
 [6, 15, 9, 11, 7, 18, 19, 10, 6, 20, 8],
 [11, 7, 14, 21, 27, 12, 7, 14, 21, 8],
 [6, 15, 28, 29, 22, 23, 16, 6, 10, 8],
 [13, 10, 9, 6, 22, 16, 13, 10, 9, 30, 23, 8],
 [24, 20, 1, 24, 9, 1, 7, 8]]

vocabulary_news = Vocabulary(tokens_news, SPECIAL_TOKENS)

vocabulary_news.set_default_index(vocabulary_news[TOKEN_UNK])

print(vocabulary_news.encode(['the', 'president', 'was', 'elected']))

[ 6  1 18 25]

print(vocabulary_news.decode([6, 17, 18, 25, 3]))

['the', 'mayor', 'was', 'elected', '<EOS>']

vocabulary_news_targets = Vocabulary(labels_news)

print(vocabulary_news_targets.encode(["sports", "sports", "politics"]))

print(vocabulary_news_targets.decode([1, 0, 1]))

[0 0 1]
['politics', 'sports', 'politics']

output_file = open(f"{output_folder}toy-news-dataset-vectors-{TOP_TOKENS_NEWS}.txt", "w")

for idx, text in enumerate(inputs_news):
    # Get label
    label = vocabulary_news_targets.encode([targets_news[idx]])[0]
    # Get sentence and vectorize it using the vocabulary
    vector = vocabulary_news.encode(preprocess(text))
    # Write sequence and label to file (separate sequence and label using a tab)
    output_file.write(f"{' '.join([str(idx) for idx in vector])}\t{label}\n")
        
output_file.flush()
output_file.close()

with open(f"{output_folder}toy-news-dataset-{TOP_TOKENS_NEWS}.vocab", 'wb') as out_file:
    pickle.dump(vocabulary_news, out_file)

with open(f"{output_folder}/toy-news-dataset-targets-{TOP_TOKENS_NEWS}.vocab", 'wb') as out_file:
    pickle.dump(vocabulary_news_targets, out_file)

with open(f"{output_folder}toy-news-dataset-{TOP_TOKENS_NEWS}.vocab", "rb") as in_file:
    vocabulary_news = pickle.load(in_file)

with open(f"{output_folder}/toy-news-dataset-targets-{TOP_TOKENS_NEWS}.vocab", "rb") as in_file:
    vocabulary_news_targets = pickle.load(in_file)

token_counter_polarity = Counter()

targets_polarity = []

with tqdm(total=10662) as pbar:
    # Loop over all file names
    for file_name in [sentences_pos, sentences_neg]:
        # Get sentiment label from file name extensions
        label = file_name.split(".")[-1]
        # Loop over each sentence (1 sentence per line)
        with open(file_name) as file:
            for line in file:
                # Update token counts
                for token in preprocess(line):
                    token_counter_polarity[token] += 1            
                # Add label to targets list
                targets_polarity.append(label)
                # Update progress bar
                pbar.update(1)

# Identify set of unique class labels
labels_polarity = set(targets_polarity)

100%|████████████████████████████████████| 10662/10662 [00:53<00:00, 199.75it/s]

vocabulary_polarity_targets = Vocabulary(labels_polarity)

print(vocabulary_polarity_targets.token2index)
print(vocabulary_polarity_targets.index2token)

{'neg': 0, 'pos': 1}
{0: 'neg', 1: 'pos'}

# Sort by token frequency
token_counter_polarity_sorted = sorted(token_counter_polarity.items(), key=lambda x: x[1], reverse=True)

# Limit number of tokens to the top-10000 most frequent tokens
TOP_TOKENS_POLARITY = 10000
token_counter_polarity_sorted_filtered = token_counter_polarity_sorted[:TOP_TOKENS_POLARITY]

# Extract final list of tokens
tokens_polarity = [ tup[0] for tup in token_counter_polarity_sorted_filtered ]

vocabulary_polarity = Vocabulary(tokens_polarity, special_tokens=SPECIAL_TOKENS)

vocabulary_polarity.set_default_index(vocabulary_polarity[TOKEN_UNK])

print(vocabulary_polarity.encode(["the", "movie", "was", "not", "that", "good", ",", "but", "i", "left", "the", "cinema", "entertained", "."]))

[   8   26  106   34   19   62    9   21   50  490    8  257 2554    6]

output_file = open(f"{output_folder}polarity-dataset-vectors-{TOP_TOKENS_POLARITY}.txt", "w")

with tqdm(total=10662) as pbar:
    for file_name in [sentences_pos, sentences_neg]:
        # Get class label from file name    
        label_name = file_name.split(".")[-1]
        # Iterate over all sentences, vectorize and save them
        with open(file_name) as file:
            for line in file:
                label = vocabulary_polarity_targets.encode([label_name])[0]
                vector = vocabulary_polarity.encode(preprocess(line))
                output_file.write(f"{' '.join([str(idx) for idx in vector])}\t{label}\n")
                pbar.update(1)
            
output_file.flush()
output_file.close()

100%|████████████████████████████████████| 10662/10662 [00:50<00:00, 213.08it/s]

with open(f"{output_folder}polarity-dataset-{TOP_TOKENS_POLARITY}.vocab", "wb") as out_file:
    pickle.dump(vocabulary_polarity, out_file)

with open(f"{output_folder}polarity-dataset-targets-{TOP_TOKENS_POLARITY}.vocab", 'wb') as out_file:
    pickle.dump(vocabulary_polarity_targets, out_file)

Token Indexing with Vocabularies¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Example Dataset¶

Create Simple Classification Dataset¶

Prepare Class Labels¶

Create Vocabulary & Mappings¶

Create Vocabulary¶

Compute Token Frequencies¶

Sort Tokens by Frequency¶

Limit Vocabulary¶

Create Final Vocabulary¶

Create Mappings¶

Special Tokens¶

Token2Index & Index&Token Mappings¶

Additional Considerations: Handling Unknown Words¶

Working with the Vocabulary & Mappings¶

Vectorize Corpus¶

Practical Application¶

Class Implementation¶

Save Vectorized Dataset & Vocabularies¶

Save Inputs & Targets¶

Save Vocabularies¶

Real-World Dataset¶

Read Files & Compute Word Frequencies¶

Prepare Class Labels¶

Create Vocabulary¶

Save Dataset & Vocabularies¶

Vectorize and Save Dataset¶

Save Vocabularies¶

Summary¶