from src.text.vectorizing.vocab import Vocabulary
from src.utils.libimports.rnntextclass import *
from src.utils.plotting.nn import *
from src.utils.compute.gpu import *
from src.utils.data.files import *

news, _ = download_dataset("text/classification/news-classification/news-classification-dataset.csv")

File 'data/datasets/text/classification/news-classification/news-classification-dataset.csv' already exists (use 'overwrite=True' to overwrite it).

# Select preferred device (GPU, if available; CPU otherwise); you can enfore the use of the CPU
device = select_device(force_cpu=False)

print("Available device: {}".format(device))

Available device: cuda:0

# Load file into Pandas DataFrame
df_news = pd.read_csv(news)
# Drop column "description"
df_news = df_news.drop("description", axis=1)
# Print the first 5 row of DataFrame
df_news.head()

class2category = { idx:cat for idx, cat in enumerate(df_news["category"].unique()) }
category2class = { v:k for k,v in class2category.items() }

print(class2category)
print(category2class)

{0: 'business', 1: 'education', 2: 'entertainment', 3: 'sports', 4: 'technology'}
{'business': 0, 'education': 1, 'entertainment': 2, 'sports': 3, 'technology': 4}

# Create a copy of the original DataFrame to avoid errors if the code cell is executed multiple times
df_news_mapped = df_news.copy()
# For all categories, map string values to their correspnding integer values
df_news_mapped["category"] = df_news["category"].map(category2class)
# Print the first 5 row of DataFrame
df_news_mapped.head()

categories = df_news_mapped["category"].tolist()

# Create counter (a specialized dictionary)
token_counter = Counter()
# Create list that will hold all tokenized headlines
token_lists = []

for text in tqdm(df_news_mapped["headline"].tolist()):
    # Use spacy to tokenize + lemmatize + lowercase each headline
    tokens = [ token.lemma_.lower() for token in nlp(text) ]
    # Add tokenized headline to final list of token lists
    token_lists.append(tokens)
    # Update the token counter
    for token in tokens:
        token_counter[token] += 1

print(f"Number of unique tokens: {len(token_counter)}")

100%|████████████████████████████████████| 10000/10000 [00:47<00:00, 211.16it/s]

Number of unique tokens: 14571

TOP_TOKENS = 2000
# Sort tokens by their frequency in descending order
token_counter_sorted = sorted(token_counter.items(), key=lambda x: x[1], reverse=True)
# Get the TOP_TOKENS most frequent token
token_counter_sorted_filtered = token_counter_sorted[:TOP_TOKENS]

tokens = [ tup[0] for tup in token_counter_sorted_filtered ]

TOKEN_PAD, TOKEN_UNK = "<PAD>", "<UNK>"

SPECIAL_TOKENS = [TOKEN_PAD, TOKEN_UNK]

vocabulary = Vocabulary(tokens, SPECIAL_TOKENS)

vocabulary.set_default_index(vocabulary[TOKEN_UNK])

X_train, X_test, y_train, y_test = train_test_split(token_lists, categories, test_size=0.25, random_state=42)

class SequenceClassificationDataset(Dataset):

    def __init__(self, inputs, targets, vocabulary):
        self.inputs = inputs
        self.targets = targets
        self.vocabulary = vocabulary

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        x = np.asarray(vocabulary.encode(self.inputs[index]))
        y = self.targets[index]
        return x, y, len(x)

dataset_train = SequenceClassificationDataset(X_train, y_train, vocabulary)
dataset_test  = SequenceClassificationDataset(X_test,  y_test,  vocabulary)

def collate_fn_pad(batch):
    # Extract sequences, class labels, and sequences lenghts from batch
    X, y, lengths = list(map(list, zip(*batch)))
    # Convert all sequences from lists to tensors
    X = [ torch.LongTensor(x) for x in X ]
    # Conver list of class labels and list of lengths to tensors
    y, lengths = torch.LongTensor(y), torch.LongTensor(lengths)
    # Sort sequences w.r.t. their lengths from longest to shortest
    lengths_sorted, sorted_idx = lengths.sort(descending=True)
    # Perform left padding on the input sequences
    X_padded = pad_sequence(X, batch_first=True, padding_value=0)
    # Return sorted sequences, class labels, and lengths
    return X_padded[sorted_idx], y[sorted_idx], lengths_sorted

batch_size = 128

loader_train = DataLoader(dataset_train, collate_fn=collate_fn_pad, batch_size=batch_size, shuffle=True)
loader_test  = DataLoader(dataset_test , collate_fn=collate_fn_pad, batch_size=batch_size)

class RnnTextClassifier(nn.Module):

    def __init__(self, vocab_size, embed_size, output_size):
        super().__init__()
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # LSTM layer
        self.lstm = nn.LSTM(embed_size, 256, batch_first=True)
        # 1 linear hidden layer + output layer
        self.linear = nn.Linear(256, 128)
        self.out = nn.Linear(128, output_size)
    
    def forward(self, inputs, hidden, lengths):
        batch_size, seq_len = inputs.shape
        # Push through embedding layer
        X = self.embedding(inputs)
        X = pack_padded_sequence(X, lengths, batch_first=True)
        # Push through LSTM layer
        _, (h, c) = self.lstm(X, hidden)
        h = h.squeeze(0)
        # Push through linear hidden layers and output layer
        h = F.relu(self.linear(h))
        h = self.out(h)
        # Return log softmax
        return F.log_softmax(h, dim=1)
    
    def init_hidden(self, batch_size, device):
        return (torch.zeros(1, batch_size, 256).to(device), torch.zeros(1, batch_size, 256).to(device))

# Define model
model = RnnTextClassifier(len(vocabulary), 100, len(category2class))
# Move model to device (i.e., GPU if available, otherwise CPU)
model = model.to(device)
# Print the model
print(model)

RnnTextClassifier(
  (embedding): Embedding(2002, 100)
  (lstm): LSTM(100, 256, batch_first=True)
  (linear): Linear(in_features=256, out_features=128, bias=True)
  (out): Linear(in_features=128, out_features=5, bias=True)
)

def train_epoch(model, loader, optimizer, criterion):
    # Initialize epoch loss (cummulative loss fo all batchs)
    epoch_loss = 0.0
    # Use tqdm to get a nice progress bar
    with tqdm(total=len(loader)) as pbar:
        # Iterate over all batches
        for X_batch, y_batch, lengths_batch in loader:
            batch_size, seq_len = X_batch.shape[0], X_batch.shape[1]
            # Move the batch to the correct device
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            # Initialize the first hidden state h0 (and move to device)
            hidden = model.init_hidden(X_batch.shape[0], device)
            # Get model outputs as log probabiltiies
            log_probs = model(X_batch, hidden, lengths_batch)
            # Calculate loss
            loss = criterion(log_probs, y_batch)
            ### Pytorch magic! ###
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # Keep track of overall epoch loss
            epoch_loss += loss.item()
            # Update progress bar
            pbar.update(1)
    # Return total loss across whole dataset
    return epoch_loss

def evaluate(model, loader):
    # Define 2 lists holding all true labels and all predicted labels
    y_true, y_pred = [], []
    # Use tqdm to get a nice progress bar
    with tqdm(total=len(loader)) as pbar:
        # Iterate over all batches
        for X_batch, y_batch, lengths_batch in loader:
            batch_size, seq_len = X_batch.shape[0], X_batch.shape[1]
            # Move the batch to the correct device
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            # Initialize the first hidden state h0 (and move to device)
            hidden = model.init_hidden(X_batch.shape[0], device)
            # Get model outputs as log probabiltiies
            log_probs = model(X_batch, hidden, lengths_batch)
            # Calculate predicted class labels based on largest log probability values
            y_batch_pred = torch.argmax(log_probs, dim=1)
            # Add the true labels and predicted labels for batch ot final lists
            y_true += list(y_batch.cpu())
            y_pred += list(y_batch_pred.cpu())
            # Update progress bar
            pbar.update(1)
    # Return f1 score
    return f1_score(y_true, y_pred, average='macro')

# Define loss function
criterion = nn.NLLLoss()
# Define optimizer for the update step
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
results = []

for epoch in range(1, num_epochs+1):
    # Set model into training mode
    model.train()
    # Train model for a single epch
    epoch_loss = train_epoch(model, loader_train, optimizer, criterion)
    # Set model into evaluation mode
    model.eval()
    # Calculate training and test f1 scores
    f1_train = evaluate(model, loader_train)
    f1_test = evaluate(model, loader_test)
    # Append the epoch loss, training f1 score, and test f1 score to final result list (for plotting later)
    results.append((epoch_loss, f1_train, f1_test))
    # Print progress(epoch loss, training f1 score, and test f1 score)
    print(f"[Epoch {epoch}] loss:\t{epoch_loss:.3f}, f1 train: {f1_train:.3f}, f1 test: {f1_test:.3f}")

100%|███████████████████████████████████████████| 59/59 [00:00<00:00, 65.93it/s]
100%|██████████████████████████████████████████| 59/59 [00:00<00:00, 103.03it/s]
100%|██████████████████████████████████████████| 20/20 [00:00<00:00, 162.50it/s]

[Epoch 1] loss:	76.174, f1 train: 0.660, f1 test: 0.632

100%|███████████████████████████████████████████| 59/59 [00:00<00:00, 99.29it/s]
100%|██████████████████████████████████████████| 59/59 [00:00<00:00, 162.01it/s]
100%|██████████████████████████████████████████| 20/20 [00:00<00:00, 160.89it/s]

[Epoch 2] loss:	41.224, f1 train: 0.828, f1 test: 0.770

100%|███████████████████████████████████████████| 59/59 [00:00<00:00, 97.83it/s]
100%|██████████████████████████████████████████| 59/59 [00:00<00:00, 106.77it/s]
100%|██████████████████████████████████████████| 20/20 [00:00<00:00, 160.66it/s]

[Epoch 3] loss:	26.167, f1 train: 0.898, f1 test: 0.833

100%|███████████████████████████████████████████| 59/59 [00:00<00:00, 98.19it/s]
100%|██████████████████████████████████████████| 59/59 [00:00<00:00, 159.79it/s]
100%|██████████████████████████████████████████| 20/20 [00:00<00:00, 163.27it/s]

[Epoch 4] loss:	17.002, f1 train: 0.935, f1 test: 0.847

100%|███████████████████████████████████████████| 59/59 [00:00<00:00, 97.68it/s]
100%|██████████████████████████████████████████| 59/59 [00:00<00:00, 160.87it/s]
100%|██████████████████████████████████████████| 20/20 [00:00<00:00, 159.23it/s]

[Epoch 5] loss:	12.727, f1 train: 0.958, f1 test: 0.871

plot_training_results(results, legend=['Loss (normalized)', 'F1 (train)', 'F1 (test)'])

	headline	category
0	Nirmala Sitharaman to equal Morarji Desai’s re...	0
1	‘Will densify network, want to be at least no....	0
2	Air India group to induct an aircraft every si...	0
3	Red Sea woes: Exporters seek increased credit ...	0
4	Air India group to induct a plane every 6 days...	0

Text Classification with Recurrent Neural Networks (RNNs)¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Checking & Setting Computing Device¶

Data Preparation¶

Loading the Dataset¶

Encoding the News Categories¶

Encoding the News Headlines¶

Creating the Vocabulary¶

Splitting Dataset into Training and Test Data¶

`Dataset` and `DataLoader` Instances¶

Creating a `Dataset` Subclass & Instance¶

Creating `DataLoader` Instances for Training & Test Data¶

Model Defintion & Training¶

Model Architecture¶

Model Training¶

Defining Auxiliary Method¶

Defining Criterion & Optimizer¶

Training the Model¶

Discussion¶

Summary¶

	headline	category
0	Nirmala Sitharaman to equal Morarji Desai’s re...	business
1	‘Will densify network, want to be at least no....	business
2	Air India group to induct an aircraft every si...	business
3	Red Sea woes: Exporters seek increased credit ...	business
4	Air India group to induct a plane every 6 days...	business

Text Classification with Recurrent Neural Networks (RNNs)¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Checking & Setting Computing Device¶

Data Preparation¶

Loading the Dataset¶

Encoding the News Categories¶

Encoding the News Headlines¶

Creating the Vocabulary¶

Splitting Dataset into Training and Test Data¶

Dataset and DataLoader Instances¶

Creating a Dataset Subclass & Instance¶

Creating DataLoader Instances for Training & Test Data¶

Model Defintion & Training¶

Model Architecture¶

Model Training¶

Defining Auxiliary Method¶

Defining Criterion & Optimizer¶

Training the Model¶

Discussion¶

Summary¶

`Dataset` and `DataLoader` Instances¶

Creating a `Dataset` Subclass & Instance¶

Creating `DataLoader` Instances for Training & Test Data¶