import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split

from src.utils.data.files import *
from src.utils.plotting.nn import *

mnist_zip, mnist_folder = download_dataset("images/classification/mnist/mnist.zip")

print(f"mnist.zip downloaded into folder {mnist_folder}")

File 'data/datasets/images/classification/mnist/mnist.zip' already exists (use 'overwrite=True' to overwrite it).
mnist.zip downloaded into folder data/datasets/images/classification/mnist/

#mnist_folder = "/path/to/file/location/"

decompress_file(mnist_zip, target_path=mnist_folder)

['data/datasets/images/classification/mnist/t10k-images.idx3-ubyte',
 'data/datasets/images/classification/mnist/t10k-labels.idx1-ubyte',
 'data/datasets/images/classification/mnist/train-images.idx3-ubyte',
 'data/datasets/images/classification/mnist/train-labels.idx1-ubyte']

def load_mnist_images(path):
    with open(path, "rb") as f:
        # First 16 bytes are magic number, n_imgages, n_rows, n_columns
        pixels = np.frombuffer(f.read(), dtype=np.uint8, offset=16)
    # Reshape array into (n_images, n_pixels_per_image) and return
    return pixels.reshape(-1, 784)

X_train = load_mnist_images(mnist_folder+"train-images.idx3-ubyte")
X_test  = load_mnist_images(mnist_folder+"t10k-images.idx3-ubyte")

print(f"Shape (n_images, n_pixels) of training data: {X_train.shape}")
print(f"Shape (n_images, n_pixels) of test data: {X_test.shape}")

Shape (n_images, n_pixels) of training data: (60000, 784)
Shape (n_images, n_pixels) of test data: (10000, 784)

def load_mnist_labels(path):
    with open(path, "rb") as f:
        # First 8 bytes are magic_number, n_images
        labels = np.frombuffer(f.read(), dtype=np.uint8, offset=8)
    return labels

y_train = load_mnist_labels(mnist_folder+"train-labels.idx1-ubyte")
y_test  = load_mnist_labels(mnist_folder+"t10k-labels.idx1-ubyte")

print(f"Shape (n_images, ) of training labels: {y_train.shape}")
print(f"Shape (n_images, ) of test labels: {y_test.shape}")

Shape (n_images, ) of training labels: (60000,)
Shape (n_images, ) of test labels: (10000,)

n_row, n_col = 3, 5

fig, axes = plt.subplots(n_row, n_col, figsize=(1.5*n_col,2*n_row))
for i in range(n_row*n_col):
    ax = axes[i//n_col, i%n_col]
    ax.imshow(X_test[i].reshape(28,28), cmap='gray_r')
    ax.set_title('Label: {}'.format(y_test[i]))
    ax.tick_params(axis='both', which='both', bottom=False, top=False, labelbottom=False, right=False, left=False, labelleft=False)
plt.tight_layout()
plt.show()

def to_one_hot(labels, n_classes):
    labels = np.asarray(labels)
    batch_size = labels.shape[0]

    one_hot = np.zeros((batch_size, n_classes), dtype=np.float32)
    one_hot[np.arange(batch_size), labels] = 1.0

    return one_hot

y_train_onehot = to_one_hot(y_train, 10)

class Linear:
    
    def __init__(self, in_features, out_features):
        # Weight initialization (simple Gaussian)
        self.W = np.random.randn(in_features, out_features) * np.sqrt(2 / in_features)
        self.b = np.zeros(out_features)

        # Gradients
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)

        # Cache for backward pass
        self.X = None

    def forward(self, X):
        self.X = X
        return X @ self.W + self.b

    def backward(self, dY):        
        self.dW = self.X.T @ dY  # Gradient w.r.t. weights
        self.db = dY.sum(axis=0) # Gradient w.r.t. bias
        dX = dY @ self.W.T       # Gradient w.r.t. input
        return dX
    
    def step(self, lr):
        # Perform one gradient descent update
        self.W -= lr * self.dW
        self.b -= lr * self.db

class ReLU:
    def __init__(self):
        # Cache for backward pass
        self.X = None

    def forward(self, X):
        self.X = X
        return np.maximum(0, X)

    def backward(self, dY):
        # Gradient flows only where X > 0
        dX = dY * (self.X > 0)
        return dX
        
    def step(self, lr):
        # Dummy method for consistent interface
        pass

class SoftmaxCrossEntropy:
    def __init__(self):
        self.probs = None
        self.labels = None

    def forward(self, logits, labels):
        # logits: (batch_size, num_classes) - Raw scores from the previous layer
        # labels: (batch_size, num_classes) - One-hot encoded ground truth
        self.labels = labels
        
        # Numerical stability trick: subtract max logit from all logits
        # This prevents e^x from exploding to infinity.
        exps = np.exp(logits - np.max(logits, axis=1, keepdims=True))
        self.probs = exps / np.sum(exps, axis=1, keepdims=True)
        
        # Avoid log(0) by adding a tiny epsilon
        epsilon = 1e-12
        batch_size = logits.shape[0]
        
        # Compute Cross-Entropy loss
        loss = -np.sum(labels * np.log(self.probs + epsilon)) / batch_size
        return loss

    def backward(self):
        #Returns gradient: (batch_size, num_classes)
        batch_size = self.labels.shape[0]

        # The simplified gradient: (p - y) / batch_size
        grad = (self.probs - self.labels) / batch_size
        return grad

class DigitsClassifier:

    def __init__(self, in_features):
        self.layers = [
            Linear(in_features, 128),
            ReLU(),
            #Linear(128, 128),  # Comment out these line
            #ReLU(),            # to add another layer
            Linear(128, 10)
        ]
        # Define loss function
        self.criterion = SoftmaxCrossEntropy()

    def forward(self, X, y):
        # Perform forward pass through all layers
        for layer in self.layers:
            X = layer.forward(X)

        # Compute and return loss (softmax + cross-entropy)
        return self.criterion.forward(X, y)

    def backward(self):
        dY = self.criterion.backward()
        for layer in reversed(self.layers):
            dY = layer.backward(dY)

    def step(self, lr):
        for layer in reversed(self.layers):
            layer.step(lr)

    def predict(self, X):
        for layer in self.layers:
            X = layer.forward(X)
        # After the last layer, X contains the logits
        # Return the largest logit for each sample as the predicted class
        return np.argmax(X, axis=1)

model = DigitsClassifier(28*28)

def create_batches(X, y, batch_size, shuffle=False):
    # Shuffle dataset for batching
    if shuffle is True:
        idx = np.random.permutation(len(X))
        X, y = X[idx], y[idx]
    # Create iterator that loops over bitaches
    for start in range(0, len(X), batch_size):
        end = start + batch_size
        yield X[start:end], y[start:end]

def train_epoch(model, X, y, lr=0.001, batch_size=128, shuffle=True):
    epoch_loss = 0.0
    for X_batch, y_batch in create_batches(X, y, batch_size=batch_size, shuffle=shuffle):
        loss = model.forward(X_batch, y_batch)
        model.backward()
        model.step(lr)
        epoch_loss += loss
    return epoch_loss

def evaluate(model, X, y, batch_size = 128):
    # Define 2 lists holding all true labels and all predicted labels
    y_true, y_pred = [], []
    for X_batch, y_batch in create_batches(X, y, batch_size=batch_size, shuffle=False):
        y_batch_pred = model.predict(X_batch)
        y_true += list(y_batch)
        y_pred += list(y_batch_pred)
    # Return the final macro F1 score
    return metrics.f1_score(y_true, y_pred, average='macro')

f1 = evaluate(model, X_test, y_test)

print(f"F1 score: {f1:.3f}")

F1 score: 0.021

n_epochs = 20
results = []

for epoch in range(n_epochs):
    epoch_loss = train_epoch(model, X_train, y_train_onehot)
    # Calculate training and test f1 scores
    f1_train = evaluate(model, X_train, y_train)
    f1_test = evaluate(model, X_test, y_test)
    # Append the epoch loss, training f1 score, and test f1 score to final result list (for plotting later)
    results.append((epoch_loss, f1_train, f1_test))
    # Print progress(epoch loss, training f1 score, and test f1 score)
    print(f"[Epoch {epoch:02d}] loss: {epoch_loss:.3f}, f1 train: {f1_train:.3f}, f1 test: {f1_test:.3f}")

[Epoch 00] loss: 964.216, f1 train: 0.807, f1 test: 0.808
[Epoch 01] loss: 350.213, f1 train: 0.850, f1 test: 0.845
[Epoch 02] loss: 264.409, f1 train: 0.872, f1 test: 0.866
[Epoch 03] loss: 218.779, f1 train: 0.886, f1 test: 0.877
[Epoch 04] loss: 190.484, f1 train: 0.899, f1 test: 0.891
[Epoch 05] loss: 170.119, f1 train: 0.906, f1 test: 0.893
[Epoch 06] loss: 156.302, f1 train: 0.912, f1 test: 0.898
[Epoch 07] loss: 145.120, f1 train: 0.916, f1 test: 0.903
[Epoch 08] loss: 135.946, f1 train: 0.919, f1 test: 0.903
[Epoch 09] loss: 128.896, f1 train: 0.925, f1 test: 0.909
[Epoch 10] loss: 122.007, f1 train: 0.930, f1 test: 0.912
[Epoch 11] loss: 115.735, f1 train: 0.934, f1 test: 0.915
[Epoch 12] loss: 110.952, f1 train: 0.927, f1 test: 0.908
[Epoch 13] loss: 106.245, f1 train: 0.935, f1 test: 0.917
[Epoch 14] loss: 102.504, f1 train: 0.941, f1 test: 0.921
[Epoch 15] loss: 98.828, f1 train: 0.940, f1 test: 0.921
[Epoch 16] loss: 95.909, f1 train: 0.942, f1 test: 0.922
[Epoch 17] loss: 93.133, f1 train: 0.944, f1 test: 0.922
[Epoch 18] loss: 90.325, f1 train: 0.946, f1 test: 0.925
[Epoch 19] loss: 88.098, f1 train: 0.940, f1 test: 0.919

plot_training_results(results, legend=['Loss (normalized)', 'F1 (train)', 'F1 (test)'])

Byte Offset	Length (bytes)	Description
0	4	Magic number (0x00000803) — identifies the file type as a set of images
4	4	Number of images (e.g., 60,000 for training set)
8	4	Number of rows (28)
12	4	Number of columns (28)

Byte Offset	Length (bytes)	Description
0	4	Magic number (0x00000801) — identifies the file type as labels
4	4	Number of labels (e.g., 60,000 for training set)

Implementing an ANN from Scratch (NumPy only)¶

Setting up the Notebook¶

Make Required Imports¶

Download MNIST Dataset¶

Preliminaries¶

Load & Prepare Dataset¶

Load Images & Labels¶

Model Implementation¶

Layers¶

Linear Layer¶

ReLU (Rectified Linear Unit)¶

Softmax + Cross-Entropy¶

Complete Model¶

Model Training¶

Auxiliary Methods¶

Training Loop¶

Discussion¶

Summary¶