from src.utils.libimports.pca import *
from src.utils.plotting.pca import *
from src.utils.data.files import *

demo_data_2d, _ = download_dataset("tabular/resources/pca-demo-data-2d.csv")
demo_data_3d, _ = download_dataset("tabular/resources/pca-demo-data-3d.csv")

File 'data/datasets/tabular/resources/pca-demo-data-2d.csv' already exists (use 'overwrite=True' to overwrite it).
File 'data/datasets/tabular/resources/pca-demo-data-3d.csv' already exists (use 'overwrite=True' to overwrite it).

df_2d = pd.read_csv(demo_data_2d)
df_3d = pd.read_csv(demo_data_3d)

X_2d = df_2d.to_numpy()
X_3d = df_3d.to_numpy()

print(f"Shape of 2D dataset: {X_2d.shape}")
print(f"Shape of 3D dataset: {X_3d.shape}")

Shape of 2D dataset: (20, 2)
Shape of 3D dataset: (200, 3)

plot_data_2d(X_2d)

corr_matrix = np.corrcoef(X_2d[:,0], X_2d[:,1])

print(f"Correlation between x1 and x2: {corr_matrix[0][1]:.2f}")

Correlation between x1 and x2: 0.81

# Define transformation matrix
W = np.array([ [1, 0], [0, 0] ])

# Perform transformation
X_2d_pca = X_2d @ W

# Plot results
plot_data_2d(X_2d, X_pca=X_2d_pca)

# Define transformation matrix
W = np.array([ [0, 0], [0, 1] ])

# Perform transformation
X_2d_pca = X_2d @ W

# Plot results
plot_data_2d(X_2d, X_pca=X_2d_pca)

pca = PCA(n_components=1).fit(X_2d)
X_2d_pca = pca.fit_transform(X_2d)
X_2d_pca = pca.inverse_transform(X_2d_pca)

plot_data_2d(X_2d, X_pca=X_2d_pca)

plot_data_3d(X_3d, max_lim=4, azim=None, repeat=True)

pca = PCA(n_components=2).fit(X_3d)

plot_data_3d(X_3d, pca=pca, show_projection=True, show_mapping=True, max_lim=4, azim=None, repeat=True)

pca = PCA(n_components=2).fit(X_3d)
X_3d_pca = pca.transform(X_3d)

plot_data_2d(X_3d_pca, x_label="$x^\prime$", y_label="$x^{\prime\prime}$", max_lim=4)

pca = PCA(n_components=1).fit(X_3d)

plot_data_3d(X_3d, pca=pca, show_projection=True, show_mapping=True, max_lim=4, azim=None, repeat=True)

X_2d_centered = X_2d - np.mean(X_2d, axis=0)
X_3d_centered = X_3d - np.mean(X_3d, axis=0)

plot_data_2d(X_2d_centered)

X_2d_scaled = X_2d_centered / np.std(X_2d_centered, 0)
X_3d_scaled = X_3d_centered / np.std(X_3d_centered, 0)

plot_data_2d(X_2d_scaled)

def normalize(X):
    # Center data around the mean
    X -= np.mean(X, 0)
    # Normlize w.r.t. to standard variance
    X /= np.std(X, 0)
    return X

X_2d_norm = normalize(X_2d)
X_3d_norm = normalize(X_3d)

def covariance(X):
    return np.dot(X.T, X) / (X.shape[0] - 1)

C_2d = covariance(X_2d)
C_3d = covariance(X_3d)

def first_principal_component(C):
    eigenvals, eigenvecs = np.linalg.eigh(C)
    key = np.argsort(eigenvals)[::-1][:1]
    return eigenvals[key], eigenvecs[:, key]

_, pc_2d_pc1 = first_principal_component(C_2d)
_, pc_3d_pc1 = first_principal_component(C_3d)

plot_data_2d(X_2d, pcs=pc_2d_pc1)

pca = PCA(n_components=1).fit(X_3d)

plot_data_3d(X_3d, pca=pca, show_pcs=1, pcs_scaling=2, max_lim=4, azim=None, repeat=True)

pca = PCA(n_components=2).fit(X_3d)

plot_data_3d(X_3d, pca=pca, show_pcs=2, pcs_scaling=2, max_lim=4, azim=None, repeat=True)

def deflate_matrix(X, pc):
    return X - np.outer(X @ pc, pc)

X_2d_new = deflate_matrix(X_2d, pc_2d_pc1)
X_3d_new = deflate_matrix(X_3d, pc_3d_pc1)

plot_data_2d(X_2d_new, pcs=pc_2d_pc1)

pca = PCA(n_components=1).fit(X_3d)

plot_data_3d(X_3d_new, pca=pca, show_pcs=1, pcs_scaling=1.0, azim=None, repeat=True)

def pca_matrix_deflation(X, n_components=1):
    W, V = [], []
    # Create a copy of the input data matrix
    X_ = X.copy()
    # Mean-center and stadardize data matrix
    X_ = normalize(X_)
    # Iteratively find 1st PC and deflate matrix
    for _ in range(n_components):
        # Create covariance matrix
        C = covariance(X_)
        # Find 1st principal component
        pc_val, pc_vec = first_principal_component(C)
        W.append(pc_vec.squeeze())
        V.append(pc_val)
        # Deflate matrix
        X_ = deflate_matrix(X_, pc_vec)
    # Convert transformation matrix and vector with eigenvalues to NumPy arrays
    W = np.asarray(W).T
    V = np.asarray(V)
    # Compute final projection into lower-dimensional space
    P = X @ W
    return P, W, V

P, W, V = pca_matrix_deflation(X_3d, n_components=2)

plot_data_2d(P, x_label="$x^\prime$", y_label="$x^{\prime\prime}$", max_lim=4)

def pca_eigendecomposition(X, n_components=1):
    # Mean-center and standardize data amtrix
    X = normalize(X)
    # Compute covariance matrix
    C = covariance(X)
    # Compute all eigenvalues and eigenvectors
    eigenvals, eigenvecs = np.linalg.eigh(C)
    # Sort the eigenvalues in descending order and return the index of the top-p ones
    idx = np.argsort(eigenvals)[::-1][:n_components]
    # Extract only the top eigenvalues and eigenvectors (i.e., tranformation matrix W)
    V, W = eigenvals[idx], eigenvecs[:, idx]
    P = np.dot(X, W)  # used to be dot(V.T, data.T).T
    return P, W, V

P, W, V = pca_eigendecomposition(X_3d, n_components=2)

plot_data_2d(P, x_label="$x^\prime$", y_label="$x^{\prime\prime}$", max_lim=4)

def pca_svd(X, n_components):
    # Mean-center and standardize data amtrix
    X = normalize(X)
    # Apply SVD directly to the centered data matrix
    U, S, Vt = np.linalg.svd(X, full_matrices=False)
    # Principal component directions are the rows of Vt
    W = Vt[:n_components].T
    V = S[:n_components]
    # Project data onto the selected principal components
    P = X @ W
    return P, W, V

P, W, V = pca_svd(X_3d, n_components=2)

plot_data_2d(P, x_label="$x^\prime$", y_label="$x^{\prime\prime}$", max_lim=4)

pca = PCA(n_components=3).fit(X_3d)

with np.printoptions(precision=3):
    print(f"Raw eigenvalues: {pca.explained_variance_}")

Raw eigenvalues: [2.177 0.608 0.23 ]

with np.printoptions(precision=3):
    print(f"Normalized eigenvalues: {pca.explained_variance_ratio_}")

Normalized eigenvalues: [0.722 0.201 0.076]

plot_explained_variance(pca.explained_variance_ratio_, style="bar", show_ratios=True)

cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)

with np.printoptions(precision=3):
    print(cumulative_explained_variance)

[0.722 0.924 1.   ]

plot_explained_variance(cumulative_explained_variance, style="bar", show_ratios=True)

Principal Component Analysis (PCA)¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Prepare Data¶

Preliminaries¶

Introduction¶

Basic Goal¶

The Intuition behind PCA¶

The PCA Algorithm¶

Basic Assumptions & Limitations¶

Data Preprocessing Steps¶

Mean-Centering¶

Feature Scaling¶

Finding the 1st Principal Component¶

Finding the 2nd, 3rd, ... Principal Components¶

Approach 1: Matrix Deflation¶

Approach 2: Eigendecomposition¶

Approach 3: Singular Value Decomposition¶

Deciding on $p$¶

Extra: The Missing Math¶

Discussion¶

Challenges & Limitations¶

Extensions & Variants¶

Summary¶