from src.utils.libimports.logreg import *
from src.utils.data.files import *

csi_gender, _    = download_dataset("tabular/classification/example-csi-gender-classification.csv")
breast_cancer, _ = download_dataset("tabular/classification/breast-cancer-wisconsin-classification.csv")

File 'data/datasets/tabular/classification/example-csi-gender-classification.csv' already exists (use 'overwrite=True' to overwrite it).
File 'data/datasets/tabular/classification/breast-cancer-wisconsin-classification.csv' already exists (use 'overwrite=True' to overwrite it).

plot_sigmoid()

df_csi = pd.read_csv(csi_gender)

df_csi.head(20)

X_csi = df_csi['shoe print size'].to_numpy()
y_csi = df_csi['sex'].to_numpy().squeeze()

# Create artificial feature x0 (all values 1) for bias w0
x0 = np.ones(X_csi.shape[0])

# Add x0 to initial data matrix
X_csi_bias = np.vstack([x0, X_csi]).T

plot_csi_data(X_csi, y_csi)

plot_ce_loss()

def calculate_ce(X, y, w):
    # Calculate linear signal
    z = np.dot(X, w)
    # Calculate prediction output/probability
    y_hat = 1 / (1 + np.exp(-z))
    # Calculate and return Cross-Entropy Loss
    return (-y * np.log(y_hat) - (1 - y) * np.log(1 - y_hat)).mean()

ce_zero = calculate_ce(X_csi_bias, y_csi, [0, 0])
ce_good = calculate_ce(X_csi_bias, y_csi, [-30, 1])

print(f"Cross-Entropy Loss for   [0, 0]: {ce_zero:.3f}")
print(f"Cross-Entropy Loss for [-30, 1]: {ce_good:.3f}")

Cross-Entropy Loss for   [0, 0]: 0.693
Cross-Entropy Loss for [-30, 1]: 0.447

# Set seed to ensure reproducible results
np.random.seed(0)

num_iterations = 1000

# Keep track of all data points for a plot
xs, ys, zs = [], [], []

# Initialize parameters
w0_random_best, w1_random_best, mse_random_best = None, None, float("inf")

for i in range(num_iterations):
    # Select a random value for weights w
    w0_random = np.random.uniform(-34, -26, 1)[0]
    w1_random = np.random.uniform(0.5, 1.5, 1)[0]
    w_random = np.asarray([w0_random, w1_random])
    # Calculate loss for selected weights w
    mse_random = calculate_ce(X_csi_bias, y_csi, w_random)
    # If the loss is lower than the currently best loss, remember all parameters
    if mse_random < mse_random_best:
        mse_random_best = mse_random
        w0_random_best = w0_random
        w1_random_best = w1_random
    # Remember current parameter values and loss for plotting
    xs.append(w0_random)
    ys.append(w1_random)
    zs.append(mse_random)
        
print("The best random values are: w0={:.3f}, w1={:.3f} (MSE loss={:.3f})".format(w0_random_best, w1_random_best, mse_random_best))

The best random values are: w0=-32.976, w1=1.083 (MSE loss=0.425)

plot_random_losses(xs, ys, zs)

def calculate_ce_gradient(X, y, w):
    # Calculate linear signal
    z = np.dot(X, w)
    # Calculate predicted output/probability
    y_hat = 1 / (1 + np.exp(-z))
    # Calculate and return gradient
    return np.dot(X.T, (y_hat - y)) / y.shape[0]

ce_grad_zero = calculate_ce_gradient(X_csi_bias, y_csi, [0, 0])

print(f"Gradients for [0, 0]: {ce_grad_zero}")

Gradients for [0, 0]: [-0.1 -3.6]

def fit_gradient_descent(X, y, lr=0.001, num_iter=10, verbose=False):
    # Initialize w as vector with all elements being 0
    w = np.zeros(X.shape[1])
    
    for i in range(num_iter):
        # Calculate gradient w.r.t. w
        gradient = calculate_ce_gradient(X, y, w)
        # Update step: adjust weights w.r.t. gradient and learning rate
        w -= lr * gradient
        # Print loss every 10% of the iterations
        if verbose == True:
            if(i % (num_iter/10) == 0):
                print('Loss: {:.3f} \t {:.0f}%'.format(calculate_ce(X, y, w), (i / (num_iter/100))))
    # Print final loss
    print('Loss: {:.3f} \t 100%'.format(calculate_ce(X, y, w)))
    # Return final set of weights w
    return w

w_gradient_descent_fit = fit_gradient_descent(X_csi_bias, y_csi, lr=0.005, num_iter=1000000, verbose=True)

print(f"The best weights using Gradient Descent are: w0={w_gradient_descent_fit[0]:.3f}, w1={w_gradient_descent_fit[1]:.3f}")

Loss: 0.667 	 0%
Loss: 0.574 	 10%
Loss: 0.523 	 20%
Loss: 0.493 	 30%
Loss: 0.474 	 40%
Loss: 0.460 	 50%
Loss: 0.451 	 60%
Loss: 0.444 	 70%
Loss: 0.439 	 80%
Loss: 0.434 	 90%
Loss: 0.431 	 100%
The best weights using Gradient Descent are: w0=-29.704, w1=0.976

# Specify all possible combinations of w0 and w1 we want to consider
w0_range = np.arange(-32, -27, 0.01)
w1_range = np.arange(0.5, 1.5, 0.01)

# Plot loss function
plot_loss_function(X_csi_bias, y_csi, w0_range, w1_range, calculate_ce)

plot_csi_data(X_csi, y_csi, w=w_gradient_descent_fit)

def predict(X, w):
    # Calculate linear signale
    z = np.dot(X, w)
    # Calculate prediction output/probability
    y_hat = 1 / (1 + np.exp(-z))
    # Return 0 or 1 depending on probability y_hat
    return (y_hat >= 0.5)*1

# Create data matrix for our single unseen data sample for our suspect
X_suspect = [[1, 32.2]]

# Calculate predicted class label
prediction_csi = predict(X_suspect, w_gradient_descent_fit)[0]

print(f"The suspect is most likely of Class {prediction_csi} (0=woman, 1-man)")

The suspect is most likely of Class 1 (0=woman, 1-man)

# Predict class labels for each data samples
y_csi_pred = predict(X_csi_bias, w_gradient_descent_fit)

# Plot data samples; color data points w.r.t. predicted class
plot_csi_data(X_csi, y_csi, y_pred=y_csi_pred, w=w_gradient_descent_fit)

# Load data file into pandas DataFrame
df_csi = pd.read_csv(csi_gender)

# Show the first 5 entries of the DataFrame
df_csi.head()

X_csi = df_csi[['shoe print size']].to_numpy()
y_csi = df_csi[['sex']].to_numpy().squeeze()

model_csi = LogisticRegression().fit(X_csi, y_csi)

w0_csi = model_csi.intercept_[0]
w1_csi = model_csi.coef_[0][0]

print(f"The best are: w0={w0_csi:.3f}, w1={w1_csi:.3f}")

The best are: w0=-32.939, w1=1.080

plot_confusion_matrix(y_csi, y_csi_pred, classes=["woman","man"])

gender_suspect = model_csi.predict([[32.2]])[0]

print(f"The estimated height of the suspect is {gender_suspect:.1f} (0=female, 1=male)")

The estimated height of the suspect is 1.0 (0=female, 1=male)

gender_suspect_prob = model_csi.predict_proba([[32.2]])[0]

print(f"Probability P(Y=1|X=x) = {gender_suspect_prob[gender_suspect]:.2f}")

Probability P(Y=1|X=x) = 0.86

df_bcc = pd.read_csv(breast_cancer)

df_bcc.head()

X_bcc = df_bcc.iloc[:, 0:10].to_numpy()
y_bcc = df_bcc.iloc[:, -1].to_numpy().squeeze()

# Split dataset in training and test data (20% test data)
X_bcc_train, X_bcc_test, y_bcc_train, y_bcc_test = train_test_split(X_bcc, y_bcc, test_size=0.25, random_state=0)

print(f"Size of training dataset: {len(X_bcc_train)}")
print(f"Size of test dataset: {len(X_bcc_test)}")

Size of training dataset: 426
Size of test dataset: 143

model_bcc = LogisticRegression(max_iter=1000).fit(X_bcc_train, y_bcc_train)

y_bcc_pred = model_bcc.predict(X_bcc_test)

plot_confusion_matrix(y_bcc_test, y_bcc_pred, classes=["benign","malignant"])

sc = StandardScaler()

X_bcc_train_scaled = sc.fit_transform(X_bcc_train)
X_bcc_test_scaled = sc.transform (X_bcc_test)

model_bcc_scaled = LogisticRegression(max_iter=10000).fit(X_bcc_train_scaled, y_bcc_train)

y_bcc_scaled_pred = model_bcc_scaled.predict(X_bcc_test_scaled)

plot_confusion_matrix(y_bcc_test, y_bcc_scaled_pred, classes=["benign","malignant"])

weights = model_bcc_scaled.coef_[0]

# Create a list of tuples (name, weight, abs(weight)) all features
features = [ (feature, weights[i], np.abs(weights[i])) for i, feature in enumerate(df_bcc.columns[0:10]) ]

# Print the 3 values for the first 5 tuples
for feature in features[:5]:
    print(feature)

('radius1', 1.0573166077553142, 1.0573166077553142)
('texture1', 1.2786172974665637, 1.2786172974665637)
('perimeter1', 1.0363185603156182, 1.0363185603156182)
('area1', 1.0612367302087162, 1.0612367302087162)
('smoothness1', 0.5040887984602725, 0.5040887984602725)

features_ranked = sorted(features, key=lambda x: x[2], reverse=True)

for feature in features_ranked:
    print(f"[Importance: {feature[1]:.2f}] {feature[0]}")

[Importance: 1.68] concave_points1
[Importance: 1.28] texture1
[Importance: 1.09] concavity1
[Importance: 1.06] area1
[Importance: 1.06] radius1
[Importance: 1.04] perimeter1
[Importance: 0.50] smoothness1
[Importance: -0.36] fractal_dimension1
[Importance: 0.29] symmetry1
[Importance: -0.12] compactness1

	shoe print size	sex
0	27.6	0
1	28.2	0
2	28.9	0
3	29.7	1
4	29.7	0
5	30.2	0
6	30.4	0
7	31.0	1
8	31.3	1
9	31.3	0
10	31.3	1
11	31.4	1
12	31.4	1
13	31.8	0
14	31.8	1
15	31.8	1
16	31.9	1
17	32.4	1
18	33.6	1
19	34.5	1

	radius1	texture1	perimeter1	area1	smoothness1	compactness1	concavity1	concave_points1	symmetry1	fractal_dimension1	...	texture3	perimeter3	area3	smoothness3	compactness3	concavity3	concave_points3	symmetry3	fractal_dimension3	diagnosis
0	20.48	21.46	132.50	1306.0	0.08355	0.08348	0.09042	0.06022	0.1467	0.05177	...	26.17	161.70	1750.0	0.1228	0.23110	0.31580	0.14450	0.2238	0.07127	1
1	13.15	15.34	85.31	538.9	0.09384	0.08498	0.09293	0.03483	0.1822	0.06207	...	20.50	97.67	677.3	0.1478	0.22560	0.30090	0.09722	0.3849	0.08633	0
2	10.17	14.88	64.55	311.9	0.11340	0.08061	0.01084	0.01290	0.2743	0.06960	...	17.45	69.86	368.6	0.1275	0.09866	0.02168	0.02579	0.3557	0.08020	0
3	14.90	22.53	102.10	685.0	0.09947	0.22250	0.27330	0.09711	0.2041	0.06898	...	27.57	125.40	832.7	0.1419	0.70900	0.90190	0.24750	0.2866	0.11550	1
4	20.73	31.12	135.70	1419.0	0.09469	0.11430	0.13670	0.08646	0.1769	0.05674	...	47.16	214.00	3432.0	0.1401	0.26440	0.34420	0.16590	0.2868	0.08218	1

Logistic Regression — Basics¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Motivating Example¶

Model Definition¶

Basic Setup¶

Bias Trick¶

Matrix Notation¶

From Probabilities to Predictions¶

Worked Example: Toy Dataset¶

Loading Dataset¶

Data Visualization¶

Training a Logistic Regression Model¶

Loss Function: How good are our weights $\mathbf{w}$?¶

Minimizing the Cross-Entropy Loss¶

"Silly" Approach: Random Search¶

Iterative Optimization using Gradient Descent¶

Making Predictions¶

Practical Applications¶

CSI Example Dataset¶

Breast Cancer Classification¶

Basic Classification¶

Classification using Standardized Data¶

Summary¶