from src.utils.libimports.linreg import *
from src.utils.plotting.linreg import *
from src.utils.data.files import *

file_hdb_resale, data_folder = download_dataset("tabular/regression/example-hdb-resale-price-prediction.csv")
csi_example, data_folder     = download_dataset("tabular/regression/example-csi-height-prediction.csv")

File 'data/datasets/tabular/regression/example-hdb-resale-price-prediction.csv' already exists (use 'overwrite=True' to overwrite it).
File 'data/datasets/tabular/regression/example-csi-height-prediction.csv' already exists (use 'overwrite=True' to overwrite it).

w0_true = 70
w1_true = 3.6

w_true = np.asarray([w0_true, w1_true])

sigma = 5

# Set seed to ensure reproducible results
np.random.seed(1)

# Generate 20 shoe print sized between 27 and 35 centimeters
num_samples = 20
X_data = np.random.uniform(27, 35, num_samples).round(1)

# Create artificial feature x0 (all values 1) for bias w0
x0 = np.ones(X_data.shape[0])

# Add x0 to initial data matrix
X = np.vstack([x0, X_data]).T


# Print the array for the 20 random shoe print sizes
print(X_data)

[30.3 32.8 27.  29.4 28.2 27.7 28.5 29.8 30.2 31.3 30.4 32.5 28.6 34.
 27.2 32.4 30.3 31.5 28.1 28.6]

# Set seed to ensure reproducible results
np.random.seed(2)

# Generate random errors
errors = np.random.normal(0, sigma, len(X))

# Calculate y based on the X and the known linear relationship + include the errors
y = (np.dot(X, w_true) + errors).round(1)

# Print the array for the 20 computed heights based on the given linear relationship between x and y
print(y)

[177.  187.8 156.5 184.  162.6 165.5 175.1 171.1 173.4 178.1 182.2 198.5
 173.2 186.8 170.6 183.7 179.  189.3 167.4 173. ]

# Combine the feature and output array into a single data matrix
data = [ (X_data[i], y[i]) for i, y_i in enumerate(y) ]

# Convert data matrix into a pandas DataFrame (not needed but simplifies the saving)
df = pd.DataFrame(data, columns=['shoe print size', 'height'])

# We can also have a quick look into the DataFrame to check of the data looks alright
df.head()

df.to_csv(f"{data_folder}my-csi-example-data.csv", index=None)

plot_csi_data(X_data, y)

plot_csi_data(X_data, y, regression_lines=[(w_true, "red", "True Relationship")])

plot_csi_data(X_data, y, regression_lines=[(w_true, "red", "True Relationship")], show_errors=True)

def calculate_mse(X, y, w):
    # Calculate weighted sum of data and weight vector using the dot product
    y_hat = np.dot(X, w)
    # Calculate and the MSE using NumpPy methods to implement MSE formula
    return np.sum(np.square(y - y_hat)) / len(X)

mse_true = calculate_mse(X, y, w_true)

print("The mse loss w.r.t. w0={:.1f}, w1={:.1f} is {:.2f})".format(w0_true, w1_true, mse_true))

The mse loss w.r.t. w0=70.0, w1=3.6 is 30.63)

# Picks some random weights (feel free to change those)
w0_random, w1_random = -20, 4.2

mse_random = calculate_mse(X, y, [w0_random, w1_random])

print("The mse loss w.r.t. w0={:.1f}, w1={:.1f} is {:.2f})".format(w0_random, w1_random, mse_random))

The mse loss w.r.t. w0=-20.0, w1=4.2 is 5067.81)

# Set seed to ensure reproducible results
np.random.seed(0)

num_iterations = 1000

# Keep track of all data points for a plot
xs, ys, zs = [], [], []

# Initialize parameters
w0_random_best, w1_random_best, mse_random_best = None, None, float("inf")

for i in range(num_iterations):
    # Select a random value for weights w
    w0_random = np.random.uniform(0.0, 100.0, 1)[0]
    w1_random = np.random.uniform(0.0, 6.0, 1)[0]
    w_random = np.asarray([w0_random, w1_random])
    # Calculate loss for selected weights w
    mse_random = calculate_mse(X, y, w_random)
    # If the loss is lower than the currently best loss, remember all parameters
    if mse_random < mse_random_best:
        mse_random_best = mse_random
        w0_random_best = w0_random
        w1_random_best = w1_random
    # Remember current parameter values and loss for plotting
    xs.append(w0_random)
    ys.append(w1_random)
    zs.append(mse_random)
        
print("The best random values are: w0={:.3f}, w1={:.3f} (MSE loss={:.3f})".format(w0_random_best, w1_random_best, mse_random_best))

The best random values are: w0=53.618, w1=4.088 (MSE loss=28.553)

plot_random_losses(xs, ys, zs)

def fit_analytically(X, y):
    # Calculate the dot product between X and it's inverse
    XTX = np.dot(X.T, X)
    # Calculate the inverse of result of the dot product calculation
    inverse = np.linalg.inv(XTX)
    # Calculate the pseudo inverse
    pseudo_inverse = np.dot(inverse, X.T)
    # Calculate and return the final parameter values
    return np.dot(pseudo_inverse, y)

w_analytical_fit = fit_analytically(X, y)

print("The best are: w0={:.3f}, w1={:.3f}".format(w_analytical_fit[0], w_analytical_fit[1]))

The best are: w0=49.466, w1=4.251

mse_analytical_fit = calculate_mse(X, y, w_analytical_fit)

print("The the smallest loss is: {:.3f}".format(mse_analytical_fit))

The the smallest loss is: 27.939

plot_csi_data(X_data, y, regression_lines=[(w_true, "red", "True Relationship"), (w_analytical_fit, "green", "Predicted Relationship")])

y_theoretical = np.dot(X, w_true)

average_error = np.mean(y - y_theoretical)

print(f"Average error for {num_samples} samples: {average_error:.3f}")

Average error for 20 samples: -1.044

def calculate_mse_gradient(X, y, w):
    # Calculate hypothesis (i.e., y_hat)
    h = np.dot(X, w)
    # Calculate and return the final gradient
    return 2 * np.dot(X.T, (h - y)) / y.shape[0]

w0_init = 10
w1_init = 10

gradient_init = calculate_mse_gradient(X, y, [w0_init, w1_init])

print(f"Gradient for w0={w0_init}, w1={w1_init}: {gradient_init}")

Gradient for w0=10, w1=10: [ 265.32  7987.171]

def fit_gradient_descent(X, y, lr=0.001, num_iter=1000, verbose=False):
    # Initialize w as vector with all elements being 0
    w = np.zeros(X.shape[1])

    for i in range(num_iter):
        # Calculate gradient w.r.t. w
        gradient = calculate_mse_gradient(X, y, w)
        # Update step: adjust weights w.r.t. gradient and learning rate
        w -= lr * gradient
        # Print loss every 10% of the iterations
        if verbose == True:
            if(i % (num_iter/10) == 0):
                print('Loss: {:.3f} \t {:.0f}%'.format(calculate_mse(X, y, w), (i / (num_iter/100))))
    # Print final loss
    print('Loss: {:.3f} \t 100%'.format(calculate_mse(X, y, w)))
    # Return final set of weights w
    return w

w_gradient_descent_fit = fit_gradient_descent(X, y, num_iter=100, verbose=True)

print("The best weights using Gradient Descent are are: w0={:.3f}, w1={:.3f}".format(w_gradient_descent_fit[0], w_gradient_descent_fit[1]))

Loss: 20185.579 	 0%
Loss: 284.539 	 10%
Loss: 41.149 	 20%
Loss: 38.171 	 30%
Loss: 38.133 	 40%
Loss: 38.131 	 50%
Loss: 38.129 	 60%
Loss: 38.127 	 70%
Loss: 38.125 	 80%
Loss: 38.124 	 90%
Loss: 38.122 	 100%
The best weights using Gradient Descent are are: w0=0.237, w1=5.888

num_iters = [10, 100, 1000, 10000, 100000]

solutions = []

for num_iter in num_iters:
    print(f"Fitting model using {num_iter} iterations...")
    w = fit_gradient_descent(X, y, num_iter=num_iter)
    solutions.append((f"Gradient Descent ({num_iter} iterations)", w))
print("DONE")

solutions.append(("Optimal solution", w_analytical_fit))

Fitting model using 10 iterations...
Loss: 420.876 	 100%
Fitting model using 100 iterations...
Loss: 38.122 	 100%
Fitting model using 1000 iterations...
Loss: 37.969 	 100%
Fitting model using 10000 iterations...
Loss: 36.563 	 100%
Fitting model using 100000 iterations...
Loss: 29.842 	 100%
DONE

regression_lines = [ (w, None, label) for label, w in solutions ]

plot_csi_data(X_data, y, regression_lines=regression_lines)

# Specify all possible combinations of w0 and w1 we want to consider
w0_range = np.arange(0.0, 100.0, 1)
w1_range = np.arange(0.0, 6.0, 0.1)

# Plot loss function
plot_loss_function(X, y, w0_range, w1_range, calculate_mse)

def predict(X, w):
    return np.dot(X, w)

# Create data matrix for our single unseen data sample for our suspect
X_suspect = [[1, 32.2]]

# Calculate predicted height
prediction_csi = predict(X_suspect, w_analytical_fit)[0]

print(f"The estimated height of the suspect is {prediction_csi:.1f}")

The estimated height of the suspect is 186.3

# Load data file into pandas DataFrame
df_csi = pd.read_csv(csi_example)

# Show the first 5 entries of the DataFrame
df_csi.head()

X_csi = df_csi[['shoe print size']].to_numpy()
y_csi = df_csi[['height']].to_numpy().squeeze()

model_csi = LinearRegression().fit(X_csi, y_csi)

w0_csi = model_csi.intercept_
w1_csi = model_csi.coef_[0]

print("The best are: w0={:.3f}, w1={:.3f}".format(w0_csi, w1_csi))

The best are: w0=49.466, w1=4.251

height_suspect = model_csi.predict([[32.2]])[0]

print(f"The estimated height of the suspect is {height_suspect:.1f} cm")

The estimated height of the suspect is 186.3 cm

# Load data file into pandas DataFrame
df_hdb = pd.read_csv(file_hdb_resale)

print(f"The dataset contains {df_hdb.shape[0]} transactions")

# Show the first 5 entries of the DataFrame
df_hdb.head()

The dataset contains 20000 transactions

corr = df_hdb.corr()[['price']]

print(corr)

                    price
month            0.038175
num_rooms        0.642531
storey           0.428576
area_sqm         0.619492
remaining_lease  0.448259
price            1.000000

plt.figure()
sns.heatmap(corr, cmap="Blues", annot=True)
plt.show()

X_hdb = df_hdb[['month', 'num_rooms', 'storey', 'area_sqm', 'remaining_lease']].to_numpy()
y_hdb = df_hdb[['price']].to_numpy().squeeze()

# Split dataset in training and test data (20% test data)
X_hdb_train, X_hdb_test, y_hdb_train, y_hdb_test = train_test_split(X_hdb, y_hdb, test_size=0.25, random_state=0)

print(f"Size of training dataset: {len(X_hdb_train)}")
print(f"Size of test dataset: {len(X_hdb_test)}")

Size of training dataset: 15000
Size of test dataset: 5000

model_hdb = LinearRegression().fit(X_hdb_train, y_hdb_train)

# Print bias weight w0
print(f"w0 = {model_hdb.intercept_:.2f}")

# Print all feature weights
for i, w in enumerate(model_hdb.coef_):
    print(f"w{i+1} = {w:.2f} (feature: {df_hdb.columns[i]})")

w0 = -193705.77
w1 = 1978.43 (feature: month)
w2 = 36666.52 (feature: num_rooms)
w3 = 9071.72 (feature: storey)
w4 = 3300.71 (feature: area_sqm)
w5 = 2882.37 (feature: remaining_lease)

y_hdb_pred = model_hdb.predict(X_hdb_test)

print("Root Mean Squared Error (RSME): %.2f" % root_mean_squared_error(y_hdb_test, y_hdb_pred))

Root Mean Squared Error (RSME): 98918.85

plot_truth_vs_predictions(y_hdb_test, y_hdb_pred)

Linear Regression¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Motivating Example¶

Model Definition¶

Bias Trick¶

Matrix Notation¶

Worked Example: Toy Dataset¶

Save Data for Later Use¶

Data Visualization¶

Training a Linear Regression Model¶

Loss Function: How good are our weights $\mathbf{w}$?¶

Minimizing the Loss¶

"Silly" Approach: Random Search¶

Analytical Approach: Normal Equation¶

Numerical Method: Gradient Descent¶

Making Prediction¶

Practical Applications¶

CSI Example Dataset¶

Simple Apartment Price Prediction¶

Summary¶

	month	num_rooms	storey	area_sqm	remaining_lease	price
0	6	5	14	123.0	74	930000.0
1	6	4	28	122.0	74	1090000.0
2	10	5	2	117.0	53	565888.0
3	5	3	5	68.0	56	368000.0
4	1	5	10	125.0	69	715000.0