from src.utils.libimports.linreg import *
from src.utils.plotting.linreg import *
from src.utils.data.files import *

demo_data_basic, _     = download_dataset("tabular/resources/linreg-demo-data-basic.csv")
demo_data_multicoll, _ = download_dataset("tabular/resources/linreg-demo-data-multicollinearity.csv")

File 'data/datasets/tabular/resources/linreg-demo-data-basic.csv' already exists (use 'overwrite=True' to overwrite it).
File 'data/datasets/tabular/resources/linreg-demo-data-multicollinearity.csv' already exists (use 'overwrite=True' to overwrite it).

x_anscombe = np.asarray([10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5])
y_anscombe1 = np.asarray([8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68])
y_anscombe2 = np.asarray([9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74])
y_anscombe3 = np.asarray([7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73])
x_anscombe4 = np.asarray([8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8])
y_anscombe4 = np.asarray([6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89])

datasets = {
    '1': (x_anscombe, y_anscombe1),
    '2': (x_anscombe, y_anscombe2),
    '3': (x_anscombe, y_anscombe3),
    '4': (x_anscombe4, y_anscombe4)
}

plot_anscombes_quartet(datasets, show_regression_line=False)

for (label, (x, y)) in datasets.items():
    # Fit Linear Regression model for current dataset
    model = LinearRegression().fit(x.reshape(-1, 1), y)
    # Print model parameters: bias/intercept (w0) and slope (w1)
    print(f"Model parameters for Dataset {label}: w0={model.intercept_:.2f}, w1={model.coef_[0]:.2f}")

Model parameters for Dataset 1: w0=3.00, w1=0.50
Model parameters for Dataset 2: w0=3.00, w1=0.50
Model parameters for Dataset 3: w0=3.00, w1=0.50
Model parameters for Dataset 4: w0=3.00, w1=0.50

plot_anscombes_quartet(datasets, show_regression_line=True)

df = pd.read_csv(demo_data_basic)

df.head()

x1, y1 = df['x1'].to_numpy(), df['y1'].to_numpy()
x2, y2 = df['x2'].to_numpy(), df['y2'].to_numpy()

fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(14, 5))
ax1.scatter(x1, y1, color='blue', alpha=0.6, label='Data points')
ax2.scatter(x2, y2, color='blue', alpha=0.6, label='Data points')
ax1.set_title("Linear Relationship")
ax2.set_title("Non-Linear Relationship")
plt.show()

print(np.corrcoef(x1, y1))

[[1.         0.90402285]
 [0.90402285 1.        ]]

print(np.corrcoef(x2, y2))

[[1.         0.24524613]
 [0.24524613 1.        ]]

x3, y3 = df['x3'].to_numpy(), df['y3'].to_numpy() # normal + non-constant variance
x4, y4 = df['x4'].to_numpy(), df['y4'].to_numpy() # non-normal

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 5))
ax1.scatter(x1, y1, color='blue', alpha=0.6)
ax2.scatter(x3, y3, color='blue', alpha=0.6)
ax3.scatter(x4, y4, color='blue', alpha=0.6)
ax1.set_title("Normal + Constant Variance")
ax2.set_title("Normal + Non-Contstant Variance")
ax3.set_title("Non-Normal")
plt.show()

X1 = x1.reshape(-1, 1)
X3 = x3.reshape(-1, 1)
X4 = x4.reshape(-1, 1)

model1 = LinearRegression().fit(X1, y1)
model3 = LinearRegression().fit(X3, y3)
model4 = LinearRegression().fit(X4, y4)

y1_pred = model1.predict(X1)
y3_pred = model3.predict(X3)
y4_pred = model4.predict(X4)

residuals1 = y1 - y1_pred
residuals3 = y3 - y3_pred
residuals4 = y4 - y4_pred

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 5))
ax1.scatter(y1_pred, residuals1, color='green', alpha=0.6)
ax2.scatter(y3_pred, residuals3, color='green', alpha=0.6)
ax3.scatter(y4_pred, residuals4, color='green', alpha=0.6)
ax1.set_title("Normal + Constant Variance")
ax2.set_title("Normal + Non-Contstant Variance")
ax3.set_title("Non-Normal")
plt.show()

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 5))
sm.qqplot(residuals1, line='45', fit=True, dist=stats.norm, ax=ax1, markerfacecolor='g', markeredgecolor='g', alpha=0.6)
sm.qqplot(residuals3, line='45', fit=True, dist=stats.norm, ax=ax2, markerfacecolor='g', markeredgecolor='g', alpha=0.6)
sm.qqplot(residuals4, line='45', fit=True, dist=stats.norm, ax=ax3, markerfacecolor='g', markeredgecolor='g', alpha=0.6)
ax1.set_title("Normal + Constant Variance")
ax2.set_title("Normal + Non-Contstant Variance")
ax3.set_title("Non-Normal")
plt.show()

df_mc = pd.read_csv(demo_data_multicoll)

# Show the first entries of the DataFrame
df_mc.head()

correlation_matrix = df_mc.corr()

print(correlation_matrix)

          x1        x2        x3        x4        x5         y
x1  1.000000  0.914840  0.190840 -0.252812  0.740295  0.884503
x2  0.914840  1.000000  0.170268 -0.236759  0.785111  0.925800
x3  0.190840  0.170268  1.000000 -0.837603  0.743990  0.517800
x4 -0.252812 -0.236759 -0.837603  1.000000 -0.687005 -0.486640
x5  0.740295  0.785111  0.743990 -0.687005  1.000000  0.953232
y   0.884503  0.925800  0.517800 -0.486640  0.953232  1.000000

plot_correlation_matrix(correlation_matrix)

X_mc = df_mc[['x1', 'x2', 'x3', 'x4', 'x5']].to_numpy()
y_mc = df_mc[['y']].to_numpy().squeeze()

num_features = X_mc.shape[1]
num_ranks = np.linalg.matrix_rank(X_mc)

print(f"The rank of the data matrix with {num_features} is {num_ranks}")

The rank of the data matrix with 5 is 4

model_mc = LinearRegression().fit(X_mc, y_mc)

print(f"Bias weight w0: {model_mc.intercept_}")
print(f"feature weights: {model_mc.coef_}")

Bias weight w0: -0.06660304530454864
feature weights: [2.56506811 1.82128583 0.5053176  0.93715092 2.83192103]

# Fit Linear Regression model to draw Regression Line
model_anscombe3 = LinearRegression().fit(x_anscombe.reshape(-1, 1), y_anscombe3)
w0_outlier, w1_outlier = model_anscombe3.intercept_, model_anscombe3.coef_[0]
            
# Plot dataset incl. regression line
plt.figure()
plt.scatter(x_anscombe, y_anscombe3, color='blue', alpha=0.6)
plt.gca().axline(xy1=(0, w0_outlier), slope=w1_outlier, color='r', lw=2)
plt.xlabel("x", fontsize=16)
plt.ylabel("y", fontsize=16)
plt.show()

	x1	y1	x2	y2	x3	y3	x4	y4
0	1.000000	57.792157	1.000000	9.921570	1.000000	54.264052	1.000000	52.761631
1	1.018036	53.745562	1.018036	-31.715100	1.018036	52.952465	1.018036	51.805231
2	1.036072	55.526394	1.036072	-15.074240	1.036072	53.604224	1.036072	51.250539
3	1.054108	59.357950	1.054108	22.077106	1.054108	54.997414	1.054108	50.924701
4	1.072144	58.283035	1.072144	10.166992	1.072144	54.682652	1.072144	50.280927

	x1	x2	x3	x4	x5	y
0	0.496714	-0.173585	0.357787	-1.276229	0.541989	0.702740
1	-0.138264	-0.766306	0.560785	-1.261162	0.355263	-1.893866
2	0.647689	1.276507	1.083051	-0.606520	3.442609	13.734696
3	1.523030	3.005297	1.053802	-0.706882	5.112901	23.920814
4	-0.234153	-0.746669	-1.377669	1.701185	-3.502008	-11.390148

Linear Regression — Assumptions & Caveats¶

Preface: Time Series Data vs Cross-Sectional Data¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Motivation: Anscombe's Quartet¶

Create & Plot Datasets¶

Fit Linear Regression Models¶

Anscombe's Quartet: Summary¶

Assumptions¶

Assumption 1 (A1): Linearity¶

Definition¶

Checking the Assumption¶

Assumption 2 (A2): Normality and Homoscedasticity¶

Definition¶

Checking the Assumption¶

Assumption 3 (A3): Strict Exogeneity¶

Definitions¶

Checking the Assumption¶

Assumption 4 (A4): Low Multicollinearity¶

Checking the Assumption¶

Assumption 5 (A5): No Autocorrelation of Residuals¶

Definition¶

Checking the Assumption¶

Caveats¶

Caveat 1 (C1): Extreme Values¶

Caveat 2 (C2): Interpolation vs Extrapolation¶

Summary¶