import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.tree import plot_tree

from src.models.trees.rf import SeleneRandomForestClassifier, SeleneRandomForestRegressor
from src.utils.data.files import *

creadit_loans, _ = download_dataset("tabular/classification/example-loan-default-data.csv")
breast_cancer, _ = download_dataset("tabular/classification/breast-cancer-wisconsin-classification.csv")

File 'data/datasets/tabular/classification/example-loan-default-data.csv' already exists (use 'overwrite=True' to overwrite it).
File 'data/datasets/tabular/classification/breast-cancer-wisconsin-classification.csv' already exists (use 'overwrite=True' to overwrite it).

df_loan = pd.read_csv(creadit_loans)

df_loan.head(10)

X_classification = df_loan.drop(["DEFAULTED", "LOAN_AMOUNT"], axis=1).to_numpy()
y_classification = df_loan["DEFAULTED"].to_numpy()

X_regression = df_loan.drop(["DEFAULTED", "LOAN_AMOUNT"], axis=1).to_numpy()
y_regression = df_loan["LOAN_AMOUNT"].to_numpy()

feature_names = df_loan.columns[:4].to_numpy()

print(feature_names)

['ANNUAL_INCOME' 'CREDIT_SCORE' 'DEBT_RATIO' 'YEARS_EMPLOYED']

np.random.seed(1)

sklearn_classifier = DecisionTreeClassifier().fit(X_classification, y_classification)

plt.figure(figsize=(8, 8))
plot_tree(sklearn_classifier, filled=True, feature_names=feature_names)
plt.show()

df_loan.head(10)

df_loan_boostrap = df_loan.sample(n=len(df_loan), replace=True, random_state=42)

df_loan_boostrap.head(10)

def create_bootstrap_sample(X, y):
    X_bootstrap, y_bootstrap = None, None
    # Get the number of samples from the shape of the original feature matrix
    n_samples = X.shape[0]
    # Sample randomly n_samples indicies 0..(n_samples-1) with replacement
    random_sample_indices = np.random.choice(n_samples, n_samples, replace=True)
    # Construct the feature matrix and the target vector using the sampled indices
    X_bootstrap = X[random_sample_indices]
    y_bootstrap = y[random_sample_indices]
    # Return the bootstrap sample
    return X_bootstrap, y_bootstrap

X_bootstrap, y_bootstrap = create_bootstrap_sample(X_classification, y_classification)

print('Shape of X_bootstrap: {}'.format(X_bootstrap.shape))
print('Shape of y_bootstrap: {}'.format(y_bootstrap.shape))

Shape of X_bootstrap: (10, 4)
Shape of y_bootstrap: (10,)

df_loan.head(10)

# Randomly sample 2 features from the subset
sampled_features = (
    pd.Series(feature_names)
    .sample(n=2, replace=False, random_state=42)
    .tolist()
)

# Create DataFrame with sampled features
df_loan_sampled = df_loan[sampled_features]

df_loan_sampled.head(10)

def sample_feature_indices(X, max_features: int=None):
    # Get the number of available features
    n_features = X.shape[1]
    # Compute sample size as the minimum of available features and max_features value
    if max_features is not None and max_features > 1:
        n_features = min(n_features, max_features)
    # Return a random sample of available indices of size n_features (without replacement)
    return np.random.choice(np.arange(X.shape[1]), size=n_features, replace=False)

for _ in range(5):
    sampled_indices = sample_feature_indices(X_classification, max_features=2)
    print(f"Sampled feature indices: {sampled_indices}")

Sampled feature indices: [3 1]
Sampled feature indices: [3 1]
Sampled feature indices: [1 3]
Sampled feature indices: [3 0]
Sampled feature indices: [3 2]

def train_random_forest_classifer(X, y, n_estimators: int=100, max_depth: int=None, min_samples_split: int=2, max_features: int=None):
    trees = []
    # Train n_estimators independent Decision trees using bootstrap and feature sampling
    for _ in range(n_estimators):
        # Create bootstrap sample
        X_bootstrap, y_bootstrap = create_bootstrap_sample(X, y)
        # Train a Decision Tree using boostrap sample; feature sampling is performed by Decision Tree implementation
        tree = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, max_features=max_features)
        tree = tree.fit(X_bootstrap, y_bootstrap)
        # Add trained to list
        trees.append(tree)
    # Return list of trees representing the Random forest
    return trees

np.random.seed(11)

random_forest_classifier = train_random_forest_classifer(X_classification, y_classification)

print(f"Number of classification trees in Random Forest: {len(random_forest_classifier)}")

Number of classification trees in Random Forest: 100

def train_random_forest_regressor(X, y, n_estimators: int=100, max_depth: int=None, min_sample_split: int=2, max_features: int=None):
    trees = []
    # Train n_estimators independent Decision trees using bootstrap and feature sampling
    for _ in range(n_estimators):
        # Create bootstrap sample
        X_bootstrap, y_bootstrap = create_bootstrap_sample(X, y)
        # Train a Decision Tree using boostrap sample; feature sampling is performed by Decision Tree implementation
        tree = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=min_sample_split, max_features=max_features)
        tree = tree.fit(X_bootstrap, y_bootstrap)
        # Add trained to list
        trees.append(tree)
    # Return list of trees representing the Random forest
    return trees

np.random.seed(11)

random_forest_regressor = train_random_forest_regressor(X_regression, y_regression)

print(f"Number of regression trees in Random Forest: {len(random_forest_regressor)}")

Number of regression trees in Random Forest: 100

for idx in range(3):
    plt.figure(figsize=(8, 8))
    plot_tree(random_forest_classifier[idx], filled=True, feature_names=feature_names)
    plt.show()

def predict_classes(X, random_forest_classifier):
    # Pass X to all trees to get all predictions
    ys = np.asarray([ tree.predict(X) for tree in random_forest_classifier ])
    # Compute the majority class label for each sample across all trees
    return np.array([ np.bincount(ys[:, col]).argmax() for col in range(ys.shape[1]) ])

y_pred_classes = predict_classes(X_classification, random_forest_classifier)

print(f"Predicted classes for toy dataset: {y_pred_classes}")

Predicted classes for toy dataset: [1 1 0 1 0 0 1 0 0 1]

def predict_values(X, random_forest_regressor):
    # Pass X to all trees to get all predictions
    ys = np.asarray([ tree.predict(X) for tree in random_forest_regressor ])
    # Compute the mean for each sample across all trees
    return np.mean(ys, axis=0)

y_pred_values = predict_values(X_regression, random_forest_regressor)

print(f"Predicted values for toy dataset: {y_pred_values}")

Predicted values for toy dataset: [24.04 32.59 30.78 14.26 28.46 36.38 21.56 54.68 33.84 15.81]

classifier = SeleneRandomForestClassifier().fit(X_classification, y_classification)

y_pred_values = classifier.predict(X_classification)

print(f"Predicted classes for toy dataset: {y_pred_classes}")

Predicted classes for toy dataset: [1 1 0 1 0 0 1 0 0 1]

regressor = SeleneRandomForestRegressor().fit(X_regression, y_regression)

y_pred_values = regressor.predict(X_regression)

print(y_pred_values)

[24.99 31.94 31.46 13.78 26.13 36.2  21.39 57.09 33.58 15.07]

df_cancer = pd.read_csv(breast_cancer)

df_cancer.head()

X_cancer = df_cancer.iloc[:, 0:-1].to_numpy()
y_cancer = df_cancer.iloc[:, -1].to_numpy().squeeze()

# Split dataset in training and test data (25% test data)
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, test_size=0.25, random_state=0)

print(f"Size of training dataset: {len(X_train)}")
print(f"Size of test dataset: {len(X_test)}")

Size of training dataset: 426
Size of test dataset: 143

for _ in range(20):
    # Train Decision Tree classifier using original training data
    tree = DecisionTreeClassifier().fit(X_train, y_train)
    # Print core features of trained Decision Tree
    # (feature index of root node, total of number of nodes in Decision Tree)
    print(f"root feature: {tree.tree_.feature[0]},\t#nodes: {tree.tree_.node_count}")

root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33
root feature: 22,	#nodes: 33

for _ in range(20):
    # Create a new bootstrap sample
    X_t, y_t = SeleneRandomForestClassifier().create_bootstrap_sample(X_train, y_train)
    # Train Decision Tree classifier using bootstrap sample
    tree = DecisionTreeClassifier().fit(X_t, y_t)
    # Print core features of trained Decision Tree
    # (feature index of root node, total of number of nodes in Decision Tree)
    print(f"root feature: {tree.tree_.feature[0]},\t#nodes: {tree.tree_.node_count}")

root feature: 22,	#nodes: 23
root feature: 27,	#nodes: 21
root feature: 22,	#nodes: 27
root feature: 22,	#nodes: 23
root feature: 22,	#nodes: 27
root feature: 22,	#nodes: 31
root feature: 22,	#nodes: 27
root feature: 27,	#nodes: 25
root feature: 22,	#nodes: 25
root feature: 23,	#nodes: 25
root feature: 22,	#nodes: 29
root feature: 22,	#nodes: 25
root feature: 27,	#nodes: 23
root feature: 27,	#nodes: 31
root feature: 22,	#nodes: 25
root feature: 22,	#nodes: 21
root feature: 7,	#nodes: 27
root feature: 22,	#nodes: 27
root feature: 22,	#nodes: 17
root feature: 22,	#nodes: 25

for _ in range(20):
    # Create a new bootstrap sample
    X_t, y_t = SeleneRandomForestClassifier().create_bootstrap_sample(X_train, y_train)
    # Train a Decision Tree classifier with a limited max_feature argument (here: 6, by default)
    tree = DecisionTreeClassifier(max_features=6).fit(X_t, y_t)
    # Print core features of trained Decision Tree
    # (feature index of root node, total of number of nodes in Decision Tree)
    print(f"root feature: {tree.tree_.feature[0]},\t#nodes: {tree.tree_.node_count}")

root feature: 7,	#nodes: 29
root feature: 0,	#nodes: 39
root feature: 7,	#nodes: 37
root feature: 22,	#nodes: 33
root feature: 6,	#nodes: 35
root feature: 27,	#nodes: 37
root feature: 27,	#nodes: 31
root feature: 6,	#nodes: 29
root feature: 2,	#nodes: 37
root feature: 22,	#nodes: 37
root feature: 20,	#nodes: 33
root feature: 22,	#nodes: 35
root feature: 22,	#nodes: 35
root feature: 2,	#nodes: 31
root feature: 20,	#nodes: 37
root feature: 7,	#nodes: 31
root feature: 20,	#nodes: 39
root feature: 22,	#nodes: 27
root feature: 20,	#nodes: 35
root feature: 22,	#nodes: 31

selene_rf = SeleneRandomForestClassifier(max_features=6).fit(X_train, y_train)

y_pred = selene_rf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98        93
           1       0.98      0.96      0.97        50

    accuracy                           0.98       143
   macro avg       0.98      0.97      0.98       143
weighted avg       0.98      0.98      0.98       143

sklearn_rf = RandomForestClassifier(max_features=6).fit(X_train, y_train)

y_pred = sklearn_rf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97        93
           1       0.96      0.94      0.95        50

    accuracy                           0.97       143
   macro avg       0.96      0.96      0.96       143
weighted avg       0.96      0.97      0.96       143

sklearn_dt = DecisionTreeClassifier().fit(X_train, y_train)

y_pred = sklearn_dt.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.94      0.95        93
           1       0.89      0.94      0.91        50

    accuracy                           0.94       143
   macro avg       0.93      0.94      0.93       143
weighted avg       0.94      0.94      0.94       143

	ANNUAL_INCOME	CREDIT_SCORE	DEBT_RATIO	YEARS_EMPLOYED	DEFAULTED	LOAN_AMOUNT
0	45	620	0.42	2	1	28
1	82	710	0.21	8	1	32
2	60	680	0.35	5	0	30
3	38	590	0.51	1	1	12
4	95	760	0.18	12	0	15
5	72	700	0.29	7	0	38
6	50	640	0.47	3	1	20
7	11	790	0.15	15	0	70
8	67	675	0.33	6	0	34
9	40	605	0.49	2	1	15

	ANNUAL_INCOME	CREDIT_SCORE	DEBT_RATIO	YEARS_EMPLOYED	DEFAULTED	LOAN_AMOUNT
0	45	620	0.42	2	1	28
1	82	710	0.21	8	1	32
2	60	680	0.35	5	0	30
3	38	590	0.51	1	1	12
4	95	760	0.18	12	0	15
5	72	700	0.29	7	0	38
6	50	640	0.47	3	1	20
7	11	790	0.15	15	0	70
8	67	675	0.33	6	0	34
9	40	605	0.49	2	1	15

	ANNUAL_INCOME	CREDIT_SCORE	DEBT_RATIO	YEARS_EMPLOYED	DEFAULTED	LOAN_AMOUNT
6	50	640	0.47	3	1	20
3	38	590	0.51	1	1	12
7	11	790	0.15	15	0	70
4	95	760	0.18	12	0	15
6	50	640	0.47	3	1	20
9	40	605	0.49	2	1	15
2	60	680	0.35	5	0	30
6	50	640	0.47	3	1	20
7	11	790	0.15	15	0	70
4	95	760	0.18	12	0	15

	ANNUAL_INCOME	CREDIT_SCORE	DEBT_RATIO	YEARS_EMPLOYED	DEFAULTED	LOAN_AMOUNT
0	45	620	0.42	2	1	28
1	82	710	0.21	8	1	32
2	60	680	0.35	5	0	30
3	38	590	0.51	1	1	12
4	95	760	0.18	12	0	15
5	72	700	0.29	7	0	38
6	50	640	0.47	3	1	20
7	11	790	0.15	15	0	70
8	67	675	0.33	6	0	34
9	40	605	0.49	2	1	15

	CREDIT_SCORE	YEARS_EMPLOYED
0	620	2
1	710	8
2	680	5
3	590	1
4	760	12
5	700	7
6	640	3
7	790	15
8	675	6
9	605	2

Random Forests¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Preparation of Toy Dataset¶

Quick Recap: Decision Trees¶

Basic Idea¶

Pros and Cons¶

Random Forest — Core Components¶

Bootstrap Sampling (Bagging)¶

Feature Sampling¶

Training the Model¶

Classifier Training¶

Regressor Training¶

Making Predictions¶

Predicting Classes¶

Predicting Values¶

Worked Example¶

Practical Implementation¶

Real-World Dataset¶

Discussion: Pros & Cons¶

Summary¶

	radius1	texture1	perimeter1	area1	smoothness1	compactness1	concavity1	concave_points1	symmetry1	fractal_dimension1	...	texture3	perimeter3	area3	smoothness3	compactness3	concavity3	concave_points3	symmetry3	fractal_dimension3	diagnosis
0	20.48	21.46	132.50	1306.0	0.08355	0.08348	0.09042	0.06022	0.1467	0.05177	...	26.17	161.70	1750.0	0.1228	0.23110	0.31580	0.14450	0.2238	0.07127	1
1	13.15	15.34	85.31	538.9	0.09384	0.08498	0.09293	0.03483	0.1822	0.06207	...	20.50	97.67	677.3	0.1478	0.22560	0.30090	0.09722	0.3849	0.08633	0
2	10.17	14.88	64.55	311.9	0.11340	0.08061	0.01084	0.01290	0.2743	0.06960	...	17.45	69.86	368.6	0.1275	0.09866	0.02168	0.02579	0.3557	0.08020	0
3	14.90	22.53	102.10	685.0	0.09947	0.22250	0.27330	0.09711	0.2041	0.06898	...	27.57	125.40	832.7	0.1419	0.70900	0.90190	0.24750	0.2866	0.11550	1
4	20.73	31.12	135.70	1419.0	0.09469	0.11430	0.13670	0.08646	0.1769	0.05674	...	47.16	214.00	3432.0	0.1401	0.26440	0.34420	0.16590	0.2868	0.08218	1