%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.tree import plot_tree

from src.utils.data.files import *
from src.models.trees.dtree import SeleneDecisionTreeClassifier, SeleneDecisionTreeRegressor

loan_data ,_     = download_dataset("tabular/classification/example-loan-default-data.csv")
breast_cancer, _ = download_dataset("tabular/classification/breast-cancer-wisconsin-classification.csv")

File 'data/datasets/tabular/classification/example-loan-default-data.csv' already exists (use 'overwrite=True' to overwrite it).
File 'data/datasets/tabular/classification/breast-cancer-wisconsin-classification.csv' already exists (use 'overwrite=True' to overwrite it).

df_loan = pd.read_csv(loan_data)

df_loan.head(15)

X_classification = df_loan.drop(["DEFAULTED", "LOAN_AMOUNT"], axis=1).to_numpy()
y_classification = df_loan["DEFAULTED"].to_numpy()

X_regression = df_loan.drop(["DEFAULTED", "LOAN_AMOUNT"], axis=1).to_numpy()
y_regression = df_loan["LOAN_AMOUNT"].to_numpy()

feature_names = ["INCOME", "SCORE", "RATIO", "YEARS"]

np.random.seed(1)

sklearn_classifier = DecisionTreeClassifier().fit(X_classification, y_classification)

plt.figure(figsize=(8, 8))
plot_tree(sklearn_classifier, filled=True, feature_names=feature_names)
plt.show()

def compute_thresholds(feature_values):
    # Get unique values to handle duplicates; return values will already be sorted
    values_sorted = np.unique(feature_values)
    # Compute and return all midpoints between consective values
    return (values_sorted[:-1] + values_sorted[1:]) / 2.0

years = df_loan.YEARS_EMPLOYED.to_numpy()
print(f"Years of emplayment for all customers:\n{years}\n")

years_thresholds = compute_thresholds(years)
print(f"All meaningful thresholds of years of employment:\n{years_thresholds}")

Years of emplayment for all customers:
[ 2  8  5  1 12  7  3 15  6  2]

All meaningful thresholds of years of employment:
[ 1.5  2.5  4.   5.5  6.5  7.5 10.  13.5]

def generate_split(feature_values, threshold):
    indices_left  = np.where(feature_values <= threshold)[0]
    indices_right = np.where(feature_values > threshold)[0]
    return indices_left, indices_right

for threshold in years_thresholds:
    left, right = generate_split(years, threshold)
    print(f"Left: {left}  Right: {right}")

Left: [3]  Right: [0 1 2 4 5 6 7 8 9]
Left: [0 3 9]  Right: [1 2 4 5 6 7 8]
Left: [0 3 6 9]  Right: [1 2 4 5 7 8]
Left: [0 2 3 6 9]  Right: [1 4 5 7 8]
Left: [0 2 3 6 8 9]  Right: [1 4 5 7]
Left: [0 2 3 5 6 8 9]  Right: [1 4 7]
Left: [0 1 2 3 5 6 8 9]  Right: [4 7]
Left: [0 1 2 3 4 5 6 8 9]  Right: [7]

def compute_gini_score_node(t):
    # Count the number of occurcences of output classes in node
    _, counts = np.unique(t, return_counts=True)
    # Calculate and return the Gini score
    return 1 - np.sum(np.square(counts/len(t)))

num_samples = 10

for num_no in range (0, num_samples+1):
    distribution = [0]*num_no + [1]*(num_samples-num_no)
    print(f"Gini({distribution}) =  {compute_gini_score_node(distribution):.3f}")

Gini([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) =  0.000
Gini([0, 1, 1, 1, 1, 1, 1, 1, 1, 1]) =  0.180
Gini([0, 0, 1, 1, 1, 1, 1, 1, 1, 1]) =  0.320
Gini([0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) =  0.420
Gini([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) =  0.480
Gini([0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) =  0.500
Gini([0, 0, 0, 0, 0, 0, 1, 1, 1, 1]) =  0.480
Gini([0, 0, 0, 0, 0, 0, 0, 1, 1, 1]) =  0.420
Gini([0, 0, 0, 0, 0, 0, 0, 0, 1, 1]) =  0.320
Gini([0, 0, 0, 0, 0, 0, 0, 0, 0, 1]) =  0.180
Gini([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) =  0.000

num_samples, gini_scores = 20, []

for num_no in range (0, num_samples+1):
    gini_scores.append(compute_gini_score_node([0]*num_no + [1]*(num_samples-num_no)))

plt.figure()
plt.plot(range(0, len(gini_scores)), gini_scores, marker='o', ls='-', lw=3)
plt.xlabel("Distribution", fontsize=14)
plt.ylabel("Gini Score", fontsize=14)
plt.show()

def compute_gini_score_split(t_left, t_right):
    # Calculate the Gini score for the left and right node
    gini_score_left  = compute_gini_score_node(t_left)
    gini_score_right = compute_gini_score_node(t_right)
    # Calculate and return the weighted average Gini score
    return   len(t_left)/(len(t_left)+len(t_right))*gini_score_left \
           + len(t_right)/(len(t_left)+len(t_right))*gini_score_right

gini_split1 = compute_gini_score_split([0, 0, 0, 0, 0, 1], [0, 1, 1, 1])
gini_split2 = compute_gini_score_split([0, 0], [0, 0, 0, 0, 1, 1, 1, 1])

print(f"Gini score of Split 1: {gini_split1:.2f}")
print(f"Gini score of Split 2: {gini_split2:.2f}")

Gini score of Split 1: 0.32
Gini score of Split 2: 0.40

def compute_mse_score_node(y):
    # Compute the mean of both child nodes
    mean = np.mean(y) 
    # Calculate the RSS score
    mse = np.sum(np.square(y - mean)) / len(y)
    # Return the final RSS score
    return mse

amounts = df_loan.LOAN_AMOUNT.to_numpy()
print(f"All loan ammount (x10k):\n{amounts}\n")

mse_amounts = compute_mse_score_node(amounts)
print(f"MSE of all loan amounts: {mse_amounts}")

All loan ammount (x10k):
[28 32 30 12 15 38 20 70 34 15]

MSE of all loan amounts: 255.84

def compute_mse_score_split(y_left, y_right):
    # Calculate the MSE score for the left and right node
    mse_left  = compute_mse_score_node(y_left)
    mse_right = compute_mse_score_node(y_right)
    # Calculate and return the weighted average MSE
    return   len(y_left)/(len(y_left)+len(y_right))*mse_left \
           + len(y_right)/(len(y_left)+len(y_right))*mse_right

mse_split1 = compute_mse_score_split([28, 32, 30, 12, 15], [38, 20, 70, 34, 15])
mse_split2 = compute_mse_score_split([15, 12, 15, 20], [32, 28, 70, 38, 30, 34])

print(f"MSE of Split 1: {mse_split1:.2f}")
print(f"MSE of Split 2: {mse_split2:.2f}")

MSE of Split 1: 219.84
MSE of Split 2: 127.03

def find_best_feature_split(x, y, split_scoring_func):
    # Initialize the return values
    best_score, best_threshold, best_split = np.inf, None, None
    # Compute all possible thresholds
    thresholds = compute_thresholds(x)
    # Check all thresholds/partitions to find the one yielding the lowest Gini score
    for threshold in thresholds:
        # Generate the split for the current threshold/partition
        split = generate_split(x, threshold)
        # Split the target values w.r.t. indices
        y_left, y_right = y[split[0]], y[split[1]]
        # Compute the Gini score for the current split
        score = split_scoring_func(y_left, y_right)
        # Keep track of the key information of the split with the lowest Gini score
        if score < best_score:
            best_score, best_threshold, best_split = score, threshold, split
    # Return key information of best split
    return best_score, best_threshold, best_split

# Find best split for years of employment
best_score, best_threshold, best_split = find_best_feature_split(X_classification[:,1], y_classification, compute_gini_score_split)

indices_left, indices_right = best_split

print(f"Best score:\t{best_score:.3g}")
print(f"Best threshold:\t{best_threshold:.3g}")
print(f"Best split:\t{indices_left}, {indices_right}")

Best score:	0.167
Best threshold:	658
Best split:	[0 3 6 9], [1 2 4 5 7 8]

def sample_feature_indices(X, max_features=None):
    n_features = X.shape[1]
    if max_features is not None:
        n_features = min(n_features, max_features)
    return np.random.choice(np.arange(X.shape[1]), size=n_features, replace=False)

for _ in range(5):
    sampled_feature_indices = sample_feature_indices(X_classification, max_features=2)
    print(f"Sampled indices: {sampled_feature_indices}")

Sampled indices: [1 2]
Sampled indices: [0 2]
Sampled indices: [3 2]
Sampled indices: [2 3]
Sampled indices: [3 2]

def find_best_split(X, y, split_scoring_func, max_features=None):
    # Initialize the return values
    best_score, best_feature, best_threshold, best_split = np.inf, None, None, None
    # Perform feature sampling
    sampled_feature_indices = sample_feature_indices(X, max_features=max_features)
    # Check for each feature (i.e., each column in X), which split has the best (lowest) score
    for feature in range(X.shape[1]):
        # Extract feature values from datasets
        x = X[:,feature]
        # Calculate the best split for the current column/feature
        score, threshold, split = find_best_feature_split(x, y, split_scoring_func)
        # Keep track of the key information of the split with the lowest Gini score
        if score <= best_score:
            best_score, best_feature, best_split, best_threshold = score, feature, split, threshold
    # Return the best split together with the relevant information
    return best_score, best_feature, best_threshold, best_split

# Find best split (without feature sampling)
best_score, best_feature, best_threshold, best_split = find_best_split(X_classification, y_classification, compute_gini_score_split)

indices_left, indices_right = best_split

print(f"Best score:\t{best_score:.3g}")
print(f"Best feature:\t{best_feature} (index)")
print(f"Best threshold:\t{best_threshold:.3g}")
print(f"Best split:\t{indices_left}, {indices_right}")

Best score:	0.167
Best feature:	3 (index)
Best threshold:	4
Best split:	[0 3 6 9], [1 2 4 5 7 8]

np.random.seed(12)

classifier = SeleneDecisionTreeClassifier().fit(X_classification, y_classification)

print(classifier)

 X[3], threshold=4, score=0.5, samples=[1 1 0 1 0 0 1 0 0 1]
--- Leaf, gini: 0.000, samples: [1 1 1 1]
--- X[0], threshold=77, score=0.278, samples=[1 0 0 0 0 0]
------ Leaf, gini: 0.000, samples: [0 0 0 0]
------ X[3], threshold=10, score=0.5, samples=[1 0]
--------- Leaf, gini: 0.000, samples: [1]
--------- Leaf, gini: 0.000, samples: [0]

np.random.seed(8)

sklearn_classifier = DecisionTreeClassifier().fit(X_classification, y_classification)

plt.figure(figsize=(8, 8))
plot_tree(sklearn_classifier, filled=True, feature_names=feature_names)
plt.show()

np.random.seed(0)

regressor = SeleneDecisionTreeRegressor().fit(X_regression, y_regression)

print(regressor)

 X[0], threshold=24.5, score=256, samples=[28 32 30 12 15 38 20 70 34 15]
--- Leaf, gini: 0.000, samples: [70]
--- X[2], threshold=0.445, score=80.8, samples=[28 32 30 12 15 38 20 34 15]
------ X[1], threshold=735, score=51.9, samples=[28 32 30 15 38 34]
--------- X[3], threshold=5.5, score=11.8, samples=[28 32 30 38 34]
------------ X[0], threshold=52.5, score=1, samples=[28 30]
--------------- Leaf, gini: 0.000, samples: [28]
--------------- Leaf, gini: 0.000, samples: [30]
------------ X[1], threshold=705, score=6.22, samples=[32 38 34]
--------------- X[3], threshold=6.5, score=4, samples=[38 34]
------------------ Leaf, gini: 0.000, samples: [34]
------------------ Leaf, gini: 0.000, samples: [38]
--------------- Leaf, gini: 0.000, samples: [32]
--------- Leaf, gini: 0.000, samples: [15]
------ X[1], threshold=622, score=10.9, samples=[12 20 15]
--------- X[1], threshold=598, score=2.25, samples=[12 15]
------------ Leaf, gini: 0.000, samples: [12]
------------ Leaf, gini: 0.000, samples: [15]
--------- Leaf, gini: 0.000, samples: [20]

np.random.seed(0)

regressor = SeleneDecisionTreeRegressor(max_depth=2).fit(X_regression, y_regression)

print(regressor)

 X[0], threshold=24.5, score=256, samples=[28 32 30 12 15 38 20 70 34 15]
--- Leaf, gini: 0.000, samples: [70]
--- X[2], threshold=0.445, score=80.8, samples=[28 32 30 12 15 38 20 34 15]
------ Leaf, gini: 51.917, samples: [28 32 30 15 38 34]
------ Leaf, gini: 10.889, samples: [12 20 15]

np.random.seed(2)

sklearn_regressor = DecisionTreeRegressor(max_depth=2).fit(X_regression, y_regression)

plt.figure(figsize=(8, 8))
plot_tree(sklearn_regressor, filled=True, feature_names=feature_names)
plt.show()

def get_targets_for_sample(node, x):
    # If the node is a leaf, simple return all targets
    if node.is_leaf():
        return node.y
    # If the node is not a leaf, go down the left or right subtree depending on threshold
    go_left = False
    if x[node.feature_idx] <= node.threshold:
        go_left = True
    if go_left:
        return get_targets_for_sample(node.left_child, x)
    else:
        return get_targets_for_sample(node.right_child, x)        

def get_targets(X, decision_tree):
    # Return list of relevant targets for each sample
    return [ get_targets_for_sample(decision_tree, x) for x in X ]

classifier = SeleneDecisionTreeClassifier(max_depth=2).fit(X_classification, y_classification)

for i, targets in enumerate(get_targets(X_classification, classifier.tree)):
    print(f"Targets for Sample {i}:\t{targets}")

Targets for Sample 0:	[1 1 1 1]
Targets for Sample 1:	[1 0]
Targets for Sample 2:	[0 0 0 0]
Targets for Sample 3:	[1 1 1 1]
Targets for Sample 4:	[1 0]
Targets for Sample 5:	[0 0 0 0]
Targets for Sample 6:	[1 1 1 1]
Targets for Sample 7:	[0 0 0 0]
Targets for Sample 8:	[0 0 0 0]
Targets for Sample 9:	[1 1 1 1]

regressor = SeleneDecisionTreeRegressor(max_depth=2).fit(X_regression, y_regression)

for i, targets in enumerate(get_targets(X_regression, regressor.tree)):
    print(f"Targets for Sample {i}:\t{targets}")

Targets for Sample 0:	[28 32 30 15 38 34]
Targets for Sample 1:	[28 32 30 15 38 34]
Targets for Sample 2:	[28 32 30 15 38 34]
Targets for Sample 3:	[12 20 15]
Targets for Sample 4:	[28 32 30 15 38 34]
Targets for Sample 5:	[28 32 30 15 38 34]
Targets for Sample 6:	[12 20 15]
Targets for Sample 7:	[70]
Targets for Sample 8:	[28 32 30 15 38 34]
Targets for Sample 9:	[12 20 15]

def predict_classes(X, decision_tree_classifier):
    # Get the targets for all samples
    targets = get_targets(X, decision_tree_classifier.tree)
    # Return the majory class for each sample
    return [ np.bincount(t).argmax() for t in targets ]

classifier = SeleneDecisionTreeClassifier().fit(X_classification, y_classification)

for i, label in enumerate(predict_classes(X_classification, classifier)):
    print(f"Predicted class for Sample {i}:\t{label}")

Predicted class for Sample 0:	1
Predicted class for Sample 1:	1
Predicted class for Sample 2:	0
Predicted class for Sample 3:	1
Predicted class for Sample 4:	0
Predicted class for Sample 5:	0
Predicted class for Sample 6:	1
Predicted class for Sample 7:	0
Predicted class for Sample 8:	0
Predicted class for Sample 9:	1

def predict_values(X, decision_tree_regressor):
    # Get the targets for all samples
    targets = get_targets(X, decision_tree_regressor.tree)
    # Return the mean value for each sample
    return [ np.mean(t) for t in targets ]

regressor = SeleneDecisionTreeRegressor().fit(X_regression, y_regression)

for i, value in enumerate(predict_classes(X_regression, regressor)):
    print(f"Predicted value for Sample {i}:\t{value}")

Predicted value for Sample 0:	28
Predicted value for Sample 1:	32
Predicted value for Sample 2:	30
Predicted value for Sample 3:	12
Predicted value for Sample 4:	15
Predicted value for Sample 5:	38
Predicted value for Sample 6:	20
Predicted value for Sample 7:	70
Predicted value for Sample 8:	34
Predicted value for Sample 9:	15

df_cancer = pd.read_csv(breast_cancer)

df_cancer.head()

X_cancer = df_cancer.iloc[:, 0:-1].to_numpy()
y_cancer = df_cancer.iloc[:, -1].to_numpy().squeeze()

# Split dataset in training and test data (25% test data)
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, test_size=0.25, random_state=0)

print(f"Size of training dataset: {len(X_train)}")
print(f"Size of test dataset: {len(X_test)}")

Size of training dataset: 426
Size of test dataset: 143

selene_decision_tree = SeleneDecisionTreeClassifier().fit(X_train, y_train)

y_pred = selene_decision_tree.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.91      0.94        93
           1       0.85      0.94      0.90        50

    accuracy                           0.92       143
   macro avg       0.91      0.93      0.92       143
weighted avg       0.93      0.92      0.92       143

sklearn_decision_tree = DecisionTreeRegressor().fit(X_train, y_train)

y_pred = sklearn_decision_tree.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.94      0.95        93
           1       0.89      0.94      0.91        50

    accuracy                           0.94       143
   macro avg       0.93      0.94      0.93       143
weighted avg       0.94      0.94      0.94       143

	radius1	texture1	perimeter1	area1	smoothness1	compactness1	concavity1	concave_points1	symmetry1	fractal_dimension1	...	texture3	perimeter3	area3	smoothness3	compactness3	concavity3	concave_points3	symmetry3	fractal_dimension3	diagnosis
0	20.48	21.46	132.50	1306.0	0.08355	0.08348	0.09042	0.06022	0.1467	0.05177	...	26.17	161.70	1750.0	0.1228	0.23110	0.31580	0.14450	0.2238	0.07127	1
1	13.15	15.34	85.31	538.9	0.09384	0.08498	0.09293	0.03483	0.1822	0.06207	...	20.50	97.67	677.3	0.1478	0.22560	0.30090	0.09722	0.3849	0.08633	0
2	10.17	14.88	64.55	311.9	0.11340	0.08061	0.01084	0.01290	0.2743	0.06960	...	17.45	69.86	368.6	0.1275	0.09866	0.02168	0.02579	0.3557	0.08020	0
3	14.90	22.53	102.10	685.0	0.09947	0.22250	0.27330	0.09711	0.2041	0.06898	...	27.57	125.40	832.7	0.1419	0.70900	0.90190	0.24750	0.2866	0.11550	1
4	20.73	31.12	135.70	1419.0	0.09469	0.11430	0.13670	0.08646	0.1769	0.05674	...	47.16	214.00	3432.0	0.1401	0.26440	0.34420	0.16590	0.2868	0.08218	1

Decision Trees — Implementation from Scratch¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Preparation of Toy Dataset¶

Preliminaries¶

Quick Recap: Decision Trees¶

Implemention¶

Generating Splits¶

Scoring Splits¶

Classification Tasks¶

Regression Tasks¶

Finding the Best Split¶

Recursive Splitting¶

Basic Approach¶

Stopping Criteria¶

Practical Implementation¶

Predicting Classes¶

Worked Example¶

Summary¶

	ANNUAL_INCOME	CREDIT_SCORE	DEBT_RATIO	YEARS_EMPLOYED	DEFAULTED	LOAN_AMOUNT
0	45	620	0.42	2	1	28
1	82	710	0.21	8	1	32
2	60	680	0.35	5	0	30
3	38	590	0.51	1	1	12
4	95	760	0.18	12	0	15
5	72	700	0.29	7	0	38
6	50	640	0.47	3	1	20
7	11	790	0.15	15	0	70
8	67	675	0.33	6	0	34
9	40	605	0.49	2	1	15