from src.utils.libimports.cart import *
from src.utils.plotting.cart import *
from src.utils.data.files import *
from src.models.trees.cart import *

bank_data ,_ = download_dataset("tabular/classification/example-credit-default-data.csv")
cart_data ,_ = download_dataset("tabular/classification/example-data-cart-overfitting-underfitting.csv")

File 'data/datasets/tabular/classification/example-credit-default-data.csv' already exists (use 'overwrite=True' to overwrite it).
File 'data/datasets/tabular/classification/example-data-cart-overfitting-underfitting.csv' already exists (use 'overwrite=True' to overwrite it).

df_bank = pd.read_csv(bank_data)

df_bank.head(10)

ages = df_bank.Age.to_numpy()
educations = df_bank.Education.to_numpy()
incomes = df_bank.Annual_Income.to_numpy()
mstatuses = df_bank.Marital_Status.to_numpy()
targets = df_bank.Credit_Default.to_numpy()

def compute_thresholds(feature_values):
    # Get unique values to handle duplicates; return values will already be sorted
    values_sorted = np.unique(feature_values)
    # Compute and return all midpoints between consective values
    return (values_sorted[:-1] + values_sorted[1:]) / 2.0

print(f"Candidate thresholds for feature 'Age':            {compute_thresholds(ages)}")
print(f"Candidate thresholds for feature 'Annual_Income':  {compute_thresholds(incomes)}")

Candidate thresholds for feature 'Age':            [20.5 24.5 27.  29.  32.5 38.  48. ]
Candidate thresholds for feature 'Annual_Income':  [45000. 55000. 65000. 72500. 80000. 90000.]

def generate_split_general(feature_values, threshold):
    indices_left  = np.where(feature_values <= threshold)[0]
    indices_right = np.where(feature_values > threshold)[0]
    return indices_left, indices_right

income_threshold = 65000

print(f"Split of feature 'Annual_Income' using {income_threshold} as threshold': {generate_split_general(incomes, income_threshold)}")

Split of feature 'Annual_Income' using 65000 as threshold': (array([1, 4, 6, 7, 8]), array([0, 2, 3, 5]))

def compute_partitions(feature_values):
    partitions = []
    # Compute the set of all unique feature values
    values = set(feature_values)
    # Compute all subsets of features values (ignoring empty set and the full set)
    subsets = chain.from_iterable(combinations(values, r) for r in range(1, len(values)))
    # Initialize set of subset we have already seen
    seen_subsets = set()
    # Loop over all subsets and consider them as the "left branch condition"
    for left in subsets:
        # Convert subset from tuple to true set
        left = set(left)
        # Compute "right branch condition" using the set difference
        right = values - left
        # If we have already seen the right subset, we can stop
        if frozenset(right) in seen_subsets:
            break
        # Add current left subset to set of seen subsets
        seen_subsets.add(frozenset(left))
        # Add current split to final result list
        partitions.append((left, right))
    # Return all possible splits
    return partitions

print(f"All partitions for education level:\n{compute_partitions(educations)}\n")
print(f"All partitions for marital statuses:\n{compute_partitions(mstatuses)}")

All partitions for education level:
[({'PhD'}, {'Bachelor', 'Masters'}), ({'Bachelor'}, {'PhD', 'Masters'}), ({'Masters'}, {'PhD', 'Bachelor'})]

All partitions for marital statuses:
[({'Single'}, {'Married'})]

def generate_split_nominal(feature_values, partition):
    indices_left  = [ idx  for idx, val in enumerate(feature_values) if val in partition[0] ]
    indices_right = [ idx  for idx, val in enumerate(feature_values) if val in partition[1] ]
    return indices_left, indices_right

partition = ({'Bachelor'}, {'Masters', 'PhD'})
#partition = ({'Masters', 'PhD'}, {'Bachelor'})

print(f"Split of feature 'Education' using {partition} as partition': {generate_split_nominal(educations, partition)}")

Split of feature 'Education' using ({'Bachelor'}, {'PhD', 'Masters'}) as partition': ([1, 4, 6], [0, 2, 3, 5, 7, 8])

def compute_gini_score_node(t):
    # Count the number of occurcences of output classes in split
    _, counts = np.unique(t, return_counts=True)
    # Calculate and return the Gini score
    return 1 - np.sum(np.square(counts/len(t)))

num_samples = 6

for num_no in range (0, num_samples+1):
    distribution = ["NO"]*num_no + ["YES"]*(num_samples-num_no)
    print(f"Gini({distribution}) =  {compute_gini_score_node(distribution):.3f}")

Gini(['YES', 'YES', 'YES', 'YES', 'YES', 'YES']) =  0.000
Gini(['NO', 'YES', 'YES', 'YES', 'YES', 'YES']) =  0.278
Gini(['NO', 'NO', 'YES', 'YES', 'YES', 'YES']) =  0.444
Gini(['NO', 'NO', 'NO', 'YES', 'YES', 'YES']) =  0.500
Gini(['NO', 'NO', 'NO', 'NO', 'YES', 'YES']) =  0.444
Gini(['NO', 'NO', 'NO', 'NO', 'NO', 'YES']) =  0.278
Gini(['NO', 'NO', 'NO', 'NO', 'NO', 'NO']) =  0.000

num_samples, gini_scores = 20, []

for num_no in range (0, num_samples+1):
    gini_scores.append(compute_gini_score_node(["NO"]*num_no + ["YES"]*(num_samples-num_no)))

plt.figure()
plt.plot(range(0, len(gini_scores)), gini_scores, marker='o', ls='-', lw=3)
plt.xlabel("Distribution", fontsize=14)
plt.ylabel("Gini Score", fontsize=14)
plt.show()

def compute_gini_score_split(t_left, t_right):
    # Calculate the Gini score for the left and right node
    gini_score_left  = compute_gini_score_node(t_left)
    gini_score_right = compute_gini_score_node(t_right)
    # Calculate and return the weighted average Gini score
    return   len(t_left)/(len(t_left)+len(t_right))*gini_score_left \
           + len(t_right)/(len(t_left)+len(t_right))*gini_score_right

gini_mstatus   = compute_gini_score_split(["NO", "NO", "NO", "YES", "YES"],  ["NO", "NO", "YES", "YES"])
gini_education = compute_gini_score_split(["NO", "NO", "NO"], ["NO", "YES", "YES", "YES", "YES", "YES"])

print(f"Gini score of split s_mstatus:    {gini_mstatus:.2f}")
print(f"Gini score of split s_education:  {gini_education:.2f}")

Gini score of split s_mstatus:    0.49
Gini score of split s_education:  0.19

df_bank.drop(["ID", "Credit_Default"], axis=1).head(10)

def compute_rss_score_split(t_left, t_right):
    # Compute the mean of both child nodes
    mean_left  = np.mean(t_left) 
    mean_right = np.mean(t_right)
    # Calculate the Gini score for the left and right node
    rss_score_left  = np.sum(np.square(t_left - mean_left))
    rss_score_right = np.sum(np.square(t_right - mean_right))
    # Calculate and return the final RSS score
    return rss_score_left + rss_score_right

rss_mstatus   = compute_rss_score_split([75, 70, 95, 40, 60],  [50, 85, 60, 65])
rss_education = compute_rss_score_split([50, 40, 60],  [75, 70, 95, 85, 60, 65])

print(f"RSS score of split s_mstatus:    {rss_mstatus:.2f}")
print(f"RSS score of split s_education:  {rss_education:.2f}")

RSS score of split s_mstatus:    2280.00
RSS score of split s_education:  1050.00

X = df_bank.drop(["ID", "Credit_Default"], axis=1).to_numpy()
y = df_bank["Credit_Default"].to_numpy()

print(f"Feature matrix X:\n{X}\n")
print(f"Target vector y: {y}")

Feature matrix X:
[[23 'Masters' 'Single' 75000]
 [35 'Bachelor' 'Married' 50000]
 [26 'Masters' 'Single' 70000]
 [41 'PhD' 'Single' 95000]
 [18 'Bachelor' 'Single' 40000]
 [55 'Masters' 'Married' 85000]
 [30 'Bachelor' 'Single' 60000]
 [35 'PhD' 'Married' 60000]
 [28 'PhD' 'Married' 60000]]

Target vector y: ['NO' 'YES' 'NO' 'NO' 'YES' 'YES' 'YES' 'NO' 'NO']

def find_best_feature_split(x, y, feature_type):
    ## Initialize the return values
    best_score, best_criterion, best_split = np.inf, None, None

    # Create splits depending on the feature being nominal or nonnominal
    if feature_type == "nominal":
        criterions = compute_partitions(x)
        generate_split = generate_split_nominal
    else:
        criterions = compute_thresholds(x)
        generate_split = generate_split_general

    # Check all thresholds/partitions to find the one yielding the lowest Gini score
    for criterion in criterions:
        # Generate the split for the current threshold/partition
        split = generate_split(x, criterion)
        # Split the target values w.r.t. indices
        y_left, y_right = y[split[0]], y[split[1]]
        # Compute the Gini score for the current split
        score = compute_gini_score_split(y_left, y_right)
        # Keep track of the key information of the split with the lowest Gini score
        if score < best_score:
            best_score, best_criterion, best_split = score, criterion, split

    # Return key information of best split
    return best_score, best_criterion, best_split

score, critierion, split = find_best_feature_split(ages, y, "ratio")
#score, critierion, split = find_best_feature_split(educations, y, "nominal")  # Should return the lowest score
#score, critierion, split = find_best_feature_split(mstatuses, y, "nominal")
#score, critierion, split = find_best_feature_split(incomes, y, "ratio")

print(f"Gini score: {score:.2f}")
print(f"Criterion:  {critierion}")
print(f"Split:      {split}")

Gini score: 0.42
Criterion:  20.5
Split:      (array([4]), array([0, 1, 2, 3, 5, 6, 7, 8]))

def find_best_split(X, y, feature_types):
    # Initialize the return values
    best_score, best_criterion, best_column, best_split = np.inf, None, None, None
    # Check for each feature (i.e., each column in X), which split has the best (lowest) score
    for column in range(X.shape[1]):
        # Extract feature values from datasets
        x = X[:,column]
        # Calculate the best split for the current column/feature
        score, threshold, split = find_best_feature_split(x, y, feature_types[column])
        # Keep track of the key information of the split with the lowest Gini score
        if score <= best_score:
            best_score, best_column, best_split, best_threshold = score, column, split, threshold
    # Return the best split together with the relevant information
    return best_score, best_column, best_threshold, best_split

score, column, critierion, split = find_best_split(X, y, ["ratio", "nominal", "nominal", "ratio"])

print(f"Gini score: {score:.2f}")
print(f"Column:     {column} ({df_bank.columns[column+1]})")
print(f"Criterion:  {critierion}")
print(f"Split:      {split}")

Gini score: 0.19
Column:     1 (Education)
Criterion:  ({'Bachelor'}, {'PhD', 'Masters'})
Split:      ([1, 4, 6], [0, 2, 3, 5, 7, 8])

cart = SeleneCART(["ration", "nominal", "nominal", "ratio"]).fit(X, y)

print(cart)

 X[1], criterion: ({'Bachelor'}, {'PhD', 'Masters'}), gini: 0.494, samples: ['NO' 'YES' 'NO' 'NO' 'YES' 'YES' 'YES' 'NO' 'NO']
--- Leaf, gini: 0.000, samples: ['YES' 'YES' 'YES']
--- X[0], criterion: 48.0, gini: 0.278, samples: ['NO' 'NO' 'NO' 'YES' 'NO' 'NO']
------ Leaf, gini: 0.000, samples: ['NO' 'NO' 'NO' 'NO' 'NO']
------ Leaf, gini: 0.000, samples: ['YES']

x_test = [50, "PhD", "Single", 70000]

print(f"Prediction for {x_test}: {cart.predict([x_test]).tolist()}")

Prediction for [50, 'PhD', 'Single', 70000]: ['YES']

# Load data file into Pandas DataFrame
df_cart = pd.read_csv(cart_data)

#Convert DataFrame into feature matrix and target vector
X = df_cart.drop(["y"], axis=1).to_numpy()
y = df_cart["y"].to_numpy()

plot_cart_data(X, y)

tree_full = DecisionTreeClassifier().fit(X, y)

plot_decision_tree(tree_full)

plot_cart_data(X, y, model=tree_full)

model_maxdepth3 = DecisionTreeClassifier(max_depth=3).fit(X, y)

plot_decision_tree(model_maxdepth3)

plot_cart_data(X, y, model_maxdepth3)

model_maxdepth2 = DecisionTreeClassifier(max_depth=2).fit(X, y)

plot_decision_tree(model_maxdepth2)

plot_cart_data(X, y, model_maxdepth2)

Education	Credit_Limit	Credit_Default
Bachelor	200	YES (1)
Bachelor	220	NO (0)
Masters	150	YES (1)
Masters	180	NO (0)
Masters	240	NO (0)
Phd	300	NO (0)

Education	Credit_Limit	Credit_Default	Edu_enc_regression	Edu_enc_classification
Bachelor	200	YES (1)	210	0.5
Bachelor	220	NO (0)	210	0.5
Masters	150	YES (1)	190	0.33
Masters	180	NO (0)	190	0.33
Masters	240	NO (0)	190	0.0
Phd	300	NO (0)	300	0.0

Aspect	Original CART (Breiman et al., 1984)	Practical Implementations (e.g., scikit-learn, XGBoost)
Splitting Criterion	Binary splits based on impurity (Gini, variance)	Same, but may include optimizations (e.g., approximate splits, histogram binning)
Handling Missing Values	Assumes complete data	Imputation during preprocessing, surrogate splits,
Tree Growth	Grows full tree, post-pruning	Typically pre-pruning through early stopping with constraints (but post-pruning still applicable)
Categorical Features	Arbitrary partitions of categories	Data encoding during preprocessing (e.g., one-hot encoding), grouping, or restricted partitions
Computational Cost	Exhaustive search for best split	Approximation techniques (quantiles, binning) to improve speed
Interpretability	Simple yes/no binary splits	Still binary, but may be constrained for readability in practice
Usage	Standalone decision tree	Often used as base learners in ensembles (Random Forests, Gradient Boosted Trees)

Decision Trees — CART (Classification and Regression Trees)¶

Setting up the Notebook¶

Make Required Imports¶

Download Required Data¶

Quick Recap: Decision Trees¶

Original CART Algorithm¶

Enumerating all Possible Splits¶

Non-Nominal Features¶

Nominal Features¶

Scoring all Splits¶

Classification Tasks¶

Regression Tasks¶

Putting it all Together¶

Tree Pruning¶

Practical Implementations & Optimizations¶

Handling of Categorical Features¶

Ease of Use¶

Performance & Efficiency¶

Non-Nominal Features¶

Nominal Features¶

Handling of Missing Values¶

"Best Guess" Strategy¶

Surrogate Splits¶

Handling Overfitting & Underfitting: Pre-Pruning¶

Integration into Ensembles¶

Summary¶

	ID	Age	Education	Marital_Status	Annual_Income	Credit_Default
0	1	23	Masters	Single	75000	NO
1	2	35	Bachelor	Married	50000	YES
2	3	26	Masters	Single	70000	NO
3	4	41	PhD	Single	95000	NO
4	5	18	Bachelor	Single	40000	YES
5	6	55	Masters	Married	85000	YES
6	7	30	Bachelor	Single	60000	YES
7	8	35	PhD	Married	60000	NO
8	9	28	PhD	Married	60000	NO