I’m taking a machine learning course with an assignment to implement a fit method for DecisionTreeClassifier. Here are the links for more information:
https://stepik.org/lesson/333977/step/8?auth=login&unit=317388
https://stepik.org/lesson/333977/step/3?auth=login&unit=317388
Here’s my code:
import numpy as np
import pandas as pd
class MyTreeClf:
def __init__(self, max_depth=5, min_samples_split=2, max_leafs=20):
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.max_leafs = max_leafs
self.tree = None
self.leafs_cnt = 0
def node_entropy(self, probs):
return -np.sum([p * np.log2(p) for p in probs if p > 0])
def node_ig(self, x_col, y, split_value):
left_mask = x_col <= split_value
right_mask = x_col > split_value
if len(x_col[left_mask]) == 0 or len(x_col[right_mask]) == 0:
return 0
left_probs = np.bincount(y[left_mask]) / len(y[left_mask])
right_probs = np.bincount(y[right_mask]) / len(y[right_mask])
entropy_after = len(y[left_mask]) / len(y) * self.node_entropy(left_probs) + len(y[right_mask]) / len(y) * self.node_entropy(right_probs)
entropy_before = self.node_entropy(np.bincount(y) / len(y))
return entropy_before - entropy_after
def get_best_split(self, X: pd.DataFrame, y: pd.Series):
best_col, best_split_value, best_ig = None, None, -np.inf
for col in X.columns:
sorted_unique_values = np.sort(X[col].unique())
for i in range(1, len(sorted_unique_values)):
split_value = (sorted_unique_values[i - 1] + sorted_unique_values[i]) / 2
ig = self.node_ig(X[col], y, split_value)
if ig > best_ig:
best_ig = ig
best_col = col
best_split_value = split_value
return best_col, best_split_value, best_ig
def fit(self, X: pd.DataFrame, y: pd.Series, depth=0):
if depth == 0:
self.tree = {}
best_col, best_split_value, best_ig = self.get_best_split(X, y)
if depth < self.max_depth and len(y) >= self.min_samples_split and self.leafs_cnt < self.max_leafs and best_col is not None:
left_mask = X[best_col] <= best_split_value
right_mask = X[best_col] > best_split_value
self.tree[depth] = {'col': best_col, 'split': best_split_value, 'left': {}, 'right': {}}
self.fit(X[left_mask], y[left_mask], depth + 1)
self.fit(X[right_mask], y[right_mask], depth + 1)
else:
class_label = y.mode()[0]
self.tree[depth] = {'class': class_label}
self.leafs_cnt += 1
I’ve verified that the node_entropy, node_ig, and get_best_split methods work correctly. However, the fit method returns 4 for the first sample, while the expected result is 2.
The test includes three datasets and the parameters max_depth, min_samples_split, and max_leafs. They provided these parameters for the first sample:
Sample Input: {“max_depth”: 3, “min_samples_split”: 2, “max_leafs”: 1}
Sample Output: 2 (leafs_cnt)
Unfortunately, they didn’t give those datasets