From 276e92df77e0cbfb0312806ce664e8469a782d7c Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 23 Aug 2023 15:49:28 -0400 Subject: [PATCH 1/6] FIX remove class weight & optimize node splitting --- sklearn/ensemble/_forest.py | 15 --------------- sklearn/tree/_tree.pyx | 2 +- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 398fd31ab2be2..5a252178b958b 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1252,17 +1252,6 @@ def partial_fit(self, X, y, sample_weight=None, classes=None): self.n_outputs_ = y.shape[1] - y, expanded_class_weight = self._validate_y_class_weight(y) - - if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: - y = np.ascontiguousarray(y, dtype=DOUBLE) - - if expanded_class_weight is not None: - if sample_weight is not None: - sample_weight = sample_weight * expanded_class_weight - else: - sample_weight = expanded_class_weight - if not self.bootstrap and self.max_samples is not None: raise ValueError( "`max_sample` cannot be set if `bootstrap=False`. " @@ -1367,10 +1356,6 @@ def partial_fit(self, X, y, sample_weight=None, classes=None): else: self._set_oob_score_and_attributes(X, y) - # Decapsulate classes_ attributes - if hasattr(self, "classes_") and self.n_outputs_ == 1: - self.n_classes_ = self.n_classes_[0] - self.classes_ = self.classes_[0] return self def predict(self, X): diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index d3b7cb4c26f18..2e281a3d2ad62 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -319,7 +319,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if not first: # push reached leaf nodes onto stack - for key, value in reversed(sorted(self.initial_roots.items())): + for key, value in sorted(self.initial_roots.items()): end += value[0] update_stack.push({ "start": start, From fd833878d35fbd553c10b0363dd67a42e374b09d Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 23 Aug 2023 23:37:44 -0400 Subject: [PATCH 2/6] FIX revert changes & correct classes param --- sklearn/ensemble/_forest.py | 19 +++++++++++++++++++ sklearn/tree/_tree.pyx | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 5a252178b958b..8fc5739450367 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1252,6 +1252,21 @@ def partial_fit(self, X, y, sample_weight=None, classes=None): self.n_outputs_ = y.shape[1] + classes = self.classes_ + if self.n_outputs_ == 1: + classes = [classes] + + y, expanded_class_weight = self._validate_y_class_weight(y, classes) + + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + + if expanded_class_weight is not None: + if sample_weight is not None: + sample_weight = sample_weight * expanded_class_weight + else: + sample_weight = expanded_class_weight + if not self.bootstrap and self.max_samples is not None: raise ValueError( "`max_sample` cannot be set if `bootstrap=False`. " @@ -1356,6 +1371,10 @@ def partial_fit(self, X, y, sample_weight=None, classes=None): else: self._set_oob_score_and_attributes(X, y) + # Decapsulate classes_ attributes + if hasattr(self, "classes_") and self.n_outputs_ == 1: + self.n_classes_ = self.n_classes_[0] + self.classes_ = self.classes_[0] return self def predict(self, X): diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 2e281a3d2ad62..d3b7cb4c26f18 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -319,7 +319,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if not first: # push reached leaf nodes onto stack - for key, value in sorted(self.initial_roots.items()): + for key, value in reversed(sorted(self.initial_roots.items())): end += value[0] update_stack.push({ "start": start, From b08b6cdf4526e4c2dcab6bee9c4927904dbac6f4 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 23 Aug 2023 23:57:55 -0400 Subject: [PATCH 3/6] FIX decouple classes param --- sklearn/ensemble/_forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 8fc5739450367..3ca1a2d347623 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1345,7 +1345,7 @@ def partial_fit(self, X, y, sample_weight=None, classes=None): verbose=self.verbose, class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap, - classes=classes, + classes=classes[0], ) for i, t in enumerate(self.estimators_) ) From b92c9eefe3bf8e3eced6cd6151fe616adff660f6 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Thu, 24 Aug 2023 10:05:07 -0400 Subject: [PATCH 4/6] FIX remove unnecessary resize for update --- sklearn/tree/_tree.pyx | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index d3b7cb4c26f18..22c97e54ca50d 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -268,16 +268,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # check input X, y, sample_weight = self._check_input(X, y, sample_weight) - # Initial capacity - cdef int init_capacity - - if tree.max_depth <= 10: - init_capacity = (2 ** (tree.max_depth + 1)) - 1 - else: - init_capacity = 2047 - - tree._resize(init_capacity) - # Parameters cdef Splitter splitter = self.splitter cdef SIZE_t max_depth = self.max_depth @@ -286,10 +276,19 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef SIZE_t min_samples_split = self.min_samples_split cdef double min_impurity_decrease = self.min_impurity_decrease + # Initial capacity + cdef int init_capacity cdef bint first = 0 if self.initial_roots is None: # Recursive partition (without actual recursion) splitter.init(X, y, sample_weight, missing_values_in_feature_mask) + + if tree.max_depth <= 10: + init_capacity = (2 ** (tree.max_depth + 1)) - 1 + else: + init_capacity = 2047 + + tree._resize(init_capacity) first = 1 cdef SIZE_t start = 0 From f1578aa80d267d0f19bbf9b91c56078dda995f56 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 24 Aug 2023 16:21:52 -0400 Subject: [PATCH 5/6] Make methods inline Signed-off-by: Adam Li --- sklearn/tree/_splitter.pxd | 2 +- sklearn/tree/_splitter.pyx | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 7da118347414a..d1731894bb134 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -126,7 +126,7 @@ cdef class Splitter(BaseSplitter): # Methods that allow modifications to stopping conditions cdef bint check_presplit_conditions( self, - SplitRecord current_split, + SplitRecord* current_split, SIZE_t n_missing, bint missing_go_to_left, ) noexcept nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 608e582685aca..982c68455040d 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -327,9 +327,9 @@ cdef class Splitter(BaseSplitter): return self.criterion.node_impurity() - cdef bint check_presplit_conditions( + cdef inline bint check_presplit_conditions( self, - SplitRecord current_split, + SplitRecord* current_split, SIZE_t n_missing, bint missing_go_to_left, ) noexcept nogil: @@ -356,7 +356,7 @@ cdef class Splitter(BaseSplitter): return 0 - cdef bint check_postsplit_conditions( + cdef inline bint check_postsplit_conditions( self ) noexcept nogil: """Check stopping conditions after evaluating the split. @@ -571,7 +571,7 @@ cdef inline int node_split_best( else: n_left = current_split.pos - splitter.start n_right = end_non_missing - current_split.pos + n_missing - if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1: + if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: continue criterion.update(current_split.pos) @@ -914,7 +914,7 @@ cdef inline int node_split_random( current_split.pos = partitioner.partition_samples(current_split.threshold) # Reject if min_samples_leaf is not guaranteed - if splitter.check_presplit_conditions(current_split, 0, 0) == 1: + if splitter.check_presplit_conditions(¤t_split, 0, 0) == 1: continue # Evaluate split From 3eb84ae84f239ff6f024aa5a2b52815ee60127e4 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 24 Aug 2023 16:38:01 -0400 Subject: [PATCH 6/6] Make more methods inline for optimized runtime Signed-off-by: Adam Li --- sklearn/tree/_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 22c97e54ca50d..cd664dc4adc38 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1147,7 +1147,7 @@ cdef class BaseTree: return node_id - cdef SIZE_t _update_node( + cdef inline SIZE_t _update_node( self, SIZE_t parent, bint is_left,