Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scarliles/honesty #69

Draft
wants to merge 72 commits into
base: submodulev3
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 47 commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
8c09f7f
init split condition injection
SamuelCarliles3 Feb 16, 2024
ecfc9b1
wip
SamuelCarliles3 Feb 16, 2024
0c3d5c0
wip
SamuelCarliles3 Feb 16, 2024
5fd12a2
wip
SamuelCarliles3 Feb 20, 2024
b593ee0
injection progress
SamuelCarliles3 Feb 27, 2024
180fac3
injection progress
SamuelCarliles3 Feb 27, 2024
c207c3e
split injection refactoring
SamuelCarliles3 Feb 27, 2024
7cc71c1
added condition parameter passthrough prototype
SamuelCarliles3 Feb 29, 2024
2470d49
some tidying
SamuelCarliles3 Feb 29, 2024
ee3399f
more tidying
SamuelCarliles3 Feb 29, 2024
a079e4f
splitter injection refactoring
SamuelCarliles3 Mar 10, 2024
5397b66
cython injection due diligence, converted min_sample and monotonic_cs…
SamuelCarliles3 Mar 15, 2024
44f1d57
tree tests pass huzzah!
SamuelCarliles3 Mar 18, 2024
4f19d53
added some splitconditions to header
SamuelCarliles3 Mar 18, 2024
cb71be0
commented out some sample code that was substantially increasing peak…
SamuelCarliles3 Mar 21, 2024
e34be5c
added vector resize
SamuelCarliles3 Apr 9, 2024
aac802e
wip
SamuelCarliles3 Apr 10, 2024
c12f2fd
Merge branch 'submodulev3' into scarliles/splitter-injection-redux
SamuelCarliles3 Apr 15, 2024
a7f5e92
settling injection memory management for now
SamuelCarliles3 Apr 15, 2024
7a70a0b
added regression forest benchmark
SamuelCarliles3 Apr 22, 2024
d9ad68a
Merge pull request #2 from ssec-jhu/scarliles/regression-benchmark
SamuelCarliles3 Apr 22, 2024
893d588
ran black for linting check
SamuelCarliles3 Apr 23, 2024
548493c
Merge branch 'submodulev3' of github.com:ssec-jhu/scikit-learn into s…
SamuelCarliles3 Apr 23, 2024
e4b53ff
Merge branch 'submodulev3' into scarliles/regression-benchmark
SamuelCarliles3 Apr 23, 2024
089d901
Merge branch 'neurodata:submodulev3' into submodulev3
SamuelCarliles3 Apr 24, 2024
3ba5f74
Merge branch 'submodulev3' of github.com:ssec-jhu/scikit-learn into s…
SamuelCarliles3 Apr 24, 2024
cf285c1
Merge branch 'scarliles/splitter-injection-redux' into scarliles/regr…
SamuelCarliles3 Apr 24, 2024
ffc6328
Merge pull request #3 from ssec-jhu/scarliles/regression-benchmark
SamuelCarliles3 Apr 24, 2024
87c90fd
initial pass at refactoring DepthFirstTreeBuilder.build
SamuelCarliles3 May 23, 2024
51da586
some renaming to make closure pattern more obvious
SamuelCarliles3 May 28, 2024
6c117a2
added SplitRecordFactory
SamuelCarliles3 May 28, 2024
c7b675b
Merge branch 'scarliles/update-node-refactor2' into scarliles/update-…
SamuelCarliles3 May 28, 2024
9e7b131
SplitRecordFactory progress
SamuelCarliles3 May 28, 2024
a017669
build loop refactor
SamuelCarliles3 May 29, 2024
4325b0a
add_or_update tweak
SamuelCarliles3 May 29, 2024
78c3a1b
reverted to back out build body refactor
SamuelCarliles3 May 30, 2024
b8cc636
refactor baby step
SamuelCarliles3 May 30, 2024
f225658
update node refactor more baby steps
SamuelCarliles3 May 30, 2024
bc17634
wip
SamuelCarliles3 Jun 14, 2024
c949182
added EventBroker class
SamuelCarliles3 Jun 16, 2024
247c4fc
added initial event firing to node_split_best
SamuelCarliles3 Jun 17, 2024
71da148
removed some old commented out code
SamuelCarliles3 Jun 17, 2024
a1fa950
honesty wip
SamuelCarliles3 Jun 30, 2024
ff0dfed
honesty wip
SamuelCarliles3 Jun 30, 2024
db4c947
honesty wip
SamuelCarliles3 Jul 1, 2024
2e87134
honesty wip
SamuelCarliles3 Jul 1, 2024
03c95d9
honesty wip
SamuelCarliles3 Jul 1, 2024
69fc530
honesty wip
SamuelCarliles3 Jul 3, 2024
61dfd0f
honesty wip
SamuelCarliles3 Jul 5, 2024
29a52be
Merge remote-tracking branch 'neurodata/submodulev3' into submodulev3
SamuelCarliles3 Jul 5, 2024
cf52ff5
broke sort functions, partitioners out of _splitter.pyx
SamuelCarliles3 Jul 5, 2024
8e433a6
refactored partitioner
SamuelCarliles3 Jul 6, 2024
09a8ec5
fixed some unintended commented out lines in SparsePartitioner
SamuelCarliles3 Jul 6, 2024
6bb7a33
Merge branch 'scarliles/defuse-partitioner' into scarliles/honesty
SamuelCarliles3 Jul 8, 2024
a2030a8
importing _honest_tree from treeple
SamuelCarliles3 Jul 10, 2024
64688e5
honesty wip
SamuelCarliles3 Jul 18, 2024
febf5e9
honesty wip
SamuelCarliles3 Jul 22, 2024
5e7d07d
honesty wip
SamuelCarliles3 Jul 31, 2024
2c4e992
honesty wip
SamuelCarliles3 Aug 1, 2024
2346e4d
honesty wip
SamuelCarliles3 Aug 4, 2024
551fcf1
honesty wip
SamuelCarliles3 Aug 4, 2024
f1fb747
honesty wip
SamuelCarliles3 Aug 4, 2024
2f2d15a
honest partition testing wip
SamuelCarliles3 Aug 9, 2024
cd79492
honest leaf validity test working
SamuelCarliles3 Aug 10, 2024
53cf65c
honest prediction wip
SamuelCarliles3 Aug 22, 2024
a9e065b
honest prediction wip
SamuelCarliles3 Aug 24, 2024
80c391d
honest prediction passing tests
SamuelCarliles3 Aug 24, 2024
9b5651e
hacked in working honest predict_proba, progress on honest regression
SamuelCarliles3 Aug 30, 2024
cbb23ee
first draft honest forest passing tests
SamuelCarliles3 Sep 3, 2024
c565d65
honesty wip
SamuelCarliles3 Sep 5, 2024
2316e4c
treeple-compatibility tweaks
SamuelCarliles3 Sep 8, 2024
71cacf3
might testing wip
SamuelCarliles3 Sep 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 44 additions & 1 deletion asv_benchmarks/benchmarks/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,58 @@
GradientBoostingClassifier,
HistGradientBoostingClassifier,
RandomForestClassifier,
RandomForestRegressor,
)

from .common import Benchmark, Estimator, Predictor
from .datasets import (
_20newsgroups_highdim_dataset,
_20newsgroups_lowdim_dataset,
_synth_classification_dataset,
_synth_regression_dataset,
_synth_regression_sparse_dataset,
)
from .utils import make_gen_classif_scorers
from .utils import make_gen_classif_scorers, make_gen_reg_scorers


class RandomForestRegressorBenchmark(Predictor, Estimator, Benchmark):
"""
Benchmarks for RandomForestRegressor.
"""

param_names = ["representation", "n_jobs"]
params = (["dense", "sparse"], Benchmark.n_jobs_vals)

def setup_cache(self):
super().setup_cache()

def make_data(self, params):
representation, n_jobs = params

if representation == "sparse":
data = _synth_regression_sparse_dataset()
else:
data = _synth_regression_dataset()

return data

def make_estimator(self, params):
representation, n_jobs = params

n_estimators = 500 if Benchmark.data_size == "large" else 100

estimator = RandomForestRegressor(
n_estimators=n_estimators,
min_samples_split=10,
max_features="log2",
n_jobs=n_jobs,
random_state=0,
)

return estimator

def make_scorers(self):
make_gen_reg_scorers(self)


class RandomForestClassifierBenchmark(Predictor, Estimator, Benchmark):
Expand Down
29 changes: 29 additions & 0 deletions sklearn/tree/_events.pxd
adam2392 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Authors: Samuel Carliles <[email protected]>
#
# License: BSD 3 clause

# See _events.pyx for details.

from libcpp.vector cimport vector
from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t

ctypedef int EventType
ctypedef void* EventHandlerEnv
ctypedef void* EventData
ctypedef bint (*EventHandlerFunction)(
EventType event_type,
EventHandlerEnv handler_env,
EventData event_data
) noexcept nogil

cdef struct EventHandlerClosure:
EventHandlerFunction f
EventHandlerEnv e

cdef class EventHandler:
cdef int[:] event_types
cdef EventHandlerClosure c

cdef class EventBroker:
cdef vector[vector[EventHandlerClosure]] listeners
cdef bint fire_event(self, EventType event_type, EventData event_data) noexcept nogil
30 changes: 30 additions & 0 deletions sklearn/tree/_events.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@

# Authors: Samuel Carliles <[email protected]>
#
# License: BSD 3 clause


cdef class EventBroker:
def __cinit__(self, EventHandler[:] listeners, int[:] event_types):
cdef int i, ct
cdef list l

self.listeners.resize(len(event_types) + 1)
if(listeners is not None):
for e in event_types:
l = [j for j, _l in enumerate(listeners) if e in _l.events]
ct = len(l)
self.listeners[e].resize(ct)
for i in range(ct):
self.listeners[e][i] = listeners[l[i]].c
else:
for e in event_types:
self.listeners[e].resize(0)

cdef bint fire_event(self, EventType event_type, EventData event_data) noexcept nogil:
bint result = True

for l in self.listeners[event_type]:
result = result && l.f(event_type, l.e, event_data)

return result
59 changes: 59 additions & 0 deletions sklearn/tree/_honesty.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Authors: Samuel Carliles <[email protected]>
#
# License: BSD 3 clause

# See _honesty.pyx for details.

from .._events cimport EventHandler
from .._splitter cimport Partitioner, NodeSplitEvent, NodeSortFeatureEventData, NodeSplitEventData
from .._splitter cimport SplitConditionEnv, SplitConditionFunction, SplitConditionClosure, SplitCondition
from .._tree cimport TreeBuildEvent, TreeBuildSetActiveParentEventData, TreeBuildAddNodeEventData

from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t

from libcpp.vector cimport vector


cdef struct Interval:
intp_t start_idx
intp_t n
intp_t feature
intp_t split_idx # start of right child
float64_t split_value

cdef struct HonestEnv:
const float32_t[:, :] X
intp_t[::1] samples
float32_t[::1] feature_values

vector[Interval] tree
Interval* active_parent
Interval active_node
intp_t active_is_left
Partitioner partitioner

cdef class Honesty:
list splitter_event_handlers
list split_conditions
list tree_event_handlers

cdef:
HonestEnv env
Partitioner partitioner

cdef struct MinSampleLeafConditionEnv:
intp_t min_samples
HonestEnv* honest_env


cdef class NodeSortFeatureHandler(EventHandler):
pass

cdef class AddNodeHandler(EventHandler):
pass

cdef class SetActiveParentHandler(EventHandler):
pass

cdef class HonestMinSamplesLeafCondition(SplitCondition):
cdef MinSamplesLeafConditionEnv _env
208 changes: 208 additions & 0 deletions sklearn/tree/_honesty.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
from libc.math cimport floor, log2, pow


cdef class Honesty:
def __cinit__(
self,
Partitioner honest_partitioner,
list splitter_event_handlers,
list split_conditions,
list tree_event_handlers,
intp_t min_samples_leaf
):
self.env.partitioner = honest_partitioner
self.splitter_event_handlers = [NodeSortFeatureHandler(&self.env)] + splitter_event_handlers
self.split_conditions = [HonestMinSamplesLeafCondition(min_samples_leaf, &self.env)] + split_conditions
self.tree_event_handlers = [SetActiveParentHandler(&self.env), AddNodeHandler(&self.env)] + tree_event_handlers


cdef bint _handle_set_active_parent(
EventType event_type,
EventHandlerEnv handler_env,
EventData event_data
) noexcept nogil:
if event_type != TreeBuildEvent.SET_ACTIVE_PARENT:
return True

cdef HonestEnv* env = <HonestEnv*>handler_env
cdef TreeBuildSetActiveParentEventData* data = <TreeBuildSetActiveParentEventData*>event_data
cdef Interval* node = &env.active_node

if data.parent_node_id >= env.tree.size():
return False

env.active_is_left = data.child_is_left

node.feature = -1
node.split_idx = 0
node.split_value = NAN

if data.parent_node_id < 0:
env.active_parent = NULL
node.start_idx = 0
node.n = env.samples.shape[0]
else:
env.active_parent = &(env.tree[data.parent_node_id])
if env.active_is_left:
node.start_idx = env.active_parent.start_idx
node.n = env.active_parent.split_idx - env.active_parent.start_idx
else:
node.start_idx = env.active_parent.split_idx
node.n = env.active_parent.n - env.active_parent.split_idx

env.partitioner.init_node_split(node.start_idx, node.start_idx + node.n)

return True

cdef class SetActiveParentHandler(EventHandler):
def __cinit__(self, HonestEnv* env):
self._event_types = [TreeBuildEvent.SET_ACTIVE_PARENT]
self.event_types = self._event_types

self.c.f = _handle_set_active_parent
self.c.e = env


cdef bint _handle_sort_feature(
EventType event_type,
EventHandlerEnv handler_env,
EventData event_data
) noexcept nogil:
if event_type != NodeSplitEvent.SORT_FEATURE:
return True

cdef HonestEnv* env = <HonestEnv*>handler_env
cdef NodeSortFeatureEventData* data = <NodeSortFeatureEventData*>event_data
cdev Interval* node = &env.active_node

node.feature = data.feature
node.split_idx = 0
node.split_value = NAN
env.partitioner.sort_samples_and_feature_values(node.feature)

return True

cdef class NodeSortFeatureHandler(EventHandler):
def __cinit__(self, HonestEnv* env):
self._event_types = [NodeSplitEvent.SORT_FEATURE]
self.event_types = self._event_types

self.c.f = _handle_sort_feature
self.c.e = env


cdef bint _handle_add_node(
EventType event_type,
EventHandlerEnv handler_env,
EventData event_data
) noexcept nogil:
if event_type != TreeBuildEvent.ADD_NODE:
return True

cdef float64_t h, feature_value
cdef intp_t i, n_left, n_missing, size = env.tree.size()
cdef HonestEnv* env = <HonestEnv*>handler_env
cdef TreeBuildAddNodeEventData* data = <TreeBuildAddNodeEventData*>event_data
cdef Interval *interval, *parent

if data.node_id >= size:
# as a heuristic, assume a complete tree and add a level
h = floor(log2(size))
env.tree.resize(size + <intp_t>pow(2, h + 1))

interval = &(env.tree[node_id])
interval.feature = data.feature
interval.split_value = data.split_value

if data.parent_node_id < 0:
# the node being added is the tree root
interval.start_idx = 0
interval.n = env.samples.shape[0]
else:
parent = &(env.tree[data.parent_node_id])

if data.is_left:
interval.start_idx = parent.start_idx
interval.n = parent.split_idx - parent.start_idx
else:
interval.start_idx = parent.split_idx
interval.n = parent.n - parent.split_idx

# *we* don't need to sort to find the split pos we'll need for partitioning,
# but the partitioner internals are so stateful we had better just do it
# to ensure that it's in the expected state
env.partitioner.init_node_split(interval.start_idx, interval.start_idx + interval.n)
env.partitioner.sort_samples_and_feature_values(interval.feature)

# count n_left to find split pos
n_left = 0
i = interval.start_idx
feature_value = env.X[env.samples[i], interval.feature]

while !isnan(feature_value) && feature_value < interval.split_value && i < interval.start_idx + interval.n:
n_left += 1
i += 1
feature_value = env.X[env.samples[i], interval.feature]

interval.split_idx = interval.start_idx + n_left

env.partitioner.partition_samples_final(
interval.split_idx, interval.split_value, interval.feature, partitioner.n_missing
)

cdef class AddNodeHandler(EventHandler):
def __cinit__(self, HonestEnv* env):
self._event_types = [TreeBuildEvent.ADD_NODE]
self.event_types = self._event_types

self.c.f = _handle_add_node
self.c.e = env


cdef bint _honest_min_sample_leaf_condition(
Splitter splitter,
intp_t split_feature,
intp_t split_pos,
float64_t split_value,
intp_t n_missing,
bint missing_go_to_left,
float64_t lower_bound,
float64_t upper_bound,
SplitConditionEnv split_condition_env
) noexcept nogil:
cdef MinSamplesLeafConditionEnv* env = <MinSamplesLeafConditionEnv*>split_condition_env
cdef HonestEnv* honest_env = env.honest_env
cdef Interval* node = env.active_node

cdef intp_t min_samples_leaf = env.min_samples
cdef intp_t end_non_missing, n_left, n_right

# we don't care about n_missing in the structure set
n_missing = honest_env.partitioner.n_missing
end_non_missing = node.start_idx + node.n - n_missing

# we don't care about split_pos in the structure set,
# need to scan forward in the honest set based on split_value to find it
while node.split_idx < node.start_idx + node.n && env.X[node.split_idx, node.feature] <= split_value:
node.split_idx += 1

if missing_go_to_left:
n_left = node.split_idx - node.start_idx + n_missing
n_right = end_non_missing - node.split_idx
else:
n_left = node.split_idx - node.start_idx
n_right = end_non_missing - node.split_idx + n_missing

# Reject if min_samples_leaf is not guaranteed
if n_left < min_samples_leaf or n_right < min_samples_leaf:
return False

return True

cdef class HonestMinSamplesLeafCondition(SplitCondition):
def __cinit__(self, intp_t min_samples, HonestEnv* env):
self._env.min_samples = min_samples
self._env.honest_env = env

self.c.f = _honest_min_sample_leaf_condition
self.c.e = &self._env
Loading