Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Boosting method implementation (XGBoost) #1209

Merged
merged 29 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
c6a2de5
first iteration
RomanKharkovskoy Nov 27, 2023
1175797
added booster hyperparametr for xgboost in search_space.py, changed m…
RomanKharkovskoy Nov 28, 2023
c1ac9e5
Added XGBoost implementation without DMatrix
RomanKharkovskoy Dec 5, 2023
fd44563
Edited default params for XGBoost
RomanKharkovskoy Dec 5, 2023
ed27ff6
Added L1 and L2 reg
RomanKharkovskoy Dec 5, 2023
e89e7f7
added 2 parametrs for tuning
RomanKharkovskoy Dec 19, 2023
f67dd4d
added tree_method
RomanKharkovskoy Dec 19, 2023
516fafd
changed xgbreg to xgboostreg
RomanKharkovskoy Dec 19, 2023
a673b38
pep8 fix
RomanKharkovskoy Dec 19, 2023
55cba3a
convert to dataframe implemantation
RomanKharkovskoy Dec 26, 2023
cc887a9
regression params
RomanKharkovskoy Dec 26, 2023
2c3ed2c
changed fit/predict for regression
RomanKharkovskoy Dec 28, 2023
49b27e8
added eval metrics
RomanKharkovskoy Jan 10, 2024
f07b9c5
added use_eval_set
RomanKharkovskoy Jan 10, 2024
7e085b6
added xgboost to default operations
RomanKharkovskoy Jan 10, 2024
71ef29f
xgboost included in composing
RomanKharkovskoy Jan 31, 2024
5a9ffce
pep8 fix
RomanKharkovskoy Jan 31, 2024
83499fb
changed test where xgboost was excluede
RomanKharkovskoy Feb 12, 2024
c5f4b72
pep8 fix
RomanKharkovskoy Feb 12, 2024
604b747
Update after reabse
aPovidlo Jul 23, 2024
74545d0
Update feature importance
aPovidlo Jul 23, 2024
a0df6ce
Fixing bug & adding early stopping param
aPovidlo Jul 24, 2024
1640891
Fixing unit test and integration tests
aPovidlo Jul 24, 2024
9225097
Separate setting eval_metric, fixes with gblinear booster strategy, f…
aPovidlo Jul 24, 2024
743cb2c
fix unit test
aPovidlo Jul 24, 2024
12ab217
bug fix
aPovidlo Jul 24, 2024
4590147
fix feature importance
aPovidlo Jul 24, 2024
5a5d0ea
fix feature importance for catboost
aPovidlo Jul 24, 2024
f20d98e
Automated autopep8 fixes
github-actions[bot] Jul 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fedot/api/api_utils/presets.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def filter_operations_by_preset(self, data_type: Optional[DataTypesEnum] = None)
excluded = ['mlp', 'svc', 'svr', 'arima', 'exog_ts', 'text_clean',
'lda', 'qda', 'lgbm', 'one_hot_encoding',
'resample', 'stl_arima']
excluded_tree = ['xgboost', 'xgbreg']
excluded_tree = []

if '*' in preset_name:
self.modification_using = True
Expand Down
7 changes: 5 additions & 2 deletions fedot/core/operations/evaluation/boostings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from fedot.core.data.data import InputData, OutputData
from fedot.core.operations.evaluation.evaluation_interfaces import EvaluationStrategy
from fedot.core.operations.evaluation.operation_implementations.models.boostings_implementations import \
FedotCatBoostClassificationImplementation, FedotCatBoostRegressionImplementation
FedotCatBoostClassificationImplementation, FedotCatBoostRegressionImplementation, \
FedotXGBoostClassificationImplementation, FedotXGBoostRegressionImplementation
from fedot.core.operations.operation_parameters import OperationParameters
from fedot.core.repository.tasks import TaskTypesEnum
from fedot.utilities.random import ImplementationRandomStateHandler
Expand All @@ -12,7 +13,9 @@
class BoostingStrategy(EvaluationStrategy):
__operations_by_types = {
'catboost': FedotCatBoostClassificationImplementation,
'catboostreg': FedotCatBoostRegressionImplementation
'catboostreg': FedotCatBoostRegressionImplementation,
'xgboost': FedotXGBoostClassificationImplementation,
'xgboostreg': FedotXGBoostRegressionImplementation
}

def __init__(self, operation_type: str, params: Optional[OperationParameters] = None):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from matplotlib import pyplot as plt
from xgboost import XGBClassifier, XGBRegressor

from fedot.core.data.data import InputData
from fedot.core.data.data_split import train_test_data_setup
Expand All @@ -13,6 +14,127 @@
from fedot.core.utils import default_fedot_data_dir


class FedotXGBoostImplementation(ModelImplementation):
__operation_params = ['n_jobs', 'use_eval_set']

def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)

self.model_params = {k: v for k, v in self.params.to_dict().items() if k not in self.__operation_params}
andreygetmanov marked this conversation as resolved.
Show resolved Hide resolved
self.model = None
self.features_names = None

def fit(self, input_data: InputData):
aPovidlo marked this conversation as resolved.
Show resolved Hide resolved
if self.params.get('enable_categorical'):
input_data = input_data.get_not_encoded_data()
self.features_names = input_data.features_names

if self.params.get('use_eval_set'):
train_input, eval_input = train_test_data_setup(input_data)

train_input = self.convert_to_dataframe(train_input, identify_cats=self.params.get('enable_categorical'))
eval_input = self.convert_to_dataframe(eval_input, identify_cats=self.params.get('enable_categorical'))

train_x, train_y = train_input.drop(columns=['target']), train_input['target']
eval_x, eval_y = eval_input.drop(columns=['target']), eval_input['target']

self.model.eval_metric = self.set_eval_metric(self.classes_)

self.model.fit(X=train_x, y=train_y, eval_set=[(eval_x, eval_y)], verbose=self.model_params['verbosity'])
else:
train_data = self.convert_to_dataframe(input_data, identify_cats=self.params.get('enable_categorical'))
self.features_names = input_data.features_names
train_x, train_y = train_data.drop(columns=['target']), train_data['target']

self.model.fit(X=train_x, y=train_y, verbose=self.model_params['verbosity'])

return self.model

def predict(self, input_data: InputData):
if self.params.get('enable_categorical'):
input_data = input_data.get_not_encoded_data()

input_data = self.convert_to_dataframe(input_data, self.params.get('enable_categorical'))
train_x, _ = input_data.drop(columns=['target']), input_data['target']
prediction = self.model.predict(train_x)

return prediction

def check_and_update_params(self):
early_stopping_rounds = self.params.get('early_stopping_rounds')
use_eval_set = self.params.get('use_eval_set')

if isinstance(early_stopping_rounds, int) and not use_eval_set:
self.params.update(early_stopping_rounds=False)

booster = self.params.get('booster')
enable_categorical = self.params.get('enable_categorical')

if booster == 'gblinear' and enable_categorical:
self.params.update(enable_categorical=False)

def get_feature_importance(self) -> list:
return self.model.features_importances_

def plot_feature_importance(self, importance_type='weight'):
model_output = self.model.get_booster().get_score()
features_names = self.features_names
plot_feature_importance(features_names, model_output.values())

@staticmethod
def convert_to_dataframe(data: Optional[InputData], identify_cats: bool):
dataframe = pd.DataFrame(data=data.features)
dataframe['target'] = data.target

if identify_cats and data.categorical_idx is not None:
for col in dataframe.columns[data.categorical_idx]:
dataframe[col] = dataframe[col].astype('category')

if data.numerical_idx is not None:
for col in dataframe.columns[data.numerical_idx]:
dataframe[col] = dataframe[col].astype('float')

return dataframe

@staticmethod
def set_eval_metric(n_classes):
if n_classes is None: # if n_classes is None -> regression
eval_metric = 'rmse'
elif len(n_classes) < 3: # if n_classes < 3 -> bin class
eval_metric = 'auc'
else: # else multiclass
eval_metric = 'mlogloss'

return eval_metric


class FedotXGBoostClassificationImplementation(FedotXGBoostImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)
self.classes_ = None
self.model = XGBClassifier(**self.model_params)

def fit(self, input_data: InputData):
self.classes_ = np.unique(np.array(input_data.target))
return super().fit(input_data=input_data)

def predict_proba(self, input_data: InputData):
if self.params.get('enable_categorical'):
input_data = input_data.get_not_encoded_data()

input_data = self.convert_to_dataframe(input_data, self.params.get('enable_categorical'))
train_x, _ = input_data.drop(columns=['target']), input_data['target']
prediction = self.model.predict_proba(train_x)
return prediction


class FedotXGBoostRegressionImplementation(FedotXGBoostImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)
self.classes_ = None
self.model = XGBRegressor(**self.model_params)


class FedotCatBoostImplementation(ModelImplementation):
__operation_params = ['use_eval_set', 'n_jobs']

Expand Down Expand Up @@ -76,6 +198,13 @@ def load_model(self, path):
self.model = CatBoostClassifier()
self.model.load_model(path)

def get_feature_importance(self) -> (list, list):
""" Return feature importance -> (feature_id (string), feature_importance (float)) """
return self.model.get_feature_importance(prettified=True)

def plot_feature_importance(self):
plot_feature_importance(self.model.feature_names_, self.model.feature_importances_)


class FedotCatBoostClassificationImplementation(FedotCatBoostImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
Expand All @@ -91,21 +220,18 @@ def predict_proba(self, input_data: InputData):
prediction = self.model.predict_proba(input_data.get_not_encoded_data().features)
return prediction

def get_feature_importance(self):
return self.model.get_feature_importance(prettified=True)

def plot_feature_importance(self):
fi = pd.DataFrame(index=self.model.feature_names_)
fi['importance'] = self.model.feature_importances_

fi.loc[fi['importance'] > 0.1].sort_values('importance').plot(
kind='barh', figsize=(16, 9), title='Feature Importance'
)

plt.show()


class FedotCatBoostRegressionImplementation(FedotCatBoostImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)
self.model = CatBoostRegressor(**self.model_params)


def plot_feature_importance(feature_names, feature_importance):
fi = pd.DataFrame(index=feature_names)
fi['importance'] = feature_importance

fi.loc[fi['importance'] > 0.1].sort_values('importance').plot(
kind='barh', figsize=(16, 9), title='Feature Importance')

plt.show()
51 changes: 46 additions & 5 deletions fedot/core/pipelines/tuning/search_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,23 +134,43 @@ def get_parameters_dict(self):
'sampling-scope': [[True, False]],
'type': 'categorical'}
},
'xgbreg': {
'xgboostreg': {
'max_depth': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [1, 11],
'sampling-scope': [1, 7],
'type': 'discrete'},
'learning_rate': {
'hyperopt-dist': hp.loguniform,
'sampling-scope': [1e-3, 1],
'type': 'continuous'},
'subsample': {
'hyperopt-dist': hp.uniform,
'sampling-scope': [0.05, 1.0],
'sampling-scope': [0.05, 0.99],
'type': 'continuous'},
'min_child_weight': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [1, 21],
'type': 'discrete'},
'booster': {
andreygetmanov marked this conversation as resolved.
Show resolved Hide resolved
'hyperopt-dist': hp.choice,
'sampling-scope': [['gbtree', 'dart', 'gblinear']],
'type': 'categorical'},
'lambda': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [0, 1000],
'type': 'discrete'},
'alpha': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [0, 1000],
'type': 'discrete'},
'colsample_bytree': {
'hyperopt-dist': hp.uniform,
'sampling-scope': [1e-4, 1],
'type': 'continuous'},
'scale_pos_weight': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [1, 20],
'type': 'continuous'},
},
'xgboost': {
'max_depth': {
Expand All @@ -168,7 +188,27 @@ def get_parameters_dict(self):
'min_child_weight': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [1, 21],
'type': 'discrete'}
'type': 'discrete'},
'booster': {
'hyperopt-dist': hp.choice,
'sampling-scope': [['gbtree', 'dart', 'gblinear']],
'type': 'categorical'},
'lambda': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [0, 1000],
'type': 'discrete'},
'alpha': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [0, 1000],
'type': 'discrete'},
'colsample_bytree': {
'hyperopt-dist': hp.uniform,
'sampling-scope': [1e-4, 1],
'type': 'continuous'},
'scale_pos_weight': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [1, 20],
'type': 'continuous'},
},
'svr': {
'C': {
Expand Down Expand Up @@ -791,6 +831,7 @@ def get_parameters_dict(self):
parameters_per_operation.update(self.custom_search_space)
else:
for operation_name, operation_dct in self.custom_search_space.items():
parameters_per_operation[operation_name].update(operation_dct)
parameters_per_operation[operation_name].update(
operation_dct)

return parameters_per_operation
18 changes: 15 additions & 3 deletions fedot/core/repository/data/default_operation_params.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,22 @@
"n_jobs": 1
},
"xgboost": {
"eval_metric": "mlogloss",
"nthread": 1,
"n_jobs": 1,
"verbose": 0
"verbosity": 0,
"booster": "gbtree",
"tree_method": "auto",
"enable_categorical": true,
"use_eval_set": true,
"early_stopping_rounds": 30
},
"xgboostreg": {
"n_jobs": 1,
andreygetmanov marked this conversation as resolved.
Show resolved Hide resolved
"verbosity": 0,
"booster": "gbtree",
"tree_method": "auto",
"enable_categorical": true,
"use_eval_set": true,
"early_stopping_rounds": 30
},
"catboost": {
"allow_writing_files": false,
Expand Down
10 changes: 5 additions & 5 deletions fedot/core/repository/data/model_repository.json
Original file line number Diff line number Diff line change
Expand Up @@ -469,17 +469,17 @@
]
},
"xgboost": {
"meta": "sklearn_class",
"meta": "boosting_class",
"presets": ["*tree"],
"tags": [
"tree", "non-default", "non_linear"
"tree", "non_linear"
]
},
"xgbreg": {
"meta": "sklearn_regr",
"xgboostreg": {
"meta": "boosting_regr",
"presets": ["*tree"],
"tags": [
"tree", "non_multi", "non-default", "non_linear"
"tree", "non_multi", "non_linear"
]
},
"cnn": {
Expand Down
2 changes: 1 addition & 1 deletion test/integration/api/test_api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def test_init_assumption_with_inappropriate_available_operations():

train_input, _, _ = get_dataset(task_type='classification')
train_input = DataPreprocessor().obligatory_prepare_for_fit(train_input)
available_operations = ['linear', 'xgboost', 'lagged']
available_operations = ['linear', 'xgboostreg', 'lagged']

initial_assumptions = AssumptionsBuilder \
.get(train_input) \
Expand Down
2 changes: 1 addition & 1 deletion test/unit/api/test_assumption_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def test_assumptions_builder_unsuitable_available_operations():

train_input, _, _ = get_dataset(task_type='classification')
train_input = DataPreprocessor().obligatory_prepare_for_fit(train_input)
available_operations = ['linear', 'xgboost', 'lagged']
available_operations = ['linear', 'lagged', 'xgboostreg']

default_builder = UniModalAssumptionsBuilder(train_input)
checked_builder = UniModalAssumptionsBuilder(train_input) \
Expand Down
2 changes: 1 addition & 1 deletion test/unit/api/test_presets.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def test_presets_classification():
task = Task(TaskTypesEnum.classification)
class_operations = get_operations_for_task(task=task, mode='all')

excluded_tree = ['xgboost', 'xgbreg']
excluded_tree = []
filtered_operations = set(class_operations).difference(set(excluded_tree))
available_operations = list(filtered_operations)

Expand Down
3 changes: 2 additions & 1 deletion test/unit/data_operations/test_time_series_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,8 @@ def test_tuner_correctly_work_with_window_size_selector():

assert autotuned_window != tuner_tuned_window
# check that WindowSizeSelector runs twice due to tuner graph copying in initialization
assert sum(check_window_size_selector_logging(records)) == 2
sum_records = sum(check_window_size_selector_logging(records))
assert sum_records == 2 or sum_records == 3


@pytest.mark.parametrize(('length', 'features_count', 'target_count', 'window_size'),
Expand Down
Loading