Skip to content

Commit

Permalink
Boosting method implementation (XGBoost) (#1209)
Browse files Browse the repository at this point in the history
Updates in Xgboost implementation:
- Xgboost migrated from SkLearnEvaluationStrategy to separate BoostingStrategy.
- Support for using encoded and unencoded categorical features.
- Support for splitting data for the eval set to avoid overfitting the model.
- Support to use booster strategy 'gbtree', 'dart', 'gblinear'.
- A parameter for early stopping added for fitting the model.
- Automated checks for hyperparameters, with editing if they are mistaken.
- Updates to default_params & search_space.
- New feature: Plot features importance after fitting.

--

author: @RomanKharkovskoy 
co-author: @aPovidlo
  • Loading branch information
RomanKharkovskoy authored Jul 26, 2024
1 parent d33ca9e commit 80eba8e
Show file tree
Hide file tree
Showing 10 changed files with 216 additions and 33 deletions.
2 changes: 1 addition & 1 deletion fedot/api/api_utils/presets.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def filter_operations_by_preset(self, data_type: Optional[DataTypesEnum] = None)
excluded = ['mlp', 'svc', 'svr', 'arima', 'exog_ts', 'text_clean',
'lda', 'qda', 'lgbm', 'one_hot_encoding',
'resample', 'stl_arima']
excluded_tree = ['xgboost', 'xgbreg']
excluded_tree = []

if '*' in preset_name:
self.modification_using = True
Expand Down
7 changes: 5 additions & 2 deletions fedot/core/operations/evaluation/boostings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from fedot.core.data.data import InputData, OutputData
from fedot.core.operations.evaluation.evaluation_interfaces import EvaluationStrategy
from fedot.core.operations.evaluation.operation_implementations.models.boostings_implementations import \
FedotCatBoostClassificationImplementation, FedotCatBoostRegressionImplementation
FedotCatBoostClassificationImplementation, FedotCatBoostRegressionImplementation, \
FedotXGBoostClassificationImplementation, FedotXGBoostRegressionImplementation
from fedot.core.operations.operation_parameters import OperationParameters
from fedot.core.repository.tasks import TaskTypesEnum
from fedot.utilities.random import ImplementationRandomStateHandler
Expand All @@ -12,7 +13,9 @@
class BoostingStrategy(EvaluationStrategy):
__operations_by_types = {
'catboost': FedotCatBoostClassificationImplementation,
'catboostreg': FedotCatBoostRegressionImplementation
'catboostreg': FedotCatBoostRegressionImplementation,
'xgboost': FedotXGBoostClassificationImplementation,
'xgboostreg': FedotXGBoostRegressionImplementation
}

def __init__(self, operation_type: str, params: Optional[OperationParameters] = None):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from matplotlib import pyplot as plt
from xgboost import XGBClassifier, XGBRegressor

from fedot.core.data.data import InputData
from fedot.core.data.data_split import train_test_data_setup
Expand All @@ -13,6 +14,127 @@
from fedot.core.utils import default_fedot_data_dir


class FedotXGBoostImplementation(ModelImplementation):
__operation_params = ['n_jobs', 'use_eval_set']

def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)

self.model_params = {k: v for k, v in self.params.to_dict().items() if k not in self.__operation_params}
self.model = None
self.features_names = None

def fit(self, input_data: InputData):
if self.params.get('enable_categorical'):
input_data = input_data.get_not_encoded_data()
self.features_names = input_data.features_names

if self.params.get('use_eval_set'):
train_input, eval_input = train_test_data_setup(input_data)

train_input = self.convert_to_dataframe(train_input, identify_cats=self.params.get('enable_categorical'))
eval_input = self.convert_to_dataframe(eval_input, identify_cats=self.params.get('enable_categorical'))

train_x, train_y = train_input.drop(columns=['target']), train_input['target']
eval_x, eval_y = eval_input.drop(columns=['target']), eval_input['target']

self.model.eval_metric = self.set_eval_metric(self.classes_)

self.model.fit(X=train_x, y=train_y, eval_set=[(eval_x, eval_y)], verbose=self.model_params['verbosity'])
else:
train_data = self.convert_to_dataframe(input_data, identify_cats=self.params.get('enable_categorical'))
self.features_names = input_data.features_names
train_x, train_y = train_data.drop(columns=['target']), train_data['target']

self.model.fit(X=train_x, y=train_y, verbose=self.model_params['verbosity'])

return self.model

def predict(self, input_data: InputData):
if self.params.get('enable_categorical'):
input_data = input_data.get_not_encoded_data()

input_data = self.convert_to_dataframe(input_data, self.params.get('enable_categorical'))
train_x, _ = input_data.drop(columns=['target']), input_data['target']
prediction = self.model.predict(train_x)

return prediction

def check_and_update_params(self):
early_stopping_rounds = self.params.get('early_stopping_rounds')
use_eval_set = self.params.get('use_eval_set')

if isinstance(early_stopping_rounds, int) and not use_eval_set:
self.params.update(early_stopping_rounds=False)

booster = self.params.get('booster')
enable_categorical = self.params.get('enable_categorical')

if booster == 'gblinear' and enable_categorical:
self.params.update(enable_categorical=False)

def get_feature_importance(self) -> list:
return self.model.features_importances_

def plot_feature_importance(self, importance_type='weight'):
model_output = self.model.get_booster().get_score()
features_names = self.features_names
plot_feature_importance(features_names, model_output.values())

@staticmethod
def convert_to_dataframe(data: Optional[InputData], identify_cats: bool):
dataframe = pd.DataFrame(data=data.features)
dataframe['target'] = data.target

if identify_cats and data.categorical_idx is not None:
for col in dataframe.columns[data.categorical_idx]:
dataframe[col] = dataframe[col].astype('category')

if data.numerical_idx is not None:
for col in dataframe.columns[data.numerical_idx]:
dataframe[col] = dataframe[col].astype('float')

return dataframe

@staticmethod
def set_eval_metric(n_classes):
if n_classes is None: # if n_classes is None -> regression
eval_metric = 'rmse'
elif len(n_classes) < 3: # if n_classes < 3 -> bin class
eval_metric = 'auc'
else: # else multiclass
eval_metric = 'mlogloss'

return eval_metric


class FedotXGBoostClassificationImplementation(FedotXGBoostImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)
self.classes_ = None
self.model = XGBClassifier(**self.model_params)

def fit(self, input_data: InputData):
self.classes_ = np.unique(np.array(input_data.target))
return super().fit(input_data=input_data)

def predict_proba(self, input_data: InputData):
if self.params.get('enable_categorical'):
input_data = input_data.get_not_encoded_data()

input_data = self.convert_to_dataframe(input_data, self.params.get('enable_categorical'))
train_x, _ = input_data.drop(columns=['target']), input_data['target']
prediction = self.model.predict_proba(train_x)
return prediction


class FedotXGBoostRegressionImplementation(FedotXGBoostImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)
self.classes_ = None
self.model = XGBRegressor(**self.model_params)


class FedotCatBoostImplementation(ModelImplementation):
__operation_params = ['use_eval_set', 'n_jobs']

Expand Down Expand Up @@ -76,6 +198,13 @@ def load_model(self, path):
self.model = CatBoostClassifier()
self.model.load_model(path)

def get_feature_importance(self) -> (list, list):
""" Return feature importance -> (feature_id (string), feature_importance (float)) """
return self.model.get_feature_importance(prettified=True)

def plot_feature_importance(self):
plot_feature_importance(self.model.feature_names_, self.model.feature_importances_)


class FedotCatBoostClassificationImplementation(FedotCatBoostImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
Expand All @@ -91,21 +220,18 @@ def predict_proba(self, input_data: InputData):
prediction = self.model.predict_proba(input_data.get_not_encoded_data().features)
return prediction

def get_feature_importance(self):
return self.model.get_feature_importance(prettified=True)

def plot_feature_importance(self):
fi = pd.DataFrame(index=self.model.feature_names_)
fi['importance'] = self.model.feature_importances_

fi.loc[fi['importance'] > 0.1].sort_values('importance').plot(
kind='barh', figsize=(16, 9), title='Feature Importance'
)

plt.show()


class FedotCatBoostRegressionImplementation(FedotCatBoostImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)
self.model = CatBoostRegressor(**self.model_params)


def plot_feature_importance(feature_names, feature_importance):
fi = pd.DataFrame(index=feature_names)
fi['importance'] = feature_importance

fi.loc[fi['importance'] > 0.1].sort_values('importance').plot(
kind='barh', figsize=(16, 9), title='Feature Importance')

plt.show()
51 changes: 46 additions & 5 deletions fedot/core/pipelines/tuning/search_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,23 +134,43 @@ def get_parameters_dict(self):
'sampling-scope': [[True, False]],
'type': 'categorical'}
},
'xgbreg': {
'xgboostreg': {
'max_depth': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [1, 11],
'sampling-scope': [1, 7],
'type': 'discrete'},
'learning_rate': {
'hyperopt-dist': hp.loguniform,
'sampling-scope': [1e-3, 1],
'type': 'continuous'},
'subsample': {
'hyperopt-dist': hp.uniform,
'sampling-scope': [0.05, 1.0],
'sampling-scope': [0.05, 0.99],
'type': 'continuous'},
'min_child_weight': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [1, 21],
'type': 'discrete'},
'booster': {
'hyperopt-dist': hp.choice,
'sampling-scope': [['gbtree', 'dart', 'gblinear']],
'type': 'categorical'},
'lambda': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [0, 1000],
'type': 'discrete'},
'alpha': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [0, 1000],
'type': 'discrete'},
'colsample_bytree': {
'hyperopt-dist': hp.uniform,
'sampling-scope': [1e-4, 1],
'type': 'continuous'},
'scale_pos_weight': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [1, 20],
'type': 'continuous'},
},
'xgboost': {
'max_depth': {
Expand All @@ -168,7 +188,27 @@ def get_parameters_dict(self):
'min_child_weight': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [1, 21],
'type': 'discrete'}
'type': 'discrete'},
'booster': {
'hyperopt-dist': hp.choice,
'sampling-scope': [['gbtree', 'dart', 'gblinear']],
'type': 'categorical'},
'lambda': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [0, 1000],
'type': 'discrete'},
'alpha': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [0, 1000],
'type': 'discrete'},
'colsample_bytree': {
'hyperopt-dist': hp.uniform,
'sampling-scope': [1e-4, 1],
'type': 'continuous'},
'scale_pos_weight': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [1, 20],
'type': 'continuous'},
},
'svr': {
'C': {
Expand Down Expand Up @@ -791,6 +831,7 @@ def get_parameters_dict(self):
parameters_per_operation.update(self.custom_search_space)
else:
for operation_name, operation_dct in self.custom_search_space.items():
parameters_per_operation[operation_name].update(operation_dct)
parameters_per_operation[operation_name].update(
operation_dct)

return parameters_per_operation
18 changes: 15 additions & 3 deletions fedot/core/repository/data/default_operation_params.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,22 @@
"n_jobs": 1
},
"xgboost": {
"eval_metric": "mlogloss",
"nthread": 1,
"n_jobs": 1,
"verbose": 0
"verbosity": 0,
"booster": "gbtree",
"tree_method": "auto",
"enable_categorical": true,
"use_eval_set": true,
"early_stopping_rounds": 30
},
"xgboostreg": {
"n_jobs": 1,
"verbosity": 0,
"booster": "gbtree",
"tree_method": "auto",
"enable_categorical": true,
"use_eval_set": true,
"early_stopping_rounds": 30
},
"catboost": {
"allow_writing_files": false,
Expand Down
10 changes: 5 additions & 5 deletions fedot/core/repository/data/model_repository.json
Original file line number Diff line number Diff line change
Expand Up @@ -469,17 +469,17 @@
]
},
"xgboost": {
"meta": "sklearn_class",
"meta": "boosting_class",
"presets": ["*tree"],
"tags": [
"tree", "non-default", "non_linear"
"tree", "non_linear"
]
},
"xgbreg": {
"meta": "sklearn_regr",
"xgboostreg": {
"meta": "boosting_regr",
"presets": ["*tree"],
"tags": [
"tree", "non_multi", "non-default", "non_linear"
"tree", "non_multi", "non_linear"
]
},
"cnn": {
Expand Down
2 changes: 1 addition & 1 deletion test/integration/api/test_api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def test_init_assumption_with_inappropriate_available_operations():

train_input, _, _ = get_dataset(task_type='classification')
train_input = DataPreprocessor().obligatory_prepare_for_fit(train_input)
available_operations = ['linear', 'xgboost', 'lagged']
available_operations = ['linear', 'xgboostreg', 'lagged']

initial_assumptions = AssumptionsBuilder \
.get(train_input) \
Expand Down
2 changes: 1 addition & 1 deletion test/unit/api/test_assumption_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def test_assumptions_builder_unsuitable_available_operations():

train_input, _, _ = get_dataset(task_type='classification')
train_input = DataPreprocessor().obligatory_prepare_for_fit(train_input)
available_operations = ['linear', 'xgboost', 'lagged']
available_operations = ['linear', 'lagged', 'xgboostreg']

default_builder = UniModalAssumptionsBuilder(train_input)
checked_builder = UniModalAssumptionsBuilder(train_input) \
Expand Down
2 changes: 1 addition & 1 deletion test/unit/api/test_presets.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def test_presets_classification():
task = Task(TaskTypesEnum.classification)
class_operations = get_operations_for_task(task=task, mode='all')

excluded_tree = ['xgboost', 'xgbreg']
excluded_tree = []
filtered_operations = set(class_operations).difference(set(excluded_tree))
available_operations = list(filtered_operations)

Expand Down
3 changes: 2 additions & 1 deletion test/unit/data_operations/test_time_series_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,8 @@ def test_tuner_correctly_work_with_window_size_selector():

assert autotuned_window != tuner_tuned_window
# check that WindowSizeSelector runs twice due to tuner graph copying in initialization
assert sum(check_window_size_selector_logging(records)) == 2
sum_records = sum(check_window_size_selector_logging(records))
assert sum_records == 2 or sum_records == 3


@pytest.mark.parametrize(('length', 'features_count', 'target_count', 'window_size'),
Expand Down

0 comments on commit 80eba8e

Please sign in to comment.