diff --git a/fedot/api/api_utils/presets.py b/fedot/api/api_utils/presets.py index 934dcde918..e42f64e236 100644 --- a/fedot/api/api_utils/presets.py +++ b/fedot/api/api_utils/presets.py @@ -53,7 +53,7 @@ def filter_operations_by_preset(self, data_type: Optional[DataTypesEnum] = None) excluded = ['mlp', 'svc', 'svr', 'arima', 'exog_ts', 'text_clean', 'lda', 'qda', 'lgbm', 'one_hot_encoding', 'resample', 'stl_arima'] - excluded_tree = ['xgboost', 'xgbreg'] + excluded_tree = [] if '*' in preset_name: self.modification_using = True diff --git a/fedot/core/operations/evaluation/boostings.py b/fedot/core/operations/evaluation/boostings.py index d74a63a27b..80435ff472 100644 --- a/fedot/core/operations/evaluation/boostings.py +++ b/fedot/core/operations/evaluation/boostings.py @@ -3,7 +3,8 @@ from fedot.core.data.data import InputData, OutputData from fedot.core.operations.evaluation.evaluation_interfaces import EvaluationStrategy from fedot.core.operations.evaluation.operation_implementations.models.boostings_implementations import \ - FedotCatBoostClassificationImplementation, FedotCatBoostRegressionImplementation + FedotCatBoostClassificationImplementation, FedotCatBoostRegressionImplementation, \ + FedotXGBoostClassificationImplementation, FedotXGBoostRegressionImplementation from fedot.core.operations.operation_parameters import OperationParameters from fedot.core.repository.tasks import TaskTypesEnum from fedot.utilities.random import ImplementationRandomStateHandler @@ -12,7 +13,9 @@ class BoostingStrategy(EvaluationStrategy): __operations_by_types = { 'catboost': FedotCatBoostClassificationImplementation, - 'catboostreg': FedotCatBoostRegressionImplementation + 'catboostreg': FedotCatBoostRegressionImplementation, + 'xgboost': FedotXGBoostClassificationImplementation, + 'xgboostreg': FedotXGBoostRegressionImplementation } def __init__(self, operation_type: str, params: Optional[OperationParameters] = None): diff --git a/fedot/core/operations/evaluation/operation_implementations/models/boostings_implementations.py b/fedot/core/operations/evaluation/operation_implementations/models/boostings_implementations.py index ce80becaf2..3fad7ecc46 100644 --- a/fedot/core/operations/evaluation/operation_implementations/models/boostings_implementations.py +++ b/fedot/core/operations/evaluation/operation_implementations/models/boostings_implementations.py @@ -5,6 +5,7 @@ import pandas as pd from catboost import CatBoostClassifier, CatBoostRegressor, Pool from matplotlib import pyplot as plt +from xgboost import XGBClassifier, XGBRegressor from fedot.core.data.data import InputData from fedot.core.data.data_split import train_test_data_setup @@ -13,6 +14,127 @@ from fedot.core.utils import default_fedot_data_dir +class FedotXGBoostImplementation(ModelImplementation): + __operation_params = ['n_jobs', 'use_eval_set'] + + def __init__(self, params: Optional[OperationParameters] = None): + super().__init__(params) + + self.model_params = {k: v for k, v in self.params.to_dict().items() if k not in self.__operation_params} + self.model = None + self.features_names = None + + def fit(self, input_data: InputData): + if self.params.get('enable_categorical'): + input_data = input_data.get_not_encoded_data() + self.features_names = input_data.features_names + + if self.params.get('use_eval_set'): + train_input, eval_input = train_test_data_setup(input_data) + + train_input = self.convert_to_dataframe(train_input, identify_cats=self.params.get('enable_categorical')) + eval_input = self.convert_to_dataframe(eval_input, identify_cats=self.params.get('enable_categorical')) + + train_x, train_y = train_input.drop(columns=['target']), train_input['target'] + eval_x, eval_y = eval_input.drop(columns=['target']), eval_input['target'] + + self.model.eval_metric = self.set_eval_metric(self.classes_) + + self.model.fit(X=train_x, y=train_y, eval_set=[(eval_x, eval_y)], verbose=self.model_params['verbosity']) + else: + train_data = self.convert_to_dataframe(input_data, identify_cats=self.params.get('enable_categorical')) + self.features_names = input_data.features_names + train_x, train_y = train_data.drop(columns=['target']), train_data['target'] + + self.model.fit(X=train_x, y=train_y, verbose=self.model_params['verbosity']) + + return self.model + + def predict(self, input_data: InputData): + if self.params.get('enable_categorical'): + input_data = input_data.get_not_encoded_data() + + input_data = self.convert_to_dataframe(input_data, self.params.get('enable_categorical')) + train_x, _ = input_data.drop(columns=['target']), input_data['target'] + prediction = self.model.predict(train_x) + + return prediction + + def check_and_update_params(self): + early_stopping_rounds = self.params.get('early_stopping_rounds') + use_eval_set = self.params.get('use_eval_set') + + if isinstance(early_stopping_rounds, int) and not use_eval_set: + self.params.update(early_stopping_rounds=False) + + booster = self.params.get('booster') + enable_categorical = self.params.get('enable_categorical') + + if booster == 'gblinear' and enable_categorical: + self.params.update(enable_categorical=False) + + def get_feature_importance(self) -> list: + return self.model.features_importances_ + + def plot_feature_importance(self, importance_type='weight'): + model_output = self.model.get_booster().get_score() + features_names = self.features_names + plot_feature_importance(features_names, model_output.values()) + + @staticmethod + def convert_to_dataframe(data: Optional[InputData], identify_cats: bool): + dataframe = pd.DataFrame(data=data.features) + dataframe['target'] = data.target + + if identify_cats and data.categorical_idx is not None: + for col in dataframe.columns[data.categorical_idx]: + dataframe[col] = dataframe[col].astype('category') + + if data.numerical_idx is not None: + for col in dataframe.columns[data.numerical_idx]: + dataframe[col] = dataframe[col].astype('float') + + return dataframe + + @staticmethod + def set_eval_metric(n_classes): + if n_classes is None: # if n_classes is None -> regression + eval_metric = 'rmse' + elif len(n_classes) < 3: # if n_classes < 3 -> bin class + eval_metric = 'auc' + else: # else multiclass + eval_metric = 'mlogloss' + + return eval_metric + + +class FedotXGBoostClassificationImplementation(FedotXGBoostImplementation): + def __init__(self, params: Optional[OperationParameters] = None): + super().__init__(params) + self.classes_ = None + self.model = XGBClassifier(**self.model_params) + + def fit(self, input_data: InputData): + self.classes_ = np.unique(np.array(input_data.target)) + return super().fit(input_data=input_data) + + def predict_proba(self, input_data: InputData): + if self.params.get('enable_categorical'): + input_data = input_data.get_not_encoded_data() + + input_data = self.convert_to_dataframe(input_data, self.params.get('enable_categorical')) + train_x, _ = input_data.drop(columns=['target']), input_data['target'] + prediction = self.model.predict_proba(train_x) + return prediction + + +class FedotXGBoostRegressionImplementation(FedotXGBoostImplementation): + def __init__(self, params: Optional[OperationParameters] = None): + super().__init__(params) + self.classes_ = None + self.model = XGBRegressor(**self.model_params) + + class FedotCatBoostImplementation(ModelImplementation): __operation_params = ['use_eval_set', 'n_jobs'] @@ -76,6 +198,13 @@ def load_model(self, path): self.model = CatBoostClassifier() self.model.load_model(path) + def get_feature_importance(self) -> (list, list): + """ Return feature importance -> (feature_id (string), feature_importance (float)) """ + return self.model.get_feature_importance(prettified=True) + + def plot_feature_importance(self): + plot_feature_importance(self.model.feature_names_, self.model.feature_importances_) + class FedotCatBoostClassificationImplementation(FedotCatBoostImplementation): def __init__(self, params: Optional[OperationParameters] = None): @@ -91,21 +220,18 @@ def predict_proba(self, input_data: InputData): prediction = self.model.predict_proba(input_data.get_not_encoded_data().features) return prediction - def get_feature_importance(self): - return self.model.get_feature_importance(prettified=True) - - def plot_feature_importance(self): - fi = pd.DataFrame(index=self.model.feature_names_) - fi['importance'] = self.model.feature_importances_ - - fi.loc[fi['importance'] > 0.1].sort_values('importance').plot( - kind='barh', figsize=(16, 9), title='Feature Importance' - ) - - plt.show() - class FedotCatBoostRegressionImplementation(FedotCatBoostImplementation): def __init__(self, params: Optional[OperationParameters] = None): super().__init__(params) self.model = CatBoostRegressor(**self.model_params) + + +def plot_feature_importance(feature_names, feature_importance): + fi = pd.DataFrame(index=feature_names) + fi['importance'] = feature_importance + + fi.loc[fi['importance'] > 0.1].sort_values('importance').plot( + kind='barh', figsize=(16, 9), title='Feature Importance') + + plt.show() diff --git a/fedot/core/pipelines/tuning/search_space.py b/fedot/core/pipelines/tuning/search_space.py index 3f7f85074a..dca6aa1892 100644 --- a/fedot/core/pipelines/tuning/search_space.py +++ b/fedot/core/pipelines/tuning/search_space.py @@ -134,10 +134,10 @@ def get_parameters_dict(self): 'sampling-scope': [[True, False]], 'type': 'categorical'} }, - 'xgbreg': { + 'xgboostreg': { 'max_depth': { 'hyperopt-dist': hp.uniformint, - 'sampling-scope': [1, 11], + 'sampling-scope': [1, 7], 'type': 'discrete'}, 'learning_rate': { 'hyperopt-dist': hp.loguniform, @@ -145,12 +145,32 @@ def get_parameters_dict(self): 'type': 'continuous'}, 'subsample': { 'hyperopt-dist': hp.uniform, - 'sampling-scope': [0.05, 1.0], + 'sampling-scope': [0.05, 0.99], 'type': 'continuous'}, 'min_child_weight': { 'hyperopt-dist': hp.uniformint, 'sampling-scope': [1, 21], 'type': 'discrete'}, + 'booster': { + 'hyperopt-dist': hp.choice, + 'sampling-scope': [['gbtree', 'dart', 'gblinear']], + 'type': 'categorical'}, + 'lambda': { + 'hyperopt-dist': hp.uniformint, + 'sampling-scope': [0, 1000], + 'type': 'discrete'}, + 'alpha': { + 'hyperopt-dist': hp.uniformint, + 'sampling-scope': [0, 1000], + 'type': 'discrete'}, + 'colsample_bytree': { + 'hyperopt-dist': hp.uniform, + 'sampling-scope': [1e-4, 1], + 'type': 'continuous'}, + 'scale_pos_weight': { + 'hyperopt-dist': hp.uniformint, + 'sampling-scope': [1, 20], + 'type': 'continuous'}, }, 'xgboost': { 'max_depth': { @@ -168,7 +188,27 @@ def get_parameters_dict(self): 'min_child_weight': { 'hyperopt-dist': hp.uniformint, 'sampling-scope': [1, 21], - 'type': 'discrete'} + 'type': 'discrete'}, + 'booster': { + 'hyperopt-dist': hp.choice, + 'sampling-scope': [['gbtree', 'dart', 'gblinear']], + 'type': 'categorical'}, + 'lambda': { + 'hyperopt-dist': hp.uniformint, + 'sampling-scope': [0, 1000], + 'type': 'discrete'}, + 'alpha': { + 'hyperopt-dist': hp.uniformint, + 'sampling-scope': [0, 1000], + 'type': 'discrete'}, + 'colsample_bytree': { + 'hyperopt-dist': hp.uniform, + 'sampling-scope': [1e-4, 1], + 'type': 'continuous'}, + 'scale_pos_weight': { + 'hyperopt-dist': hp.uniformint, + 'sampling-scope': [1, 20], + 'type': 'continuous'}, }, 'svr': { 'C': { @@ -791,6 +831,7 @@ def get_parameters_dict(self): parameters_per_operation.update(self.custom_search_space) else: for operation_name, operation_dct in self.custom_search_space.items(): - parameters_per_operation[operation_name].update(operation_dct) + parameters_per_operation[operation_name].update( + operation_dct) return parameters_per_operation diff --git a/fedot/core/repository/data/default_operation_params.json b/fedot/core/repository/data/default_operation_params.json index 4076b4d26e..9ec23c118b 100644 --- a/fedot/core/repository/data/default_operation_params.json +++ b/fedot/core/repository/data/default_operation_params.json @@ -6,10 +6,22 @@ "n_jobs": 1 }, "xgboost": { - "eval_metric": "mlogloss", - "nthread": 1, "n_jobs": 1, - "verbose": 0 + "verbosity": 0, + "booster": "gbtree", + "tree_method": "auto", + "enable_categorical": true, + "use_eval_set": true, + "early_stopping_rounds": 30 + }, + "xgboostreg": { + "n_jobs": 1, + "verbosity": 0, + "booster": "gbtree", + "tree_method": "auto", + "enable_categorical": true, + "use_eval_set": true, + "early_stopping_rounds": 30 }, "catboost": { "allow_writing_files": false, diff --git a/fedot/core/repository/data/model_repository.json b/fedot/core/repository/data/model_repository.json index d6a3c60c38..727c37ea69 100644 --- a/fedot/core/repository/data/model_repository.json +++ b/fedot/core/repository/data/model_repository.json @@ -469,17 +469,17 @@ ] }, "xgboost": { - "meta": "sklearn_class", + "meta": "boosting_class", "presets": ["*tree"], "tags": [ - "tree", "non-default", "non_linear" + "tree", "non_linear" ] }, - "xgbreg": { - "meta": "sklearn_regr", + "xgboostreg": { + "meta": "boosting_regr", "presets": ["*tree"], "tags": [ - "tree", "non_multi", "non-default", "non_linear" + "tree", "non_multi", "non_linear" ] }, "cnn": { diff --git a/test/integration/api/test_api_utils.py b/test/integration/api/test_api_utils.py index 1909c4c221..42b3e4a147 100644 --- a/test/integration/api/test_api_utils.py +++ b/test/integration/api/test_api_utils.py @@ -97,7 +97,7 @@ def test_init_assumption_with_inappropriate_available_operations(): train_input, _, _ = get_dataset(task_type='classification') train_input = DataPreprocessor().obligatory_prepare_for_fit(train_input) - available_operations = ['linear', 'xgboost', 'lagged'] + available_operations = ['linear', 'xgboostreg', 'lagged'] initial_assumptions = AssumptionsBuilder \ .get(train_input) \ diff --git a/test/unit/api/test_assumption_builder.py b/test/unit/api/test_assumption_builder.py index a56bcbe34f..23195168fd 100644 --- a/test/unit/api/test_assumption_builder.py +++ b/test/unit/api/test_assumption_builder.py @@ -104,7 +104,7 @@ def test_assumptions_builder_unsuitable_available_operations(): train_input, _, _ = get_dataset(task_type='classification') train_input = DataPreprocessor().obligatory_prepare_for_fit(train_input) - available_operations = ['linear', 'xgboost', 'lagged'] + available_operations = ['linear', 'lagged', 'xgboostreg'] default_builder = UniModalAssumptionsBuilder(train_input) checked_builder = UniModalAssumptionsBuilder(train_input) \ diff --git a/test/unit/api/test_presets.py b/test/unit/api/test_presets.py index 5da842afb5..8d0bdfee06 100644 --- a/test/unit/api/test_presets.py +++ b/test/unit/api/test_presets.py @@ -13,7 +13,7 @@ def test_presets_classification(): task = Task(TaskTypesEnum.classification) class_operations = get_operations_for_task(task=task, mode='all') - excluded_tree = ['xgboost', 'xgbreg'] + excluded_tree = [] filtered_operations = set(class_operations).difference(set(excluded_tree)) available_operations = list(filtered_operations) diff --git a/test/unit/data_operations/test_time_series_operations.py b/test/unit/data_operations/test_time_series_operations.py index 277884edce..fb56a2b003 100644 --- a/test/unit/data_operations/test_time_series_operations.py +++ b/test/unit/data_operations/test_time_series_operations.py @@ -359,7 +359,8 @@ def test_tuner_correctly_work_with_window_size_selector(): assert autotuned_window != tuner_tuned_window # check that WindowSizeSelector runs twice due to tuner graph copying in initialization - assert sum(check_window_size_selector_logging(records)) == 2 + sum_records = sum(check_window_size_selector_logging(records)) + assert sum_records == 2 or sum_records == 3 @pytest.mark.parametrize(('length', 'features_count', 'target_count', 'window_size'),