Boosting method implementation (XGBoost) (#1209)

Updates in Xgboost implementation: - Xgboost migrated from SkLearnEvaluationStrategy to separate BoostingStrategy. - Support for using encoded and unencoded categorical features. - Support for splitting data for the eval set to avoid overfitting the model. - Support to use booster strategy 'gbtree', 'dart', 'gblinear'. - A parameter for early stopping added for fitting the model. - Automated checks for hyperparameters, with editing if they are mistaken. - Updates to default_params & search_space. - New feature: Plot features importance after fitting. -- author: @RomanKharkovskoy co-author: @aPovidlo
aimclub · Jul 26, 2024 · 80eba8e · 80eba8e
1 parent d33ca9e
commit 80eba8e
Show file tree

Hide file tree

Showing 10 changed files with 216 additions and 33 deletions.
diff --git a/fedot/api/api_utils/presets.py b/fedot/api/api_utils/presets.py
@@ -53,7 +53,7 @@ def filter_operations_by_preset(self, data_type: Optional[DataTypesEnum] = None)
         excluded = ['mlp', 'svc', 'svr', 'arima', 'exog_ts', 'text_clean',
                     'lda', 'qda', 'lgbm', 'one_hot_encoding',
                     'resample', 'stl_arima']
-        excluded_tree = ['xgboost', 'xgbreg']
+        excluded_tree = []
 
         if '*' in preset_name:
             self.modification_using = True

diff --git a/fedot/core/operations/evaluation/boostings.py b/fedot/core/operations/evaluation/boostings.py
@@ -3,7 +3,8 @@
 from fedot.core.data.data import InputData, OutputData
 from fedot.core.operations.evaluation.evaluation_interfaces import EvaluationStrategy
 from fedot.core.operations.evaluation.operation_implementations.models.boostings_implementations import \
-    FedotCatBoostClassificationImplementation, FedotCatBoostRegressionImplementation
+    FedotCatBoostClassificationImplementation, FedotCatBoostRegressionImplementation, \
+    FedotXGBoostClassificationImplementation, FedotXGBoostRegressionImplementation
 from fedot.core.operations.operation_parameters import OperationParameters
 from fedot.core.repository.tasks import TaskTypesEnum
 from fedot.utilities.random import ImplementationRandomStateHandler
@@ -12,7 +13,9 @@
 class BoostingStrategy(EvaluationStrategy):
     __operations_by_types = {
         'catboost': FedotCatBoostClassificationImplementation,
-        'catboostreg': FedotCatBoostRegressionImplementation
+        'catboostreg': FedotCatBoostRegressionImplementation,
+        'xgboost': FedotXGBoostClassificationImplementation,
+        'xgboostreg': FedotXGBoostRegressionImplementation
     }
 
     def __init__(self, operation_type: str, params: Optional[OperationParameters] = None):

diff --git a/.../core/operations/evaluation/operation_implementations/models/boostings_implementations.py b/.../core/operations/evaluation/operation_implementations/models/boostings_implementations.py
@@ -5,6 +5,7 @@
 import pandas as pd
 from catboost import CatBoostClassifier, CatBoostRegressor, Pool
 from matplotlib import pyplot as plt
+from xgboost import XGBClassifier, XGBRegressor
 
 from fedot.core.data.data import InputData
 from fedot.core.data.data_split import train_test_data_setup
@@ -13,6 +14,127 @@
 from fedot.core.utils import default_fedot_data_dir
 
 
+class FedotXGBoostImplementation(ModelImplementation):
+    __operation_params = ['n_jobs', 'use_eval_set']
+
+    def __init__(self, params: Optional[OperationParameters] = None):
+        super().__init__(params)
+
+        self.model_params = {k: v for k, v in self.params.to_dict().items() if k not in self.__operation_params}
+        self.model = None
+        self.features_names = None
+
+    def fit(self, input_data: InputData):
+        if self.params.get('enable_categorical'):
+            input_data = input_data.get_not_encoded_data()
+            self.features_names = input_data.features_names
+
+        if self.params.get('use_eval_set'):
+            train_input, eval_input = train_test_data_setup(input_data)
+
+            train_input = self.convert_to_dataframe(train_input, identify_cats=self.params.get('enable_categorical'))
+            eval_input = self.convert_to_dataframe(eval_input, identify_cats=self.params.get('enable_categorical'))
+
+            train_x, train_y = train_input.drop(columns=['target']), train_input['target']
+            eval_x, eval_y = eval_input.drop(columns=['target']), eval_input['target']
+
+            self.model.eval_metric = self.set_eval_metric(self.classes_)
+
+            self.model.fit(X=train_x, y=train_y, eval_set=[(eval_x, eval_y)], verbose=self.model_params['verbosity'])
+        else:
+            train_data = self.convert_to_dataframe(input_data, identify_cats=self.params.get('enable_categorical'))
+            self.features_names = input_data.features_names
+            train_x, train_y = train_data.drop(columns=['target']), train_data['target']
+
+            self.model.fit(X=train_x, y=train_y, verbose=self.model_params['verbosity'])
+
+        return self.model
+
+    def predict(self, input_data: InputData):
+        if self.params.get('enable_categorical'):
+            input_data = input_data.get_not_encoded_data()
+
+        input_data = self.convert_to_dataframe(input_data, self.params.get('enable_categorical'))
+        train_x, _ = input_data.drop(columns=['target']), input_data['target']
+        prediction = self.model.predict(train_x)
+
+        return prediction
+
+    def check_and_update_params(self):
+        early_stopping_rounds = self.params.get('early_stopping_rounds')
+        use_eval_set = self.params.get('use_eval_set')
+
+        if isinstance(early_stopping_rounds, int) and not use_eval_set:
+            self.params.update(early_stopping_rounds=False)
+
+        booster = self.params.get('booster')
+        enable_categorical = self.params.get('enable_categorical')
+
+        if booster == 'gblinear' and enable_categorical:
+            self.params.update(enable_categorical=False)
+
+    def get_feature_importance(self) -> list:
+        return self.model.features_importances_
+
+    def plot_feature_importance(self, importance_type='weight'):
+        model_output = self.model.get_booster().get_score()
+        features_names = self.features_names
+        plot_feature_importance(features_names, model_output.values())
+
+    @staticmethod
+    def convert_to_dataframe(data: Optional[InputData], identify_cats: bool):
+        dataframe = pd.DataFrame(data=data.features)
+        dataframe['target'] = data.target
+
+        if identify_cats and data.categorical_idx is not None:
+            for col in dataframe.columns[data.categorical_idx]:
+                dataframe[col] = dataframe[col].astype('category')
+
+        if data.numerical_idx is not None:
+            for col in dataframe.columns[data.numerical_idx]:
+                dataframe[col] = dataframe[col].astype('float')
+
+        return dataframe
+
+    @staticmethod
+    def set_eval_metric(n_classes):
+        if n_classes is None:  # if n_classes is None -> regression
+            eval_metric = 'rmse'
+        elif len(n_classes) < 3:  # if n_classes < 3 -> bin class
+            eval_metric = 'auc'
+        else:  # else multiclass
+            eval_metric = 'mlogloss'
+
+        return eval_metric
+
+
+class FedotXGBoostClassificationImplementation(FedotXGBoostImplementation):
+    def __init__(self, params: Optional[OperationParameters] = None):
+        super().__init__(params)
+        self.classes_ = None
+        self.model = XGBClassifier(**self.model_params)
+
+    def fit(self, input_data: InputData):
+        self.classes_ = np.unique(np.array(input_data.target))
+        return super().fit(input_data=input_data)
+
+    def predict_proba(self, input_data: InputData):
+        if self.params.get('enable_categorical'):
+            input_data = input_data.get_not_encoded_data()
+
+        input_data = self.convert_to_dataframe(input_data, self.params.get('enable_categorical'))
+        train_x, _ = input_data.drop(columns=['target']), input_data['target']
+        prediction = self.model.predict_proba(train_x)
+        return prediction
+
+
+class FedotXGBoostRegressionImplementation(FedotXGBoostImplementation):
+    def __init__(self, params: Optional[OperationParameters] = None):
+        super().__init__(params)
+        self.classes_ = None
+        self.model = XGBRegressor(**self.model_params)
+
+
 class FedotCatBoostImplementation(ModelImplementation):
     __operation_params = ['use_eval_set', 'n_jobs']
 
@@ -76,6 +198,13 @@ def load_model(self, path):
         self.model = CatBoostClassifier()
         self.model.load_model(path)
 
+    def get_feature_importance(self) -> (list, list):
+        """ Return feature importance -> (feature_id (string), feature_importance (float)) """
+        return self.model.get_feature_importance(prettified=True)
+
+    def plot_feature_importance(self):
+        plot_feature_importance(self.model.feature_names_, self.model.feature_importances_)
+
 
 class FedotCatBoostClassificationImplementation(FedotCatBoostImplementation):
     def __init__(self, params: Optional[OperationParameters] = None):
@@ -91,21 +220,18 @@ def predict_proba(self, input_data: InputData):
         prediction = self.model.predict_proba(input_data.get_not_encoded_data().features)
         return prediction
 
-    def get_feature_importance(self):
-        return self.model.get_feature_importance(prettified=True)
-
-    def plot_feature_importance(self):
-        fi = pd.DataFrame(index=self.model.feature_names_)
-        fi['importance'] = self.model.feature_importances_
-
-        fi.loc[fi['importance'] > 0.1].sort_values('importance').plot(
-            kind='barh', figsize=(16, 9), title='Feature Importance'
-        )
-
-        plt.show()
-
 
 class FedotCatBoostRegressionImplementation(FedotCatBoostImplementation):
     def __init__(self, params: Optional[OperationParameters] = None):
         super().__init__(params)
         self.model = CatBoostRegressor(**self.model_params)
+
+
+def plot_feature_importance(feature_names, feature_importance):
+    fi = pd.DataFrame(index=feature_names)
+    fi['importance'] = feature_importance
+
+    fi.loc[fi['importance'] > 0.1].sort_values('importance').plot(
+        kind='barh', figsize=(16, 9), title='Feature Importance')
+
+    plt.show()
diff --git a/fedot/core/pipelines/tuning/search_space.py b/fedot/core/pipelines/tuning/search_space.py
@@ -134,23 +134,43 @@ def get_parameters_dict(self):
                     'sampling-scope': [[True, False]],
                     'type': 'categorical'}
             },
-            'xgbreg': {
+            'xgboostreg': {
                 'max_depth': {
                     'hyperopt-dist': hp.uniformint,
-                    'sampling-scope': [1, 11],
+                    'sampling-scope': [1, 7],
                     'type': 'discrete'},
                 'learning_rate': {
                     'hyperopt-dist': hp.loguniform,
                     'sampling-scope': [1e-3, 1],
                     'type': 'continuous'},
                 'subsample': {
                     'hyperopt-dist': hp.uniform,
-                    'sampling-scope': [0.05, 1.0],
+                    'sampling-scope': [0.05, 0.99],
                     'type': 'continuous'},
                 'min_child_weight': {
                     'hyperopt-dist': hp.uniformint,
                     'sampling-scope': [1, 21],
                     'type': 'discrete'},
+                'booster': {
+                    'hyperopt-dist': hp.choice,
+                    'sampling-scope': [['gbtree', 'dart', 'gblinear']],
+                    'type': 'categorical'},
+                'lambda': {
+                    'hyperopt-dist': hp.uniformint,
+                    'sampling-scope': [0, 1000],
+                    'type': 'discrete'},
+                'alpha': {
+                    'hyperopt-dist': hp.uniformint,
+                    'sampling-scope': [0, 1000],
+                    'type': 'discrete'},
+                'colsample_bytree': {
+                    'hyperopt-dist': hp.uniform,
+                    'sampling-scope': [1e-4, 1],
+                    'type': 'continuous'},
+                'scale_pos_weight': {
+                    'hyperopt-dist': hp.uniformint,
+                    'sampling-scope': [1, 20],
+                    'type': 'continuous'},
             },
             'xgboost': {
                 'max_depth': {
@@ -168,7 +188,27 @@ def get_parameters_dict(self):
                 'min_child_weight': {
                     'hyperopt-dist': hp.uniformint,
                     'sampling-scope': [1, 21],
-                    'type': 'discrete'}
+                    'type': 'discrete'},
+                'booster': {
+                    'hyperopt-dist': hp.choice,
+                    'sampling-scope': [['gbtree', 'dart', 'gblinear']],
+                    'type': 'categorical'},
+                'lambda': {
+                    'hyperopt-dist': hp.uniformint,
+                    'sampling-scope': [0, 1000],
+                    'type': 'discrete'},
+                'alpha': {
+                    'hyperopt-dist': hp.uniformint,
+                    'sampling-scope': [0, 1000],
+                    'type': 'discrete'},
+                'colsample_bytree': {
+                    'hyperopt-dist': hp.uniform,
+                    'sampling-scope': [1e-4, 1],
+                    'type': 'continuous'},
+                'scale_pos_weight': {
+                    'hyperopt-dist': hp.uniformint,
+                    'sampling-scope': [1, 20],
+                    'type': 'continuous'},
             },
             'svr': {
                 'C': {
@@ -791,6 +831,7 @@ def get_parameters_dict(self):
                 parameters_per_operation.update(self.custom_search_space)
             else:
                 for operation_name, operation_dct in self.custom_search_space.items():
-                    parameters_per_operation[operation_name].update(operation_dct)
+                    parameters_per_operation[operation_name].update(
+                        operation_dct)
 
         return parameters_per_operation
diff --git a/fedot/core/repository/data/default_operation_params.json b/fedot/core/repository/data/default_operation_params.json
@@ -6,10 +6,22 @@
     "n_jobs": 1
   },
   "xgboost": {
-    "eval_metric": "mlogloss",
-    "nthread": 1,
     "n_jobs": 1,
-    "verbose": 0
+    "verbosity": 0,
+    "booster": "gbtree",
+    "tree_method": "auto",
+    "enable_categorical": true,
+    "use_eval_set": true,
+    "early_stopping_rounds": 30
+  },
+  "xgboostreg": {
+    "n_jobs": 1,
+    "verbosity": 0,
+    "booster": "gbtree",
+    "tree_method": "auto",
+    "enable_categorical": true,
+    "use_eval_set": true,
+    "early_stopping_rounds": 30
   },
   "catboost": {
     "allow_writing_files": false,

diff --git a/fedot/core/repository/data/model_repository.json b/fedot/core/repository/data/model_repository.json
@@ -469,17 +469,17 @@
       ]
     },
     "xgboost": {
-      "meta": "sklearn_class",
+      "meta": "boosting_class",
       "presets": ["*tree"],
       "tags": [
-        "tree", "non-default", "non_linear"
+        "tree", "non_linear"
       ]
     },
-    "xgbreg": {
-      "meta": "sklearn_regr",
+    "xgboostreg": {
+      "meta": "boosting_regr",
       "presets": ["*tree"],
       "tags": [
-        "tree", "non_multi", "non-default", "non_linear"
+        "tree", "non_multi", "non_linear"
       ]
     },
     "cnn": {

diff --git a/test/integration/api/test_api_utils.py b/test/integration/api/test_api_utils.py
@@ -97,7 +97,7 @@ def test_init_assumption_with_inappropriate_available_operations():
 
     train_input, _, _ = get_dataset(task_type='classification')
     train_input = DataPreprocessor().obligatory_prepare_for_fit(train_input)
-    available_operations = ['linear', 'xgboost', 'lagged']
+    available_operations = ['linear', 'xgboostreg', 'lagged']
 
     initial_assumptions = AssumptionsBuilder \
         .get(train_input) \

diff --git a/test/unit/api/test_assumption_builder.py b/test/unit/api/test_assumption_builder.py
@@ -104,7 +104,7 @@ def test_assumptions_builder_unsuitable_available_operations():
 
     train_input, _, _ = get_dataset(task_type='classification')
     train_input = DataPreprocessor().obligatory_prepare_for_fit(train_input)
-    available_operations = ['linear', 'xgboost', 'lagged']
+    available_operations = ['linear', 'lagged', 'xgboostreg']
 
     default_builder = UniModalAssumptionsBuilder(train_input)
     checked_builder = UniModalAssumptionsBuilder(train_input) \

diff --git a/test/unit/api/test_presets.py b/test/unit/api/test_presets.py
@@ -13,7 +13,7 @@ def test_presets_classification():
     task = Task(TaskTypesEnum.classification)
     class_operations = get_operations_for_task(task=task, mode='all')
 
-    excluded_tree = ['xgboost', 'xgbreg']
+    excluded_tree = []
     filtered_operations = set(class_operations).difference(set(excluded_tree))
     available_operations = list(filtered_operations)
 

diff --git a/test/unit/data_operations/test_time_series_operations.py b/test/unit/data_operations/test_time_series_operations.py
@@ -359,7 +359,8 @@ def test_tuner_correctly_work_with_window_size_selector():
 
     assert autotuned_window != tuner_tuned_window
     # check that WindowSizeSelector runs twice due to tuner graph copying in initialization
-    assert sum(check_window_size_selector_logging(records)) == 2
+    sum_records = sum(check_window_size_selector_logging(records))
+    assert sum_records == 2 or sum_records == 3
 
 
 @pytest.mark.parametrize(('length', 'features_count', 'target_count', 'window_size'),