From 7a3946a95ed633e02e67d0cf0111652eabf93ce0 Mon Sep 17 00:00:00 2001
From: Andrey Stebenkov <a.stebenkov75@yandex.ru>
Date: Thu, 15 Aug 2024 16:34:33 +0300
Subject: [PATCH] Fix bug with
 test_api_fit_predict_with_pseudo_large_dataset_with_label_correct

---
 .../data_operations/sklearn_transformations.py      | 10 ++--------
 fedot/preprocessing/categorical.py                  |  1 +
 fedot/preprocessing/preprocessing.py                | 13 ++++++++++---
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
index 3485586fa4..9b6b3d2c8c 100644
--- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
+++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
@@ -282,10 +282,7 @@ def fit(self, input_data: InputData):
 
         if data_type_is_table(input_data):
             categorical_idx = input_data.categorical_idx.tolist()
-            numerical_idx = np.setdiff1d(
-                np.concatenate((input_data.numerical_idx, input_data.encoded_idx)),
-                categorical_idx
-            ).tolist()
+            numerical_idx = np.setdiff1d(input_data.numerical_idx, categorical_idx).tolist()
             # Tabular data contains categorical features
             numerical, categorical = divide_data_categorical_numerical(input_data, categorical_idx, numerical_idx)
 
@@ -317,10 +314,7 @@ def transform(self, input_data: InputData) -> OutputData:
 
         if data_type_is_table(input_data) and input_data.categorical_idx is not None:
             self.categorical_ids = input_data.categorical_idx.tolist()
-            self.non_categorical_ids = np.setdiff1d(
-                np.concatenate((input_data.numerical_idx, input_data.encoded_idx)),
-                self.categorical_ids
-            ).tolist()
+            self.non_categorical_ids = np.setdiff1d(input_data.numerical_idx, self.categorical_ids).tolist()
 
             numerical, categorical = divide_data_categorical_numerical(
                 input_data, self.categorical_ids, self.non_categorical_ids
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
index 2a2226d524..07c70de0c9 100644
--- a/fedot/preprocessing/categorical.py
+++ b/fedot/preprocessing/categorical.py
@@ -55,6 +55,7 @@ def fit(self, input_data: InputData):
 
             # Remove binary columns from categorical_idx
             input_data.categorical_idx = [idx for idx in input_data.categorical_idx if idx not in binary_ids_to_convert]
+            input_data.categorical_idx = np.array(input_data.categorical_idx)
             self.binary_ids_to_convert = binary_ids_to_convert
             
             # TODO: Add log.message with binary ids
diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
index 9fee4db89f..0752e17b3e 100644
--- a/fedot/preprocessing/preprocessing.py
+++ b/fedot/preprocessing/preprocessing.py
@@ -237,8 +237,7 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str,
             self.types_correctors[source_name].convert_data_for_predict(data)
 
         feature_type_ids = data.supplementary_data.col_type_ids['features']
-        data.numerical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]]))
-        data.categorical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[str]]))
+        data.numerical_idx, data.categorical_idx = self._update_num_and_cats_ids(feature_type_ids)
 
         # TODO andreygetmanov target encoding must be obligatory for all data types
         if data_type_is_text(data):
@@ -252,6 +251,9 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str,
             else:
                 data = self.binary_categorical_processors[source_name].transform(data)
 
+            feature_type_ids = data.supplementary_data.col_type_ids['features']
+            data.numerical_idx, data.categorical_idx = self._update_num_and_cats_ids(feature_type_ids)
+
         return data
 
     def _prepare_optional(self, pipeline, data: InputData, source_name: str):
@@ -560,7 +562,6 @@ def reduce_memory_size(self, data: InputData) -> InputData:
         def reduce_mem_usage_np(arr, initial_types):
             reduced_columns = OptimisedFeature()
 
-
             for i in range(arr.shape[1]):
                 col = arr[:, i]
                 init_type = _convertable_types[initial_types[i]]
@@ -601,3 +602,9 @@ def reduce_mem_usage_np(arr, initial_types):
             data.target = reduce_mem_usage_np(data.target, data.supplementary_data.col_type_ids['target'])
 
         return data
+
+    def _update_num_and_cats_ids(self, feature_type_ids):
+        numerical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]]))
+        categorical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[str]]))
+
+        return numerical_idx, categorical_idx