From 7a3946a95ed633e02e67d0cf0111652eabf93ce0 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Thu, 15 Aug 2024 16:34:33 +0300 Subject: [PATCH] Fix bug with test_api_fit_predict_with_pseudo_large_dataset_with_label_correct --- .../data_operations/sklearn_transformations.py | 10 ++-------- fedot/preprocessing/categorical.py | 1 + fedot/preprocessing/preprocessing.py | 13 ++++++++++--- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 3485586fa4..9b6b3d2c8c 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -282,10 +282,7 @@ def fit(self, input_data: InputData): if data_type_is_table(input_data): categorical_idx = input_data.categorical_idx.tolist() - numerical_idx = np.setdiff1d( - np.concatenate((input_data.numerical_idx, input_data.encoded_idx)), - categorical_idx - ).tolist() + numerical_idx = np.setdiff1d(input_data.numerical_idx, categorical_idx).tolist() # Tabular data contains categorical features numerical, categorical = divide_data_categorical_numerical(input_data, categorical_idx, numerical_idx) @@ -317,10 +314,7 @@ def transform(self, input_data: InputData) -> OutputData: if data_type_is_table(input_data) and input_data.categorical_idx is not None: self.categorical_ids = input_data.categorical_idx.tolist() - self.non_categorical_ids = np.setdiff1d( - np.concatenate((input_data.numerical_idx, input_data.encoded_idx)), - self.categorical_ids - ).tolist() + self.non_categorical_ids = np.setdiff1d(input_data.numerical_idx, self.categorical_ids).tolist() numerical, categorical = divide_data_categorical_numerical( input_data, self.categorical_ids, self.non_categorical_ids diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 2a2226d524..07c70de0c9 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -55,6 +55,7 @@ def fit(self, input_data: InputData): # Remove binary columns from categorical_idx input_data.categorical_idx = [idx for idx in input_data.categorical_idx if idx not in binary_ids_to_convert] + input_data.categorical_idx = np.array(input_data.categorical_idx) self.binary_ids_to_convert = binary_ids_to_convert # TODO: Add log.message with binary ids diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 9fee4db89f..0752e17b3e 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -237,8 +237,7 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str, self.types_correctors[source_name].convert_data_for_predict(data) feature_type_ids = data.supplementary_data.col_type_ids['features'] - data.numerical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]])) - data.categorical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[str]])) + data.numerical_idx, data.categorical_idx = self._update_num_and_cats_ids(feature_type_ids) # TODO andreygetmanov target encoding must be obligatory for all data types if data_type_is_text(data): @@ -252,6 +251,9 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str, else: data = self.binary_categorical_processors[source_name].transform(data) + feature_type_ids = data.supplementary_data.col_type_ids['features'] + data.numerical_idx, data.categorical_idx = self._update_num_and_cats_ids(feature_type_ids) + return data def _prepare_optional(self, pipeline, data: InputData, source_name: str): @@ -560,7 +562,6 @@ def reduce_memory_size(self, data: InputData) -> InputData: def reduce_mem_usage_np(arr, initial_types): reduced_columns = OptimisedFeature() - for i in range(arr.shape[1]): col = arr[:, i] init_type = _convertable_types[initial_types[i]] @@ -601,3 +602,9 @@ def reduce_mem_usage_np(arr, initial_types): data.target = reduce_mem_usage_np(data.target, data.supplementary_data.col_type_ids['target']) return data + + def _update_num_and_cats_ids(self, feature_type_ids): + numerical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]])) + categorical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[str]])) + + return numerical_idx, categorical_idx