PR fixes

* rid of str variables for types in preprocessor * improved define column_types function in data_types.py
aimclub · Dec 29, 2022 · cd5a712 · cd5a712
1 parent 4cad6b4
commit cd5a712
Show file tree

Hide file tree

Showing 12 changed files with 943 additions and 922 deletions.
diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py
@@ -5,6 +5,7 @@
 
 from fedot.core.data.data import InputData, data_type_is_table, data_type_is_ts, data_type_is_multi_ts
 from fedot.core.repository.dataset_types import DataTypesEnum
+from fedot.preprocessing.data_types import TYPE_TO_ID
 
 
 def data_type_is_suitable_for_preprocessing(data: InputData) -> bool:
@@ -76,11 +77,11 @@ def find_categorical_columns(table: np.ndarray, column_types: dict = None):
 
     categorical_ids = []
     non_categorical_ids = []
-    for column_id, type_name in enumerate(column_types):
-        if 'str' in str(type_name):
-            categorical_ids.append(column_id)
+    for col_id, col_type_id in enumerate(column_types):
+        if col_type_id == TYPE_TO_ID[str]:
+            categorical_ids.append(col_id)
         else:
-            non_categorical_ids.append(column_id)
+            non_categorical_ids.append(col_id)
 
     return categorical_ids, non_categorical_ids
 

diff --git a/...e/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/...e/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -10,6 +10,7 @@
 from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import \
     DataOperationImplementation
 from fedot.core.operations.operation_parameters import OperationParameters
+from fedot.preprocessing.data_types import TYPE_TO_ID
 
 
 class OneHotEncodingImplementation(DataOperationImplementation):
@@ -31,9 +32,9 @@ def fit(self, input_data: InputData):
         :return encoder: trained encoder (optional output)
         """
         features = input_data.features
-        features_types = input_data.supplementary_data.column_types.get('features')
+        features_type_ids = input_data.supplementary_data.column_types.get('features')
         categorical_ids, non_categorical_ids = find_categorical_columns(features,
-                                                                        features_types)
+                                                                        features_type_ids)
 
         # Indices of columns with categorical and non-categorical features
         self.categorical_ids = categorical_ids
@@ -75,11 +76,11 @@ def _update_column_types(self, output_data: OutputData):
         if self.categorical_ids:
             # There are categorical features in the table
             col_types = output_data.supplementary_data.column_types['features']
-            numerical_columns = [t_name for t_name in col_types if 'str' not in t_name]
+            numerical_columns = [t_name for t_name in col_types if t_name != TYPE_TO_ID[str]]
 
             # Calculate new binary columns number after encoding
             encoded_columns_number = output_data.predict.shape[1] - len(numerical_columns)
-            numerical_columns.extend([str(int)] * encoded_columns_number)
+            numerical_columns.extend([TYPE_TO_ID[int]] * encoded_columns_number)
 
             output_data.supplementary_data.column_types['features'] = numerical_columns
 
@@ -153,7 +154,7 @@ def _update_column_types(self, output_data: OutputData):
             # Categorical features were in the dataset
             col_types = output_data.supplementary_data.column_types['features']
             for categorical_id in self.categorical_ids:
-                col_types[categorical_id] = str(int)
+                col_types[categorical_id] = TYPE_TO_ID[int]
 
             output_data.supplementary_data.column_types['features'] = col_types
 

diff --git a/...perations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/...perations/evaluation/operation_implementations/data_operations/sklearn_transformations.py
@@ -13,6 +13,7 @@
 from fedot.core.operations.evaluation.operation_implementations. \
     implementation_interfaces import DataOperationImplementation, EncodedInvariantImplementation
 from fedot.core.operations.operation_parameters import OperationParameters
+from fedot.preprocessing.data_types import TYPE_TO_ID
 
 
 class ComponentAnalysisImplementation(DataOperationImplementation):
@@ -87,8 +88,8 @@ def update_column_types(output_data: OutputData) -> OutputData:
         """Update column types after applying PCA operations
         """
 
-        n_rows, n_cols = output_data.predict.shape
-        output_data.supplementary_data.column_types['features'] = [str(float) * n_cols]
+        _, n_cols = output_data.predict.shape
+        output_data.supplementary_data.column_types['features'] = [TYPE_TO_ID[float] * n_cols]
         return output_data
 
 
@@ -127,6 +128,7 @@ class FastICAImplementation(ComponentAnalysisImplementation):
     Args:
         params: OperationParameters with the hyperparameters
     """
+
     def __init__(self, params: Optional[OperationParameters]):
         super().__init__(params)
         self.pca = FastICA(**self.params.to_dict())
@@ -195,7 +197,7 @@ def _update_column_types(self, source_features_shape, output_data: OutputData):
             if cols_number_added > 0:
                 # There are new columns in the table
                 col_types = output_data.supplementary_data.column_types['features']
-                col_types.extend([str(float)] * cols_number_added)
+                col_types.extend([TYPE_TO_ID[float]] * cols_number_added)
                 output_data.supplementary_data.column_types['features'] = col_types
 
 

diff --git a/...ore/operations/evaluation/operation_implementations/data_operations/ts_transformations.py b/...ore/operations/evaluation/operation_implementations/data_operations/ts_transformations.py
@@ -7,12 +7,13 @@
 from sklearn.decomposition import TruncatedSVD
 
 from fedot.core.data.data import InputData, OutputData
-from fedot.core.log import LoggerAdapter, default_log
+from fedot.core.log import default_log
 from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import (
     DataOperationImplementation
 )
 from fedot.core.operations.operation_parameters import OperationParameters
 from fedot.core.repository.dataset_types import DataTypesEnum
+from fedot.preprocessing.data_types import TYPE_TO_ID
 
 
 class LaggedImplementation(DataOperationImplementation):
@@ -135,12 +136,12 @@ def _update_column_types(self, output_data: OutputData):
         """
 
         features_n_rows, features_n_cols = output_data.predict.shape
-        features_column_types = [str(float)] * features_n_cols
+        features_column_types = [TYPE_TO_ID[float]] * features_n_cols
         column_types = {'features': features_column_types}
 
         if output_data.target is not None and len(output_data.target.shape) > 1:
             target_n_rows, target_n_cols = output_data.target.shape
-            column_types.update({'target': [str(float)] * target_n_cols})
+            column_types.update({'target': [TYPE_TO_ID[float]] * target_n_cols})
         output_data.supplementary_data.column_types = column_types
 
     def _apply_transformation_for_fit(self, input_data: InputData, features: np.array, target: np.array,

diff --git a/fedot/core/operations/model.py b/fedot/core/operations/model.py
@@ -1,68 +1,69 @@
-import numpy as np
-
-from fedot.core.data.data import OutputData
-from fedot.core.operations.operation import Operation
-from fedot.core.repository.dataset_types import DataTypesEnum
-from fedot.core.repository.operation_types_repository import OperationTypesRepository
-from fedot.core.repository.tasks import TaskTypesEnum
-
-
-class Model(Operation):
-    """Class with ``fit``/``predict`` methods defining the evaluation strategy for the task
-
-    Args:
-        operation_type: name of the model
-    """
-
-    def __init__(self, operation_type: str):
-        super().__init__(operation_type=operation_type)
-        self.operations_repo = OperationTypesRepository('model')
-
-    @staticmethod
-    def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> OutputData:
-        """Assign types for tabular data obtained from model predictions.\n
-        By default, all types of model predictions for tabular data can be clearly defined
-        """
-        if output_data.data_type is not DataTypesEnum.table:
-            # No column data types info for non-tabular data
-            return output_data
-
-        is_regression_task = output_data.task.task_type is TaskTypesEnum.regression
-        is_ts_forecasting_task = output_data.task.task_type is TaskTypesEnum.ts_forecasting
-
-        predict_shape = np.array(output_data.predict).shape
-        # Add information about features
-        if is_regression_task or is_ts_forecasting_task:
-            if len(predict_shape) < 2:
-                column_info = {'features': [str(float)] * predict_shape[0]}
-            else:
-                column_info = {'features': [str(float)] * predict_shape[1]}
-        else:
-            if len(predict_shape) < 2:
-                output_data.predict = output_data.predict.reshape((-1, 1))
-                predict_shape = output_data.predict.shape
-            # Classification task or clustering
-            if output_mode == 'labels':
-                column_info = {'features': [str(int)] * predict_shape[1]}
-            else:
-                column_info = {'features': [str(float)] * predict_shape[1]}
-
-        # Add information about target
-        target_shape = output_data.target.shape if output_data.target is not None else None
-        if target_shape is None:
-            # There is no target column in output data
-            output_data.supplementary_data.column_types = column_info
-            return output_data
-
-        if is_regression_task or is_ts_forecasting_task:
-            if len(target_shape) > 1:
-                column_info.update({'target': [str(float)] * target_shape[1]})
-            else:
-                # Array present "time series"
-                column_info.update({'target': [str(float)] * len(output_data.target)})
-        else:
-            # Classification task or clustering
-            column_info.update({'target': [str(int)] * predict_shape[1]})
-
-        output_data.supplementary_data.column_types = column_info
-        return output_data
+import numpy as np
+
+from fedot.core.data.data import OutputData
+from fedot.core.operations.operation import Operation
+from fedot.core.repository.dataset_types import DataTypesEnum
+from fedot.core.repository.operation_types_repository import OperationTypesRepository
+from fedot.core.repository.tasks import TaskTypesEnum
+from fedot.preprocessing.data_types import TYPE_TO_ID
+
+
+class Model(Operation):
+    """Class with ``fit``/``predict`` methods defining the evaluation strategy for the task
+
+    Args:
+        operation_type: name of the model
+    """
+
+    def __init__(self, operation_type: str):
+        super().__init__(operation_type=operation_type)
+        self.operations_repo = OperationTypesRepository('model')
+
+    @staticmethod
+    def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> OutputData:
+        """Assign types for tabular data obtained from model predictions.\n
+        By default, all types of model predictions for tabular data can be clearly defined
+        """
+        if output_data.data_type is not DataTypesEnum.table:
+            # No column data types info for non-tabular data
+            return output_data
+
+        is_regression_task = output_data.task.task_type is TaskTypesEnum.regression
+        is_ts_forecasting_task = output_data.task.task_type is TaskTypesEnum.ts_forecasting
+
+        predict_shape = np.array(output_data.predict).shape
+        # Add information about features
+        if is_regression_task or is_ts_forecasting_task:
+            if len(predict_shape) < 2:
+                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[0]}
+            else:
+                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}
+        else:
+            if len(predict_shape) < 2:
+                output_data.predict = output_data.predict.reshape((-1, 1))
+                predict_shape = output_data.predict.shape
+            # Classification task or clustering
+            if output_mode == 'labels':
+                column_info = {'features': [TYPE_TO_ID[int]] * predict_shape[1]}
+            else:
+                column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}
+
+        # Add information about target
+        target_shape = output_data.target.shape if output_data.target is not None else None
+        if target_shape is None:
+            # There is no target column in output data
+            output_data.supplementary_data.column_types = column_info
+            return output_data
+
+        if is_regression_task or is_ts_forecasting_task:
+            if len(target_shape) > 1:
+                column_info.update({'target': [TYPE_TO_ID[float]] * target_shape[1]})
+            else:
+                # Array present "time series"
+                column_info.update({'target': [TYPE_TO_ID[float]] * len(output_data.target)})
+        else:
+            # Classification task or clustering
+            column_info.update({'target': [TYPE_TO_ID[int]] * predict_shape[1]})
+
+        output_data.supplementary_data.column_types = column_info
+        return output_data
diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py
@@ -7,7 +7,7 @@
 
 from fedot.core.data.data import InputData
 from fedot.core.data.data_preprocessing import find_categorical_columns
-from fedot.preprocessing.data_types import NAME_CLASS_INT, FEDOT_STR_NAN
+from fedot.preprocessing.data_types import TYPE_TO_ID, FEDOT_STR_NAN
 
 
 class BinaryCategoricalPreprocessor:
@@ -91,7 +91,7 @@ def transform(self, input_data: InputData) -> InputData:
         # Update features types
         features_types = copied_data.supplementary_data.column_types['features']
         for converted_column_id in self.binary_ids_to_convert:
-            features_types[converted_column_id] = NAME_CLASS_INT
+            features_types[converted_column_id] = TYPE_TO_ID[int]
         return copied_data
 
     def _train_encoder(self, column: pd.Series, column_id: int):