Adding docstring to reduce memory and optimisedfeatures

aimclub · Aug 20, 2024 · 5a7cd7a · 5a7cd7a
1 parent fca7ef6
commit 5a7cd7a
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 7 deletions.
diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py
@@ -42,7 +42,7 @@ class Data:
     idx: np.ndarray
     task: Task
     data_type: DataTypesEnum
-    features: Union[np.ndarray, OptimisedFeature]
+    features: Union[np.ndarray, OptimisedFeatures]
     categorical_features: Optional[np.ndarray] = None
     categorical_idx: Optional[np.ndarray] = None
     numerical_idx: Optional[np.ndarray] = None
@@ -683,7 +683,10 @@ class OutputData(Data):
 
 
 @dataclass
-class OptimisedFeature:
+class OptimisedFeatures:
+    """``Data`` type for optimised storage data.
+    It based on numpy ndarray, but the features storages in list of np.ndarray with own optimal dtype
+    """
     _columns: list = field(default_factory=list, init=False)
     _shape: tuple = field(default=(0, 0), init=False)
     _nbytes: int = 0

diff --git a/...e/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/...e/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py
@@ -4,7 +4,7 @@
 import numpy as np
 from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 
-from fedot.core.data.data import InputData, OutputData, OptimisedFeature
+from fedot.core.data.data import InputData, OutputData, OptimisedFeatures
 from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import (
     DataOperationImplementation
 )
@@ -161,7 +161,7 @@ def _apply_label_encoder(self, data: np.ndarray):
             if isinstance(data, np.ndarray):
                 data[:, column_id] = transformed_column
 
-            elif isinstance(data, OptimisedFeature):
+            elif isinstance(data, OptimisedFeatures):
                 data._columns[column_id] = transformed_column
 
     def get_params(self) -> OperationParameters:

diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py
@@ -192,6 +192,20 @@ def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalD
         """
         raise AbstractMethodNotImplementError
 
+    @abstractmethod
+    def reduce_memory_size(self, data: InputData) -> InputData:
+        """
+        Method allows to reduce the memory consumption of InputData.
+
+        This works in this way:
+        - Getting the defined type of feature from preprocessing (e.g. int);
+        - Finding the minimum and maximum values in this feature;
+        - Finding a suitable type and change it
+         (e.g.: Feature has unique values 0 and 1, the suitable type would be np.bool.
+         Feature has all values between 0 and 100, the suitable type would be np.int8);
+        """
+        raise AbstractMethodNotImplementError
+
     @staticmethod
     def mark_as_preprocessed(data: Union[InputData, MultiModalData], *, is_obligatory: bool = True):
         """

diff --git a/fedot/preprocessing/dummy_preprocessing.py b/fedot/preprocessing/dummy_preprocessing.py
@@ -63,3 +63,6 @@ def restore_index(self, input_data: InputData, result: OutputData) -> OutputData
     def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalData]
                                        ) -> Union[InputData, MultiModalData]:
         return test_data
+
+    def reduce_memory_size(self, data: InputData) -> InputData:
+        return data
diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py
@@ -8,7 +8,7 @@
 from golem.core.paths import copy_doc
 from sklearn.preprocessing import LabelEncoder
 
-from fedot.core.data.data import InputData, np_datetime_to_numeric, OptimisedFeature
+from fedot.core.data.data import InputData, np_datetime_to_numeric, OptimisedFeatures
 from fedot.core.data.data import OutputData, data_type_is_table, data_type_is_text, data_type_is_ts
 from fedot.core.data.data_preprocessing import (
     data_has_categorical_features,
@@ -558,10 +558,10 @@ def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalD
                 last_id = len(input_data.idx)
                 input_data.idx = np.arange(last_id, last_id + input_data.task.task_params.forecast_length)
         return test_data
-
+    @copy_doc(BasePreprocessor.reduce_memory_size)
     def reduce_memory_size(self, data: InputData) -> InputData:
         def reduce_mem_usage_np(arr, initial_types):
-            reduced_columns = OptimisedFeature()
+            reduced_columns = OptimisedFeatures()
 
             for i in range(arr.shape[1]):
                 col = arr[:, i]