From 5a7cd7aa9a1236ffa213ec2ef5f44807d6d8ace2 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Tue, 20 Aug 2024 19:33:09 +0300 Subject: [PATCH] Adding docstring to reduce memory and optimisedfeatures --- fedot/core/data/data.py | 7 +++++-- .../data_operations/categorical_encoders.py | 4 ++-- fedot/preprocessing/base_preprocessing.py | 14 ++++++++++++++ fedot/preprocessing/dummy_preprocessing.py | 3 +++ fedot/preprocessing/preprocessing.py | 6 +++--- 5 files changed, 27 insertions(+), 7 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index eda9440313..8e475bc629 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -42,7 +42,7 @@ class Data: idx: np.ndarray task: Task data_type: DataTypesEnum - features: Union[np.ndarray, OptimisedFeature] + features: Union[np.ndarray, OptimisedFeatures] categorical_features: Optional[np.ndarray] = None categorical_idx: Optional[np.ndarray] = None numerical_idx: Optional[np.ndarray] = None @@ -683,7 +683,10 @@ class OutputData(Data): @dataclass -class OptimisedFeature: +class OptimisedFeatures: + """``Data`` type for optimised storage data. + It based on numpy ndarray, but the features storages in list of np.ndarray with own optimal dtype + """ _columns: list = field(default_factory=list, init=False) _shape: tuple = field(default=(0, 0), init=False) _nbytes: int = 0 diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index c2ffac8e93..057702c6ba 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -4,7 +4,7 @@ import numpy as np from sklearn.preprocessing import LabelEncoder, OneHotEncoder -from fedot.core.data.data import InputData, OutputData, OptimisedFeature +from fedot.core.data.data import InputData, OutputData, OptimisedFeatures from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import ( DataOperationImplementation ) @@ -161,7 +161,7 @@ def _apply_label_encoder(self, data: np.ndarray): if isinstance(data, np.ndarray): data[:, column_id] = transformed_column - elif isinstance(data, OptimisedFeature): + elif isinstance(data, OptimisedFeatures): data._columns[column_id] = transformed_column def get_params(self) -> OperationParameters: diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py index 7871af8fc4..56c238ffb9 100644 --- a/fedot/preprocessing/base_preprocessing.py +++ b/fedot/preprocessing/base_preprocessing.py @@ -192,6 +192,20 @@ def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalD """ raise AbstractMethodNotImplementError + @abstractmethod + def reduce_memory_size(self, data: InputData) -> InputData: + """ + Method allows to reduce the memory consumption of InputData. + + This works in this way: + - Getting the defined type of feature from preprocessing (e.g. int); + - Finding the minimum and maximum values in this feature; + - Finding a suitable type and change it + (e.g.: Feature has unique values 0 and 1, the suitable type would be np.bool. + Feature has all values between 0 and 100, the suitable type would be np.int8); + """ + raise AbstractMethodNotImplementError + @staticmethod def mark_as_preprocessed(data: Union[InputData, MultiModalData], *, is_obligatory: bool = True): """ diff --git a/fedot/preprocessing/dummy_preprocessing.py b/fedot/preprocessing/dummy_preprocessing.py index d3c4206e34..4d0d1cd456 100644 --- a/fedot/preprocessing/dummy_preprocessing.py +++ b/fedot/preprocessing/dummy_preprocessing.py @@ -63,3 +63,6 @@ def restore_index(self, input_data: InputData, result: OutputData) -> OutputData def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalData] ) -> Union[InputData, MultiModalData]: return test_data + + def reduce_memory_size(self, data: InputData) -> InputData: + return data diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 434da34928..1f85e4e824 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -8,7 +8,7 @@ from golem.core.paths import copy_doc from sklearn.preprocessing import LabelEncoder -from fedot.core.data.data import InputData, np_datetime_to_numeric, OptimisedFeature +from fedot.core.data.data import InputData, np_datetime_to_numeric, OptimisedFeatures from fedot.core.data.data import OutputData, data_type_is_table, data_type_is_text, data_type_is_ts from fedot.core.data.data_preprocessing import ( data_has_categorical_features, @@ -558,10 +558,10 @@ def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalD last_id = len(input_data.idx) input_data.idx = np.arange(last_id, last_id + input_data.task.task_params.forecast_length) return test_data - + @copy_doc(BasePreprocessor.reduce_memory_size) def reduce_memory_size(self, data: InputData) -> InputData: def reduce_mem_usage_np(arr, initial_types): - reduced_columns = OptimisedFeature() + reduced_columns = OptimisedFeatures() for i in range(arr.shape[1]): col = arr[:, i]