Skip to content

Commit

Permalink
Adding docstring to reduce memory and optimisedfeatures
Browse files Browse the repository at this point in the history
  • Loading branch information
aPovidlo committed Aug 20, 2024
1 parent fca7ef6 commit 5a7cd7a
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 7 deletions.
7 changes: 5 additions & 2 deletions fedot/core/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class Data:
idx: np.ndarray
task: Task
data_type: DataTypesEnum
features: Union[np.ndarray, OptimisedFeature]
features: Union[np.ndarray, OptimisedFeatures]
categorical_features: Optional[np.ndarray] = None
categorical_idx: Optional[np.ndarray] = None
numerical_idx: Optional[np.ndarray] = None
Expand Down Expand Up @@ -683,7 +683,10 @@ class OutputData(Data):


@dataclass
class OptimisedFeature:
class OptimisedFeatures:
"""``Data`` type for optimised storage data.
It based on numpy ndarray, but the features storages in list of np.ndarray with own optimal dtype
"""
_columns: list = field(default_factory=list, init=False)
_shape: tuple = field(default=(0, 0), init=False)
_nbytes: int = 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from fedot.core.data.data import InputData, OutputData, OptimisedFeature
from fedot.core.data.data import InputData, OutputData, OptimisedFeatures
from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import (
DataOperationImplementation
)
Expand Down Expand Up @@ -161,7 +161,7 @@ def _apply_label_encoder(self, data: np.ndarray):
if isinstance(data, np.ndarray):
data[:, column_id] = transformed_column

elif isinstance(data, OptimisedFeature):
elif isinstance(data, OptimisedFeatures):
data._columns[column_id] = transformed_column

def get_params(self) -> OperationParameters:
Expand Down
14 changes: 14 additions & 0 deletions fedot/preprocessing/base_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,20 @@ def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalD
"""
raise AbstractMethodNotImplementError

@abstractmethod
def reduce_memory_size(self, data: InputData) -> InputData:
"""
Method allows to reduce the memory consumption of InputData.
This works in this way:
- Getting the defined type of feature from preprocessing (e.g. int);
- Finding the minimum and maximum values in this feature;
- Finding a suitable type and change it
(e.g.: Feature has unique values 0 and 1, the suitable type would be np.bool.
Feature has all values between 0 and 100, the suitable type would be np.int8);
"""
raise AbstractMethodNotImplementError

@staticmethod
def mark_as_preprocessed(data: Union[InputData, MultiModalData], *, is_obligatory: bool = True):
"""
Expand Down
3 changes: 3 additions & 0 deletions fedot/preprocessing/dummy_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,6 @@ def restore_index(self, input_data: InputData, result: OutputData) -> OutputData
def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalData]
) -> Union[InputData, MultiModalData]:
return test_data

def reduce_memory_size(self, data: InputData) -> InputData:
return data
6 changes: 3 additions & 3 deletions fedot/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from golem.core.paths import copy_doc
from sklearn.preprocessing import LabelEncoder

from fedot.core.data.data import InputData, np_datetime_to_numeric, OptimisedFeature
from fedot.core.data.data import InputData, np_datetime_to_numeric, OptimisedFeatures
from fedot.core.data.data import OutputData, data_type_is_table, data_type_is_text, data_type_is_ts
from fedot.core.data.data_preprocessing import (
data_has_categorical_features,
Expand Down Expand Up @@ -558,10 +558,10 @@ def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalD
last_id = len(input_data.idx)
input_data.idx = np.arange(last_id, last_id + input_data.task.task_params.forecast_length)
return test_data

@copy_doc(BasePreprocessor.reduce_memory_size)
def reduce_memory_size(self, data: InputData) -> InputData:
def reduce_mem_usage_np(arr, initial_types):
reduced_columns = OptimisedFeature()
reduced_columns = OptimisedFeatures()

for i in range(arr.shape[1]):
col = arr[:, i]
Expand Down

0 comments on commit 5a7cd7a

Please sign in to comment.