Skip to content

Commit

Permalink
optimizations and style fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
IIaKyJIuH committed Feb 21, 2023
1 parent ed0b89c commit 8f05890
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 53 deletions.
6 changes: 3 additions & 3 deletions fedot/core/data/data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ def data_type_is_suitable_for_preprocessing(data: InputData) -> bool:

def replace_inf_with_nans(input_data: InputData):
features = input_data.features
has_infs = (features == np.inf) | (features == -np.inf)
if np.any(has_infs):
features[has_infs] = np.nan
inf_idxs: Tuple[np.ndarray, ...] = ((features == np.inf) | (features == -np.inf)).nonzero()
if len(inf_idxs[0]):
features[inf_idxs] = np.nan


def replace_nans_with_empty_strings(input_data: InputData):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from copy import deepcopy
from typing import Optional
from typing import Optional, Tuple

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -137,9 +137,9 @@ def transform(self, input_data: InputData) -> OutputData:
# If categorical features are exists - transform them inplace in InputData
for categorical_id in self.categorical_ids:
categorical_column = input_data.features[:, categorical_id]
has_nan: np.ndarray = pd.isna(categorical_column)
nan_idxs: Tuple[np.ndarray, ...] = pd.isna(categorical_column).nonzero()

transformed = self._apply_label_encoder(categorical_column, categorical_id, has_nan)
transformed = self._apply_label_encoder(categorical_column, categorical_id, nan_idxs)
copied_data.features[:, categorical_id] = transformed

output_data = self._convert_to_output(copied_data,
Expand Down Expand Up @@ -168,21 +168,21 @@ def _fit_label_encoders(self, input_data: InputData):
self.encoders.update({categorical_id: le})

def _apply_label_encoder(self, categorical_column: np.ndarray, categorical_id: int,
has_nan: np.ndarray) -> np.ndarray:
nan_idxs: Tuple[np.ndarray, ...]) -> np.ndarray:
""" Apply fitted LabelEncoder for column transformation
:param categorical_column: numpy array with categorical features
:param categorical_id: index of current categorical column
:param has_nan: bool array of gap elements in the ``categorical_column``
:param nan_idxs: indices of gap elements in the ``categorical_column``
"""
column_encoder = self.encoders[categorical_id]
column_encoder.classes_ = pd.unique(np.concatenate((column_encoder.classes_, categorical_column)))

transformed_column = column_encoder.transform(categorical_column)
if len(has_nan) > 0:
if len(nan_idxs[0]):
# Store np.nan values
transformed_column = transformed_column.astype(object)
transformed_column[has_nan] = np.nan
transformed_column[nan_idxs] = np.nan

return transformed_column

Expand Down
4 changes: 2 additions & 2 deletions fedot/core/repository/json_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from importlib import import_module
from typing import Union, TYPE_CHECKING, List

# imports are required for the eval
# imports are required beneath in the function
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.repository.tasks import TaskTypesEnum

Expand Down Expand Up @@ -39,7 +39,7 @@ def import_enums_from_str(field_value: str) -> Union[List[DataTypesEnum],
Returns:
list of either class:`DataTypesEnum` or class:`TaskTypesEnum` values
"""
enums = [full_val.split('.') for full_val in field_value.strip('][').split(', ') if full_val != '']
enums = [full_val.split('.') for full_val in field_value.strip('][').split(', ') if full_val]
return [
getattr(globals()[data_type], value)
for (data_type, value) in enums]
Expand Down
50 changes: 15 additions & 35 deletions fedot/preprocessing/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,26 +33,23 @@ def fit(self, input_data: InputData):
return self

binary_ids_to_convert = []
number_of_columns = input_data.features.shape[-1]
for column_id in range(number_of_columns):
pd_column = pd.Series(input_data.features[:, column_id], copy=True)
has_nan = pd_column.isna()
if has_nan.sum() and column_id in categorical_ids:
for column_id, column in enumerate(input_data.features.T):
pd_column = pd.Series(column, copy=True)
is_nan = pd_column.isna()
column_uniques = pd_column.unique()
if is_nan.sum() and column_id in categorical_ids:
# This categorical column has nans
replaced_column, _ = replace_nans_with_fedot_nans(pd_column, has_nan)
column_uniques = replaced_column.unique()
pd_column[is_nan] = FEDOT_STR_NAN

if len(column_uniques) <= 3:
# There is column with binary categories and gaps
self.binary_features_with_nans.append(column_id)
binary_ids_to_convert.append(column_id)
self._train_encoder(replaced_column, column_id)
self._train_encoder(pd_column, column_id)
else:
column_uniques = pd_column.unique()
if len(column_uniques) <= 2 and column_id in categorical_ids:
# Column contains binary string feature
binary_ids_to_convert.append(column_id)

# Train encoder for current column
self._train_encoder(pd_column, column_id)

Expand All @@ -67,26 +64,15 @@ def transform(self, input_data: InputData) -> InputData:
# There are no binary categorical features
return input_data

converted_features = []
number_of_columns = input_data.features.shape[-1]
for column_id in range(number_of_columns):
copied_data = deepcopy(input_data)
for column_id, column in enumerate(copied_data.features.T):
if column_id in self.binary_ids_to_convert:
# If column contains nans - replace them with fedot nans special string
pd_column = pd.Series(input_data.features[:, column_id])
has_nan = pd_column.isna()
replaced_column, has_nan = replace_nans_with_fedot_nans(pd_column, has_nan)
nan_idxs: Tuple[np.ndarray, ...] = pd.isna(column).nonzero()
column[nan_idxs] = FEDOT_STR_NAN

# Convert into integers
converted_column = self._apply_encoder(replaced_column, column_id, has_nan)
else:
# Stay column the same
converted_column = input_data.features[:, column_id]

converted_features.append(converted_column.reshape((-1, 1)))

# Store transformed features
copied_data = deepcopy(input_data)
copied_data.features = np.hstack(converted_features)
column[:] = self._apply_encoder(column, column_id, nan_idxs)

# Update features types
features_types = copied_data.supplementary_data.column_types['features']
Expand Down Expand Up @@ -117,22 +103,16 @@ def _train_encoder(self, column: pd.Series, column_id: int):
# Store fitted label encoder for transform method
self.binary_encoders.update({column_id: encoder})

def _apply_encoder(self, column: pd.Series, column_id: int, has_nan: pd.Series) -> np.ndarray:
def _apply_encoder(self, column: np.ndarray, column_id: int, nan_idxs: Tuple[np.ndarray, ...]) -> np.ndarray:
""" Apply already fitted encoders """
encoder = self.binary_encoders[column_id]
# Extend encoder classes if the column contains categories not previously encountered
encoder.classes_ = np.unique(np.concatenate((encoder.classes_, column)))

converted = encoder.transform(column)
if len(has_nan) > 0:
if len(nan_idxs[0]):
# Column has nans in its structure - after conversion replace it
converted = converted.astype(float)
converted[has_nan] = np.nan
converted[nan_idxs] = np.nan

return converted


def replace_nans_with_fedot_nans(column: pd.Series, has_nan: pd.Series) -> Tuple[pd.Series, pd.Series]:
# Add new category - 'fedot_nan' after converting it will be replaced by nans
column[has_nan] = FEDOT_STR_NAN
return column, has_nan
10 changes: 4 additions & 6 deletions fedot/preprocessing/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,8 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData):
data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed)

data.supplementary_data.column_types['features'] = [
col_type
for col_id, col_type in enumerate(data.supplementary_data.column_types['features'])
col_type_id
for col_id, col_type_id in enumerate(data.supplementary_data.column_types['features'])
if col_id not in self.string_columns_transformation_failed
]

Expand Down Expand Up @@ -429,9 +429,7 @@ def define_column_types(table: np.ndarray):
table_of_types[nans] = TYPE_TO_ID[type(None)]

columns_info = {}
for column_id in range(n_columns):
col_types = table_of_types[:, column_id]

for column_id, col_types in enumerate(table_of_types.T):
unique_col_types, unique_col_types_number = np.unique(col_types, return_counts=True)

if len(unique_col_types) > 1:
Expand All @@ -445,7 +443,7 @@ def define_column_types(table: np.ndarray):
]

# Store information about nans in the target
nan_ids = np.where(nans[:, column_id])[0]
nan_ids = np.nonzero(nans[:, column_id])[0]
columns_info.update({column_id: {'types': unique_col_types,
'str_number': str_number,
'int_number': int_number,
Expand Down

0 comments on commit 8f05890

Please sign in to comment.