Skip to content

Commit

Permalink
PR fixes
Browse files Browse the repository at this point in the history
* rid of str variables for types in preprocessor
* improved define column_types function in data_types.py
  • Loading branch information
IIaKyJIuH committed Dec 29, 2022
1 parent 4cad6b4 commit cd5a712
Show file tree
Hide file tree
Showing 12 changed files with 943 additions and 922 deletions.
9 changes: 5 additions & 4 deletions fedot/core/data/data_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from fedot.core.data.data import InputData, data_type_is_table, data_type_is_ts, data_type_is_multi_ts
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.preprocessing.data_types import TYPE_TO_ID


def data_type_is_suitable_for_preprocessing(data: InputData) -> bool:
Expand Down Expand Up @@ -76,11 +77,11 @@ def find_categorical_columns(table: np.ndarray, column_types: dict = None):

categorical_ids = []
non_categorical_ids = []
for column_id, type_name in enumerate(column_types):
if 'str' in str(type_name):
categorical_ids.append(column_id)
for col_id, col_type_id in enumerate(column_types):
if col_type_id == TYPE_TO_ID[str]:
categorical_ids.append(col_id)
else:
non_categorical_ids.append(column_id)
non_categorical_ids.append(col_id)

return categorical_ids, non_categorical_ids

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import \
DataOperationImplementation
from fedot.core.operations.operation_parameters import OperationParameters
from fedot.preprocessing.data_types import TYPE_TO_ID


class OneHotEncodingImplementation(DataOperationImplementation):
Expand All @@ -31,9 +32,9 @@ def fit(self, input_data: InputData):
:return encoder: trained encoder (optional output)
"""
features = input_data.features
features_types = input_data.supplementary_data.column_types.get('features')
features_type_ids = input_data.supplementary_data.column_types.get('features')
categorical_ids, non_categorical_ids = find_categorical_columns(features,
features_types)
features_type_ids)

# Indices of columns with categorical and non-categorical features
self.categorical_ids = categorical_ids
Expand Down Expand Up @@ -75,11 +76,11 @@ def _update_column_types(self, output_data: OutputData):
if self.categorical_ids:
# There are categorical features in the table
col_types = output_data.supplementary_data.column_types['features']
numerical_columns = [t_name for t_name in col_types if 'str' not in t_name]
numerical_columns = [t_name for t_name in col_types if t_name != TYPE_TO_ID[str]]

# Calculate new binary columns number after encoding
encoded_columns_number = output_data.predict.shape[1] - len(numerical_columns)
numerical_columns.extend([str(int)] * encoded_columns_number)
numerical_columns.extend([TYPE_TO_ID[int]] * encoded_columns_number)

output_data.supplementary_data.column_types['features'] = numerical_columns

Expand Down Expand Up @@ -153,7 +154,7 @@ def _update_column_types(self, output_data: OutputData):
# Categorical features were in the dataset
col_types = output_data.supplementary_data.column_types['features']
for categorical_id in self.categorical_ids:
col_types[categorical_id] = str(int)
col_types[categorical_id] = TYPE_TO_ID[int]

output_data.supplementary_data.column_types['features'] = col_types

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from fedot.core.operations.evaluation.operation_implementations. \
implementation_interfaces import DataOperationImplementation, EncodedInvariantImplementation
from fedot.core.operations.operation_parameters import OperationParameters
from fedot.preprocessing.data_types import TYPE_TO_ID


class ComponentAnalysisImplementation(DataOperationImplementation):
Expand Down Expand Up @@ -87,8 +88,8 @@ def update_column_types(output_data: OutputData) -> OutputData:
"""Update column types after applying PCA operations
"""

n_rows, n_cols = output_data.predict.shape
output_data.supplementary_data.column_types['features'] = [str(float) * n_cols]
_, n_cols = output_data.predict.shape
output_data.supplementary_data.column_types['features'] = [TYPE_TO_ID[float] * n_cols]
return output_data


Expand Down Expand Up @@ -127,6 +128,7 @@ class FastICAImplementation(ComponentAnalysisImplementation):
Args:
params: OperationParameters with the hyperparameters
"""

def __init__(self, params: Optional[OperationParameters]):
super().__init__(params)
self.pca = FastICA(**self.params.to_dict())
Expand Down Expand Up @@ -195,7 +197,7 @@ def _update_column_types(self, source_features_shape, output_data: OutputData):
if cols_number_added > 0:
# There are new columns in the table
col_types = output_data.supplementary_data.column_types['features']
col_types.extend([str(float)] * cols_number_added)
col_types.extend([TYPE_TO_ID[float]] * cols_number_added)
output_data.supplementary_data.column_types['features'] = col_types


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
from sklearn.decomposition import TruncatedSVD

from fedot.core.data.data import InputData, OutputData
from fedot.core.log import LoggerAdapter, default_log
from fedot.core.log import default_log
from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import (
DataOperationImplementation
)
from fedot.core.operations.operation_parameters import OperationParameters
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.preprocessing.data_types import TYPE_TO_ID


class LaggedImplementation(DataOperationImplementation):
Expand Down Expand Up @@ -135,12 +136,12 @@ def _update_column_types(self, output_data: OutputData):
"""

features_n_rows, features_n_cols = output_data.predict.shape
features_column_types = [str(float)] * features_n_cols
features_column_types = [TYPE_TO_ID[float]] * features_n_cols
column_types = {'features': features_column_types}

if output_data.target is not None and len(output_data.target.shape) > 1:
target_n_rows, target_n_cols = output_data.target.shape
column_types.update({'target': [str(float)] * target_n_cols})
column_types.update({'target': [TYPE_TO_ID[float]] * target_n_cols})
output_data.supplementary_data.column_types = column_types

def _apply_transformation_for_fit(self, input_data: InputData, features: np.array, target: np.array,
Expand Down
137 changes: 69 additions & 68 deletions fedot/core/operations/model.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,69 @@
import numpy as np

from fedot.core.data.data import OutputData
from fedot.core.operations.operation import Operation
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.repository.operation_types_repository import OperationTypesRepository
from fedot.core.repository.tasks import TaskTypesEnum


class Model(Operation):
"""Class with ``fit``/``predict`` methods defining the evaluation strategy for the task
Args:
operation_type: name of the model
"""

def __init__(self, operation_type: str):
super().__init__(operation_type=operation_type)
self.operations_repo = OperationTypesRepository('model')

@staticmethod
def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> OutputData:
"""Assign types for tabular data obtained from model predictions.\n
By default, all types of model predictions for tabular data can be clearly defined
"""
if output_data.data_type is not DataTypesEnum.table:
# No column data types info for non-tabular data
return output_data

is_regression_task = output_data.task.task_type is TaskTypesEnum.regression
is_ts_forecasting_task = output_data.task.task_type is TaskTypesEnum.ts_forecasting

predict_shape = np.array(output_data.predict).shape
# Add information about features
if is_regression_task or is_ts_forecasting_task:
if len(predict_shape) < 2:
column_info = {'features': [str(float)] * predict_shape[0]}
else:
column_info = {'features': [str(float)] * predict_shape[1]}
else:
if len(predict_shape) < 2:
output_data.predict = output_data.predict.reshape((-1, 1))
predict_shape = output_data.predict.shape
# Classification task or clustering
if output_mode == 'labels':
column_info = {'features': [str(int)] * predict_shape[1]}
else:
column_info = {'features': [str(float)] * predict_shape[1]}

# Add information about target
target_shape = output_data.target.shape if output_data.target is not None else None
if target_shape is None:
# There is no target column in output data
output_data.supplementary_data.column_types = column_info
return output_data

if is_regression_task or is_ts_forecasting_task:
if len(target_shape) > 1:
column_info.update({'target': [str(float)] * target_shape[1]})
else:
# Array present "time series"
column_info.update({'target': [str(float)] * len(output_data.target)})
else:
# Classification task or clustering
column_info.update({'target': [str(int)] * predict_shape[1]})

output_data.supplementary_data.column_types = column_info
return output_data
import numpy as np

from fedot.core.data.data import OutputData
from fedot.core.operations.operation import Operation
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.core.repository.operation_types_repository import OperationTypesRepository
from fedot.core.repository.tasks import TaskTypesEnum
from fedot.preprocessing.data_types import TYPE_TO_ID


class Model(Operation):
"""Class with ``fit``/``predict`` methods defining the evaluation strategy for the task
Args:
operation_type: name of the model
"""

def __init__(self, operation_type: str):
super().__init__(operation_type=operation_type)
self.operations_repo = OperationTypesRepository('model')

@staticmethod
def assign_tabular_column_types(output_data: OutputData, output_mode: str) -> OutputData:
"""Assign types for tabular data obtained from model predictions.\n
By default, all types of model predictions for tabular data can be clearly defined
"""
if output_data.data_type is not DataTypesEnum.table:
# No column data types info for non-tabular data
return output_data

is_regression_task = output_data.task.task_type is TaskTypesEnum.regression
is_ts_forecasting_task = output_data.task.task_type is TaskTypesEnum.ts_forecasting

predict_shape = np.array(output_data.predict).shape
# Add information about features
if is_regression_task or is_ts_forecasting_task:
if len(predict_shape) < 2:
column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[0]}
else:
column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}
else:
if len(predict_shape) < 2:
output_data.predict = output_data.predict.reshape((-1, 1))
predict_shape = output_data.predict.shape
# Classification task or clustering
if output_mode == 'labels':
column_info = {'features': [TYPE_TO_ID[int]] * predict_shape[1]}
else:
column_info = {'features': [TYPE_TO_ID[float]] * predict_shape[1]}

# Add information about target
target_shape = output_data.target.shape if output_data.target is not None else None
if target_shape is None:
# There is no target column in output data
output_data.supplementary_data.column_types = column_info
return output_data

if is_regression_task or is_ts_forecasting_task:
if len(target_shape) > 1:
column_info.update({'target': [TYPE_TO_ID[float]] * target_shape[1]})
else:
# Array present "time series"
column_info.update({'target': [TYPE_TO_ID[float]] * len(output_data.target)})
else:
# Classification task or clustering
column_info.update({'target': [TYPE_TO_ID[int]] * predict_shape[1]})

output_data.supplementary_data.column_types = column_info
return output_data
4 changes: 2 additions & 2 deletions fedot/preprocessing/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from fedot.core.data.data import InputData
from fedot.core.data.data_preprocessing import find_categorical_columns
from fedot.preprocessing.data_types import NAME_CLASS_INT, FEDOT_STR_NAN
from fedot.preprocessing.data_types import TYPE_TO_ID, FEDOT_STR_NAN


class BinaryCategoricalPreprocessor:
Expand Down Expand Up @@ -91,7 +91,7 @@ def transform(self, input_data: InputData) -> InputData:
# Update features types
features_types = copied_data.supplementary_data.column_types['features']
for converted_column_id in self.binary_ids_to_convert:
features_types[converted_column_id] = NAME_CLASS_INT
features_types[converted_column_id] = TYPE_TO_ID[int]
return copied_data

def _train_encoder(self, column: pd.Series, column_id: int):
Expand Down
Loading

0 comments on commit cd5a712

Please sign in to comment.