From 5f3fbb258f4adf85f6889646ee57fe42f2d77f9b Mon Sep 17 00:00:00 2001 From: Sergey Kasyanov Date: Tue, 8 Aug 2023 14:49:32 +0300 Subject: [PATCH 01/14] Refactor data splitters logic Fix #1119 Add support of MultiModalData in cv_folds Delete some code that solves problems are solved in the new DataSourceSplitter or in new cv_folds --- fedot/api/api_utils/params.py | 3 +- fedot/core/data/data_split.py | 313 +++++++----------- .../objective/data_objective_advisor.py | 40 --- .../objective/data_source_splitter.py | 69 ++-- fedot/core/validation/split.py | 177 +++------- 5 files changed, 195 insertions(+), 407 deletions(-) delete mode 100644 fedot/core/optimisers/objective/data_objective_advisor.py diff --git a/fedot/api/api_utils/params.py b/fedot/api/api_utils/params.py index 718fabe28d..8cd24dfd10 100644 --- a/fedot/api/api_utils/params.py +++ b/fedot/api/api_utils/params.py @@ -56,9 +56,8 @@ def accept_and_apply_recommendations(self, input_data: Union[InputData, MultiMod input_data: data for preprocessing recommendations: dict with recommendations """ - # TODO fix multimodality + if isinstance(input_data, MultiModalData): - self['cv_folds'] = None # there are no support for multimodal data now for data_source_name, values in input_data.items(): self.accept_and_apply_recommendations(input_data[data_source_name], recommendations[data_source_name]) diff --git a/fedot/core/data/data_split.py b/fedot/core/data/data_split.py index f086cd9519..5fc86d5189 100644 --- a/fedot/core/data/data_split.py +++ b/fedot/core/data/data_split.py @@ -1,246 +1,155 @@ from copy import deepcopy -from typing import Tuple, Union +from typing import Tuple, Optional, Union +import numpy as np from sklearn.model_selection import train_test_split from fedot.core.data.data import InputData from fedot.core.data.multi_modal import MultiModalData from fedot.core.repository.dataset_types import DataTypesEnum -from fedot.core.repository.tasks import TaskTypesEnum -def _split_time_series(data: InputData, task, *args, **kwargs): - """ Split time series data into train and test parts - - :param data: InputData object to split - :param task: task to solve - """ - - input_features = data.features - input_target = data.target - forecast_length = task.task_params.forecast_length - - if kwargs.get('validation_blocks') is not None: - # It is required to split data for in-sample forecasting - forecast_length = forecast_length * kwargs.get('validation_blocks') - x_train = input_features[:-forecast_length] - x_test = input_features - - y_train = input_target[:-forecast_length] - y_test = input_target[-forecast_length:] +def _split_input_data_by_indexes(origin_input_data: Union[InputData, MultiModalData], index, reset_idx=False): + """ The function get InputData or MultiModalData and return + only data with indexes in index, not in idx + f.e. index = [0, 1, 2, 3] == input_data.features[[0, 1, 2, 3], :] + :param index: indexes that needed in output data + :param reset_idx: set to True for idx is range (0, len(data)) + """ + + if isinstance(origin_input_data, MultiModalData): + data = MultiModalData() + for key in origin_input_data: + data[key] = _split_input_data_by_indexes(origin_input_data[key], + index=index, + reset_idx=reset_idx) + return data + elif isinstance(origin_input_data, InputData): + target = np.take(origin_input_data.target, index, 0) + features = np.take(origin_input_data.features, index, 0) + + if reset_idx: + idx = np.arange(0, len(target)) + else: + idx = np.take(origin_input_data.idx, index, 0) + + data = InputData(idx=idx, + features=features, + target=target, + task=deepcopy(origin_input_data.task), + data_type=origin_input_data.data_type, + supplementary_data=origin_input_data.supplementary_data) + return data else: - # Source time series divide into two parts - x_train = input_features[:-forecast_length] - x_test = input_features[:-forecast_length] - - y_train = input_target[:-forecast_length] - y_test = input_target[-forecast_length:] - - idx_train = data.idx[:-forecast_length] - idx_test = data.idx[-forecast_length:] - - # Prepare data to train the operation - train_data = InputData(idx=idx_train, features=x_train, target=y_train, - task=task, data_type=DataTypesEnum.ts, - supplementary_data=data.supplementary_data) - - test_data = InputData(idx=idx_test, features=x_test, target=y_test, - task=task, data_type=DataTypesEnum.ts, - supplementary_data=data.supplementary_data) - return train_data, test_data + raise TypeError(f'Unknown data type {type(origin_input_data)}') -def _split_multi_time_series(data: InputData, task, *args, **kwargs): - """ Split multi_ts time series data into train and test parts +def _split_time_series(data: InputData, + validation_blocks: Optional[int] = None, + **kwargs): + """ Split time series data into train and test parts :param data: InputData object to split - :param task: task to solve + :param validation_blocks: validation blocks are used for test """ - input_features = data.features - input_target = data.target - forecast_length = task.task_params.forecast_length - - if kwargs.get('validation_blocks') is not None: - # It is required to split data for in-sample forecasting - forecast_length = forecast_length * kwargs.get('validation_blocks') - x_train = input_features[:-forecast_length] - x_test = input_features + forecast_length = data.task.task_params.forecast_length + if validation_blocks is not None: + forecast_length *= validation_blocks - y_train = input_target[:-forecast_length] - y_test = input_target[-forecast_length:, 0] + target_length = len(data.target) + train_data = _split_input_data_by_indexes(data, index=np.arange(0, target_length - forecast_length)) + test_data = _split_input_data_by_indexes(data, index=np.arange(target_length - forecast_length, target_length)) + if validation_blocks is None: + # for in-sample + test_data.features = train_data.features else: - # Source time series divide into two parts - x_train = input_features[:-forecast_length] - x_test = input_features[:-forecast_length] - - y_train = input_target[:-forecast_length] - y_test = input_target[-forecast_length:, 0] - - idx_train = data.idx[:-forecast_length] - idx_test = data.idx[-forecast_length:] - - # Prepare data to train the operation - train_data = InputData(idx=idx_train, features=x_train, target=y_train, - task=task, data_type=DataTypesEnum.multi_ts, - supplementary_data=data.supplementary_data) - - test_data = InputData(idx=idx_test, features=x_test, target=y_test, - task=task, data_type=DataTypesEnum.multi_ts, - supplementary_data=data.supplementary_data) + # for out-of-sample + test_data.features = data.features return train_data, test_data -def _split_any(data: InputData, task, data_type, split_ratio, with_shuffle=False, **kwargs): - """ Split any data into train and test parts +def _split_any(data: InputData, + split_ratio: float, + shuffle: Optional[bool] = None, + stratify: Optional[bool] = None, + random_seed: Optional[int] = None, + **kwargs): + """ Split any data except timeseries into train and test parts :param data: InputData object to split - :param task: task to solve - :param split_ratio: threshold for partitioning - :param data_type type of data to split - :param with_shuffle: is data needed to be shuffled or not + :param split_ratio: share of train data between 0 and 1 + :param shuffle: is data needed to be shuffled or not + :param stratify: make stratified sample or not + :param random_seed: random_seed for shuffle """ - if not 0. < split_ratio < 1.: - raise ValueError('Split ratio must belong to the interval (0; 1)') - random_state = 42 - - # Predictors and target - input_features = data.features - input_target = data.target - idx = data.idx - if task.task_type == TaskTypesEnum.classification and with_shuffle: - stratify = input_target + if stratify and shuffle: + stratify_labels = data.target + test_size = round(len(data.target) * (1. - split_ratio)) + labels_num = np.unique(stratify_labels).shape[0] + if test_size < labels_num: + split_ratio = 1 - labels_num / len(data.target) else: - stratify = None + stratify_labels = None - idx_train, idx_test, x_train, x_test, y_train, y_test = \ - train_test_split(idx, - input_features, - input_target, - test_size=1. - split_ratio, - shuffle=with_shuffle, - random_state=random_state, - stratify=stratify) + train_ids, test_ids = train_test_split(np.arange(0, len(data.target)), + test_size=1. - split_ratio, + shuffle=shuffle, + random_state=random_seed, + stratify=stratify_labels) # Prepare data to train the operation - train_data = InputData(idx=idx_train, features=x_train, target=y_train, - task=task, data_type=data_type, - supplementary_data=data.supplementary_data) - - test_data = InputData(idx=idx_test, features=x_test, target=y_test, - task=task, data_type=data_type, - supplementary_data=data.supplementary_data) - - return train_data, test_data - - -def _split_table(data: InputData, task, split_ratio, with_shuffle=False, **kwargs): - """ Split table data into train and test parts - - :param data: InputData object to split - :param task: task to solve - :param split_ratio: threshold for partitioning - :param with_shuffle: is data needed to be shuffled or not - """ - return _split_any(data, task, DataTypesEnum.table, split_ratio, with_shuffle) - - -def _split_image(data: InputData, task, split_ratio, with_shuffle=False, **kwargs): - """ Split image data into train and test parts - - :param data: InputData object to split - :param task: task to solve - :param split_ratio: threshold for partitioning - :param with_shuffle: is data needed to be shuffled or not - """ - - return _split_any(data, task, DataTypesEnum.image, split_ratio, with_shuffle) - - -def _split_text(data: InputData, task, split_ratio, with_shuffle=False, **kwargs): - """ Split text data into train and test parts - - :param data: InputData object to split - :param task: task to solve - :param split_ratio: threshold for partitioning - :param with_shuffle: is data needed to be shuffled or not - """ - - return _split_any(data, task, DataTypesEnum.text, split_ratio, with_shuffle) - - -def _train_test_single_data_setup(data: InputData, split_ratio=0.8, - shuffle_flag=False, **kwargs) -> Tuple[InputData, InputData]: - """ Function for train and test split - - :param data: InputData for train and test splitting - :param split_ratio: threshold for partitioning - :param shuffle_flag: is data needed to be shuffled or not - - :return train_data: InputData for train - :return test_data: InputData for validation - """ - # Split into train and test - if data is not None: - task = data.task - - split_func_dict = { - DataTypesEnum.multi_ts: _split_multi_time_series, - DataTypesEnum.ts: _split_time_series, - DataTypesEnum.table: _split_table, - DataTypesEnum.image: _split_image, - DataTypesEnum.text: _split_text - } - - split_func = split_func_dict.get(data.data_type, _split_table) - - train_data, test_data = split_func(data, task, split_ratio, - with_shuffle=shuffle_flag, - **kwargs) - else: - raise ValueError('InputData must be not empty') - - # Store additional information - train_data.supplementary_data = deepcopy(data.supplementary_data) - test_data.supplementary_data = deepcopy(data.supplementary_data) - return train_data, test_data - - -def _train_test_multi_modal_data_setup(data: MultiModalData, split_ratio=0.8, - shuffle_flag=False, **kwargs) -> Tuple[MultiModalData, MultiModalData]: - train_data = MultiModalData() - test_data = MultiModalData() - for node in data.keys(): - data_part = data[node] - train_data_part, test_data_part = train_test_data_setup(data_part, split_ratio, shuffle_flag, **kwargs) - train_data[node] = train_data_part - test_data[node] = test_data_part + train_data = _split_input_data_by_indexes(data, index=train_ids) + test_data = _split_input_data_by_indexes(data, index=test_ids) return train_data, test_data -def train_test_data_setup(data: Union[InputData, MultiModalData], split_ratio=0.8, - shuffle_flag=False, **kwargs) -> Tuple[Union[InputData, MultiModalData], - Union[InputData, MultiModalData]]: +def train_test_data_setup(data: Union[InputData, MultiModalData], + split_ratio: float = 0.8, + shuffle: bool = False, + stratify: bool = True, + random_seed: int = 42, + validation_blocks: Optional[int] = None) -> Tuple[Union[InputData, MultiModalData], + Union[InputData, MultiModalData]]: """ Function for train and test split for both InputData and MultiModalData - Args: - data: data for train and test splitting - split_ratio: threshold for partitioning - shuffle_flag: is data needed to be shuffled or not - kwargs: additional optional parameters such as number of validation blocks + :param data: InputData object to split + :param split_ratio: share of train data between 0 and 1 + :param shuffle: is data needed to be shuffled or not + :param stratify: make stratified sample or not + :param random_seed: random_seed for shuffle + :param validation_blocks: validation blocks are used for test - Returns: - data for train, data for validation + :return: data for train, data for validation """ + input_arguments = {'split_ratio': split_ratio, + 'shuffle': shuffle, + 'stratify': stratify, + 'random_seed': random_seed, + 'validation_blocks': validation_blocks} if isinstance(data, InputData): - train_data, test_data = _train_test_single_data_setup(data, split_ratio, shuffle_flag, **kwargs) + split_func_dict = {DataTypesEnum.multi_ts: _split_time_series, + DataTypesEnum.ts: _split_time_series, + DataTypesEnum.table: _split_any, + DataTypesEnum.image: _split_any, + DataTypesEnum.text: _split_any} + + if data.data_type not in split_func_dict: + raise TypeError((f'Unknown data type {type(data)}. Supported data types:' + f' {", ".join(str(x) for x in split_func_dict)}')) + + split_func = split_func_dict[data.data_type] + train_data, test_data = split_func(data, **input_arguments) elif isinstance(data, MultiModalData): - train_data, test_data = _train_test_multi_modal_data_setup(data, split_ratio, shuffle_flag, **kwargs) + train_data, test_data = MultiModalData(), MultiModalData() + for node in data.keys(): + train_data[node], test_data[node] = train_test_data_setup(data[node], **input_arguments) else: - raise ValueError(f'Dataset {type(data)} is not supported') + raise ValueError((f'Dataset {type(data)} is not supported. Supported types:' + 'InputData, MultiModalData')) return train_data, test_data diff --git a/fedot/core/optimisers/objective/data_objective_advisor.py b/fedot/core/optimisers/objective/data_objective_advisor.py deleted file mode 100644 index 48972aaf8a..0000000000 --- a/fedot/core/optimisers/objective/data_objective_advisor.py +++ /dev/null @@ -1,40 +0,0 @@ -import numpy as np -from sklearn.model_selection import StratifiedKFold, KFold -from sklearn.model_selection._split import _BaseKFold -from typing import Type - -from fedot.core.data.data import InputData -from fedot.core.repository.tasks import TaskTypesEnum - - -class DataObjectiveAdvisor: - def __init__(self, threshold: float = 0.5): - """ - Advisor for DataObjectiveBuilder for choice some parameters based on input_data - - :param threshold: threshold level for difference between uniform probabilities and real probabilities - """ - self.threshold = threshold - - def propose_kfold(self, input_data: InputData) -> Type[_BaseKFold]: - """ - Method to choose he most suitable strategy for making folds - - :param input_data: data to analyse - """ - if input_data.task.task_type is TaskTypesEnum.classification and self.check_imbalance(input_data): - return StratifiedKFold - else: - return KFold - - def check_imbalance(self, input_data: InputData) -> bool: - """ - Checks data for imbalance - if probability of any class lower than uniform probability in threshold times it - returns true - :param input_data: data to analyse - - """ - _, counts = np.unique(input_data.target, return_counts=True) - probabilities = counts / input_data.target.shape[0] - uniform_probability = 1 / input_data.num_classes - return np.any(np.abs(uniform_probability - probabilities) / uniform_probability > self.threshold) diff --git a/fedot/core/optimisers/objective/data_source_splitter.py b/fedot/core/optimisers/objective/data_source_splitter.py index 756c2151d8..88d17a329e 100644 --- a/fedot/core/optimisers/objective/data_source_splitter.py +++ b/fedot/core/optimisers/objective/data_source_splitter.py @@ -1,5 +1,5 @@ from functools import partial -from typing import Optional +from typing import Optional, Union from golem.core.log import default_log @@ -7,11 +7,10 @@ from fedot.core.data.data import InputData from fedot.core.data.data_split import train_test_data_setup from fedot.core.data.multi_modal import MultiModalData -from fedot.core.optimisers.objective.data_objective_advisor import DataObjectiveAdvisor from fedot.core.optimisers.objective.data_objective_eval import DataSource from fedot.core.repository.tasks import TaskTypesEnum -from fedot.core.validation.split import tabular_cv_generator, ts_cv_generator from fedot.remote.remote_evaluator import RemoteEvaluator, init_data_for_remote_execution +from fedot.core.validation.split import cv_generator class DataSourceSplitter: @@ -31,32 +30,45 @@ def __init__(self, cv_folds: Optional[int] = None, validation_blocks: Optional[int] = None, split_ratio: Optional[float] = None, - shuffle: bool = False): + shuffle: bool = True, + stratify: bool = True, + random_seed: int = 42): self.cv_folds = cv_folds self.validation_blocks = validation_blocks self.split_ratio = split_ratio self.shuffle = shuffle - self.advisor = DataObjectiveAdvisor() + self.stratify = stratify + self.random_seed = random_seed self.log = default_log(self) - def build(self, data: InputData) -> DataSource: - # Shuffle data - if self.shuffle and data.task.task_type is not TaskTypesEnum.ts_forecasting: - data.shuffle() - + def build(self, data: Union[InputData, MultiModalData]) -> DataSource: # Check split_ratio - split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type] - if not (0 < split_ratio < 1): - raise ValueError(f'split_ratio is {split_ratio} but should be between 0 and 1') + self.split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type] + if not (0 < self.split_ratio < 1): + raise ValueError(f'split_ratio is {self.split_ratio} but should be between 0 and 1') + + # Check cv_folds and do holdout if cv_folds less than 2 + if self.cv_folds is not None and self.cv_folds < 2: + self.cv_folds = None + + # Calculate the number of validation blocks for timeseries forecasting + if data.task.task_type is TaskTypesEnum.ts_forecasting and self.validation_blocks is None: + self._propose_cv_folds_and_validation_blocks(data, self.split_ratio) - # Calculate the number of validation blocks - if self.validation_blocks is None and data.task.task_type is TaskTypesEnum.ts_forecasting: - self._propose_cv_folds_and_validation_blocks(data, split_ratio) + # Forbid stratify for nonclassification tasks + if data.task.task_type is not TaskTypesEnum.classification: + self.stratify = False # Split data if self.cv_folds is not None: self.log.info("K-folds cross validation is applied.") - data_producer = self._build_kfolds_producer(data) + data_producer = partial(cv_generator, + data=data, + shuffle=self.shuffle, + cv_folds=self.cv_folds, + random_seed=self.random_seed, + stratify=self.stratify, + validation_blocks=self.validation_blocks) else: self.log.info("Hold out validation is applied.") data_producer = self._build_holdout_producer(data) @@ -73,29 +85,18 @@ def _build_holdout_producer(self, data: InputData) -> DataSource: that always returns same data split. Equivalent to 1-fold validation. """ - split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type] - train_data, test_data = train_test_data_setup(data, split_ratio, validation_blocks=self.validation_blocks) + train_data, test_data = train_test_data_setup(data, + split_ratio=self.split_ratio, + stratify=self.stratify, + random_seed=self.random_seed, + shuffle=self.shuffle, + validation_blocks=self.validation_blocks) if RemoteEvaluator().is_enabled: init_data_for_remote_execution(train_data) return partial(self._data_producer, train_data, test_data) - def _build_kfolds_producer(self, data: InputData) -> DataSource: - if isinstance(data, MultiModalData): - raise NotImplementedError('Cross-validation is not supported for multi-modal data') - if data.task.task_type is TaskTypesEnum.ts_forecasting: - # Perform time series cross validation - cv_generator = partial(ts_cv_generator, data, - self.cv_folds, - self.validation_blocks, - self.log) - else: - cv_generator = partial(tabular_cv_generator, data, - self.cv_folds, - self.advisor.propose_kfold(data)) - return cv_generator - def _propose_cv_folds_and_validation_blocks(self, data, split_ratio): data_shape = data.target.shape[0] forecast_length = data.task.task_params.forecast_length diff --git a/fedot/core/validation/split.py b/fedot/core/validation/split.py index b5bab02a7a..e6a1aaca1a 100644 --- a/fedot/core/validation/split.py +++ b/fedot/core/validation/split.py @@ -1,27 +1,14 @@ -from typing import Iterator, Optional, Tuple, Type +from typing import Iterator, Optional, Tuple, Union import numpy as np -from golem.core.log import LoggerAdapter, default_log + +from fedot.core.data.multi_modal import MultiModalData +from fedot.core.repository.tasks import TaskTypesEnum from sklearn.model_selection import KFold, TimeSeriesSplit -from sklearn.model_selection._split import _BaseKFold +from sklearn.model_selection._split import StratifiedKFold from fedot.core.data.data import InputData -from fedot.core.data.data_split import train_test_data_setup -from fedot.core.repository.dataset_types import DataTypesEnum - - -class OneFoldInputDataSplit: - """ Perform one fold split (hold out) for InputData structures """ - - def __init__(self): - pass - - @staticmethod - def input_split(input_data: InputData, **kwargs): - # Train test split - train_input, test_input = train_test_data_setup(input_data, **kwargs) - - yield train_input, test_input +from fedot.core.data.data_split import _split_input_data_by_indexes class TsInputDataSplit(TimeSeriesSplit): @@ -40,127 +27,59 @@ class TsInputDataSplit(TimeSeriesSplit): train - [1, 2, 3, 4, 5, 6, 7, 8] test - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] """ - def __init__(self, validation_blocks: int, **params): - super().__init__(**params) - self.validation_blocks = validation_blocks - self.params = params + def __init__(self, n_splits: int, test_size: int): + super().__init__(gap=0, n_splits=n_splits, test_size=test_size) - def input_split(self, input_data: InputData) -> Iterator[Tuple[InputData, InputData]]: - """ Splitting into datasets for train and validation using + def split(self, data: np.ndarray, *args) -> Iterator[Tuple[InputData, InputData]]: + """ Define indexes for train and validation using "in-sample forecasting" algorithm - :param input_data: InputData for splitting + :param data: InputData for splitting """ - # Transform InputData into numpy array - data_for_split = np.array(input_data.target) - - for train_ids, test_ids in super().split(data_for_split): - if len(train_ids) <= len(test_ids): - raise ValueError("Train size will be too small with selected number of folds and validation blocks") - # Return train part by ids - train_features, train_target = _ts_data_by_index(train_ids, train_ids, input_data) - train_data = InputData(idx=np.arange(0, len(train_target)), - features=train_features, target=train_target, - task=input_data.task, - data_type=input_data.data_type, - supplementary_data=input_data.supplementary_data) - # Unit all ids for "in-sample validation" - all_ids = np.hstack((train_ids, test_ids)) - # In-sample validation dataset - val_features, val_target = _ts_data_by_index(all_ids, all_ids, input_data) - validation_data = InputData(idx=np.arange(0, len(val_target)), - features=val_features, target=val_target, - task=input_data.task, - data_type=input_data.data_type, - supplementary_data=input_data.supplementary_data) + for train_ids, test_ids in super().split(data): + new_test_ids = np.hstack((train_ids, test_ids)) + yield train_ids, new_test_ids - yield train_data, validation_data - -def tabular_cv_generator(data: InputData, - folds: int, - split_method: Type[_BaseKFold] = KFold) -> Iterator[Tuple[InputData, InputData]]: +def cv_generator(data: Union[InputData, MultiModalData], + cv_folds: Optional[int] = None, + shuffle: bool = False, + random_seed: int = 42, + stratify: bool = True, + validation_blocks: Optional[int] = None) -> Iterator[Tuple[Union[InputData, MultiModalData], + Union[InputData, MultiModalData]]]: """ The function for splitting data into a train and test samples - in the InputData format for KFolds cross validation. The function + in the InputData format for cross validation. The function return a generator of tuples, consisting of a pair of train, test. :param data: InputData for train and test splitting - :param folds: number of folds - :param split_method: method to split data (f.e. stratify KFold) - - :return Iterator[InputData, InputData]: return split train/test data + :param shuffle: is data need shuffle + :param cv_folds: number of folds + :param random_seed: random seed for shuffle + :param stratify: `True` to make stratified samples for classification task + :param validation_blocks: validation blocks for timeseries data, + + :return Iterator[Tuple[Union[InputData, MultiModalData], + Union[InputData, MultiModalData]]]: return split train/test data """ - kf = split_method(n_splits=folds, shuffle=True, random_state=42) - - for train_idxs, test_idxs in kf.split(data.features, data.target): - train_features, train_target = _table_data_by_index(train_idxs, data) - test_features, test_target = _table_data_by_index(test_idxs, data) - - idx_for_train = np.arange(0, len(train_features)) - idx_for_test = np.arange(0, len(test_features)) - - train_data = InputData(idx=idx_for_train, - features=train_features, - target=train_target, - task=data.task, - data_type=data.data_type, - supplementary_data=data.supplementary_data) - test_data = InputData(idx=idx_for_test, - features=test_features, - target=test_target, - task=data.task, - data_type=data.data_type, - supplementary_data=data.supplementary_data) + # Define base class for generate cv folds + if data.task.task_type is TaskTypesEnum.ts_forecasting: + if validation_blocks is None: + raise ValueError('validation_blocks is None') + horizon = data.task.task_params.forecast_length * validation_blocks + kf = TsInputDataSplit(n_splits=cv_folds, test_size=horizon) + reset_idx = True + elif data.task.task_type is TaskTypesEnum.classification and stratify: + kf = StratifiedKFold(n_splits=cv_folds, shuffle=shuffle, random_state=random_seed) + reset_idx = False + else: + kf = KFold(n_splits=cv_folds, shuffle=shuffle, random_state=random_seed) + reset_idx = False + + # Split + for train_ids, test_ids in kf.split(data.target, data.target): + train_data = _split_input_data_by_indexes(data, train_ids, reset_idx=reset_idx) + test_data = _split_input_data_by_indexes(data, test_ids, reset_idx=reset_idx) yield train_data, test_data - - -def ts_cv_generator(data: InputData, folds: int, - validation_blocks: int = 1, log: Optional[LoggerAdapter] = None) \ - -> Iterator[Tuple[InputData, InputData]]: - """ Splitting data for time series cross validation - - :param data: source InputData with time series data type - :param folds: number of folds - :param validation_blocks: number of validation block per each fold - :param log: log object - """ - if not log: - log = default_log(prefix=__name__) - validation_blocks = int(validation_blocks) - # Forecast horizon for each fold - horizon = data.task.task_params.forecast_length * validation_blocks - - try: - tscv = TsInputDataSplit(gap=0, validation_blocks=validation_blocks, - n_splits=folds, test_size=horizon) - for train_data, test_data in tscv.input_split(data): - yield train_data, test_data - except ValueError: - log.info(f'Time series length too small for cross validation with {folds} folds. Perform one fold validation') - # Perform one fold validation (folds parameter will be ignored) - - one_fold_split = OneFoldInputDataSplit() - for train_data, test_data in one_fold_split.input_split(data, validation_blocks=validation_blocks): - yield train_data, test_data - - -def _table_data_by_index(index, values: InputData): - """ Allow to get tabular data by indexes of elements """ - features = values.features[index, :] - target = np.take(values.target, index) - - return features, target - - -def _ts_data_by_index(train_ids, test_ids, data): - """ Allow to get time series data by indexes of elements """ - features = data.features[train_ids] - target = data.target[test_ids] - - # Use only the first time-series as target for multi_ts - if data.data_type == DataTypesEnum.multi_ts: - target = target[:, 0] - - return features, target From b5444f31e0a44a7c8a2d2973a798c398cad202a2 Mon Sep 17 00:00:00 2001 From: Sergey Kasyanov Date: Tue, 8 Aug 2023 14:51:12 +0300 Subject: [PATCH 02/14] Rename module split.py to cv_folds.py --- fedot/core/optimisers/objective/data_source_splitter.py | 2 +- fedot/core/validation/{split.py => cv_folds.py} | 0 test/unit/data/test_data_split.py | 2 +- test/unit/optimizer/test_pipeline_objective_eval.py | 2 +- test/unit/validation/test_table_cv.py | 2 +- test/unit/validation/test_time_series_cv.py | 2 +- 6 files changed, 5 insertions(+), 5 deletions(-) rename fedot/core/validation/{split.py => cv_folds.py} (100%) diff --git a/fedot/core/optimisers/objective/data_source_splitter.py b/fedot/core/optimisers/objective/data_source_splitter.py index 88d17a329e..addc09d4cb 100644 --- a/fedot/core/optimisers/objective/data_source_splitter.py +++ b/fedot/core/optimisers/objective/data_source_splitter.py @@ -10,7 +10,7 @@ from fedot.core.optimisers.objective.data_objective_eval import DataSource from fedot.core.repository.tasks import TaskTypesEnum from fedot.remote.remote_evaluator import RemoteEvaluator, init_data_for_remote_execution -from fedot.core.validation.split import cv_generator +from fedot.core.validation.cv_folds import cv_generator class DataSourceSplitter: diff --git a/fedot/core/validation/split.py b/fedot/core/validation/cv_folds.py similarity index 100% rename from fedot/core/validation/split.py rename to fedot/core/validation/cv_folds.py diff --git a/test/unit/data/test_data_split.py b/test/unit/data/test_data_split.py index 5cf904921e..fb1865cf29 100644 --- a/test/unit/data/test_data_split.py +++ b/test/unit/data/test_data_split.py @@ -12,7 +12,7 @@ from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import Task, TaskTypesEnum, TsForecastingParams from fedot.core.utils import split_data -from fedot.core.validation.split import tabular_cv_generator, ts_cv_generator +from fedot.core.validation.cv_folds import tabular_cv_generator, ts_cv_generator from test.unit.pipelines.test_decompose_pipelines import get_classification_data from test.unit.tasks.test_forecasting import get_ts_data diff --git a/test/unit/optimizer/test_pipeline_objective_eval.py b/test/unit/optimizer/test_pipeline_objective_eval.py index cfb3f86444..6e176ec4bf 100644 --- a/test/unit/optimizer/test_pipeline_objective_eval.py +++ b/test/unit/optimizer/test_pipeline_objective_eval.py @@ -17,7 +17,7 @@ from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum, MetricsRepository, \ RegressionMetricsEnum from fedot.core.repository.tasks import Task, TaskTypesEnum -from fedot.core.validation.split import tabular_cv_generator, OneFoldInputDataSplit +from fedot.core.validation.cv_folds import tabular_cv_generator, OneFoldInputDataSplit from test.integration.models.test_model import classification_dataset from test.unit.tasks.test_forecasting import get_simple_ts_pipeline from test.unit.validation.test_table_cv import sample_pipeline diff --git a/test/unit/validation/test_table_cv.py b/test/unit/validation/test_table_cv.py index 17eff41c7b..6b4df58eac 100644 --- a/test/unit/validation/test_table_cv.py +++ b/test/unit/validation/test_table_cv.py @@ -22,7 +22,7 @@ from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum from fedot.core.repository.tasks import Task, TaskTypesEnum from fedot.core.utils import fedot_project_root -from fedot.core.validation.split import tabular_cv_generator +from fedot.core.validation.cv_folds import tabular_cv_generator from test.integration.models.test_model import classification_dataset from test.unit.tasks.test_classification import get_iris_data, pipeline_simple diff --git a/test/unit/validation/test_time_series_cv.py b/test/unit/validation/test_time_series_cv.py index 0fcba28d53..e96811e0b5 100644 --- a/test/unit/validation/test_time_series_cv.py +++ b/test/unit/validation/test_time_series_cv.py @@ -15,7 +15,7 @@ from fedot.core.repository.quality_metrics_repository import \ MetricsRepository, RegressionMetricsEnum from fedot.core.repository.tasks import TsForecastingParams -from fedot.core.validation.split import ts_cv_generator +from fedot.core.validation.cv_folds import ts_cv_generator from test.unit.tasks.test_forecasting import get_simple_ts_pipeline, get_ts_data log = default_log(prefix=__name__) From 2a3a17e5f147a0bd3e8f2067ad9b6a9bd6ccc1dd Mon Sep 17 00:00:00 2001 From: Sergey Kasyanov Date: Tue, 8 Aug 2023 14:56:36 +0300 Subject: [PATCH 03/14] Move cv_folds.py to fedot/core/data --- fedot/core/{validation => data}/cv_folds.py | 0 fedot/core/validation/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename fedot/core/{validation => data}/cv_folds.py (100%) delete mode 100644 fedot/core/validation/__init__.py diff --git a/fedot/core/validation/cv_folds.py b/fedot/core/data/cv_folds.py similarity index 100% rename from fedot/core/validation/cv_folds.py rename to fedot/core/data/cv_folds.py diff --git a/fedot/core/validation/__init__.py b/fedot/core/validation/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 From b73107780610b2a8fb2adae4d529d74e39b9c378 Mon Sep 17 00:00:00 2001 From: Sergey Kasyanov Date: Tue, 8 Aug 2023 17:44:11 +0300 Subject: [PATCH 04/14] Add tests Also some fixes in new code --- fedot/core/data/cv_folds.py | 4 +- fedot/core/data/data_split.py | 12 +- .../objective/data_source_splitter.py | 43 ++++--- test/unit/data/test_data_split.py | 107 ++++++++++++++++-- 4 files changed, 137 insertions(+), 29 deletions(-) diff --git a/fedot/core/data/cv_folds.py b/fedot/core/data/cv_folds.py index e6a1aaca1a..8b2db786fd 100644 --- a/fedot/core/data/cv_folds.py +++ b/fedot/core/data/cv_folds.py @@ -72,9 +72,11 @@ def cv_generator(data: Union[InputData, MultiModalData], kf = TsInputDataSplit(n_splits=cv_folds, test_size=horizon) reset_idx = True elif data.task.task_type is TaskTypesEnum.classification and stratify: - kf = StratifiedKFold(n_splits=cv_folds, shuffle=shuffle, random_state=random_seed) + kf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_seed) reset_idx = False else: + if not shuffle: + random_seed = None kf = KFold(n_splits=cv_folds, shuffle=shuffle, random_state=random_seed) reset_idx = False diff --git a/fedot/core/data/data_split.py b/fedot/core/data/data_split.py index 5fc86d5189..8ed142c99c 100644 --- a/fedot/core/data/data_split.py +++ b/fedot/core/data/data_split.py @@ -7,6 +7,7 @@ from fedot.core.data.data import InputData from fedot.core.data.multi_modal import MultiModalData from fedot.core.repository.dataset_types import DataTypesEnum +from fedot.core.repository.tasks import TaskTypesEnum def _split_input_data_by_indexes(origin_input_data: Union[InputData, MultiModalData], index, reset_idx=False): @@ -73,9 +74,9 @@ def _split_time_series(data: InputData, def _split_any(data: InputData, split_ratio: float, - shuffle: Optional[bool] = None, - stratify: Optional[bool] = None, - random_seed: Optional[int] = None, + shuffle: bool, + stratify: bool, + random_seed: int, **kwargs): """ Split any data except timeseries into train and test parts @@ -87,6 +88,7 @@ def _split_any(data: InputData, """ if stratify and shuffle: + # check that there are enough labels for stratify stratify_labels = data.target test_size = round(len(data.target) * (1. - split_ratio)) labels_num = np.unique(stratify_labels).shape[0] @@ -126,6 +128,10 @@ def train_test_data_setup(data: Union[InputData, MultiModalData], :return: data for train, data for validation """ + + if data.task.task_type is TaskTypesEnum.classification and stratify: + shuffle = True + input_arguments = {'split_ratio': split_ratio, 'shuffle': shuffle, 'stratify': stratify, diff --git a/fedot/core/optimisers/objective/data_source_splitter.py b/fedot/core/optimisers/objective/data_source_splitter.py index addc09d4cb..b52af67912 100644 --- a/fedot/core/optimisers/objective/data_source_splitter.py +++ b/fedot/core/optimisers/objective/data_source_splitter.py @@ -10,7 +10,7 @@ from fedot.core.optimisers.objective.data_objective_eval import DataSource from fedot.core.repository.tasks import TaskTypesEnum from fedot.remote.remote_evaluator import RemoteEvaluator, init_data_for_remote_execution -from fedot.core.validation.cv_folds import cv_generator +from fedot.core.data.cv_folds import cv_generator class DataSourceSplitter: @@ -42,22 +42,35 @@ def __init__(self, self.log = default_log(self) def build(self, data: Union[InputData, MultiModalData]) -> DataSource: - # Check split_ratio + # define split_ratio self.split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type] - if not (0 < self.split_ratio < 1): - raise ValueError(f'split_ratio is {self.split_ratio} but should be between 0 and 1') - # Check cv_folds and do holdout if cv_folds less than 2 - if self.cv_folds is not None and self.cv_folds < 2: - self.cv_folds = None + # Check cv_folds + if self.cv_folds is not None: + if not isinstance(self.cv_folds, int): + if self.cv_folds % 1 != 0: + raise ValueError(f"cv_folds is not integer: {self.cv_folds}") + self.cv_folds = int(self.cv_folds) + if self.cv_folds < 2: + self.cv_folds = None + if self.cv_folds > data.target.shape[0] - 1: + raise ValueError((f"cv_folds ({self.cv_folds}) is greater than" + f" the maximum allowed count {data.target.shape[0] - 1}")) # Calculate the number of validation blocks for timeseries forecasting if data.task.task_type is TaskTypesEnum.ts_forecasting and self.validation_blocks is None: - self._propose_cv_folds_and_validation_blocks(data, self.split_ratio) + self._propose_cv_folds_and_validation_blocks(data) + + # Check split_ratio + if self.cv_folds is None and not (0 < self.split_ratio < 1): + raise ValueError(f'split_ratio is {self.split_ratio} but should be between 0 and 1') - # Forbid stratify for nonclassification tasks if data.task.task_type is not TaskTypesEnum.classification: + # Forbid stratify for nonclassification tasks self.stratify = False + else: + if self.stratify: + self.shuffle = True # Split data if self.cv_folds is not None: @@ -97,7 +110,7 @@ def _build_holdout_producer(self, data: InputData) -> DataSource: return partial(self._data_producer, train_data, test_data) - def _propose_cv_folds_and_validation_blocks(self, data, split_ratio): + def _propose_cv_folds_and_validation_blocks(self, data): data_shape = data.target.shape[0] forecast_length = data.task.task_params.forecast_length # check that cv folds may be realized @@ -119,14 +132,14 @@ def _propose_cv_folds_and_validation_blocks(self, data, split_ratio): " Cross validation is switched off.")) if self.cv_folds is None: - test_shape = int(data_shape * (1 - split_ratio)) + test_shape = int(data_shape * (1 - self.split_ratio)) if forecast_length > test_shape: - split_ratio = 1 - forecast_length / data_shape + self.split_ratio = 1 - forecast_length / data_shape self.log.info((f"Forecast length ({forecast_length}) is greater than test length" f" ({test_shape}) defined by split ratio." - f" Split ratio is changed to {split_ratio}.")) - test_share = 1 - split_ratio - self.split_ratio = split_ratio + f" Split ratio is changed to {self.split_ratio}.")) + test_share = 1 - self.split_ratio + self.split_ratio = self.split_ratio else: test_share = 1 / (self.cv_folds + 1) self.validation_blocks = int(data_shape * test_share // forecast_length) diff --git a/test/unit/data/test_data_split.py b/test/unit/data/test_data_split.py index fb1865cf29..2d2264721c 100644 --- a/test/unit/data/test_data_split.py +++ b/test/unit/data/test_data_split.py @@ -12,7 +12,7 @@ from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import Task, TaskTypesEnum, TsForecastingParams from fedot.core.utils import split_data -from fedot.core.validation.cv_folds import tabular_cv_generator, ts_cv_generator +from fedot.core.data.cv_folds import cv_generator from test.unit.pipelines.test_decompose_pipelines import get_classification_data from test.unit.tasks.test_forecasting import get_ts_data @@ -22,10 +22,10 @@ IMAGE_SIMPLE = {'train_features_size': (8, 5, 5, 2), 'test_features_size': (2, 5, 5, 2), 'test_idx': (8, 9)} -def get_tabular_classification_data(): +def get_tabular_classification_data(length=10, class_count=2): task = Task(TaskTypesEnum.classification) - features = np.full((10, 5), 1, dtype=float) - target = np.repeat(np.array([1, 2]), 5).reshape((-1, 1)) + features = np.full((length, 5), 1, dtype=float) + target = np.repeat(np.array(list(range(1, class_count + 1))), length // class_count).reshape((-1, 1)) input_data = InputData(idx=np.arange(0, len(features)), features=features, target=target, task=task, data_type=DataTypesEnum.table) return input_data @@ -106,6 +106,18 @@ def get_balanced_data_to_test_mismatch(): return input_data +def check_shuffle(sample): + unique = np.unique(np.diff(sample.idx)) + test_result = len(unique) > 1 or np.min(unique) > 1 + return test_result + + +def check_stratify(train, test): + deltas = [np.unique(np.sort(train.target), return_counts=True)[1], + np.unique(np.sort(test.target), return_counts=True)[1]] + return np.allclose(*[delta / sum(delta) for delta in deltas]) + + def test_split_data(): dataframe = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6], @@ -126,7 +138,7 @@ def test_split_data(): def test_default_train_test_simple(data_generator: Callable, expected_output: dict): """ Check if simple splitting perform correctly for all used in FEDOT data types """ input_data = data_generator() - train_data, test_data = train_test_data_setup(input_data) + train_data, test_data = train_test_data_setup(input_data, stratify=False) assert train_data.features.shape == expected_output['train_features_size'] assert test_data.features.shape == expected_output['test_features_size'] @@ -149,7 +161,7 @@ def test_advanced_time_series_splitting(): # test StratifiedKFold [(DataSourceSplitter(cv_folds=3, shuffle=True), get_imbalanced_data_to_test_mismatch()), # test KFold - # (DataSourceSplitter(cv_folds=3, shuffle=True), get_balanced_data_to_test_mismatch()), + (DataSourceSplitter(cv_folds=3, shuffle=True), get_balanced_data_to_test_mismatch()), # test hold-out (DataSourceSplitter(shuffle=True), get_imbalanced_data_to_test_mismatch()), ]) @@ -192,19 +204,63 @@ def test_multivariate_time_series_splitting_correct(): assert len(test_series_data.features) == 20 assert np.allclose(test_series_data.target, np.array([16, 17, 18, 19])) +@pytest.mark.parametrize(('datas_funs', 'cv_folds', 'shuffle', 'stratify'), + [# classification + stratify + shuffle + cv_folds + ([partial(get_tabular_classification_data, 100, 5)] * 3, 2, True, True), + # classification + shuffle + cv_folds + ([partial(get_tabular_classification_data, 100, 5)] * 3, 2, True, False), + # classification + stratify + shuffle + ([partial(get_tabular_classification_data, 100, 5)] * 3, None, True, True), + # classification + shuffle + ([partial(get_tabular_classification_data, 100, 5)] * 3, None, True, False), + # classification + ([partial(get_tabular_classification_data, 100, 5)] * 3, None, False, False), + # timeseries + cv_folds + ([partial(get_ts_data_to_forecast, 10, 100)] * 3, 2, False, False), + # timeseries + ([partial(get_ts_data_to_forecast, 10, 100)] * 3, None, False, False), + ]) +def test_multimodal_data_splitting_is_correct(datas_funs, cv_folds, shuffle, stratify): + mdata = MultiModalData({f'data_{i}': data_fun() for i, data_fun in enumerate(datas_funs)}) + data_splitter = DataSourceSplitter(cv_folds=cv_folds, shuffle=shuffle, stratify=stratify) + data_producer = data_splitter.build(mdata) + keys = tuple(mdata.keys()) + + for samples in data_producer(): + for sample in samples: + assert isinstance(sample, MultiModalData) + + # keys should be the same + assert set(keys) == set(sample.keys()) + + # idx should be the same + idx = [np.reshape(x.idx, (-1, 1)) for x in sample.values()] + assert np.all(np.diff(np.concatenate(idx, 1), 1) == 0) + + # shuffle should be done + if shuffle: + for key in keys: + assert check_shuffle(sample[key]) + + # stratify should be done + if stratify: + for key in keys: + assert check_stratify(samples[0][key], samples[1][key]) + + @pytest.mark.parametrize("cv_generator, data", - [(partial(tabular_cv_generator, folds=5), + [(partial(cv_generator, cv_folds=5), get_classification_data()[0]), - (partial(ts_cv_generator, folds=3, validation_blocks=2), + (partial(cv_generator, cv_folds=3, validation_blocks=2), get_ts_data()[0])]) def test_cv_generator_works_stable(cv_generator, data): """ Test if ts cv generator works stable (always return same folds) """ idx_first = [] idx_second = [] - for row in cv_generator(data=data): + for row in cv_generator(data=data, stratify=False): idx_first.append(row[1].idx) - for row in cv_generator(data=data): + for row in cv_generator(data=data, stratify=False): idx_second.append(row[1].idx) for i in range(len(idx_first)): @@ -231,3 +287,34 @@ def test_data_splitting_defines_validation_blocks_correctly(forecast_length, cv_ assert data_source_splitter.cv_folds == check_cv_folds assert data_source_splitter.split_ratio == check_split_ratio assert data_source_splitter.validation_blocks == check_validation_blocks + + +@pytest.mark.parametrize(('cv_folds', 'shuffle', 'stratify', 'data_classes'), + [(2, True, True, 2), # simple case + (2, False, True, 2), # should work without error + (5, True, True, 4), # more folds and more classes + ]) +def test_stratify(cv_folds, shuffle, stratify, data_classes): + data = get_tabular_classification_data(length=100, class_count=data_classes) + data_splitter = DataSourceSplitter(cv_folds=cv_folds, shuffle=shuffle, stratify=stratify) + data_producer = data_splitter.build(data) + + for train, test in data_producer(): + assert check_stratify(train, test) + + +@pytest.mark.parametrize(('is_shuffle', 'shuffle', 'cv_folds', 'data'), + [(True, True, 2, + get_tabular_classification_data(length=100, class_count=4)), # cv_folds classification + (True, True, None, + get_tabular_classification_data(length=100, class_count=4)), # holdout classification + (False, True, 2, get_ts_data_to_forecast(10, 100)), # cv_folds timeseries + (False, True, None, get_ts_data_to_forecast(10, 100)), # holdout timeseries + ]) +def test_shuffle(is_shuffle, cv_folds, shuffle, data): + data_splitter = DataSourceSplitter(cv_folds=cv_folds, shuffle=shuffle, stratify=False) + data_producer = data_splitter.build(data) + + for samples in data_producer(): + for sample in samples: + assert check_shuffle(sample) == is_shuffle From 4950dbe972bfa6e7b931538bc5d2b606da5c9a69 Mon Sep 17 00:00:00 2001 From: Sergey Kasyanov Date: Wed, 9 Aug 2023 10:04:20 +0300 Subject: [PATCH 05/14] Add tests and fix some tests --- fedot/core/data/cv_folds.py | 2 +- test/unit/data/test_data_split.py | 18 ++++++---- .../optimizer/test_pipeline_objective_eval.py | 36 ++++++++++--------- test/unit/validation/test_table_cv.py | 26 +++----------- test/unit/validation/test_time_series_cv.py | 22 ++---------- 5 files changed, 40 insertions(+), 64 deletions(-) diff --git a/fedot/core/data/cv_folds.py b/fedot/core/data/cv_folds.py index 8b2db786fd..915599f831 100644 --- a/fedot/core/data/cv_folds.py +++ b/fedot/core/data/cv_folds.py @@ -43,7 +43,7 @@ def split(self, data: np.ndarray, *args) -> Iterator[Tuple[InputData, InputData] def cv_generator(data: Union[InputData, MultiModalData], - cv_folds: Optional[int] = None, + cv_folds: int, shuffle: bool = False, random_seed: int = 42, stratify: bool = True, diff --git a/test/unit/data/test_data_split.py b/test/unit/data/test_data_split.py index 2d2264721c..022003dfe9 100644 --- a/test/unit/data/test_data_split.py +++ b/test/unit/data/test_data_split.py @@ -158,11 +158,8 @@ def test_advanced_time_series_splitting(): @pytest.mark.parametrize('data_splitter, data', - # test StratifiedKFold [(DataSourceSplitter(cv_folds=3, shuffle=True), get_imbalanced_data_to_test_mismatch()), - # test KFold (DataSourceSplitter(cv_folds=3, shuffle=True), get_balanced_data_to_test_mismatch()), - # test hold-out (DataSourceSplitter(shuffle=True), get_imbalanced_data_to_test_mismatch()), ]) def test_data_splitting_without_shape_mismatch(data_splitter: DataSourceSplitter, data: InputData): @@ -204,11 +201,14 @@ def test_multivariate_time_series_splitting_correct(): assert len(test_series_data.features) == 20 assert np.allclose(test_series_data.target, np.array([16, 17, 18, 19])) + @pytest.mark.parametrize(('datas_funs', 'cv_folds', 'shuffle', 'stratify'), [# classification + stratify + shuffle + cv_folds - ([partial(get_tabular_classification_data, 100, 5)] * 3, 2, True, True), + ([partial(get_tabular_classification_data, 100, 5)] * 3, 4, True, True), # classification + shuffle + cv_folds - ([partial(get_tabular_classification_data, 100, 5)] * 3, 2, True, False), + ([partial(get_tabular_classification_data, 100, 5)] * 3, 4, True, False), + # classification + cv_folds + ([partial(get_tabular_classification_data, 100, 5)] * 3, 4, False, False), # classification + stratify + shuffle ([partial(get_tabular_classification_data, 100, 5)] * 3, None, True, True), # classification + shuffle @@ -216,7 +216,7 @@ def test_multivariate_time_series_splitting_correct(): # classification ([partial(get_tabular_classification_data, 100, 5)] * 3, None, False, False), # timeseries + cv_folds - ([partial(get_ts_data_to_forecast, 10, 100)] * 3, 2, False, False), + ([partial(get_ts_data_to_forecast, 10, 100)] * 3, 3, False, False), # timeseries ([partial(get_ts_data_to_forecast, 10, 100)] * 3, None, False, False), ]) @@ -225,6 +225,7 @@ def test_multimodal_data_splitting_is_correct(datas_funs, cv_folds, shuffle, str data_splitter = DataSourceSplitter(cv_folds=cv_folds, shuffle=shuffle, stratify=stratify) data_producer = data_splitter.build(mdata) keys = tuple(mdata.keys()) + features_dimensity = [subdata.features.shape[1:] for subdata in mdata.values()] for samples in data_producer(): for sample in samples: @@ -237,6 +238,10 @@ def test_multimodal_data_splitting_is_correct(datas_funs, cv_folds, shuffle, str idx = [np.reshape(x.idx, (-1, 1)) for x in sample.values()] assert np.all(np.diff(np.concatenate(idx, 1), 1) == 0) + # dimensity of features should be the same + splitted_data_features_dimensity =[subdata.features.shape[1:] for subdata in sample.values()] + assert features_dimensity == splitted_data_features_dimensity + # shuffle should be done if shuffle: for key in keys: @@ -248,7 +253,6 @@ def test_multimodal_data_splitting_is_correct(datas_funs, cv_folds, shuffle, str assert check_stratify(samples[0][key], samples[1][key]) - @pytest.mark.parametrize("cv_generator, data", [(partial(cv_generator, cv_folds=5), get_classification_data()[0]), diff --git a/test/unit/optimizer/test_pipeline_objective_eval.py b/test/unit/optimizer/test_pipeline_objective_eval.py index 6e176ec4bf..052d67d667 100644 --- a/test/unit/optimizer/test_pipeline_objective_eval.py +++ b/test/unit/optimizer/test_pipeline_objective_eval.py @@ -17,7 +17,6 @@ from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum, MetricsRepository, \ RegressionMetricsEnum from fedot.core.repository.tasks import Task, TaskTypesEnum -from fedot.core.validation.cv_folds import tabular_cv_generator, OneFoldInputDataSplit from test.integration.models.test_model import classification_dataset from test.unit.tasks.test_forecasting import get_simple_ts_pipeline from test.unit.validation.test_table_cv import sample_pipeline @@ -75,12 +74,12 @@ def empty_datasource(): ) def test_pipeline_objective_evaluate_with_different_metrics(classification_dataset, pipeline): for metric in ClassificationMetricsEnum: - one_fold_split = OneFoldInputDataSplit() - data_split = partial(one_fold_split.input_split, input_data=classification_dataset) + data_producer = DataSourceSplitter(cv_folds=None).build(classification_dataset) check_pipeline = deepcopy(pipeline) - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer) fitness = objective_eval(pipeline) - act_fitness = actual_fitness(data_split, check_pipeline, metric) + act_fitness = actual_fitness(data_producer, check_pipeline, metric) assert fitness.valid assert fitness.value is not None assert np.isclose(fitness.value, act_fitness.value, atol=1e-8), metric.name @@ -88,11 +87,11 @@ def test_pipeline_objective_evaluate_with_different_metrics(classification_datas def test_pipeline_objective_evaluate_with_empty_pipeline(classification_dataset): pipeline = empty_pipeline() - - data_split = partial(OneFoldInputDataSplit().input_split, input_data=classification_dataset) + data_producer = DataSourceSplitter(cv_folds=None).build(classification_dataset) metric = ClassificationMetricsEnum.ROCAUC_penalty - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer) with pytest.raises(AttributeError): objective_eval(pipeline) @@ -100,10 +99,11 @@ def test_pipeline_objective_evaluate_with_empty_pipeline(classification_dataset) def test_pipeline_objective_evaluate_with_cv_fold(classification_dataset): pipeline = sample_pipeline() - cv_fold = partial(tabular_cv_generator, classification_dataset, folds=5) + data_producer = DataSourceSplitter(cv_folds=5).build(classification_dataset) metric = ClassificationMetricsEnum.logloss - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), cv_fold) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer) fitness = objective_eval(pipeline) assert fitness.valid assert fitness.value is not None @@ -123,16 +123,20 @@ def test_pipeline_objective_evaluate_with_empty_datasource(classification_datase def test_pipeline_objective_evaluate_with_time_constraint(classification_dataset): pipeline = sample_pipeline() - data_split = partial(OneFoldInputDataSplit().input_split, input_data=classification_dataset) + data_producer = DataSourceSplitter(cv_folds=None).build(classification_dataset) metric = ClassificationMetricsEnum.ROCAUC_penalty time_constraint = datetime.timedelta(seconds=0.0001) - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split, time_constraint=time_constraint) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer, + time_constraint=time_constraint) fitness = objective_eval(pipeline) assert not fitness.valid time_constraint = datetime.timedelta(seconds=300) - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split, time_constraint=time_constraint) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer, + time_constraint=time_constraint) fitness = objective_eval(pipeline) assert fitness.valid assert fitness.value is not None @@ -147,9 +151,9 @@ def test_pipeline_objective_evaluate_with_invalid_metrics(classification_dataset with pytest.raises(Exception): pipeline = sample_pipeline() - data_split = partial(OneFoldInputDataSplit().input_split, input_data=classification_dataset) - - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metrics), data_split) + data_producer = DataSourceSplitter(cv_folds=None).build(classification_dataset) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metrics), + data_producer=data_producer) objective_eval(pipeline) diff --git a/test/unit/validation/test_table_cv.py b/test/unit/validation/test_table_cv.py index 6b4df58eac..7888f0f939 100644 --- a/test/unit/validation/test_table_cv.py +++ b/test/unit/validation/test_table_cv.py @@ -1,18 +1,16 @@ import logging from datetime import timedelta -from functools import partial import pytest + from golem.core.tuning.simultaneous import SimultaneousTuner from sklearn.metrics import roc_auc_score as roc_auc -from sklearn.model_selection import KFold, StratifiedKFold from fedot.api.main import Fedot from fedot.core.composer.composer_builder import ComposerBuilder from fedot.core.data.data import InputData from fedot.core.data.data_split import train_test_data_setup from fedot.core.optimisers.objective import PipelineObjectiveEvaluate -from fedot.core.optimisers.objective.data_objective_advisor import DataObjectiveAdvisor from fedot.core.optimisers.objective.metrics_objective import MetricsObjective from fedot.core.pipelines.node import PipelineNode from fedot.core.pipelines.pipeline import Pipeline @@ -22,7 +20,7 @@ from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum from fedot.core.repository.tasks import Task, TaskTypesEnum from fedot.core.utils import fedot_project_root -from fedot.core.validation.cv_folds import tabular_cv_generator +from fedot.core.optimisers.objective.data_source_splitter import DataSourceSplitter from test.integration.models.test_model import classification_dataset from test.unit.tasks.test_classification import get_iris_data, pipeline_simple @@ -44,32 +42,18 @@ def get_classification_data(): def test_cv_multiple_metrics_evaluated_correct(classification_dataset): pipeline = sample_pipeline() - cv_folds = partial(tabular_cv_generator, classification_dataset, folds=5) + data_producer = DataSourceSplitter(cv_folds=5).build(classification_dataset) metrics = [ClassificationMetricsEnum.ROCAUC_penalty, ClassificationMetricsEnum.accuracy, ClassificationMetricsEnum.logloss] - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metrics), cv_folds) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metrics), + data_producer=data_producer) actual_values = objective_eval(pipeline).values all_metrics_correct = all(0 < abs(x) <= 1 for x in actual_values) assert all_metrics_correct -def test_kfold_advisor_works_correct_in_balanced_case(): - data = get_classification_data() - advisor = DataObjectiveAdvisor() - split_type = advisor.propose_kfold(data) - assert split_type == KFold - - -def test_kfold_advisor_works_correct_in_imbalanced_case(): - data = get_classification_data() - data.target[:-int(len(data.target) * 0.1)] = 0 - advisor = DataObjectiveAdvisor() - split_type = advisor.propose_kfold(data) - assert split_type == StratifiedKFold - - def test_cv_min_kfolds_raise(): task = Task(task_type=TaskTypesEnum.classification) models_repo = OperationTypesRepository() diff --git a/test/unit/validation/test_time_series_cv.py b/test/unit/validation/test_time_series_cv.py index e96811e0b5..dac782f062 100644 --- a/test/unit/validation/test_time_series_cv.py +++ b/test/unit/validation/test_time_series_cv.py @@ -15,7 +15,7 @@ from fedot.core.repository.quality_metrics_repository import \ MetricsRepository, RegressionMetricsEnum from fedot.core.repository.tasks import TsForecastingParams -from fedot.core.validation.cv_folds import ts_cv_generator +from fedot.core.data.cv_folds import cv_generator from test.unit.tasks.test_forecasting import get_simple_ts_pipeline, get_ts_data log = default_log(prefix=__name__) @@ -50,7 +50,8 @@ def test_ts_cv_generator_correct(): validation_horizon = validation_elements_per_fold * folds i = 0 - for train_data, test_data in ts_cv_generator(time_series, folds, validation_blocks, log): + for train_data, test_data in cv_generator(time_series, cv_folds=folds, + validation_blocks=validation_blocks): train_len = len(train_data.idx) assert train_len == ts_len - validation_horizon validation_horizon -= validation_elements_per_fold @@ -58,23 +59,6 @@ def test_ts_cv_generator_correct(): assert i == folds -def test_cv_folds_too_large_correct(): - """ Checks whether cases where the number of folds is too large, causing - the number of elements to be validated to be greater than the number of elements - in the time series itself, are adequately handled - - In this case a hold-out validation with 1 fold and 3 validation blocks must be performed - """ - folds = 50 - forecast_len, validation_blocks, time_series = configure_experiment() - - i = 0 - for train_data, test_data in ts_cv_generator(time_series, folds, validation_blocks, log): - i += 1 - assert len(train_data.idx) == 85 - assert i == 1 - - def test_tuner_cv_correct(): """ Checks if the tuner works correctly when using cross validation for From c9b2928950521ac660f09e4289b295fea00adab4 Mon Sep 17 00:00:00 2001 From: Sergey Kasyanov Date: Wed, 9 Aug 2023 15:38:28 +0300 Subject: [PATCH 06/14] Fixes Fix calls of changed functions, add correct parameters to some test, fix data splitting functions --- .../kc2_sourcecode_defects_classification.py | 2 +- docs/source/basics/multi_modal_tasks.rst | 2 +- docs/source/basics/ts_forecasting.rst | 2 +- examples/advanced/automl/h2o_example.py | 2 +- examples/advanced/automl/tpot_example.py | 2 +- examples/advanced/multi_modal_pipeline.py | 2 +- .../advanced/multimodal_text_num_example.py | 2 +- fedot/core/data/cv_folds.py | 13 +--- fedot/core/data/data_split.py | 73 ++++++++++++++----- .../sklearn_transformations.py | 4 +- .../objective/data_source_splitter.py | 23 ++++-- test/integration/api/test_api_utils.py | 2 +- test/integration/api/test_main_api.py | 4 +- test/unit/data/test_data_split.py | 4 +- .../test_pipeline_preprocessing.py | 2 +- test/unit/preprocessing/test_preprocessors.py | 6 +- test/unit/tasks/test_classification.py | 8 +- 17 files changed, 97 insertions(+), 56 deletions(-) diff --git a/cases/kc2_sourcecode_defects_classification.py b/cases/kc2_sourcecode_defects_classification.py index e055cd384f..dbe3fbfeb2 100644 --- a/cases/kc2_sourcecode_defects_classification.py +++ b/cases/kc2_sourcecode_defects_classification.py @@ -17,7 +17,7 @@ def get_kc2_data(): encoded = (target == 'yes').astype(int) data.target = encoded - train, test = train_test_data_setup(data, shuffle_flag=True) + train, test = train_test_data_setup(data, shuffle=True) return train, test diff --git a/docs/source/basics/multi_modal_tasks.rst b/docs/source/basics/multi_modal_tasks.rst index b988f78f11..e7792da981 100644 --- a/docs/source/basics/multi_modal_tasks.rst +++ b/docs/source/basics/multi_modal_tasks.rst @@ -15,7 +15,7 @@ FEDOT's API supports multimodal data from the box. The only thing you need is to data = MultiModalData.from_csv(file_path='multimodal_dataset.csv', task='classification', target_columns='target_column', text_columns=['text_col1', 'text_col2'], columns_to_drop=['col_to_drop1', 'col_to_drop2'], index_col=None) - fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.7) + fit_data, predict_data = train_test_data_setup(data, shuffle=True, split_ratio=0.7) Using ``from_csv()`` method, you should define the task type, and target columns. FEDOT can find text columns automatically, but you can set them manually. You can also select columns which will be dropped from the original dataset. By default, FEDOT reads the first column of every dataset as an index column. If there is no index columns in the dataset, you should set ``index_col=None``. Initialize the FEDOT object and define the type of modeling problem. diff --git a/docs/source/basics/ts_forecasting.rst b/docs/source/basics/ts_forecasting.rst index 4a57a5729a..4d006d3fe2 100644 --- a/docs/source/basics/ts_forecasting.rst +++ b/docs/source/basics/ts_forecasting.rst @@ -188,7 +188,7 @@ Train test split ~~~~~~~~~~~~~~~~ To split InputData use ``train_test_data_setup`` method. -``split_ratio`` and ``shuffle_flag`` are ignored for time-series forecasting. +``split_ratio`` and ``shuffle``, and ``stratify`` are ignored for time-series forecasting. .. autofunction:: fedot.core.data.data_split.train_test_data_setup diff --git a/examples/advanced/automl/h2o_example.py b/examples/advanced/automl/h2o_example.py index 4659561728..b098c223fb 100644 --- a/examples/advanced/automl/h2o_example.py +++ b/examples/advanced/automl/h2o_example.py @@ -51,7 +51,7 @@ def h2o_classification_pipeline_evaluation(): pipeline_path = "h2o_class" data = get_iris_data() pipeline = pipeline_h2o_class() - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) pipeline.fit(input_data=train_data) results = pipeline.predict(input_data=test_data, output_mode="full_probs") diff --git a/examples/advanced/automl/tpot_example.py b/examples/advanced/automl/tpot_example.py index 425b068482..2574c224a2 100644 --- a/examples/advanced/automl/tpot_example.py +++ b/examples/advanced/automl/tpot_example.py @@ -41,7 +41,7 @@ def tpot_classification_pipeline_evaluation(): pipeline_path = "tpot_class" data = get_iris_data() pipeline = pipeline_tpot_class() - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) pipeline.fit(input_data=train_data) results = pipeline.predict(input_data=test_data, output_mode="full_probs") diff --git a/examples/advanced/multi_modal_pipeline.py b/examples/advanced/multi_modal_pipeline.py index 8780d3fc3f..933e9e05ab 100644 --- a/examples/advanced/multi_modal_pipeline.py +++ b/examples/advanced/multi_modal_pipeline.py @@ -74,7 +74,7 @@ def run_multi_modal_pipeline(files_path: str, visualization=False) -> float: data = prepare_multi_modal_data(files_path, task, images_size) - fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.6) + fit_data, predict_data = train_test_data_setup(data, shuffle=True, split_ratio=0.6) automl_model = Fedot(problem='classification', timeout=15) pipeline = automl_model.fit(features=fit_data, diff --git a/examples/advanced/multimodal_text_num_example.py b/examples/advanced/multimodal_text_num_example.py index e8b48d915d..ba000b67a6 100644 --- a/examples/advanced/multimodal_text_num_example.py +++ b/examples/advanced/multimodal_text_num_example.py @@ -25,7 +25,7 @@ def run_multi_modal_example(file_path: str, visualization=False, with_tuning=Tru task = 'classification' path = Path(fedot_project_root(), file_path) data = MultiModalData.from_csv(file_path=path, task=task, target_columns='variety', index_col=None) - fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.7) + fit_data, predict_data = train_test_data_setup(data, shuffle=True, split_ratio=0.7) automl_model = Fedot(problem=task, timeout=10, with_tuning=with_tuning) automl_model.fit(features=fit_data, diff --git a/fedot/core/data/cv_folds.py b/fedot/core/data/cv_folds.py index 915599f831..58a77afc81 100644 --- a/fedot/core/data/cv_folds.py +++ b/fedot/core/data/cv_folds.py @@ -65,23 +65,18 @@ def cv_generator(data: Union[InputData, MultiModalData], """ # Define base class for generate cv folds + retain_first_target = False if data.task.task_type is TaskTypesEnum.ts_forecasting: - if validation_blocks is None: - raise ValueError('validation_blocks is None') horizon = data.task.task_params.forecast_length * validation_blocks kf = TsInputDataSplit(n_splits=cv_folds, test_size=horizon) - reset_idx = True + retain_first_target = True elif data.task.task_type is TaskTypesEnum.classification and stratify: kf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_seed) - reset_idx = False else: - if not shuffle: - random_seed = None kf = KFold(n_splits=cv_folds, shuffle=shuffle, random_state=random_seed) - reset_idx = False # Split for train_ids, test_ids in kf.split(data.target, data.target): - train_data = _split_input_data_by_indexes(data, train_ids, reset_idx=reset_idx) - test_data = _split_input_data_by_indexes(data, test_ids, reset_idx=reset_idx) + train_data = _split_input_data_by_indexes(data, train_ids) + test_data = _split_input_data_by_indexes(data, test_ids, retain_first_target=retain_first_target) yield train_data, test_data diff --git a/fedot/core/data/data_split.py b/fedot/core/data/data_split.py index 8ed142c99c..dd5a94dce1 100644 --- a/fedot/core/data/data_split.py +++ b/fedot/core/data/data_split.py @@ -10,7 +10,9 @@ from fedot.core.repository.tasks import TaskTypesEnum -def _split_input_data_by_indexes(origin_input_data: Union[InputData, MultiModalData], index, reset_idx=False): +def _split_input_data_by_indexes(origin_input_data: Union[InputData, MultiModalData], + index, + retain_first_target=False): """ The function get InputData or MultiModalData and return only data with indexes in index, not in idx f.e. index = [0, 1, 2, 3] == input_data.features[[0, 1, 2, 3], :] @@ -23,16 +25,15 @@ def _split_input_data_by_indexes(origin_input_data: Union[InputData, MultiModalD for key in origin_input_data: data[key] = _split_input_data_by_indexes(origin_input_data[key], index=index, - reset_idx=reset_idx) + retain_first_target=retain_first_target) return data elif isinstance(origin_input_data, InputData): + idx = np.take(origin_input_data.idx, index, 0) target = np.take(origin_input_data.target, index, 0) features = np.take(origin_input_data.features, index, 0) - if reset_idx: - idx = np.arange(0, len(target)) - else: - idx = np.take(origin_input_data.idx, index, 0) + if retain_first_target and len(target.shape) > 1: + target = target[:, 0] data = InputData(idx=idx, features=features, @@ -59,8 +60,9 @@ def _split_time_series(data: InputData, forecast_length *= validation_blocks target_length = len(data.target) - train_data = _split_input_data_by_indexes(data, index=np.arange(0, target_length - forecast_length)) - test_data = _split_input_data_by_indexes(data, index=np.arange(target_length - forecast_length, target_length)) + train_data = _split_input_data_by_indexes(data, index=np.arange(0, target_length - forecast_length),) + test_data = _split_input_data_by_indexes(data, index=np.arange(target_length - forecast_length, target_length), + retain_first_target=True) if validation_blocks is None: # for in-sample @@ -87,15 +89,7 @@ def _split_any(data: InputData, :param random_seed: random_seed for shuffle """ - if stratify and shuffle: - # check that there are enough labels for stratify - stratify_labels = data.target - test_size = round(len(data.target) * (1. - split_ratio)) - labels_num = np.unique(stratify_labels).shape[0] - if test_size < labels_num: - split_ratio = 1 - labels_num / len(data.target) - else: - stratify_labels = None + stratify_labels = data.target if stratify else None train_ids, test_ids = train_test_split(np.arange(0, len(data.target)), test_size=1. - split_ratio, @@ -110,6 +104,43 @@ def _split_any(data: InputData, return train_data, test_data +def _are_stratification_allowed(data: Union[InputData, MultiModalData], split_ratio: float) -> bool: + """ Check that stratification may be done + :param data: data for split + :param split_ratio: relation between train data length and all data length + :return: + stratify - stratification is allowed""" + # check task_type + if data.task.task_type is not TaskTypesEnum.classification: + return False + + try: + # fast way + classes = np.unique(data.target, return_counts=True) + except: + # slow way + from collections import Counter + classes = Counter(data.target) + classes = [list(classes), list(classes.values())] + + # check that there are enough labels for two samples + if not all(x > 1 for x in classes[1]): + if __debug__: + return False + else: + raise ValueError(("There is the only value for some classes:" + f" {', '.join(str(val) for val, count in zip(*classes) if count == 1)}." + f" Can not do correct data split for {data.task.task_type.name} task.")) + + # check that split ratio allows to set all classes to both samples + test_size = round(len(data.target) * (1. - split_ratio)) + labels_count = len(classes[0]) + if test_size < labels_count: + return False + + return True + + def train_test_data_setup(data: Union[InputData, MultiModalData], split_ratio: float = 0.8, shuffle: bool = False, @@ -129,8 +160,12 @@ def train_test_data_setup(data: Union[InputData, MultiModalData], :return: data for train, data for validation """ - if data.task.task_type is TaskTypesEnum.classification and stratify: - shuffle = True + # check that stratification may be done + stratify &= _are_stratification_allowed(data, split_ratio) + # stratification is allowed only with shuffle + shuffle |= stratify + # shuffle is allowed only with random_seed and vise versa + random_seed = (random_seed or 42) if shuffle else None input_arguments = {'split_ratio': split_ratio, 'shuffle': shuffle, diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 68db60ce42..59ab97c6df 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -362,7 +362,9 @@ def _find_binary_features(self, numerical_features: np.array): # Calculate unique values per column (excluding nans) for column_id, col in enumerate(df): unique_values = df[col].dropna().unique() - if len(unique_values) <= 2: + # TODO: test data processed without information about train data + # it may lead to errors + if len(unique_values) == 2: # Current numerical column has only two values column_info = {column_id: {'min': min(unique_values), 'max': max(unique_values)}} diff --git a/fedot/core/optimisers/objective/data_source_splitter.py b/fedot/core/optimisers/objective/data_source_splitter.py index b52af67912..909a67dcfb 100644 --- a/fedot/core/optimisers/objective/data_source_splitter.py +++ b/fedot/core/optimisers/objective/data_source_splitter.py @@ -5,7 +5,7 @@ from fedot.core.constants import default_data_split_ratio_by_task from fedot.core.data.data import InputData -from fedot.core.data.data_split import train_test_data_setup +from fedot.core.data.data_split import train_test_data_setup, _are_stratification_allowed from fedot.core.data.multi_modal import MultiModalData from fedot.core.optimisers.objective.data_objective_eval import DataSource from fedot.core.repository.tasks import TaskTypesEnum @@ -65,12 +65,21 @@ def build(self, data: Union[InputData, MultiModalData]) -> DataSource: if self.cv_folds is None and not (0 < self.split_ratio < 1): raise ValueError(f'split_ratio is {self.split_ratio} but should be between 0 and 1') - if data.task.task_type is not TaskTypesEnum.classification: - # Forbid stratify for nonclassification tasks - self.stratify = False - else: - if self.stratify: - self.shuffle = True + # Forbid stratify for nonclassification tasks + self.stratify &= data.task.task_type is TaskTypesEnum.classification + if self.stratify: + # check that stratification can be done + # for cross validation split ratio is defined as validation_size / train_size + split_ratio = self.split_ratio if self.cv_folds is None else (1 - 1 / (self.cv_folds + 1)) + self.stratify = _are_stratification_allowed(data, split_ratio) + if not self.stratify: + self.log.info(f"Stratification data splitting is disabled.") + + # Stratification can not be done without shuffle + self.shuffle |= self.stratify + + # Random seed depends on shuffle + self.random_seed = (self.random_seed or 42) if self.shuffle else None # Split data if self.cv_folds is not None: diff --git a/test/integration/api/test_api_utils.py b/test/integration/api/test_api_utils.py index f64277681b..36b3794294 100644 --- a/test/integration/api/test_api_utils.py +++ b/test/integration/api/test_api_utils.py @@ -35,7 +35,7 @@ def test_output_binary_classification_correct(): data = get_binary_classification_data() random.seed(1) - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) model = Fedot(problem=task_type, timeout=0.1) model.fit(train_data, predefined_model='logit') diff --git a/test/integration/api/test_main_api.py b/test/integration/api/test_main_api.py index 99db329f58..4cc4c51548 100644 --- a/test/integration/api/test_main_api.py +++ b/test/integration/api/test_main_api.py @@ -75,7 +75,7 @@ def get_dataset(task_type: str, validation_blocks: Optional[int] = None, n_sampl data = get_iris_data() else: data = get_synthetic_classification_data(n_samples=n_samples, n_features=n_features, random_state=42) - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) threshold = 0.95 elif task_type == 'clustering': data = get_synthetic_input_data(n_samples=100) @@ -110,7 +110,7 @@ def load_categorical_unimodal(): dataset_path = 'test/data/classification_with_categorical.csv' full_path = os.path.join(str(fedot_project_root()), dataset_path) data = InputData.from_csv(full_path) - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) return train_data, test_data diff --git a/test/unit/data/test_data_split.py b/test/unit/data/test_data_split.py index 022003dfe9..82e1dc4270 100644 --- a/test/unit/data/test_data_split.py +++ b/test/unit/data/test_data_split.py @@ -262,9 +262,9 @@ def test_cv_generator_works_stable(cv_generator, data): """ Test if ts cv generator works stable (always return same folds) """ idx_first = [] idx_second = [] - for row in cv_generator(data=data, stratify=False): + for row in cv_generator(data=data, stratify=False, random_seed=None): idx_first.append(row[1].idx) - for row in cv_generator(data=data, stratify=False): + for row in cv_generator(data=data, stratify=False, random_seed=None): idx_second.append(row[1].idx) for i in range(len(idx_first)): diff --git a/test/unit/preprocessing/test_pipeline_preprocessing.py b/test/unit/preprocessing/test_pipeline_preprocessing.py index e875017ff4..dcba57ba9e 100644 --- a/test/unit/preprocessing/test_pipeline_preprocessing.py +++ b/test/unit/preprocessing/test_pipeline_preprocessing.py @@ -247,7 +247,7 @@ def test_data_with_mixed_types_per_column_processed_correctly(): processed correctly. """ input_data = data_with_mixed_types_in_each_column() - train_data, test_data = train_test_data_setup(input_data, split_ratio=0.9) + train_data, test_data = train_test_data_setup(input_data, split_ratio=0.9, stratify=False) pipeline = Pipeline(PipelineNode('dt')) pipeline = correct_preprocessing_params(pipeline, categorical_max_uniques_th=5) diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py index 35936dc766..038b9f44af 100644 --- a/test/unit/preprocessing/test_preprocessors.py +++ b/test/unit/preprocessing/test_preprocessors.py @@ -114,7 +114,7 @@ def data_with_complicated_types(): input_data = InputData(idx=np.arange(18), features=features, target=target, task=task, data_type=DataTypesEnum.table) - return train_test_data_setup(input_data, split_ratio=0.9) + return train_test_data_setup(input_data, split_ratio=0.9, stratify=False) def test_column_types_converting_correctly(): @@ -146,7 +146,7 @@ def test_column_types_process_correctly(): """ data = data_with_mixed_types_in_each_column() - train_data, test_data = train_test_data_setup(data, split_ratio=0.9) + train_data, test_data = train_test_data_setup(data, split_ratio=0.9, stratify=False) # Remove target from test sample test_data.target = None @@ -223,7 +223,7 @@ def test_binary_pseudo_string_column_process_correctly(): def fit_predict_cycle_for_testing(idx: int): input_data = get_mixed_data_with_str_and_float_values(idx=idx) - train_data, test_data = train_test_data_setup(input_data, split_ratio=0.9) + train_data, test_data = train_test_data_setup(input_data, split_ratio=0.9, stratify=False) pipeline = Pipeline(PipelineNode('dt')) pipeline = correct_preprocessing_params(pipeline) diff --git a/test/unit/tasks/test_classification.py b/test/unit/tasks/test_classification.py index 8bc423afb8..7373f758be 100644 --- a/test/unit/tasks/test_classification.py +++ b/test/unit/tasks/test_classification.py @@ -99,7 +99,7 @@ def get_image_classification_data(composite_flag: bool = True): def test_multiclassification_pipeline_fit_correct(): data = get_iris_data() pipeline = pipeline_simple() - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) pipeline.fit(input_data=train_data) results = pipeline.predict(input_data=test_data) @@ -117,7 +117,7 @@ def test_classification_with_pca_pipeline_fit_correct(): pipeline_pca = pipeline_with_pca() pipeline = pipeline_simple() - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) pipeline.fit(input_data=train_data) pipeline_pca.fit(input_data=train_data) @@ -141,7 +141,7 @@ def test_classification_with_pca_pipeline_fit_correct(): def test_output_mode_labels(): data = get_iris_data() pipeline = pipeline_simple() - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) pipeline.fit(input_data=train_data) results = pipeline.predict(input_data=test_data, output_mode='labels') @@ -156,7 +156,7 @@ def test_output_mode_labels(): def test_output_mode_full_probs(): data = get_binary_classification_data() pipeline = pipeline_simple() - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) pipeline.fit(input_data=train_data) results = pipeline.predict(input_data=test_data, output_mode='full_probs') From e57097c63326eb0e00fddc40d201fe2ff636636f Mon Sep 17 00:00:00 2001 From: Sergey Kasyanov Date: Wed, 9 Aug 2023 16:03:46 +0300 Subject: [PATCH 07/14] Fixes --- fedot/core/data/data_split.py | 6 ++++++ .../data_operations/sklearn_transformations.py | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fedot/core/data/data_split.py b/fedot/core/data/data_split.py index dd5a94dce1..ddb642e855 100644 --- a/fedot/core/data/data_split.py +++ b/fedot/core/data/data_split.py @@ -126,6 +126,8 @@ def _are_stratification_allowed(data: Union[InputData, MultiModalData], split_ra # check that there are enough labels for two samples if not all(x > 1 for x in classes[1]): if __debug__: + # tests often use very small datasets that are not suitable for data splitting + # for test stratification is disabled in that case return False else: raise ValueError(("There is the only value for some classes:" @@ -144,6 +146,7 @@ def _are_stratification_allowed(data: Union[InputData, MultiModalData], split_ra def train_test_data_setup(data: Union[InputData, MultiModalData], split_ratio: float = 0.8, shuffle: bool = False, + shuffle_flag: bool = False, stratify: bool = True, random_seed: int = 42, validation_blocks: Optional[int] = None) -> Tuple[Union[InputData, MultiModalData], @@ -153,6 +156,7 @@ def train_test_data_setup(data: Union[InputData, MultiModalData], :param data: InputData object to split :param split_ratio: share of train data between 0 and 1 :param shuffle: is data needed to be shuffled or not + :param shuffle_flag: same is shuffle, use for backward compatibility :param stratify: make stratified sample or not :param random_seed: random_seed for shuffle :param validation_blocks: validation blocks are used for test @@ -160,6 +164,8 @@ def train_test_data_setup(data: Union[InputData, MultiModalData], :return: data for train, data for validation """ + # for backward compatibility + shuffle |= shuffle_flag # check that stratification may be done stratify &= _are_stratification_allowed(data, split_ratio) # stratification is allowed only with shuffle diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 59ab97c6df..b75e70076c 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -362,8 +362,6 @@ def _find_binary_features(self, numerical_features: np.array): # Calculate unique values per column (excluding nans) for column_id, col in enumerate(df): unique_values = df[col].dropna().unique() - # TODO: test data processed without information about train data - # it may lead to errors if len(unique_values) == 2: # Current numerical column has only two values column_info = {column_id: {'min': min(unique_values), From 6cb04df809c09ec000a387fbd54b79a71dcba929 Mon Sep 17 00:00:00 2001 From: Sergey Kasyanov Date: Thu, 10 Aug 2023 09:50:36 +0300 Subject: [PATCH 08/14] Fixes --- examples/advanced/automl/h2o_example.py | 2 +- fedot/core/data/cv_folds.py | 1 + fedot/core/data/data_split.py | 16 ++++++++-------- .../optimisers/objective/data_source_splitter.py | 14 +++++++++----- test/unit/data/test_data_split.py | 14 +++++++------- .../optimizer/test_pipeline_objective_eval.py | 1 - 6 files changed, 26 insertions(+), 22 deletions(-) diff --git a/examples/advanced/automl/h2o_example.py b/examples/advanced/automl/h2o_example.py index b098c223fb..f4a0a9d3f4 100644 --- a/examples/advanced/automl/h2o_example.py +++ b/examples/advanced/automl/h2o_example.py @@ -48,7 +48,6 @@ def export_h2o(pipeline, pipeline_path, test_data): def h2o_classification_pipeline_evaluation(): - pipeline_path = "h2o_class" data = get_iris_data() pipeline = pipeline_h2o_class() train_data, test_data = train_test_data_setup(data, shuffle=True) @@ -62,6 +61,7 @@ def h2o_classification_pipeline_evaluation(): multi_class='ovo', average='macro') # H2o has troubles with serialization for now + # pipeline_path = "h2o_class"1 # export_h2o(pipeline, pipeline_path, test_data) print(f"roc auc: {roc_auc_on_test}") diff --git a/fedot/core/data/cv_folds.py b/fedot/core/data/cv_folds.py index 58a77afc81..99f5cdf866 100644 --- a/fedot/core/data/cv_folds.py +++ b/fedot/core/data/cv_folds.py @@ -69,6 +69,7 @@ def cv_generator(data: Union[InputData, MultiModalData], if data.task.task_type is TaskTypesEnum.ts_forecasting: horizon = data.task.task_params.forecast_length * validation_blocks kf = TsInputDataSplit(n_splits=cv_folds, test_size=horizon) + # for multi_ts use first target column as main target retain_first_target = True elif data.task.task_type is TaskTypesEnum.classification and stratify: kf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_seed) diff --git a/fedot/core/data/data_split.py b/fedot/core/data/data_split.py index ddb642e855..0b85c0f864 100644 --- a/fedot/core/data/data_split.py +++ b/fedot/core/data/data_split.py @@ -16,8 +16,9 @@ def _split_input_data_by_indexes(origin_input_data: Union[InputData, MultiModalD """ The function get InputData or MultiModalData and return only data with indexes in index, not in idx f.e. index = [0, 1, 2, 3] == input_data.features[[0, 1, 2, 3], :] + :param origin_input_data: data to split :param index: indexes that needed in output data - :param reset_idx: set to True for idx is range (0, len(data)) + :param retain_first_target: set to True for use only first column of target """ if isinstance(origin_input_data, MultiModalData): @@ -61,7 +62,7 @@ def _split_time_series(data: InputData, target_length = len(data.target) train_data = _split_input_data_by_indexes(data, index=np.arange(0, target_length - forecast_length),) - test_data = _split_input_data_by_indexes(data, index=np.arange(target_length - forecast_length, target_length), + test_data = _split_input_data_by_indexes(data, index=np.arange(target_length - forecast_length, target_length), retain_first_target=True) if validation_blocks is None: @@ -97,7 +98,6 @@ def _split_any(data: InputData, random_state=random_seed, stratify=stratify_labels) - # Prepare data to train the operation train_data = _split_input_data_by_indexes(data, index=train_ids) test_data = _split_input_data_by_indexes(data, index=test_ids) @@ -108,8 +108,8 @@ def _are_stratification_allowed(data: Union[InputData, MultiModalData], split_ra """ Check that stratification may be done :param data: data for split :param split_ratio: relation between train data length and all data length - :return: - stratify - stratification is allowed""" + :return bool: stratification is allowed""" + # check task_type if data.task.task_type is not TaskTypesEnum.classification: return False @@ -127,12 +127,12 @@ def _are_stratification_allowed(data: Union[InputData, MultiModalData], split_ra if not all(x > 1 for x in classes[1]): if __debug__: # tests often use very small datasets that are not suitable for data splitting - # for test stratification is disabled in that case + # stratification is disabled for tests return False else: raise ValueError(("There is the only value for some classes:" f" {', '.join(str(val) for val, count in zip(*classes) if count == 1)}." - f" Can not do correct data split for {data.task.task_type.name} task.")) + f" Data split can not be done for {data.task.task_type.name} task.")) # check that split ratio allows to set all classes to both samples test_size = round(len(data.target) * (1. - split_ratio)) @@ -197,6 +197,6 @@ def train_test_data_setup(data: Union[InputData, MultiModalData], train_data[node], test_data[node] = train_test_data_setup(data[node], **input_arguments) else: raise ValueError((f'Dataset {type(data)} is not supported. Supported types:' - 'InputData, MultiModalData')) + ' InputData, MultiModalData')) return train_data, test_data diff --git a/fedot/core/optimisers/objective/data_source_splitter.py b/fedot/core/optimisers/objective/data_source_splitter.py index 909a67dcfb..cd36ad2c9c 100644 --- a/fedot/core/optimisers/objective/data_source_splitter.py +++ b/fedot/core/optimisers/objective/data_source_splitter.py @@ -19,11 +19,17 @@ class DataSourceSplitter: Can provide hold-out validation and k-fold validation. :param cv_folds: Number of folds on data for cross-validation. - If provided, then k-fold validation is used. Otherwise, hold-out validation is used. + If provided, then cross validation is used. Otherwise, hold-out validation is used. :param split_ratio: Ratio of data for splitting. Applied only in case of hold-out split. If not provided, then default split ratios will be used. :param shuffle: Is shuffling required for data. + :param stratify: If True then stratification is used for samples + :param validation_blocks: Validation blocks count. + Applied only for time series data. + If not provided, then value will be calculated. + :param random_seed: Random seed for shuffle. + :param log: Log for logging. """ def __init__(self, @@ -65,15 +71,13 @@ def build(self, data: Union[InputData, MultiModalData]) -> DataSource: if self.cv_folds is None and not (0 < self.split_ratio < 1): raise ValueError(f'split_ratio is {self.split_ratio} but should be between 0 and 1') - # Forbid stratify for nonclassification tasks - self.stratify &= data.task.task_type is TaskTypesEnum.classification if self.stratify: # check that stratification can be done - # for cross validation split ratio is defined as validation_size / train_size + # for cross validation split ratio is defined as validation_size / all_data_size split_ratio = self.split_ratio if self.cv_folds is None else (1 - 1 / (self.cv_folds + 1)) self.stratify = _are_stratification_allowed(data, split_ratio) if not self.stratify: - self.log.info(f"Stratification data splitting is disabled.") + self.log.info("Stratificated splitting of data is disabled.") # Stratification can not be done without shuffle self.shuffle |= self.stratify diff --git a/test/unit/data/test_data_split.py b/test/unit/data/test_data_split.py index 82e1dc4270..ddcdb7944d 100644 --- a/test/unit/data/test_data_split.py +++ b/test/unit/data/test_data_split.py @@ -239,7 +239,7 @@ def test_multimodal_data_splitting_is_correct(datas_funs, cv_folds, shuffle, str assert np.all(np.diff(np.concatenate(idx, 1), 1) == 0) # dimensity of features should be the same - splitted_data_features_dimensity =[subdata.features.shape[1:] for subdata in sample.values()] + splitted_data_features_dimensity = [subdata.features.shape[1:] for subdata in sample.values()] assert features_dimensity == splitted_data_features_dimensity # shuffle should be done @@ -294,9 +294,9 @@ def test_data_splitting_defines_validation_blocks_correctly(forecast_length, cv_ @pytest.mark.parametrize(('cv_folds', 'shuffle', 'stratify', 'data_classes'), - [(2, True, True, 2), # simple case - (2, False, True, 2), # should work without error - (5, True, True, 4), # more folds and more classes + [(2, True, True, 2), # simple case + (2, False, True, 2), # should work without error + (5, True, True, 4), # more folds and more classes ]) def test_stratify(cv_folds, shuffle, stratify, data_classes): data = get_tabular_classification_data(length=100, class_count=data_classes) @@ -309,10 +309,10 @@ def test_stratify(cv_folds, shuffle, stratify, data_classes): @pytest.mark.parametrize(('is_shuffle', 'shuffle', 'cv_folds', 'data'), [(True, True, 2, - get_tabular_classification_data(length=100, class_count=4)), # cv_folds classification + get_tabular_classification_data(length=100, class_count=4)), # cv_folds classification (True, True, None, - get_tabular_classification_data(length=100, class_count=4)), # holdout classification - (False, True, 2, get_ts_data_to_forecast(10, 100)), # cv_folds timeseries + get_tabular_classification_data(length=100, class_count=4)), # holdout classification + (False, True, 2, get_ts_data_to_forecast(10, 100)), 1 # cv_folds timeseries (False, True, None, get_ts_data_to_forecast(10, 100)), # holdout timeseries ]) def test_shuffle(is_shuffle, cv_folds, shuffle, data): diff --git a/test/unit/optimizer/test_pipeline_objective_eval.py b/test/unit/optimizer/test_pipeline_objective_eval.py index 052d67d667..b92cef1233 100644 --- a/test/unit/optimizer/test_pipeline_objective_eval.py +++ b/test/unit/optimizer/test_pipeline_objective_eval.py @@ -1,6 +1,5 @@ import datetime from copy import deepcopy -from functools import partial import numpy as np import pytest From 971617d3a7c50f9c1b73d2791302e4969e218c37 Mon Sep 17 00:00:00 2001 From: Sergey Kasyanov Date: Thu, 10 Aug 2023 09:55:27 +0300 Subject: [PATCH 09/14] Fixes --- test/unit/data/test_data_split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/data/test_data_split.py b/test/unit/data/test_data_split.py index ddcdb7944d..712173452a 100644 --- a/test/unit/data/test_data_split.py +++ b/test/unit/data/test_data_split.py @@ -312,7 +312,7 @@ def test_stratify(cv_folds, shuffle, stratify, data_classes): get_tabular_classification_data(length=100, class_count=4)), # cv_folds classification (True, True, None, get_tabular_classification_data(length=100, class_count=4)), # holdout classification - (False, True, 2, get_ts_data_to_forecast(10, 100)), 1 # cv_folds timeseries + (False, True, 2, get_ts_data_to_forecast(10, 100)), # cv_folds timeseries (False, True, None, get_ts_data_to_forecast(10, 100)), # holdout timeseries ]) def test_shuffle(is_shuffle, cv_folds, shuffle, data): From 0e3f8758f095740273fbc63884dc45e0b9435c7b Mon Sep 17 00:00:00 2001 From: Sergey Kasyanov Date: Thu, 10 Aug 2023 10:32:48 +0300 Subject: [PATCH 10/14] Fixes --- examples/advanced/automl/h2o_example.py | 2 +- fedot/core/optimisers/objective/data_source_splitter.py | 3 +-- test/unit/data/test_data_split.py | 3 ++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/advanced/automl/h2o_example.py b/examples/advanced/automl/h2o_example.py index f4a0a9d3f4..b16712734a 100644 --- a/examples/advanced/automl/h2o_example.py +++ b/examples/advanced/automl/h2o_example.py @@ -73,7 +73,7 @@ def h2o_regression_pipeline_evaluation(): train_data, test_data = train_test_data_setup(data) pipeline.fit(input_data=train_data) - results = pipeline.predict(input_data=test_data) + _ = pipeline.predict(input_data=test_data) _, rmse_on_test = get_rmse_value(pipeline, train_data, test_data) print(f"RMSE {rmse_on_test}") diff --git a/fedot/core/optimisers/objective/data_source_splitter.py b/fedot/core/optimisers/objective/data_source_splitter.py index cd36ad2c9c..e3cd1160a2 100644 --- a/fedot/core/optimisers/objective/data_source_splitter.py +++ b/fedot/core/optimisers/objective/data_source_splitter.py @@ -36,7 +36,7 @@ def __init__(self, cv_folds: Optional[int] = None, validation_blocks: Optional[int] = None, split_ratio: Optional[float] = None, - shuffle: bool = True, + shuffle: bool = False, stratify: bool = True, random_seed: int = 42): self.cv_folds = cv_folds @@ -152,7 +152,6 @@ def _propose_cv_folds_and_validation_blocks(self, data): f" ({test_shape}) defined by split ratio." f" Split ratio is changed to {self.split_ratio}.")) test_share = 1 - self.split_ratio - self.split_ratio = self.split_ratio else: test_share = 1 / (self.cv_folds + 1) self.validation_blocks = int(data_shape * test_share // forecast_length) diff --git a/test/unit/data/test_data_split.py b/test/unit/data/test_data_split.py index 712173452a..8415ecfcbf 100644 --- a/test/unit/data/test_data_split.py +++ b/test/unit/data/test_data_split.py @@ -203,7 +203,8 @@ def test_multivariate_time_series_splitting_correct(): @pytest.mark.parametrize(('datas_funs', 'cv_folds', 'shuffle', 'stratify'), - [# classification + stratify + shuffle + cv_folds + [ + # classification + stratify + shuffle + cv_folds ([partial(get_tabular_classification_data, 100, 5)] * 3, 4, True, True), # classification + shuffle + cv_folds ([partial(get_tabular_classification_data, 100, 5)] * 3, 4, True, False), From b0c854fba1cc54e1a78744819586a84225ab9699 Mon Sep 17 00:00:00 2001 From: Sergey Kasyanov Date: Mon, 14 Aug 2023 11:45:54 +0300 Subject: [PATCH 11/14] Fix problem with lagged window failure on data shortage --- fedot/core/optimisers/objective/data_source_splitter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fedot/core/optimisers/objective/data_source_splitter.py b/fedot/core/optimisers/objective/data_source_splitter.py index e3cd1160a2..b6403016c9 100644 --- a/fedot/core/optimisers/objective/data_source_splitter.py +++ b/fedot/core/optimisers/objective/data_source_splitter.py @@ -123,8 +123,10 @@ def _build_holdout_producer(self, data: InputData) -> DataSource: return partial(self._data_producer, train_data, test_data) - def _propose_cv_folds_and_validation_blocks(self, data): + def _propose_cv_folds_and_validation_blocks(self, data, expected_window_size=20): data_shape = data.target.shape[0] + # first expected_window_size points should to be guaranteed for prediction at fit stage + data_shape -= expected_window_size forecast_length = data.task.task_params.forecast_length # check that cv folds may be realized if self.cv_folds is not None: From d9002a8d8d54a27c527266dc8edbf51cfc5647e4 Mon Sep 17 00:00:00 2001 From: Sergey Kasyanov Date: Mon, 14 Aug 2023 13:04:35 +0300 Subject: [PATCH 12/14] Fixes --- fedot/core/data/cv_folds.py | 6 +++--- fedot/core/optimisers/objective/data_source_splitter.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fedot/core/data/cv_folds.py b/fedot/core/data/cv_folds.py index 99f5cdf866..77eed03a03 100644 --- a/fedot/core/data/cv_folds.py +++ b/fedot/core/data/cv_folds.py @@ -50,10 +50,10 @@ def cv_generator(data: Union[InputData, MultiModalData], validation_blocks: Optional[int] = None) -> Iterator[Tuple[Union[InputData, MultiModalData], Union[InputData, MultiModalData]]]: """ The function for splitting data into a train and test samples - in the InputData format for cross validation. The function - return a generator of tuples, consisting of a pair of train, test. + for cross validation. The function return a generator of tuples, + consisting of a pair of train, test. - :param data: InputData for train and test splitting + :param data: data for train and test splitting :param shuffle: is data need shuffle :param cv_folds: number of folds :param random_seed: random seed for shuffle diff --git a/fedot/core/optimisers/objective/data_source_splitter.py b/fedot/core/optimisers/objective/data_source_splitter.py index b6403016c9..2c41d57a2c 100644 --- a/fedot/core/optimisers/objective/data_source_splitter.py +++ b/fedot/core/optimisers/objective/data_source_splitter.py @@ -20,14 +20,14 @@ class DataSourceSplitter: :param cv_folds: Number of folds on data for cross-validation. If provided, then cross validation is used. Otherwise, hold-out validation is used. + :param validation_blocks: Validation blocks count. + Applied only for time series data. + If not provided, then value will be calculated. :param split_ratio: Ratio of data for splitting. Applied only in case of hold-out split. If not provided, then default split ratios will be used. :param shuffle: Is shuffling required for data. :param stratify: If True then stratification is used for samples - :param validation_blocks: Validation blocks count. - Applied only for time series data. - If not provided, then value will be calculated. :param random_seed: Random seed for shuffle. :param log: Log for logging. """ From e3da922ee48857e68c42d9ac9082ec559693f550 Mon Sep 17 00:00:00 2001 From: Sergey Kasyanov Date: Mon, 14 Aug 2023 16:05:02 +0300 Subject: [PATCH 13/14] Fixes --- test/unit/data/test_data_split.py | 2 +- test/unit/optimizer/test_pipeline_objective_eval.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/unit/data/test_data_split.py b/test/unit/data/test_data_split.py index 8415ecfcbf..b4f2b85cc7 100644 --- a/test/unit/data/test_data_split.py +++ b/test/unit/data/test_data_split.py @@ -286,7 +286,7 @@ def test_data_splitting_defines_validation_blocks_correctly(forecast_length, cv_ check_cv_folds, check_split_ratio, check_validation_blocks): """ Checks if validation blocks count defines correctly for different data """ - data = get_ts_data_to_forecast(forecast_length) + data = get_ts_data_to_forecast(forecast_length, 120) data_source_splitter = DataSourceSplitter(cv_folds=cv_folds, split_ratio=split_ratio) data_source_splitter.build(data) assert data_source_splitter.cv_folds == check_cv_folds diff --git a/test/unit/optimizer/test_pipeline_objective_eval.py b/test/unit/optimizer/test_pipeline_objective_eval.py index 15be120e24..1c1ce688e7 100644 --- a/test/unit/optimizer/test_pipeline_objective_eval.py +++ b/test/unit/optimizer/test_pipeline_objective_eval.py @@ -90,8 +90,8 @@ def test_pipeline_objective_evaluate_with_different_metrics(classification_datas ) def test_pipeline_objective_evaluate_with_different_metrics_with_str_labes(pipeline): for metric in ClassificationMetricsEnum: - one_fold_split = OneFoldInputDataSplit() - data_split = partial(one_fold_split.input_split, input_data=classification_dataset_with_str_labels()) + data_splitter = DataSourceSplitter() + data_split = data_splitter.build(classification_dataset_with_str_labels()) check_pipeline = deepcopy(pipeline) objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split) fitness = objective_eval(pipeline) From d77ecdb3aafaa93aabb618844efd66830ee8d74c Mon Sep 17 00:00:00 2001 From: Sergey Kasyanov Date: Mon, 14 Aug 2023 16:15:10 +0300 Subject: [PATCH 14/14] Fixes --- fedot/core/data/data_split.py | 2 +- fedot/core/optimisers/objective/data_source_splitter.py | 6 +++--- test/integration/api/test_api_utils.py | 2 +- test/unit/validation/test_table_cv.py | 1 - 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fedot/core/data/data_split.py b/fedot/core/data/data_split.py index 0b85c0f864..3f2d389a0b 100644 --- a/fedot/core/data/data_split.py +++ b/fedot/core/data/data_split.py @@ -117,7 +117,7 @@ def _are_stratification_allowed(data: Union[InputData, MultiModalData], split_ra try: # fast way classes = np.unique(data.target, return_counts=True) - except: + except Exception: # slow way from collections import Counter classes = Counter(data.target) diff --git a/fedot/core/optimisers/objective/data_source_splitter.py b/fedot/core/optimisers/objective/data_source_splitter.py index 2c41d57a2c..70e5dd7e30 100644 --- a/fedot/core/optimisers/objective/data_source_splitter.py +++ b/fedot/core/optimisers/objective/data_source_splitter.py @@ -53,10 +53,10 @@ def build(self, data: Union[InputData, MultiModalData]) -> DataSource: # Check cv_folds if self.cv_folds is not None: - if not isinstance(self.cv_folds, int): - if self.cv_folds % 1 != 0: - raise ValueError(f"cv_folds is not integer: {self.cv_folds}") + try: self.cv_folds = int(self.cv_folds) + except ValueError: + raise ValueError(f"cv_folds is not integer: {self.cv_folds}") if self.cv_folds < 2: self.cv_folds = None if self.cv_folds > data.target.shape[0] - 1: diff --git a/test/integration/api/test_api_utils.py b/test/integration/api/test_api_utils.py index aa9aa4384a..fd2f7877ce 100644 --- a/test/integration/api/test_api_utils.py +++ b/test/integration/api/test_api_utils.py @@ -32,7 +32,7 @@ def test_output_binary_classification_correct(): task_type = 'classification' data = get_binary_classification_data() - + train_data, test_data = train_test_data_setup(data, shuffle=True) model = Fedot(problem=task_type, seed=1, timeout=0.1) diff --git a/test/unit/validation/test_table_cv.py b/test/unit/validation/test_table_cv.py index 6ebe511366..55be85451f 100644 --- a/test/unit/validation/test_table_cv.py +++ b/test/unit/validation/test_table_cv.py @@ -4,7 +4,6 @@ import pytest from golem.core.tuning.simultaneous import SimultaneousTuner -from sklearn.metrics import roc_auc_score as roc_auc from fedot.api.main import Fedot from fedot.core.data.data import InputData