diff --git a/cases/kc2_sourcecode_defects_classification.py b/cases/kc2_sourcecode_defects_classification.py index e055cd384f..dbe3fbfeb2 100644 --- a/cases/kc2_sourcecode_defects_classification.py +++ b/cases/kc2_sourcecode_defects_classification.py @@ -17,7 +17,7 @@ def get_kc2_data(): encoded = (target == 'yes').astype(int) data.target = encoded - train, test = train_test_data_setup(data, shuffle_flag=True) + train, test = train_test_data_setup(data, shuffle=True) return train, test diff --git a/docs/source/basics/multi_modal_tasks.rst b/docs/source/basics/multi_modal_tasks.rst index b988f78f11..e7792da981 100644 --- a/docs/source/basics/multi_modal_tasks.rst +++ b/docs/source/basics/multi_modal_tasks.rst @@ -15,7 +15,7 @@ FEDOT's API supports multimodal data from the box. The only thing you need is to data = MultiModalData.from_csv(file_path='multimodal_dataset.csv', task='classification', target_columns='target_column', text_columns=['text_col1', 'text_col2'], columns_to_drop=['col_to_drop1', 'col_to_drop2'], index_col=None) - fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.7) + fit_data, predict_data = train_test_data_setup(data, shuffle=True, split_ratio=0.7) Using ``from_csv()`` method, you should define the task type, and target columns. FEDOT can find text columns automatically, but you can set them manually. You can also select columns which will be dropped from the original dataset. By default, FEDOT reads the first column of every dataset as an index column. If there is no index columns in the dataset, you should set ``index_col=None``. Initialize the FEDOT object and define the type of modeling problem. diff --git a/docs/source/basics/ts_forecasting.rst b/docs/source/basics/ts_forecasting.rst index 4a57a5729a..4d006d3fe2 100644 --- a/docs/source/basics/ts_forecasting.rst +++ b/docs/source/basics/ts_forecasting.rst @@ -188,7 +188,7 @@ Train test split ~~~~~~~~~~~~~~~~ To split InputData use ``train_test_data_setup`` method. -``split_ratio`` and ``shuffle_flag`` are ignored for time-series forecasting. +``split_ratio`` and ``shuffle``, and ``stratify`` are ignored for time-series forecasting. .. autofunction:: fedot.core.data.data_split.train_test_data_setup diff --git a/examples/advanced/automl/h2o_example.py b/examples/advanced/automl/h2o_example.py index 4659561728..b16712734a 100644 --- a/examples/advanced/automl/h2o_example.py +++ b/examples/advanced/automl/h2o_example.py @@ -48,10 +48,9 @@ def export_h2o(pipeline, pipeline_path, test_data): def h2o_classification_pipeline_evaluation(): - pipeline_path = "h2o_class" data = get_iris_data() pipeline = pipeline_h2o_class() - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) pipeline.fit(input_data=train_data) results = pipeline.predict(input_data=test_data, output_mode="full_probs") @@ -62,6 +61,7 @@ def h2o_classification_pipeline_evaluation(): multi_class='ovo', average='macro') # H2o has troubles with serialization for now + # pipeline_path = "h2o_class"1 # export_h2o(pipeline, pipeline_path, test_data) print(f"roc auc: {roc_auc_on_test}") @@ -73,7 +73,7 @@ def h2o_regression_pipeline_evaluation(): train_data, test_data = train_test_data_setup(data) pipeline.fit(input_data=train_data) - results = pipeline.predict(input_data=test_data) + _ = pipeline.predict(input_data=test_data) _, rmse_on_test = get_rmse_value(pipeline, train_data, test_data) print(f"RMSE {rmse_on_test}") diff --git a/examples/advanced/automl/tpot_example.py b/examples/advanced/automl/tpot_example.py index 425b068482..2574c224a2 100644 --- a/examples/advanced/automl/tpot_example.py +++ b/examples/advanced/automl/tpot_example.py @@ -41,7 +41,7 @@ def tpot_classification_pipeline_evaluation(): pipeline_path = "tpot_class" data = get_iris_data() pipeline = pipeline_tpot_class() - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) pipeline.fit(input_data=train_data) results = pipeline.predict(input_data=test_data, output_mode="full_probs") diff --git a/examples/advanced/multi_modal_pipeline.py b/examples/advanced/multi_modal_pipeline.py index 8780d3fc3f..933e9e05ab 100644 --- a/examples/advanced/multi_modal_pipeline.py +++ b/examples/advanced/multi_modal_pipeline.py @@ -74,7 +74,7 @@ def run_multi_modal_pipeline(files_path: str, visualization=False) -> float: data = prepare_multi_modal_data(files_path, task, images_size) - fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.6) + fit_data, predict_data = train_test_data_setup(data, shuffle=True, split_ratio=0.6) automl_model = Fedot(problem='classification', timeout=15) pipeline = automl_model.fit(features=fit_data, diff --git a/examples/advanced/multimodal_text_num_example.py b/examples/advanced/multimodal_text_num_example.py index 71b4eb5bc8..f59097a845 100644 --- a/examples/advanced/multimodal_text_num_example.py +++ b/examples/advanced/multimodal_text_num_example.py @@ -26,7 +26,7 @@ def run_multi_modal_example(file_path: str, visualization: bool = False, with_tu task = 'classification' path = fedot_project_root().joinpath(file_path) data = MultiModalData.from_csv(file_path=path, task=task, target_columns='variety', index_col=None) - fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.7) + fit_data, predict_data = train_test_data_setup(data, shuffle=True, split_ratio=0.7) automl_model = Fedot(problem=task, timeout=timeout, with_tuning=with_tuning, n_jobs=1) automl_model.fit(features=fit_data, diff --git a/fedot/api/api_utils/params.py b/fedot/api/api_utils/params.py index 718fabe28d..8cd24dfd10 100644 --- a/fedot/api/api_utils/params.py +++ b/fedot/api/api_utils/params.py @@ -56,9 +56,8 @@ def accept_and_apply_recommendations(self, input_data: Union[InputData, MultiMod input_data: data for preprocessing recommendations: dict with recommendations """ - # TODO fix multimodality + if isinstance(input_data, MultiModalData): - self['cv_folds'] = None # there are no support for multimodal data now for data_source_name, values in input_data.items(): self.accept_and_apply_recommendations(input_data[data_source_name], recommendations[data_source_name]) diff --git a/fedot/core/data/cv_folds.py b/fedot/core/data/cv_folds.py new file mode 100644 index 0000000000..77eed03a03 --- /dev/null +++ b/fedot/core/data/cv_folds.py @@ -0,0 +1,83 @@ +from typing import Iterator, Optional, Tuple, Union + +import numpy as np + +from fedot.core.data.multi_modal import MultiModalData +from fedot.core.repository.tasks import TaskTypesEnum +from sklearn.model_selection import KFold, TimeSeriesSplit +from sklearn.model_selection._split import StratifiedKFold + +from fedot.core.data.data import InputData +from fedot.core.data.data_split import _split_input_data_by_indexes + + +class TsInputDataSplit(TimeSeriesSplit): + """ Perform time series splitting for cross validation on InputData structures. + The difference between TimeSeriesSplit (sklearn) and TsInputDataSplit can be + demonstrated by an example: + The time series [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] must be splitted into 3 + parts, where the size of each fold for validation will be 2 elements. + TimeSeriesSplit (return indices) + train - [0, 1, 2, 3] test - [4, 5] + train - [0, 1, 2, 3, 4, 5] test - [6, 7] + train - [0, 1, 2, 3, 4, 5, 6, 7] test - [8, 9] + TsInputDataSplit (return values of time series) + train - [1, 2, 3, 4] test - [1, 2, 3, 4, 5, 6] + train - [1, 2, 3, 4, 5, 6] test - [1, 2, 3, 4, 5, 6, 7, 8] + train - [1, 2, 3, 4, 5, 6, 7, 8] test - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + """ + + def __init__(self, n_splits: int, test_size: int): + super().__init__(gap=0, n_splits=n_splits, test_size=test_size) + + def split(self, data: np.ndarray, *args) -> Iterator[Tuple[InputData, InputData]]: + """ Define indexes for train and validation using + "in-sample forecasting" algorithm + + :param data: InputData for splitting + """ + + for train_ids, test_ids in super().split(data): + new_test_ids = np.hstack((train_ids, test_ids)) + yield train_ids, new_test_ids + + +def cv_generator(data: Union[InputData, MultiModalData], + cv_folds: int, + shuffle: bool = False, + random_seed: int = 42, + stratify: bool = True, + validation_blocks: Optional[int] = None) -> Iterator[Tuple[Union[InputData, MultiModalData], + Union[InputData, MultiModalData]]]: + """ The function for splitting data into a train and test samples + for cross validation. The function return a generator of tuples, + consisting of a pair of train, test. + + :param data: data for train and test splitting + :param shuffle: is data need shuffle + :param cv_folds: number of folds + :param random_seed: random seed for shuffle + :param stratify: `True` to make stratified samples for classification task + :param validation_blocks: validation blocks for timeseries data, + + :return Iterator[Tuple[Union[InputData, MultiModalData], + Union[InputData, MultiModalData]]]: return split train/test data + """ + + # Define base class for generate cv folds + retain_first_target = False + if data.task.task_type is TaskTypesEnum.ts_forecasting: + horizon = data.task.task_params.forecast_length * validation_blocks + kf = TsInputDataSplit(n_splits=cv_folds, test_size=horizon) + # for multi_ts use first target column as main target + retain_first_target = True + elif data.task.task_type is TaskTypesEnum.classification and stratify: + kf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_seed) + else: + kf = KFold(n_splits=cv_folds, shuffle=shuffle, random_state=random_seed) + + # Split + for train_ids, test_ids in kf.split(data.target, data.target): + train_data = _split_input_data_by_indexes(data, train_ids) + test_data = _split_input_data_by_indexes(data, test_ids, retain_first_target=retain_first_target) + yield train_data, test_data diff --git a/fedot/core/data/data_split.py b/fedot/core/data/data_split.py index f086cd9519..3f2d389a0b 100644 --- a/fedot/core/data/data_split.py +++ b/fedot/core/data/data_split.py @@ -1,6 +1,7 @@ from copy import deepcopy -from typing import Tuple, Union +from typing import Tuple, Optional, Union +import numpy as np from sklearn.model_selection import train_test_split from fedot.core.data.data import InputData @@ -9,238 +10,193 @@ from fedot.core.repository.tasks import TaskTypesEnum -def _split_time_series(data: InputData, task, *args, **kwargs): - """ Split time series data into train and test parts - - :param data: InputData object to split - :param task: task to solve - """ - - input_features = data.features - input_target = data.target - forecast_length = task.task_params.forecast_length - - if kwargs.get('validation_blocks') is not None: - # It is required to split data for in-sample forecasting - forecast_length = forecast_length * kwargs.get('validation_blocks') - x_train = input_features[:-forecast_length] - x_test = input_features - - y_train = input_target[:-forecast_length] - y_test = input_target[-forecast_length:] +def _split_input_data_by_indexes(origin_input_data: Union[InputData, MultiModalData], + index, + retain_first_target=False): + """ The function get InputData or MultiModalData and return + only data with indexes in index, not in idx + f.e. index = [0, 1, 2, 3] == input_data.features[[0, 1, 2, 3], :] + :param origin_input_data: data to split + :param index: indexes that needed in output data + :param retain_first_target: set to True for use only first column of target + """ + + if isinstance(origin_input_data, MultiModalData): + data = MultiModalData() + for key in origin_input_data: + data[key] = _split_input_data_by_indexes(origin_input_data[key], + index=index, + retain_first_target=retain_first_target) + return data + elif isinstance(origin_input_data, InputData): + idx = np.take(origin_input_data.idx, index, 0) + target = np.take(origin_input_data.target, index, 0) + features = np.take(origin_input_data.features, index, 0) + + if retain_first_target and len(target.shape) > 1: + target = target[:, 0] + + data = InputData(idx=idx, + features=features, + target=target, + task=deepcopy(origin_input_data.task), + data_type=origin_input_data.data_type, + supplementary_data=origin_input_data.supplementary_data) + return data else: - # Source time series divide into two parts - x_train = input_features[:-forecast_length] - x_test = input_features[:-forecast_length] + raise TypeError(f'Unknown data type {type(origin_input_data)}') - y_train = input_target[:-forecast_length] - y_test = input_target[-forecast_length:] - idx_train = data.idx[:-forecast_length] - idx_test = data.idx[-forecast_length:] - - # Prepare data to train the operation - train_data = InputData(idx=idx_train, features=x_train, target=y_train, - task=task, data_type=DataTypesEnum.ts, - supplementary_data=data.supplementary_data) - - test_data = InputData(idx=idx_test, features=x_test, target=y_test, - task=task, data_type=DataTypesEnum.ts, - supplementary_data=data.supplementary_data) - return train_data, test_data - - -def _split_multi_time_series(data: InputData, task, *args, **kwargs): - """ Split multi_ts time series data into train and test parts +def _split_time_series(data: InputData, + validation_blocks: Optional[int] = None, + **kwargs): + """ Split time series data into train and test parts :param data: InputData object to split - :param task: task to solve + :param validation_blocks: validation blocks are used for test """ - input_features = data.features - input_target = data.target - forecast_length = task.task_params.forecast_length - - if kwargs.get('validation_blocks') is not None: - # It is required to split data for in-sample forecasting - forecast_length = forecast_length * kwargs.get('validation_blocks') - x_train = input_features[:-forecast_length] - x_test = input_features - - y_train = input_target[:-forecast_length] - y_test = input_target[-forecast_length:, 0] - - else: - # Source time series divide into two parts - x_train = input_features[:-forecast_length] - x_test = input_features[:-forecast_length] - - y_train = input_target[:-forecast_length] - y_test = input_target[-forecast_length:, 0] - - idx_train = data.idx[:-forecast_length] - idx_test = data.idx[-forecast_length:] - - # Prepare data to train the operation - train_data = InputData(idx=idx_train, features=x_train, target=y_train, - task=task, data_type=DataTypesEnum.multi_ts, - supplementary_data=data.supplementary_data) - - test_data = InputData(idx=idx_test, features=x_test, target=y_test, - task=task, data_type=DataTypesEnum.multi_ts, - supplementary_data=data.supplementary_data) - - return train_data, test_data - - -def _split_any(data: InputData, task, data_type, split_ratio, with_shuffle=False, **kwargs): - """ Split any data into train and test parts + forecast_length = data.task.task_params.forecast_length + if validation_blocks is not None: + forecast_length *= validation_blocks - :param data: InputData object to split - :param task: task to solve - :param split_ratio: threshold for partitioning - :param data_type type of data to split - :param with_shuffle: is data needed to be shuffled or not - """ + target_length = len(data.target) + train_data = _split_input_data_by_indexes(data, index=np.arange(0, target_length - forecast_length),) + test_data = _split_input_data_by_indexes(data, index=np.arange(target_length - forecast_length, target_length), + retain_first_target=True) - if not 0. < split_ratio < 1.: - raise ValueError('Split ratio must belong to the interval (0; 1)') - random_state = 42 - - # Predictors and target - input_features = data.features - input_target = data.target - idx = data.idx - if task.task_type == TaskTypesEnum.classification and with_shuffle: - stratify = input_target + if validation_blocks is None: + # for in-sample + test_data.features = train_data.features else: - stratify = None - - idx_train, idx_test, x_train, x_test, y_train, y_test = \ - train_test_split(idx, - input_features, - input_target, - test_size=1. - split_ratio, - shuffle=with_shuffle, - random_state=random_state, - stratify=stratify) - - # Prepare data to train the operation - train_data = InputData(idx=idx_train, features=x_train, target=y_train, - task=task, data_type=data_type, - supplementary_data=data.supplementary_data) - - test_data = InputData(idx=idx_test, features=x_test, target=y_test, - task=task, data_type=data_type, - supplementary_data=data.supplementary_data) + # for out-of-sample + test_data.features = data.features return train_data, test_data -def _split_table(data: InputData, task, split_ratio, with_shuffle=False, **kwargs): - """ Split table data into train and test parts +def _split_any(data: InputData, + split_ratio: float, + shuffle: bool, + stratify: bool, + random_seed: int, + **kwargs): + """ Split any data except timeseries into train and test parts :param data: InputData object to split - :param task: task to solve - :param split_ratio: threshold for partitioning - :param with_shuffle: is data needed to be shuffled or not + :param split_ratio: share of train data between 0 and 1 + :param shuffle: is data needed to be shuffled or not + :param stratify: make stratified sample or not + :param random_seed: random_seed for shuffle """ - return _split_any(data, task, DataTypesEnum.table, split_ratio, with_shuffle) - -def _split_image(data: InputData, task, split_ratio, with_shuffle=False, **kwargs): - """ Split image data into train and test parts - - :param data: InputData object to split - :param task: task to solve - :param split_ratio: threshold for partitioning - :param with_shuffle: is data needed to be shuffled or not - """ - - return _split_any(data, task, DataTypesEnum.image, split_ratio, with_shuffle) - - -def _split_text(data: InputData, task, split_ratio, with_shuffle=False, **kwargs): - """ Split text data into train and test parts - - :param data: InputData object to split - :param task: task to solve - :param split_ratio: threshold for partitioning - :param with_shuffle: is data needed to be shuffled or not - """ + stratify_labels = data.target if stratify else None - return _split_any(data, task, DataTypesEnum.text, split_ratio, with_shuffle) + train_ids, test_ids = train_test_split(np.arange(0, len(data.target)), + test_size=1. - split_ratio, + shuffle=shuffle, + random_state=random_seed, + stratify=stratify_labels) - -def _train_test_single_data_setup(data: InputData, split_ratio=0.8, - shuffle_flag=False, **kwargs) -> Tuple[InputData, InputData]: - """ Function for train and test split - - :param data: InputData for train and test splitting - :param split_ratio: threshold for partitioning - :param shuffle_flag: is data needed to be shuffled or not - - :return train_data: InputData for train - :return test_data: InputData for validation - """ - # Split into train and test - if data is not None: - task = data.task - - split_func_dict = { - DataTypesEnum.multi_ts: _split_multi_time_series, - DataTypesEnum.ts: _split_time_series, - DataTypesEnum.table: _split_table, - DataTypesEnum.image: _split_image, - DataTypesEnum.text: _split_text - } - - split_func = split_func_dict.get(data.data_type, _split_table) - - train_data, test_data = split_func(data, task, split_ratio, - with_shuffle=shuffle_flag, - **kwargs) - else: - raise ValueError('InputData must be not empty') - - # Store additional information - train_data.supplementary_data = deepcopy(data.supplementary_data) - test_data.supplementary_data = deepcopy(data.supplementary_data) - return train_data, test_data - - -def _train_test_multi_modal_data_setup(data: MultiModalData, split_ratio=0.8, - shuffle_flag=False, **kwargs) -> Tuple[MultiModalData, MultiModalData]: - train_data = MultiModalData() - test_data = MultiModalData() - for node in data.keys(): - data_part = data[node] - train_data_part, test_data_part = train_test_data_setup(data_part, split_ratio, shuffle_flag, **kwargs) - train_data[node] = train_data_part - test_data[node] = test_data_part + train_data = _split_input_data_by_indexes(data, index=train_ids) + test_data = _split_input_data_by_indexes(data, index=test_ids) return train_data, test_data -def train_test_data_setup(data: Union[InputData, MultiModalData], split_ratio=0.8, - shuffle_flag=False, **kwargs) -> Tuple[Union[InputData, MultiModalData], - Union[InputData, MultiModalData]]: +def _are_stratification_allowed(data: Union[InputData, MultiModalData], split_ratio: float) -> bool: + """ Check that stratification may be done + :param data: data for split + :param split_ratio: relation between train data length and all data length + :return bool: stratification is allowed""" + + # check task_type + if data.task.task_type is not TaskTypesEnum.classification: + return False + + try: + # fast way + classes = np.unique(data.target, return_counts=True) + except Exception: + # slow way + from collections import Counter + classes = Counter(data.target) + classes = [list(classes), list(classes.values())] + + # check that there are enough labels for two samples + if not all(x > 1 for x in classes[1]): + if __debug__: + # tests often use very small datasets that are not suitable for data splitting + # stratification is disabled for tests + return False + else: + raise ValueError(("There is the only value for some classes:" + f" {', '.join(str(val) for val, count in zip(*classes) if count == 1)}." + f" Data split can not be done for {data.task.task_type.name} task.")) + + # check that split ratio allows to set all classes to both samples + test_size = round(len(data.target) * (1. - split_ratio)) + labels_count = len(classes[0]) + if test_size < labels_count: + return False + + return True + + +def train_test_data_setup(data: Union[InputData, MultiModalData], + split_ratio: float = 0.8, + shuffle: bool = False, + shuffle_flag: bool = False, + stratify: bool = True, + random_seed: int = 42, + validation_blocks: Optional[int] = None) -> Tuple[Union[InputData, MultiModalData], + Union[InputData, MultiModalData]]: """ Function for train and test split for both InputData and MultiModalData - Args: - data: data for train and test splitting - split_ratio: threshold for partitioning - shuffle_flag: is data needed to be shuffled or not - kwargs: additional optional parameters such as number of validation blocks - - Returns: - data for train, data for validation + :param data: InputData object to split + :param split_ratio: share of train data between 0 and 1 + :param shuffle: is data needed to be shuffled or not + :param shuffle_flag: same is shuffle, use for backward compatibility + :param stratify: make stratified sample or not + :param random_seed: random_seed for shuffle + :param validation_blocks: validation blocks are used for test + + :return: data for train, data for validation """ + + # for backward compatibility + shuffle |= shuffle_flag + # check that stratification may be done + stratify &= _are_stratification_allowed(data, split_ratio) + # stratification is allowed only with shuffle + shuffle |= stratify + # shuffle is allowed only with random_seed and vise versa + random_seed = (random_seed or 42) if shuffle else None + + input_arguments = {'split_ratio': split_ratio, + 'shuffle': shuffle, + 'stratify': stratify, + 'random_seed': random_seed, + 'validation_blocks': validation_blocks} if isinstance(data, InputData): - train_data, test_data = _train_test_single_data_setup(data, split_ratio, shuffle_flag, **kwargs) + split_func_dict = {DataTypesEnum.multi_ts: _split_time_series, + DataTypesEnum.ts: _split_time_series, + DataTypesEnum.table: _split_any, + DataTypesEnum.image: _split_any, + DataTypesEnum.text: _split_any} + + if data.data_type not in split_func_dict: + raise TypeError((f'Unknown data type {type(data)}. Supported data types:' + f' {", ".join(str(x) for x in split_func_dict)}')) + + split_func = split_func_dict[data.data_type] + train_data, test_data = split_func(data, **input_arguments) elif isinstance(data, MultiModalData): - train_data, test_data = _train_test_multi_modal_data_setup(data, split_ratio, shuffle_flag, **kwargs) + train_data, test_data = MultiModalData(), MultiModalData() + for node in data.keys(): + train_data[node], test_data[node] = train_test_data_setup(data[node], **input_arguments) else: - raise ValueError(f'Dataset {type(data)} is not supported') + raise ValueError((f'Dataset {type(data)} is not supported. Supported types:' + ' InputData, MultiModalData')) return train_data, test_data diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 68db60ce42..b75e70076c 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -362,7 +362,7 @@ def _find_binary_features(self, numerical_features: np.array): # Calculate unique values per column (excluding nans) for column_id, col in enumerate(df): unique_values = df[col].dropna().unique() - if len(unique_values) <= 2: + if len(unique_values) == 2: # Current numerical column has only two values column_info = {column_id: {'min': min(unique_values), 'max': max(unique_values)}} diff --git a/fedot/core/optimisers/objective/data_objective_advisor.py b/fedot/core/optimisers/objective/data_objective_advisor.py deleted file mode 100644 index 48972aaf8a..0000000000 --- a/fedot/core/optimisers/objective/data_objective_advisor.py +++ /dev/null @@ -1,40 +0,0 @@ -import numpy as np -from sklearn.model_selection import StratifiedKFold, KFold -from sklearn.model_selection._split import _BaseKFold -from typing import Type - -from fedot.core.data.data import InputData -from fedot.core.repository.tasks import TaskTypesEnum - - -class DataObjectiveAdvisor: - def __init__(self, threshold: float = 0.5): - """ - Advisor for DataObjectiveBuilder for choice some parameters based on input_data - - :param threshold: threshold level for difference between uniform probabilities and real probabilities - """ - self.threshold = threshold - - def propose_kfold(self, input_data: InputData) -> Type[_BaseKFold]: - """ - Method to choose he most suitable strategy for making folds - - :param input_data: data to analyse - """ - if input_data.task.task_type is TaskTypesEnum.classification and self.check_imbalance(input_data): - return StratifiedKFold - else: - return KFold - - def check_imbalance(self, input_data: InputData) -> bool: - """ - Checks data for imbalance - if probability of any class lower than uniform probability in threshold times it - returns true - :param input_data: data to analyse - - """ - _, counts = np.unique(input_data.target, return_counts=True) - probabilities = counts / input_data.target.shape[0] - uniform_probability = 1 / input_data.num_classes - return np.any(np.abs(uniform_probability - probabilities) / uniform_probability > self.threshold) diff --git a/fedot/core/optimisers/objective/data_source_splitter.py b/fedot/core/optimisers/objective/data_source_splitter.py index 756c2151d8..70e5dd7e30 100644 --- a/fedot/core/optimisers/objective/data_source_splitter.py +++ b/fedot/core/optimisers/objective/data_source_splitter.py @@ -1,17 +1,16 @@ from functools import partial -from typing import Optional +from typing import Optional, Union from golem.core.log import default_log from fedot.core.constants import default_data_split_ratio_by_task from fedot.core.data.data import InputData -from fedot.core.data.data_split import train_test_data_setup +from fedot.core.data.data_split import train_test_data_setup, _are_stratification_allowed from fedot.core.data.multi_modal import MultiModalData -from fedot.core.optimisers.objective.data_objective_advisor import DataObjectiveAdvisor from fedot.core.optimisers.objective.data_objective_eval import DataSource from fedot.core.repository.tasks import TaskTypesEnum -from fedot.core.validation.split import tabular_cv_generator, ts_cv_generator from fedot.remote.remote_evaluator import RemoteEvaluator, init_data_for_remote_execution +from fedot.core.data.cv_folds import cv_generator class DataSourceSplitter: @@ -20,43 +19,82 @@ class DataSourceSplitter: Can provide hold-out validation and k-fold validation. :param cv_folds: Number of folds on data for cross-validation. - If provided, then k-fold validation is used. Otherwise, hold-out validation is used. + If provided, then cross validation is used. Otherwise, hold-out validation is used. + :param validation_blocks: Validation blocks count. + Applied only for time series data. + If not provided, then value will be calculated. :param split_ratio: Ratio of data for splitting. Applied only in case of hold-out split. If not provided, then default split ratios will be used. :param shuffle: Is shuffling required for data. + :param stratify: If True then stratification is used for samples + :param random_seed: Random seed for shuffle. + :param log: Log for logging. """ def __init__(self, cv_folds: Optional[int] = None, validation_blocks: Optional[int] = None, split_ratio: Optional[float] = None, - shuffle: bool = False): + shuffle: bool = False, + stratify: bool = True, + random_seed: int = 42): self.cv_folds = cv_folds self.validation_blocks = validation_blocks self.split_ratio = split_ratio self.shuffle = shuffle - self.advisor = DataObjectiveAdvisor() + self.stratify = stratify + self.random_seed = random_seed self.log = default_log(self) - def build(self, data: InputData) -> DataSource: - # Shuffle data - if self.shuffle and data.task.task_type is not TaskTypesEnum.ts_forecasting: - data.shuffle() + def build(self, data: Union[InputData, MultiModalData]) -> DataSource: + # define split_ratio + self.split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type] + + # Check cv_folds + if self.cv_folds is not None: + try: + self.cv_folds = int(self.cv_folds) + except ValueError: + raise ValueError(f"cv_folds is not integer: {self.cv_folds}") + if self.cv_folds < 2: + self.cv_folds = None + if self.cv_folds > data.target.shape[0] - 1: + raise ValueError((f"cv_folds ({self.cv_folds}) is greater than" + f" the maximum allowed count {data.target.shape[0] - 1}")) + + # Calculate the number of validation blocks for timeseries forecasting + if data.task.task_type is TaskTypesEnum.ts_forecasting and self.validation_blocks is None: + self._propose_cv_folds_and_validation_blocks(data) # Check split_ratio - split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type] - if not (0 < split_ratio < 1): - raise ValueError(f'split_ratio is {split_ratio} but should be between 0 and 1') + if self.cv_folds is None and not (0 < self.split_ratio < 1): + raise ValueError(f'split_ratio is {self.split_ratio} but should be between 0 and 1') + + if self.stratify: + # check that stratification can be done + # for cross validation split ratio is defined as validation_size / all_data_size + split_ratio = self.split_ratio if self.cv_folds is None else (1 - 1 / (self.cv_folds + 1)) + self.stratify = _are_stratification_allowed(data, split_ratio) + if not self.stratify: + self.log.info("Stratificated splitting of data is disabled.") - # Calculate the number of validation blocks - if self.validation_blocks is None and data.task.task_type is TaskTypesEnum.ts_forecasting: - self._propose_cv_folds_and_validation_blocks(data, split_ratio) + # Stratification can not be done without shuffle + self.shuffle |= self.stratify + + # Random seed depends on shuffle + self.random_seed = (self.random_seed or 42) if self.shuffle else None # Split data if self.cv_folds is not None: self.log.info("K-folds cross validation is applied.") - data_producer = self._build_kfolds_producer(data) + data_producer = partial(cv_generator, + data=data, + shuffle=self.shuffle, + cv_folds=self.cv_folds, + random_seed=self.random_seed, + stratify=self.stratify, + validation_blocks=self.validation_blocks) else: self.log.info("Hold out validation is applied.") data_producer = self._build_holdout_producer(data) @@ -73,31 +111,22 @@ def _build_holdout_producer(self, data: InputData) -> DataSource: that always returns same data split. Equivalent to 1-fold validation. """ - split_ratio = self.split_ratio or default_data_split_ratio_by_task[data.task.task_type] - train_data, test_data = train_test_data_setup(data, split_ratio, validation_blocks=self.validation_blocks) + train_data, test_data = train_test_data_setup(data, + split_ratio=self.split_ratio, + stratify=self.stratify, + random_seed=self.random_seed, + shuffle=self.shuffle, + validation_blocks=self.validation_blocks) if RemoteEvaluator().is_enabled: init_data_for_remote_execution(train_data) return partial(self._data_producer, train_data, test_data) - def _build_kfolds_producer(self, data: InputData) -> DataSource: - if isinstance(data, MultiModalData): - raise NotImplementedError('Cross-validation is not supported for multi-modal data') - if data.task.task_type is TaskTypesEnum.ts_forecasting: - # Perform time series cross validation - cv_generator = partial(ts_cv_generator, data, - self.cv_folds, - self.validation_blocks, - self.log) - else: - cv_generator = partial(tabular_cv_generator, data, - self.cv_folds, - self.advisor.propose_kfold(data)) - return cv_generator - - def _propose_cv_folds_and_validation_blocks(self, data, split_ratio): + def _propose_cv_folds_and_validation_blocks(self, data, expected_window_size=20): data_shape = data.target.shape[0] + # first expected_window_size points should to be guaranteed for prediction at fit stage + data_shape -= expected_window_size forecast_length = data.task.task_params.forecast_length # check that cv folds may be realized if self.cv_folds is not None: @@ -118,14 +147,13 @@ def _propose_cv_folds_and_validation_blocks(self, data, split_ratio): " Cross validation is switched off.")) if self.cv_folds is None: - test_shape = int(data_shape * (1 - split_ratio)) + test_shape = int(data_shape * (1 - self.split_ratio)) if forecast_length > test_shape: - split_ratio = 1 - forecast_length / data_shape + self.split_ratio = 1 - forecast_length / data_shape self.log.info((f"Forecast length ({forecast_length}) is greater than test length" f" ({test_shape}) defined by split ratio." - f" Split ratio is changed to {split_ratio}.")) - test_share = 1 - split_ratio - self.split_ratio = split_ratio + f" Split ratio is changed to {self.split_ratio}.")) + test_share = 1 - self.split_ratio else: test_share = 1 / (self.cv_folds + 1) self.validation_blocks = int(data_shape * test_share // forecast_length) diff --git a/fedot/core/validation/__init__.py b/fedot/core/validation/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/fedot/core/validation/split.py b/fedot/core/validation/split.py deleted file mode 100644 index b5bab02a7a..0000000000 --- a/fedot/core/validation/split.py +++ /dev/null @@ -1,166 +0,0 @@ -from typing import Iterator, Optional, Tuple, Type - -import numpy as np -from golem.core.log import LoggerAdapter, default_log -from sklearn.model_selection import KFold, TimeSeriesSplit -from sklearn.model_selection._split import _BaseKFold - -from fedot.core.data.data import InputData -from fedot.core.data.data_split import train_test_data_setup -from fedot.core.repository.dataset_types import DataTypesEnum - - -class OneFoldInputDataSplit: - """ Perform one fold split (hold out) for InputData structures """ - - def __init__(self): - pass - - @staticmethod - def input_split(input_data: InputData, **kwargs): - # Train test split - train_input, test_input = train_test_data_setup(input_data, **kwargs) - - yield train_input, test_input - - -class TsInputDataSplit(TimeSeriesSplit): - """ Perform time series splitting for cross validation on InputData structures. - The difference between TimeSeriesSplit (sklearn) and TsInputDataSplit can be - demonstrated by an example: - The time series [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] must be splitted into 3 - parts, where the size of each fold for validation will be 2 elements. - TimeSeriesSplit (return indices) - train - [0, 1, 2, 3] test - [4, 5] - train - [0, 1, 2, 3, 4, 5] test - [6, 7] - train - [0, 1, 2, 3, 4, 5, 6, 7] test - [8, 9] - TsInputDataSplit (return values of time series) - train - [1, 2, 3, 4] test - [1, 2, 3, 4, 5, 6] - train - [1, 2, 3, 4, 5, 6] test - [1, 2, 3, 4, 5, 6, 7, 8] - train - [1, 2, 3, 4, 5, 6, 7, 8] test - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] - """ - - def __init__(self, validation_blocks: int, **params): - super().__init__(**params) - self.validation_blocks = validation_blocks - self.params = params - - def input_split(self, input_data: InputData) -> Iterator[Tuple[InputData, InputData]]: - """ Splitting into datasets for train and validation using - "in-sample forecasting" algorithm - - :param input_data: InputData for splitting - """ - # Transform InputData into numpy array - data_for_split = np.array(input_data.target) - - for train_ids, test_ids in super().split(data_for_split): - if len(train_ids) <= len(test_ids): - raise ValueError("Train size will be too small with selected number of folds and validation blocks") - # Return train part by ids - train_features, train_target = _ts_data_by_index(train_ids, train_ids, input_data) - train_data = InputData(idx=np.arange(0, len(train_target)), - features=train_features, target=train_target, - task=input_data.task, - data_type=input_data.data_type, - supplementary_data=input_data.supplementary_data) - - # Unit all ids for "in-sample validation" - all_ids = np.hstack((train_ids, test_ids)) - # In-sample validation dataset - val_features, val_target = _ts_data_by_index(all_ids, all_ids, input_data) - validation_data = InputData(idx=np.arange(0, len(val_target)), - features=val_features, target=val_target, - task=input_data.task, - data_type=input_data.data_type, - supplementary_data=input_data.supplementary_data) - - yield train_data, validation_data - - -def tabular_cv_generator(data: InputData, - folds: int, - split_method: Type[_BaseKFold] = KFold) -> Iterator[Tuple[InputData, InputData]]: - """ The function for splitting data into a train and test samples - in the InputData format for KFolds cross validation. The function - return a generator of tuples, consisting of a pair of train, test. - - :param data: InputData for train and test splitting - :param folds: number of folds - :param split_method: method to split data (f.e. stratify KFold) - - :return Iterator[InputData, InputData]: return split train/test data - """ - kf = split_method(n_splits=folds, shuffle=True, random_state=42) - - for train_idxs, test_idxs in kf.split(data.features, data.target): - train_features, train_target = _table_data_by_index(train_idxs, data) - test_features, test_target = _table_data_by_index(test_idxs, data) - - idx_for_train = np.arange(0, len(train_features)) - idx_for_test = np.arange(0, len(test_features)) - - train_data = InputData(idx=idx_for_train, - features=train_features, - target=train_target, - task=data.task, - data_type=data.data_type, - supplementary_data=data.supplementary_data) - test_data = InputData(idx=idx_for_test, - features=test_features, - target=test_target, - task=data.task, - data_type=data.data_type, - supplementary_data=data.supplementary_data) - - yield train_data, test_data - - -def ts_cv_generator(data: InputData, folds: int, - validation_blocks: int = 1, log: Optional[LoggerAdapter] = None) \ - -> Iterator[Tuple[InputData, InputData]]: - """ Splitting data for time series cross validation - - :param data: source InputData with time series data type - :param folds: number of folds - :param validation_blocks: number of validation block per each fold - :param log: log object - """ - if not log: - log = default_log(prefix=__name__) - validation_blocks = int(validation_blocks) - # Forecast horizon for each fold - horizon = data.task.task_params.forecast_length * validation_blocks - - try: - tscv = TsInputDataSplit(gap=0, validation_blocks=validation_blocks, - n_splits=folds, test_size=horizon) - for train_data, test_data in tscv.input_split(data): - yield train_data, test_data - except ValueError: - log.info(f'Time series length too small for cross validation with {folds} folds. Perform one fold validation') - # Perform one fold validation (folds parameter will be ignored) - - one_fold_split = OneFoldInputDataSplit() - for train_data, test_data in one_fold_split.input_split(data, validation_blocks=validation_blocks): - yield train_data, test_data - - -def _table_data_by_index(index, values: InputData): - """ Allow to get tabular data by indexes of elements """ - features = values.features[index, :] - target = np.take(values.target, index) - - return features, target - - -def _ts_data_by_index(train_ids, test_ids, data): - """ Allow to get time series data by indexes of elements """ - features = data.features[train_ids] - target = data.target[test_ids] - - # Use only the first time-series as target for multi_ts - if data.data_type == DataTypesEnum.multi_ts: - target = target[:, 0] - - return features, target diff --git a/test/integration/api/test_api_utils.py b/test/integration/api/test_api_utils.py index b855592b3c..fd2f7877ce 100644 --- a/test/integration/api/test_api_utils.py +++ b/test/integration/api/test_api_utils.py @@ -33,7 +33,7 @@ def test_output_binary_classification_correct(): data = get_binary_classification_data() - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) model = Fedot(problem=task_type, seed=1, timeout=0.1) model.fit(train_data, predefined_model='logit') diff --git a/test/integration/api/test_main_api.py b/test/integration/api/test_main_api.py index 3afc0f5b7f..70c845bbf0 100644 --- a/test/integration/api/test_main_api.py +++ b/test/integration/api/test_main_api.py @@ -75,7 +75,7 @@ def get_dataset(task_type: str, validation_blocks: Optional[int] = None, n_sampl data = get_iris_data() else: data = get_synthetic_classification_data(n_samples=n_samples, n_features=n_features, random_state=42) - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) threshold = 0.95 elif task_type == 'clustering': data = get_synthetic_input_data(n_samples=100) @@ -110,7 +110,7 @@ def load_categorical_unimodal(): dataset_path = 'test/data/classification_with_categorical.csv' full_path = os.path.join(str(fedot_project_root()), dataset_path) data = InputData.from_csv(full_path) - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) return train_data, test_data diff --git a/test/unit/data/test_data_split.py b/test/unit/data/test_data_split.py index 5cf904921e..b4f2b85cc7 100644 --- a/test/unit/data/test_data_split.py +++ b/test/unit/data/test_data_split.py @@ -12,7 +12,7 @@ from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import Task, TaskTypesEnum, TsForecastingParams from fedot.core.utils import split_data -from fedot.core.validation.split import tabular_cv_generator, ts_cv_generator +from fedot.core.data.cv_folds import cv_generator from test.unit.pipelines.test_decompose_pipelines import get_classification_data from test.unit.tasks.test_forecasting import get_ts_data @@ -22,10 +22,10 @@ IMAGE_SIMPLE = {'train_features_size': (8, 5, 5, 2), 'test_features_size': (2, 5, 5, 2), 'test_idx': (8, 9)} -def get_tabular_classification_data(): +def get_tabular_classification_data(length=10, class_count=2): task = Task(TaskTypesEnum.classification) - features = np.full((10, 5), 1, dtype=float) - target = np.repeat(np.array([1, 2]), 5).reshape((-1, 1)) + features = np.full((length, 5), 1, dtype=float) + target = np.repeat(np.array(list(range(1, class_count + 1))), length // class_count).reshape((-1, 1)) input_data = InputData(idx=np.arange(0, len(features)), features=features, target=target, task=task, data_type=DataTypesEnum.table) return input_data @@ -106,6 +106,18 @@ def get_balanced_data_to_test_mismatch(): return input_data +def check_shuffle(sample): + unique = np.unique(np.diff(sample.idx)) + test_result = len(unique) > 1 or np.min(unique) > 1 + return test_result + + +def check_stratify(train, test): + deltas = [np.unique(np.sort(train.target), return_counts=True)[1], + np.unique(np.sort(test.target), return_counts=True)[1]] + return np.allclose(*[delta / sum(delta) for delta in deltas]) + + def test_split_data(): dataframe = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6], @@ -126,7 +138,7 @@ def test_split_data(): def test_default_train_test_simple(data_generator: Callable, expected_output: dict): """ Check if simple splitting perform correctly for all used in FEDOT data types """ input_data = data_generator() - train_data, test_data = train_test_data_setup(input_data) + train_data, test_data = train_test_data_setup(input_data, stratify=False) assert train_data.features.shape == expected_output['train_features_size'] assert test_data.features.shape == expected_output['test_features_size'] @@ -146,11 +158,8 @@ def test_advanced_time_series_splitting(): @pytest.mark.parametrize('data_splitter, data', - # test StratifiedKFold [(DataSourceSplitter(cv_folds=3, shuffle=True), get_imbalanced_data_to_test_mismatch()), - # test KFold - # (DataSourceSplitter(cv_folds=3, shuffle=True), get_balanced_data_to_test_mismatch()), - # test hold-out + (DataSourceSplitter(cv_folds=3, shuffle=True), get_balanced_data_to_test_mismatch()), (DataSourceSplitter(shuffle=True), get_imbalanced_data_to_test_mismatch()), ]) def test_data_splitting_without_shape_mismatch(data_splitter: DataSourceSplitter, data: InputData): @@ -193,18 +202,70 @@ def test_multivariate_time_series_splitting_correct(): assert np.allclose(test_series_data.target, np.array([16, 17, 18, 19])) +@pytest.mark.parametrize(('datas_funs', 'cv_folds', 'shuffle', 'stratify'), + [ + # classification + stratify + shuffle + cv_folds + ([partial(get_tabular_classification_data, 100, 5)] * 3, 4, True, True), + # classification + shuffle + cv_folds + ([partial(get_tabular_classification_data, 100, 5)] * 3, 4, True, False), + # classification + cv_folds + ([partial(get_tabular_classification_data, 100, 5)] * 3, 4, False, False), + # classification + stratify + shuffle + ([partial(get_tabular_classification_data, 100, 5)] * 3, None, True, True), + # classification + shuffle + ([partial(get_tabular_classification_data, 100, 5)] * 3, None, True, False), + # classification + ([partial(get_tabular_classification_data, 100, 5)] * 3, None, False, False), + # timeseries + cv_folds + ([partial(get_ts_data_to_forecast, 10, 100)] * 3, 3, False, False), + # timeseries + ([partial(get_ts_data_to_forecast, 10, 100)] * 3, None, False, False), + ]) +def test_multimodal_data_splitting_is_correct(datas_funs, cv_folds, shuffle, stratify): + mdata = MultiModalData({f'data_{i}': data_fun() for i, data_fun in enumerate(datas_funs)}) + data_splitter = DataSourceSplitter(cv_folds=cv_folds, shuffle=shuffle, stratify=stratify) + data_producer = data_splitter.build(mdata) + keys = tuple(mdata.keys()) + features_dimensity = [subdata.features.shape[1:] for subdata in mdata.values()] + + for samples in data_producer(): + for sample in samples: + assert isinstance(sample, MultiModalData) + + # keys should be the same + assert set(keys) == set(sample.keys()) + + # idx should be the same + idx = [np.reshape(x.idx, (-1, 1)) for x in sample.values()] + assert np.all(np.diff(np.concatenate(idx, 1), 1) == 0) + + # dimensity of features should be the same + splitted_data_features_dimensity = [subdata.features.shape[1:] for subdata in sample.values()] + assert features_dimensity == splitted_data_features_dimensity + + # shuffle should be done + if shuffle: + for key in keys: + assert check_shuffle(sample[key]) + + # stratify should be done + if stratify: + for key in keys: + assert check_stratify(samples[0][key], samples[1][key]) + + @pytest.mark.parametrize("cv_generator, data", - [(partial(tabular_cv_generator, folds=5), + [(partial(cv_generator, cv_folds=5), get_classification_data()[0]), - (partial(ts_cv_generator, folds=3, validation_blocks=2), + (partial(cv_generator, cv_folds=3, validation_blocks=2), get_ts_data()[0])]) def test_cv_generator_works_stable(cv_generator, data): """ Test if ts cv generator works stable (always return same folds) """ idx_first = [] idx_second = [] - for row in cv_generator(data=data): + for row in cv_generator(data=data, stratify=False, random_seed=None): idx_first.append(row[1].idx) - for row in cv_generator(data=data): + for row in cv_generator(data=data, stratify=False, random_seed=None): idx_second.append(row[1].idx) for i in range(len(idx_first)): @@ -225,9 +286,40 @@ def test_data_splitting_defines_validation_blocks_correctly(forecast_length, cv_ check_cv_folds, check_split_ratio, check_validation_blocks): """ Checks if validation blocks count defines correctly for different data """ - data = get_ts_data_to_forecast(forecast_length) + data = get_ts_data_to_forecast(forecast_length, 120) data_source_splitter = DataSourceSplitter(cv_folds=cv_folds, split_ratio=split_ratio) data_source_splitter.build(data) assert data_source_splitter.cv_folds == check_cv_folds assert data_source_splitter.split_ratio == check_split_ratio assert data_source_splitter.validation_blocks == check_validation_blocks + + +@pytest.mark.parametrize(('cv_folds', 'shuffle', 'stratify', 'data_classes'), + [(2, True, True, 2), # simple case + (2, False, True, 2), # should work without error + (5, True, True, 4), # more folds and more classes + ]) +def test_stratify(cv_folds, shuffle, stratify, data_classes): + data = get_tabular_classification_data(length=100, class_count=data_classes) + data_splitter = DataSourceSplitter(cv_folds=cv_folds, shuffle=shuffle, stratify=stratify) + data_producer = data_splitter.build(data) + + for train, test in data_producer(): + assert check_stratify(train, test) + + +@pytest.mark.parametrize(('is_shuffle', 'shuffle', 'cv_folds', 'data'), + [(True, True, 2, + get_tabular_classification_data(length=100, class_count=4)), # cv_folds classification + (True, True, None, + get_tabular_classification_data(length=100, class_count=4)), # holdout classification + (False, True, 2, get_ts_data_to_forecast(10, 100)), # cv_folds timeseries + (False, True, None, get_ts_data_to_forecast(10, 100)), # holdout timeseries + ]) +def test_shuffle(is_shuffle, cv_folds, shuffle, data): + data_splitter = DataSourceSplitter(cv_folds=cv_folds, shuffle=shuffle, stratify=False) + data_producer = data_splitter.build(data) + + for samples in data_producer(): + for sample in samples: + assert check_shuffle(sample) == is_shuffle diff --git a/test/unit/optimizer/test_pipeline_objective_eval.py b/test/unit/optimizer/test_pipeline_objective_eval.py index 657b7a201e..1c1ce688e7 100644 --- a/test/unit/optimizer/test_pipeline_objective_eval.py +++ b/test/unit/optimizer/test_pipeline_objective_eval.py @@ -1,6 +1,5 @@ import datetime from copy import deepcopy -from functools import partial import numpy as np import pytest @@ -17,7 +16,6 @@ from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum, MetricsRepository, \ RegressionMetricsEnum from fedot.core.repository.tasks import Task, TaskTypesEnum -from fedot.core.validation.split import tabular_cv_generator, OneFoldInputDataSplit from test.integration.models.test_model import classification_dataset, classification_dataset_with_str_labels from test.unit.tasks.test_forecasting import get_simple_ts_pipeline from test.unit.validation.test_table_cv import sample_pipeline @@ -75,12 +73,12 @@ def empty_datasource(): ) def test_pipeline_objective_evaluate_with_different_metrics(classification_dataset, pipeline): for metric in ClassificationMetricsEnum: - one_fold_split = OneFoldInputDataSplit() - data_split = partial(one_fold_split.input_split, input_data=classification_dataset) + data_producer = DataSourceSplitter(cv_folds=None).build(classification_dataset) check_pipeline = deepcopy(pipeline) - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer) fitness = objective_eval(pipeline) - act_fitness = actual_fitness(data_split, check_pipeline, metric) + act_fitness = actual_fitness(data_producer, check_pipeline, metric) assert fitness.valid assert fitness.value is not None assert np.isclose(fitness.value, act_fitness.value, atol=1e-8), metric.name @@ -92,8 +90,8 @@ def test_pipeline_objective_evaluate_with_different_metrics(classification_datas ) def test_pipeline_objective_evaluate_with_different_metrics_with_str_labes(pipeline): for metric in ClassificationMetricsEnum: - one_fold_split = OneFoldInputDataSplit() - data_split = partial(one_fold_split.input_split, input_data=classification_dataset_with_str_labels()) + data_splitter = DataSourceSplitter() + data_split = data_splitter.build(classification_dataset_with_str_labels()) check_pipeline = deepcopy(pipeline) objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split) fitness = objective_eval(pipeline) @@ -105,11 +103,11 @@ def test_pipeline_objective_evaluate_with_different_metrics_with_str_labes(pipel def test_pipeline_objective_evaluate_with_empty_pipeline(classification_dataset): pipeline = empty_pipeline() - - data_split = partial(OneFoldInputDataSplit().input_split, input_data=classification_dataset) + data_producer = DataSourceSplitter(cv_folds=None).build(classification_dataset) metric = ClassificationMetricsEnum.ROCAUC_penalty - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer) with pytest.raises(AttributeError): objective_eval(pipeline) @@ -117,10 +115,11 @@ def test_pipeline_objective_evaluate_with_empty_pipeline(classification_dataset) def test_pipeline_objective_evaluate_with_cv_fold(classification_dataset): pipeline = sample_pipeline() - cv_fold = partial(tabular_cv_generator, classification_dataset, folds=5) + data_producer = DataSourceSplitter(cv_folds=5).build(classification_dataset) metric = ClassificationMetricsEnum.logloss - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), cv_fold) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer) fitness = objective_eval(pipeline) assert fitness.valid assert fitness.value is not None @@ -140,16 +139,20 @@ def test_pipeline_objective_evaluate_with_empty_datasource(classification_datase def test_pipeline_objective_evaluate_with_time_constraint(classification_dataset): pipeline = sample_pipeline() - data_split = partial(OneFoldInputDataSplit().input_split, input_data=classification_dataset) + data_producer = DataSourceSplitter(cv_folds=None).build(classification_dataset) metric = ClassificationMetricsEnum.ROCAUC_penalty time_constraint = datetime.timedelta(seconds=0.0001) - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split, time_constraint=time_constraint) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer, + time_constraint=time_constraint) fitness = objective_eval(pipeline) assert not fitness.valid time_constraint = datetime.timedelta(seconds=300) - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), data_split, time_constraint=time_constraint) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metric), + data_producer=data_producer, + time_constraint=time_constraint) fitness = objective_eval(pipeline) assert fitness.valid assert fitness.value is not None @@ -164,9 +167,9 @@ def test_pipeline_objective_evaluate_with_invalid_metrics(classification_dataset with pytest.raises(Exception): pipeline = sample_pipeline() - data_split = partial(OneFoldInputDataSplit().input_split, input_data=classification_dataset) - - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metrics), data_split) + data_producer = DataSourceSplitter(cv_folds=None).build(classification_dataset) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metrics), + data_producer=data_producer) objective_eval(pipeline) diff --git a/test/unit/preprocessing/test_pipeline_preprocessing.py b/test/unit/preprocessing/test_pipeline_preprocessing.py index e875017ff4..dcba57ba9e 100644 --- a/test/unit/preprocessing/test_pipeline_preprocessing.py +++ b/test/unit/preprocessing/test_pipeline_preprocessing.py @@ -247,7 +247,7 @@ def test_data_with_mixed_types_per_column_processed_correctly(): processed correctly. """ input_data = data_with_mixed_types_in_each_column() - train_data, test_data = train_test_data_setup(input_data, split_ratio=0.9) + train_data, test_data = train_test_data_setup(input_data, split_ratio=0.9, stratify=False) pipeline = Pipeline(PipelineNode('dt')) pipeline = correct_preprocessing_params(pipeline, categorical_max_uniques_th=5) diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py index 35936dc766..038b9f44af 100644 --- a/test/unit/preprocessing/test_preprocessors.py +++ b/test/unit/preprocessing/test_preprocessors.py @@ -114,7 +114,7 @@ def data_with_complicated_types(): input_data = InputData(idx=np.arange(18), features=features, target=target, task=task, data_type=DataTypesEnum.table) - return train_test_data_setup(input_data, split_ratio=0.9) + return train_test_data_setup(input_data, split_ratio=0.9, stratify=False) def test_column_types_converting_correctly(): @@ -146,7 +146,7 @@ def test_column_types_process_correctly(): """ data = data_with_mixed_types_in_each_column() - train_data, test_data = train_test_data_setup(data, split_ratio=0.9) + train_data, test_data = train_test_data_setup(data, split_ratio=0.9, stratify=False) # Remove target from test sample test_data.target = None @@ -223,7 +223,7 @@ def test_binary_pseudo_string_column_process_correctly(): def fit_predict_cycle_for_testing(idx: int): input_data = get_mixed_data_with_str_and_float_values(idx=idx) - train_data, test_data = train_test_data_setup(input_data, split_ratio=0.9) + train_data, test_data = train_test_data_setup(input_data, split_ratio=0.9, stratify=False) pipeline = Pipeline(PipelineNode('dt')) pipeline = correct_preprocessing_params(pipeline) diff --git a/test/unit/tasks/test_classification.py b/test/unit/tasks/test_classification.py index 8bc423afb8..7373f758be 100644 --- a/test/unit/tasks/test_classification.py +++ b/test/unit/tasks/test_classification.py @@ -99,7 +99,7 @@ def get_image_classification_data(composite_flag: bool = True): def test_multiclassification_pipeline_fit_correct(): data = get_iris_data() pipeline = pipeline_simple() - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) pipeline.fit(input_data=train_data) results = pipeline.predict(input_data=test_data) @@ -117,7 +117,7 @@ def test_classification_with_pca_pipeline_fit_correct(): pipeline_pca = pipeline_with_pca() pipeline = pipeline_simple() - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) pipeline.fit(input_data=train_data) pipeline_pca.fit(input_data=train_data) @@ -141,7 +141,7 @@ def test_classification_with_pca_pipeline_fit_correct(): def test_output_mode_labels(): data = get_iris_data() pipeline = pipeline_simple() - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) pipeline.fit(input_data=train_data) results = pipeline.predict(input_data=test_data, output_mode='labels') @@ -156,7 +156,7 @@ def test_output_mode_labels(): def test_output_mode_full_probs(): data = get_binary_classification_data() pipeline = pipeline_simple() - train_data, test_data = train_test_data_setup(data, shuffle_flag=True) + train_data, test_data = train_test_data_setup(data, shuffle=True) pipeline.fit(input_data=train_data) results = pipeline.predict(input_data=test_data, output_mode='full_probs') diff --git a/test/unit/validation/test_table_cv.py b/test/unit/validation/test_table_cv.py index fe67c0ab34..55be85451f 100644 --- a/test/unit/validation/test_table_cv.py +++ b/test/unit/validation/test_table_cv.py @@ -1,16 +1,14 @@ import logging from datetime import timedelta -from functools import partial import pytest + from golem.core.tuning.simultaneous import SimultaneousTuner -from sklearn.model_selection import KFold, StratifiedKFold from fedot.api.main import Fedot from fedot.core.data.data import InputData from fedot.core.data.data_split import train_test_data_setup from fedot.core.optimisers.objective import PipelineObjectiveEvaluate -from fedot.core.optimisers.objective.data_objective_advisor import DataObjectiveAdvisor from fedot.core.optimisers.objective.metrics_objective import MetricsObjective from fedot.core.pipelines.node import PipelineNode from fedot.core.pipelines.pipeline import Pipeline @@ -20,7 +18,7 @@ from fedot.core.repository.quality_metrics_repository import ClassificationMetricsEnum from fedot.core.repository.tasks import Task, TaskTypesEnum from fedot.core.utils import fedot_project_root -from fedot.core.validation.split import tabular_cv_generator +from fedot.core.optimisers.objective.data_source_splitter import DataSourceSplitter from test.integration.models.test_model import classification_dataset from test.unit.tasks.test_classification import get_iris_data, pipeline_simple @@ -42,32 +40,18 @@ def get_classification_data(): def test_cv_multiple_metrics_evaluated_correct(classification_dataset): pipeline = sample_pipeline() - cv_folds = partial(tabular_cv_generator, classification_dataset, folds=5) + data_producer = DataSourceSplitter(cv_folds=5).build(classification_dataset) metrics = [ClassificationMetricsEnum.ROCAUC_penalty, ClassificationMetricsEnum.accuracy, ClassificationMetricsEnum.logloss] - objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metrics), cv_folds) + objective_eval = PipelineObjectiveEvaluate(MetricsObjective(metrics), + data_producer=data_producer) actual_values = objective_eval(pipeline).values all_metrics_correct = all(0 < abs(x) <= 1 for x in actual_values) assert all_metrics_correct -def test_kfold_advisor_works_correct_in_balanced_case(): - data = get_classification_data() - advisor = DataObjectiveAdvisor() - split_type = advisor.propose_kfold(data) - assert split_type == KFold - - -def test_kfold_advisor_works_correct_in_imbalanced_case(): - data = get_classification_data() - data.target[:-int(len(data.target) * 0.1)] = 0 - advisor = DataObjectiveAdvisor() - split_type = advisor.propose_kfold(data) - assert split_type == StratifiedKFold - - def test_cv_min_kfolds_raise(): task = Task(task_type=TaskTypesEnum.classification) models_repo = OperationTypesRepository() diff --git a/test/unit/validation/test_time_series_cv.py b/test/unit/validation/test_time_series_cv.py index 0fcba28d53..dac782f062 100644 --- a/test/unit/validation/test_time_series_cv.py +++ b/test/unit/validation/test_time_series_cv.py @@ -15,7 +15,7 @@ from fedot.core.repository.quality_metrics_repository import \ MetricsRepository, RegressionMetricsEnum from fedot.core.repository.tasks import TsForecastingParams -from fedot.core.validation.split import ts_cv_generator +from fedot.core.data.cv_folds import cv_generator from test.unit.tasks.test_forecasting import get_simple_ts_pipeline, get_ts_data log = default_log(prefix=__name__) @@ -50,7 +50,8 @@ def test_ts_cv_generator_correct(): validation_horizon = validation_elements_per_fold * folds i = 0 - for train_data, test_data in ts_cv_generator(time_series, folds, validation_blocks, log): + for train_data, test_data in cv_generator(time_series, cv_folds=folds, + validation_blocks=validation_blocks): train_len = len(train_data.idx) assert train_len == ts_len - validation_horizon validation_horizon -= validation_elements_per_fold @@ -58,23 +59,6 @@ def test_ts_cv_generator_correct(): assert i == folds -def test_cv_folds_too_large_correct(): - """ Checks whether cases where the number of folds is too large, causing - the number of elements to be validated to be greater than the number of elements - in the time series itself, are adequately handled - - In this case a hold-out validation with 1 fold and 3 validation blocks must be performed - """ - folds = 50 - forecast_len, validation_blocks, time_series = configure_experiment() - - i = 0 - for train_data, test_data in ts_cv_generator(time_series, folds, validation_blocks, log): - i += 1 - assert len(train_data.idx) == 85 - assert i == 1 - - def test_tuner_cv_correct(): """ Checks if the tuner works correctly when using cross validation for