Support of multimodal data in DataSourceSplitter + bugfix #1119 (#1138)

* Refactor data splitters logic Fix #1119 Add support of MultiModalData in cv_folds Delete some code that solves problems are solved in the new DataSourceSplitter or in new cv_folds * Rename module split.py to cv_folds.py * Move cv_folds.py to fedot/core/data * Add tests * Add tests and fix some tests * Fix problem with lagged window failure on data shortage
aimclub · Aug 14, 2023 · d53dfa1 · d53dfa1
1 parent 04bceac
commit d53dfa1
Show file tree

Hide file tree

Showing 24 changed files with 470 additions and 547 deletions.
diff --git a/cases/kc2_sourcecode_defects_classification.py b/cases/kc2_sourcecode_defects_classification.py
@@ -17,7 +17,7 @@ def get_kc2_data():
     encoded = (target == 'yes').astype(int)
     data.target = encoded
 
-    train, test = train_test_data_setup(data, shuffle_flag=True)
+    train, test = train_test_data_setup(data, shuffle=True)
 
     return train, test
 

diff --git a/docs/source/basics/multi_modal_tasks.rst b/docs/source/basics/multi_modal_tasks.rst
@@ -15,7 +15,7 @@ FEDOT's API supports multimodal data from the box. The only thing you need is to
 
     data = MultiModalData.from_csv(file_path='multimodal_dataset.csv', task='classification', target_columns='target_column',
                                    text_columns=['text_col1', 'text_col2'], columns_to_drop=['col_to_drop1', 'col_to_drop2'], index_col=None)
-    fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.7)
+    fit_data, predict_data = train_test_data_setup(data, shuffle=True, split_ratio=0.7)
 
 Using ``from_csv()`` method, you should define the task type, and target columns. FEDOT can find text columns automatically, but you can set them manually. You can also select columns which will be dropped from the original dataset. By default, FEDOT reads the first column of every dataset as an index column. If there is no index columns in the dataset, you should set ``index_col=None``.
 Initialize the FEDOT object and define the type of modeling problem.

diff --git a/docs/source/basics/ts_forecasting.rst b/docs/source/basics/ts_forecasting.rst
@@ -188,7 +188,7 @@ Train test split
 ~~~~~~~~~~~~~~~~
 
 To split InputData use ``train_test_data_setup`` method.
-``split_ratio`` and ``shuffle_flag`` are ignored for time-series forecasting.
+``split_ratio`` and ``shuffle``, and ``stratify`` are ignored for time-series forecasting.
 
 .. autofunction:: fedot.core.data.data_split.train_test_data_setup
 

diff --git a/examples/advanced/automl/h2o_example.py b/examples/advanced/automl/h2o_example.py
@@ -48,10 +48,9 @@ def export_h2o(pipeline, pipeline_path, test_data):
 
 
 def h2o_classification_pipeline_evaluation():
-    pipeline_path = "h2o_class"
     data = get_iris_data()
     pipeline = pipeline_h2o_class()
-    train_data, test_data = train_test_data_setup(data, shuffle_flag=True)
+    train_data, test_data = train_test_data_setup(data, shuffle=True)
 
     pipeline.fit(input_data=train_data)
     results = pipeline.predict(input_data=test_data, output_mode="full_probs")
@@ -62,6 +61,7 @@ def h2o_classification_pipeline_evaluation():
                               multi_class='ovo',
                               average='macro')
     #  H2o has troubles with serialization for now
+    #  pipeline_path = "h2o_class"1
     #  export_h2o(pipeline, pipeline_path, test_data)
     print(f"roc auc: {roc_auc_on_test}")
 
@@ -73,7 +73,7 @@ def h2o_regression_pipeline_evaluation():
     train_data, test_data = train_test_data_setup(data)
 
     pipeline.fit(input_data=train_data)
-    results = pipeline.predict(input_data=test_data)
+    _ = pipeline.predict(input_data=test_data)
     _, rmse_on_test = get_rmse_value(pipeline, train_data, test_data)
     print(f"RMSE {rmse_on_test}")
 

diff --git a/examples/advanced/automl/tpot_example.py b/examples/advanced/automl/tpot_example.py
@@ -41,7 +41,7 @@ def tpot_classification_pipeline_evaluation():
     pipeline_path = "tpot_class"
     data = get_iris_data()
     pipeline = pipeline_tpot_class()
-    train_data, test_data = train_test_data_setup(data, shuffle_flag=True)
+    train_data, test_data = train_test_data_setup(data, shuffle=True)
 
     pipeline.fit(input_data=train_data)
     results = pipeline.predict(input_data=test_data, output_mode="full_probs")

diff --git a/examples/advanced/multi_modal_pipeline.py b/examples/advanced/multi_modal_pipeline.py
@@ -74,7 +74,7 @@ def run_multi_modal_pipeline(files_path: str, visualization=False) -> float:
 
     data = prepare_multi_modal_data(files_path, task, images_size)
 
-    fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.6)
+    fit_data, predict_data = train_test_data_setup(data, shuffle=True, split_ratio=0.6)
 
     automl_model = Fedot(problem='classification', timeout=15)
     pipeline = automl_model.fit(features=fit_data,

diff --git a/examples/advanced/multimodal_text_num_example.py b/examples/advanced/multimodal_text_num_example.py
@@ -26,7 +26,7 @@ def run_multi_modal_example(file_path: str, visualization: bool = False, with_tu
     task = 'classification'
     path = fedot_project_root().joinpath(file_path)
     data = MultiModalData.from_csv(file_path=path, task=task, target_columns='variety', index_col=None)
-    fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.7)
+    fit_data, predict_data = train_test_data_setup(data, shuffle=True, split_ratio=0.7)
 
     automl_model = Fedot(problem=task, timeout=timeout, with_tuning=with_tuning, n_jobs=1)
     automl_model.fit(features=fit_data,

diff --git a/fedot/api/api_utils/params.py b/fedot/api/api_utils/params.py
@@ -56,9 +56,8 @@ def accept_and_apply_recommendations(self, input_data: Union[InputData, MultiMod
             input_data: data for preprocessing
             recommendations: dict with recommendations
         """
-        # TODO fix multimodality
+
         if isinstance(input_data, MultiModalData):
-            self['cv_folds'] = None  # there are no support for multimodal data now
             for data_source_name, values in input_data.items():
                 self.accept_and_apply_recommendations(input_data[data_source_name],
                                                       recommendations[data_source_name])

diff --git a/fedot/core/data/cv_folds.py b/fedot/core/data/cv_folds.py
@@ -0,0 +1,83 @@
+from typing import Iterator, Optional, Tuple, Union
+
+import numpy as np
+
+from fedot.core.data.multi_modal import MultiModalData
+from fedot.core.repository.tasks import TaskTypesEnum
+from sklearn.model_selection import KFold, TimeSeriesSplit
+from sklearn.model_selection._split import StratifiedKFold
+
+from fedot.core.data.data import InputData
+from fedot.core.data.data_split import _split_input_data_by_indexes
+
+
+class TsInputDataSplit(TimeSeriesSplit):
+    """ Perform time series splitting for cross validation on InputData structures.
+    The difference between TimeSeriesSplit (sklearn) and TsInputDataSplit can be
+    demonstrated by an example:
+    The time series [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] must be splitted into 3
+    parts, where the size of each fold for validation will be 2 elements.
+    TimeSeriesSplit (return indices)
+        train - [0, 1, 2, 3] test - [4, 5]
+        train - [0, 1, 2, 3, 4, 5] test - [6, 7]
+        train - [0, 1, 2, 3, 4, 5, 6, 7] test - [8, 9]
+    TsInputDataSplit (return values of time series)
+        train - [1, 2, 3, 4] test - [1, 2, 3, 4, 5, 6]
+        train - [1, 2, 3, 4, 5, 6] test - [1, 2, 3, 4, 5, 6, 7, 8]
+        train - [1, 2, 3, 4, 5, 6, 7, 8] test - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    """
+
+    def __init__(self, n_splits: int, test_size: int):
+        super().__init__(gap=0, n_splits=n_splits, test_size=test_size)
+
+    def split(self, data: np.ndarray, *args) -> Iterator[Tuple[InputData, InputData]]:
+        """ Define indexes for train and validation using
+        "in-sample forecasting" algorithm
+
+        :param data: InputData for splitting
+        """
+
+        for train_ids, test_ids in super().split(data):
+            new_test_ids = np.hstack((train_ids, test_ids))
+            yield train_ids, new_test_ids
+
+
+def cv_generator(data: Union[InputData, MultiModalData],
+                 cv_folds: int,
+                 shuffle: bool = False,
+                 random_seed: int = 42,
+                 stratify: bool = True,
+                 validation_blocks: Optional[int] = None) -> Iterator[Tuple[Union[InputData, MultiModalData],
+                                                                            Union[InputData, MultiModalData]]]:
+    """ The function for splitting data into a train and test samples
+        for cross validation. The function return a generator of tuples,
+        consisting of a pair of train, test.
+
+    :param data: data for train and test splitting
+    :param shuffle: is data need shuffle
+    :param cv_folds: number of folds
+    :param random_seed: random seed for shuffle
+    :param stratify: `True` to make stratified samples for classification task
+    :param validation_blocks: validation blocks for timeseries data,
+
+    :return Iterator[Tuple[Union[InputData, MultiModalData],
+                           Union[InputData, MultiModalData]]]: return split train/test data
+    """
+
+    # Define base class for generate cv folds
+    retain_first_target = False
+    if data.task.task_type is TaskTypesEnum.ts_forecasting:
+        horizon = data.task.task_params.forecast_length * validation_blocks
+        kf = TsInputDataSplit(n_splits=cv_folds, test_size=horizon)
+        # for multi_ts use first target column as main target
+        retain_first_target = True
+    elif data.task.task_type is TaskTypesEnum.classification and stratify:
+        kf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_seed)
+    else:
+        kf = KFold(n_splits=cv_folds, shuffle=shuffle, random_state=random_seed)
+
+    # Split
+    for train_ids, test_ids in kf.split(data.target, data.target):
+        train_data = _split_input_data_by_indexes(data, train_ids)
+        test_data = _split_input_data_by_indexes(data, test_ids, retain_first_target=retain_first_target)
+        yield train_data, test_data