Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support of multimodal data in DataSourceSplitter + bugfix #1119 #1138

Merged
merged 15 commits into from
Aug 14, 2023
2 changes: 1 addition & 1 deletion cases/kc2_sourcecode_defects_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_kc2_data():
encoded = (target == 'yes').astype(int)
data.target = encoded

train, test = train_test_data_setup(data, shuffle_flag=True)
train, test = train_test_data_setup(data, shuffle=True)

return train, test

Expand Down
2 changes: 1 addition & 1 deletion docs/source/basics/multi_modal_tasks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ FEDOT's API supports multimodal data from the box. The only thing you need is to

data = MultiModalData.from_csv(file_path='multimodal_dataset.csv', task='classification', target_columns='target_column',
text_columns=['text_col1', 'text_col2'], columns_to_drop=['col_to_drop1', 'col_to_drop2'], index_col=None)
fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.7)
fit_data, predict_data = train_test_data_setup(data, shuffle=True, split_ratio=0.7)

Using ``from_csv()`` method, you should define the task type, and target columns. FEDOT can find text columns automatically, but you can set them manually. You can also select columns which will be dropped from the original dataset. By default, FEDOT reads the first column of every dataset as an index column. If there is no index columns in the dataset, you should set ``index_col=None``.
Initialize the FEDOT object and define the type of modeling problem.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/basics/ts_forecasting.rst
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ Train test split
~~~~~~~~~~~~~~~~

To split InputData use ``train_test_data_setup`` method.
``split_ratio`` and ``shuffle_flag`` are ignored for time-series forecasting.
``split_ratio`` and ``shuffle``, and ``stratify`` are ignored for time-series forecasting.

.. autofunction:: fedot.core.data.data_split.train_test_data_setup

Expand Down
6 changes: 3 additions & 3 deletions examples/advanced/automl/h2o_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,9 @@ def export_h2o(pipeline, pipeline_path, test_data):


def h2o_classification_pipeline_evaluation():
pipeline_path = "h2o_class"
data = get_iris_data()
pipeline = pipeline_h2o_class()
train_data, test_data = train_test_data_setup(data, shuffle_flag=True)
train_data, test_data = train_test_data_setup(data, shuffle=True)

pipeline.fit(input_data=train_data)
results = pipeline.predict(input_data=test_data, output_mode="full_probs")
Expand All @@ -62,6 +61,7 @@ def h2o_classification_pipeline_evaluation():
multi_class='ovo',
average='macro')
# H2o has troubles with serialization for now
# pipeline_path = "h2o_class"1
# export_h2o(pipeline, pipeline_path, test_data)
print(f"roc auc: {roc_auc_on_test}")

Expand All @@ -73,7 +73,7 @@ def h2o_regression_pipeline_evaluation():
train_data, test_data = train_test_data_setup(data)

pipeline.fit(input_data=train_data)
results = pipeline.predict(input_data=test_data)
_ = pipeline.predict(input_data=test_data)
_, rmse_on_test = get_rmse_value(pipeline, train_data, test_data)
print(f"RMSE {rmse_on_test}")

Expand Down
2 changes: 1 addition & 1 deletion examples/advanced/automl/tpot_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def tpot_classification_pipeline_evaluation():
pipeline_path = "tpot_class"
data = get_iris_data()
pipeline = pipeline_tpot_class()
train_data, test_data = train_test_data_setup(data, shuffle_flag=True)
train_data, test_data = train_test_data_setup(data, shuffle=True)

pipeline.fit(input_data=train_data)
results = pipeline.predict(input_data=test_data, output_mode="full_probs")
Expand Down
2 changes: 1 addition & 1 deletion examples/advanced/multi_modal_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def run_multi_modal_pipeline(files_path: str, visualization=False) -> float:

data = prepare_multi_modal_data(files_path, task, images_size)

fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.6)
fit_data, predict_data = train_test_data_setup(data, shuffle=True, split_ratio=0.6)

automl_model = Fedot(problem='classification', timeout=15)
pipeline = automl_model.fit(features=fit_data,
Expand Down
2 changes: 1 addition & 1 deletion examples/advanced/multimodal_text_num_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def run_multi_modal_example(file_path: str, visualization: bool = False, with_tu
task = 'classification'
path = fedot_project_root().joinpath(file_path)
data = MultiModalData.from_csv(file_path=path, task=task, target_columns='variety', index_col=None)
fit_data, predict_data = train_test_data_setup(data, shuffle_flag=True, split_ratio=0.7)
fit_data, predict_data = train_test_data_setup(data, shuffle=True, split_ratio=0.7)

automl_model = Fedot(problem=task, timeout=timeout, with_tuning=with_tuning, n_jobs=1)
automl_model.fit(features=fit_data,
Expand Down
3 changes: 1 addition & 2 deletions fedot/api/api_utils/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,8 @@ def accept_and_apply_recommendations(self, input_data: Union[InputData, MultiMod
input_data: data for preprocessing
recommendations: dict with recommendations
"""
# TODO fix multimodality

if isinstance(input_data, MultiModalData):
self['cv_folds'] = None # there are no support for multimodal data now
for data_source_name, values in input_data.items():
self.accept_and_apply_recommendations(input_data[data_source_name],
recommendations[data_source_name])
Expand Down
83 changes: 83 additions & 0 deletions fedot/core/data/cv_folds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from typing import Iterator, Optional, Tuple, Union

import numpy as np

from fedot.core.data.multi_modal import MultiModalData
from fedot.core.repository.tasks import TaskTypesEnum
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.model_selection._split import StratifiedKFold

from fedot.core.data.data import InputData
from fedot.core.data.data_split import _split_input_data_by_indexes


class TsInputDataSplit(TimeSeriesSplit):
""" Perform time series splitting for cross validation on InputData structures.
The difference between TimeSeriesSplit (sklearn) and TsInputDataSplit can be
demonstrated by an example:
The time series [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] must be splitted into 3
parts, where the size of each fold for validation will be 2 elements.
TimeSeriesSplit (return indices)
train - [0, 1, 2, 3] test - [4, 5]
train - [0, 1, 2, 3, 4, 5] test - [6, 7]
train - [0, 1, 2, 3, 4, 5, 6, 7] test - [8, 9]
TsInputDataSplit (return values of time series)
train - [1, 2, 3, 4] test - [1, 2, 3, 4, 5, 6]
train - [1, 2, 3, 4, 5, 6] test - [1, 2, 3, 4, 5, 6, 7, 8]
train - [1, 2, 3, 4, 5, 6, 7, 8] test - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
"""

def __init__(self, n_splits: int, test_size: int):
super().__init__(gap=0, n_splits=n_splits, test_size=test_size)

def split(self, data: np.ndarray, *args) -> Iterator[Tuple[InputData, InputData]]:
""" Define indexes for train and validation using
"in-sample forecasting" algorithm

:param data: InputData for splitting
"""

for train_ids, test_ids in super().split(data):
new_test_ids = np.hstack((train_ids, test_ids))
yield train_ids, new_test_ids


def cv_generator(data: Union[InputData, MultiModalData],
cv_folds: int,
shuffle: bool = False,
random_seed: int = 42,
stratify: bool = True,
validation_blocks: Optional[int] = None) -> Iterator[Tuple[Union[InputData, MultiModalData],
Union[InputData, MultiModalData]]]:
""" The function for splitting data into a train and test samples
for cross validation. The function return a generator of tuples,
consisting of a pair of train, test.

:param data: data for train and test splitting
:param shuffle: is data need shuffle
:param cv_folds: number of folds
:param random_seed: random seed for shuffle
:param stratify: `True` to make stratified samples for classification task
:param validation_blocks: validation blocks for timeseries data,

:return Iterator[Tuple[Union[InputData, MultiModalData],
Union[InputData, MultiModalData]]]: return split train/test data
"""

# Define base class for generate cv folds
retain_first_target = False
if data.task.task_type is TaskTypesEnum.ts_forecasting:
horizon = data.task.task_params.forecast_length * validation_blocks
kf = TsInputDataSplit(n_splits=cv_folds, test_size=horizon)
# for multi_ts use first target column as main target
retain_first_target = True
elif data.task.task_type is TaskTypesEnum.classification and stratify:
kf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_seed)
else:
kf = KFold(n_splits=cv_folds, shuffle=shuffle, random_state=random_seed)

# Split
for train_ids, test_ids in kf.split(data.target, data.target):
train_data = _split_input_data_by_indexes(data, train_ids)
test_data = _split_input_data_by_indexes(data, test_ids, retain_first_target=retain_first_target)
yield train_data, test_data
Loading
Loading