Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fast topological features #1252

Merged
merged 20 commits into from
Jan 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ FEDOT supports bunch of dimensionality preprocessing operations that can be be a
`one_hot_encoding`,One-Hot Encoder, Feature encoding
`label_encoding`,Label Encoder, Feature encoding
`resample`,Imbalanced binary class transformation in classification, Data transformation
`topological_features`,Calculation of topological features, only for time series,Data transformation
`topological_features`,Calculation of topological features,Time series transformation
`fast_topological_features`,Fast calculation of part of topological features,Time series transformation


.. csv-table:: Feature transformation operations implementations
Expand Down Expand Up @@ -105,7 +106,8 @@ FEDOT supports bunch of dimensionality preprocessing operations that can be be a
`one_hot_encoding`,`sklearn.preprocessing.OneHotEncoder`,
`label_encoding`,`sklearn.preprocessing.LabelEncoder`,`fast_train` `*tree`
`resample`,`FEDOT model using sklearn.utils.resample`,
`topological_features`,FEDOT model,`ts`
`topological_features`,FEDOT model,`ts`,
`fast_topological_features`,FEDOT model,`ts`


Models used
Expand Down
3 changes: 2 additions & 1 deletion fedot/api/api_utils/assumptions/task_assumptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ class TSForecastingAssumptions(TaskAssumptions):
def builders(self):
return {
'lagged_ridge':
PipelineBuilder().add_sequence('lagged', 'ridge'),
PipelineBuilder()
.add_sequence('lagged', 'ridge'),
'topological':
PipelineBuilder()
.add_node('lagged')
Expand Down
2 changes: 2 additions & 0 deletions fedot/api/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,8 @@ def setup_pipeline_structure(
- ``diff_filter`` -> Derivative Filter Transformation
- ``cut`` -> Cut Transformation
- ``exog_ts`` -> Exogeneus Transformation
- ``topological_features`` -> Topological features
- ``fast_topological_features`` -> Fast implementation of topological features

max_depth: max depth of a pipeline. Defaults to ``6``.

Expand Down
6 changes: 5 additions & 1 deletion fedot/core/operations/evaluation/common_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from fedot.core.operations.evaluation.operation_implementations.data_operations.sklearn_transformations import \
ImputationImplementation, KernelPCAImplementation, NormalizationImplementation, PCAImplementation, \
PolyFeaturesImplementation, ScalingImplementation, FastICAImplementation
from fedot.core.operations.evaluation.operation_implementations.\
data_operations.topological.fast_topological_extractor import \
FastTopologicalFeaturesImplementation
from fedot.core.operations.evaluation.operation_implementations.data_operations.topological. \
topological_extractor import TopologicalFeaturesImplementation
from fedot.core.operations.operation_parameters import OperationParameters
Expand Down Expand Up @@ -47,7 +50,8 @@ class FedotPreprocessingStrategy(EvaluationStrategy):
'one_hot_encoding': OneHotEncodingImplementation,
'label_encoding': LabelEncodingImplementation,
'fast_ica': FastICAImplementation,
'topological_features': TopologicalFeaturesImplementation
'topological_features': TopologicalFeaturesImplementation,
'fast_topological_features': FastTopologicalFeaturesImplementation,
}

def __init__(self, operation_type: str, params: Optional[OperationParameters] = None):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from itertools import chain
from typing import Optional

import numpy as np
from gph import ripser_parallel as ripser
from joblib import Parallel, delayed

from fedot.core.data.data import InputData, OutputData
from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import \
DataOperationImplementation
from fedot.core.operations.operation_parameters import OperationParameters


class FastTopologicalFeaturesImplementation(DataOperationImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)
self.window_size_as_share = params.get('window_size_as_share')
self.max_homology_dimension = params.get('max_homology_dimension')
self.metric = params.get('metric')
self.stride = params.get('stride')
self.n_jobs = params.get('n_jobs')
self.quantiles = (0.1, 0.25, 0.5, 0.75, 0.9)
self._shape = len(self.quantiles)
self._window_size = None

def fit(self, input_data: InputData):
self._window_size = int(input_data.features.shape[1] * self.window_size_as_share)
self._window_size = max(self._window_size, 2)
self._window_size = min(self._window_size, input_data.features.shape[1] - 2)
return self

def transform(self, input_data: InputData) -> OutputData:
features = input_data.features
with Parallel(n_jobs=self.n_jobs, prefer='processes') as parallel:
topological_features = parallel(delayed(self._extract_features)
(np.mean(features[i:i+2, ::self.stride], axis=0))
for i in range(0, features.shape[0], 2))
if len(topological_features) * 2 < features.shape[0]:
topological_features.append(topological_features[-1])
result = np.array(list(chain(*zip(topological_features, topological_features))))
if result.shape[0] > features.shape[0]:
result = result[:-1, :]
np.nan_to_num(result, copy=False, nan=0, posinf=0, neginf=0)
return result

def _extract_features(self, x):
x_sliced = np.array([x[i:self._window_size + i] for i in range(x.shape[0] - self._window_size + 1)])
x_processed = ripser(x_sliced,
maxdim=self.max_homology_dimension,
coeff=2,
metric=self.metric,
n_threads=1,
collapse_edges=False)["dgms"]
result = np.zeros(self._shape * (self.max_homology_dimension + 1))
for i, xp in enumerate(x_processed):
if xp.shape[0] > 0:
result[i * self._shape:(i + 1) * self._shape] = np.quantile(xp[:, 1] - xp[:, 0], self.quantiles,
overwrite_input=True, method='hazen')
return result
16 changes: 16 additions & 0 deletions fedot/core/pipelines/tuning/search_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,6 +768,22 @@ def get_parameters_dict(self):
'sampling-scope': [0.9, 0.99],
'type': 'continuous'}
},
'fast_topological_features': {
'window_size_as_share': {
'hyperopt-dist': hp.uniform,
'sampling-scope': [0.1, 0.9],
'type': 'continuous'
},
'max_homology_dimension': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [1, 5],
'type': 'discrete'
},
'metric': {
'hyperopt-dist': hp.choice,
'sampling-scope': [['euclidean', 'manhattan', 'cosine']],
'type': 'categorical'}
}
}

if self.custom_search_space is not None:
Expand Down
14 changes: 14 additions & 0 deletions fedot/core/repository/data/data_operation_repository.json
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,20 @@
"presets": [
"ts"
],
"input_type": "[DataTypesEnum.table]",
"output_type": "[DataTypesEnum.table]",
"tags": [
"non_applicable_for_ts",
"feature_space_transformation"
]
},
"fast_topological_features": {
Lopa10ko marked this conversation as resolved.
Show resolved Hide resolved
"meta": "custom_ts_preprocessing",
"presets": [
"ts"
],
"input_type": "[DataTypesEnum.table]",
"output_type": "[DataTypesEnum.table]",
"tags": [
"non_applicable_for_ts",
"feature_space_transformation"
Expand Down
7 changes: 7 additions & 0 deletions fedot/core/repository/data/default_operation_params.json
Original file line number Diff line number Diff line change
Expand Up @@ -160,5 +160,12 @@
},
"topological_features": {
"n_jobs": -1
},
"fast_topological_features": {
"n_jobs": 1,
"window_size_as_share": 0.66,
"max_homology_dimension": 1,
"metric": "euclidean",
"stride": 1
}
}
14 changes: 9 additions & 5 deletions test/integration/models/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,16 @@ def get_data_for_testing(task_type, data_type, length=100, features_count=1,
return None

if task_type is TaskTypesEnum.ts_forecasting:
task = Task(task_type, TsForecastingParams(max(length // 10, 2)))
forecast_length = max(length // 10, 2)
task = Task(task_type, TsForecastingParams(forecast_length))
if data_type is DataTypesEnum.ts:
features = np.zeros(length) + value
else:
features = np.zeros((length, features_count)) + value
if data_type is DataTypesEnum.table:
target = np.zeros(length) + value
target = np.zeros((length, forecast_length)) + value
else:
target = features

else:
task = Task(task_type)
data_type = DataTypesEnum.table
Expand Down Expand Up @@ -156,11 +156,15 @@ def fit_time_for_operation(operation: OperationMetaInfo,
return perf_counter() - start_time

for task_type in operation.task_type:
for data_type in operation.input_types:
input_types = operation.input_types
if task_type is TaskTypesEnum.ts_forecasting:
if operation.input_types == [DataTypesEnum.table]:
input_types = [DataTypesEnum.ts]
for data_type in input_types:
perfomance_values = []
for length in data_lengths:
data = get_data_for_testing(task_type, data_type,
length=length, features_count=2,
length=length, features_count=10,
random=True)
if data is not None:
min_evaluated_time = min(fit_time_for_operation(operation, data) for _ in range(times))
Expand Down
Loading