Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix pca #1267

Merged
merged 4 commits into from
Mar 18, 2024
Merged

fix pca #1267

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions fedot/core/constants.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,44 @@
from fedot.core.repository.tasks import TaskTypesEnum

MINIMAL_SECONDS_FOR_TUNING = 15
"""Minimal seconds for tuning."""

DEFAULT_TUNING_ITERATIONS_NUMBER = 100000
"""Default number of tuning iterations."""

DEFAULT_API_TIMEOUT_MINUTES = 5.0
"""Default API timeout in minutes."""

DEFAULT_FORECAST_LENGTH = 30
"""Default forecast length."""

COMPOSING_TUNING_PROPORTION = 0.6
"""Proportion of data used for composing tuning."""

BEST_QUALITY_PRESET_NAME = 'best_quality'
"""Name of the preset for best quality."""

FAST_TRAIN_PRESET_NAME = 'fast_train'
"""Name of the preset for fast training."""

AUTO_PRESET_NAME = 'auto'
"""Name of the preset for auto tuning."""

MINIMAL_PIPELINE_NUMBER_FOR_EVALUATION = 100
"""Minimal number of pipelines for evaluation."""

MIN_NUMBER_OF_GENERATIONS = 3
"""Minimum number of generations."""

FRACTION_OF_UNIQUE_VALUES = 0.95
"""Fraction of unique values."""

default_data_split_ratio_by_task = {
TaskTypesEnum.classification: 0.8,
TaskTypesEnum.regression: 0.8,
TaskTypesEnum.ts_forecasting: 0.5
}
"""Default data split ratio by task."""

PCA_MIN_THRESHOLD_TS = 7
"""Minimum threshold for PCA in TS forecasting."""
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,23 @@
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, StandardScaler

from fedot.core.constants import PCA_MIN_THRESHOLD_TS
from fedot.core.data.data import InputData, OutputData, data_type_is_table
from fedot.core.data.data_preprocessing import convert_into_column, data_has_categorical_features, \
divide_data_categorical_numerical, find_categorical_columns, replace_inf_with_nans
from fedot.core.operations.evaluation.operation_implementations. \
implementation_interfaces import DataOperationImplementation, EncodedInvariantImplementation
from fedot.core.operations.operation_parameters import OperationParameters
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.preprocessing.data_types import TYPE_TO_ID


class ComponentAnalysisImplementation(DataOperationImplementation):
""" Class for applying PCA and kernel PCA models from sklearn
"""
Class for applying PCA and kernel PCA models from sklearn

Args:
params: OpearationParameters with the arguments
params: OperationParameters with the arguments
"""

def __init__(self, params: Optional[OperationParameters]):
Expand All @@ -29,8 +32,9 @@ def __init__(self, params: Optional[OperationParameters]):
self.number_of_features = None
self.number_of_samples = None

def fit(self, input_data: InputData):
"""The method trains the PCA model
def fit(self, input_data: InputData) -> PCA:
"""
The method trains the PCA model

Args:
input_data: data with features, target and ids for PCA training
Expand All @@ -42,13 +46,14 @@ def fit(self, input_data: InputData):
self.number_of_samples, self.number_of_features = np.array(input_data.features).shape

if self.number_of_features > 1:
self.check_and_correct_params()
self.check_and_correct_params(is_ts_data=input_data.data_type is DataTypesEnum.ts)
self.pca.fit(input_data.features)

return self.pca

def transform(self, input_data: InputData) -> OutputData:
"""Method for transformation tabular data using PCA
"""
Method for transformation tabular data using PCA

Args:
input_data: data with features, target and ids for PCA applying
Expand All @@ -63,13 +68,13 @@ def transform(self, input_data: InputData) -> OutputData:
transformed_features = input_data.features

# Update features
output_data = self._convert_to_output(input_data,
transformed_features)
output_data = self._convert_to_output(input_data, transformed_features)
self.update_column_types(output_data)
return output_data

def check_and_correct_params(self):
"""Method check if number of features in data enough for ``n_components``
def check_and_correct_params(self, is_ts_data: bool = False):
"""
Method check if number of features in data enough for ``n_components``
parameter in PCA or not. And if not enough - fixes it
"""
n_components = self.params.get('n_components')
Expand All @@ -80,12 +85,15 @@ def check_and_correct_params(self):
# Check that n_samples correctly map with n_features
if self.number_of_samples < self.number_of_features:
self.params.update(n_components=0.5)
if is_ts_data and (n_components * self.number_of_features) < PCA_MIN_THRESHOLD_TS:
self.params.update(n_components=PCA_MIN_THRESHOLD_TS / self.number_of_features)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Не будет проблем с нецелым числом n_components?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If 0 < n_components < 1 and svd_solver == 'full', select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components.

У нас 0 < n_components < 1


self.pca.set_params(**self.params.to_dict())

@staticmethod
def update_column_types(output_data: OutputData) -> OutputData:
"""Update column types after applying PCA operations
"""
Update column types after applying PCA operations
"""

_, n_cols = output_data.predict.shape
Expand All @@ -94,7 +102,8 @@ def update_column_types(output_data: OutputData) -> OutputData:


class PCAImplementation(ComponentAnalysisImplementation):
"""Class for applying PCA from sklearn
"""
Class for applying PCA from sklearn

Args:
params: OperationParameters with the hyperparameters
Expand All @@ -111,7 +120,8 @@ def __init__(self, params: Optional[OperationParameters] = None):


class KernelPCAImplementation(ComponentAnalysisImplementation):
""" Class for applying kernel PCA from sklearn
"""
Class for applying kernel PCA from sklearn

Args:
params: OperationParameters with the hyperparameters
Expand All @@ -123,7 +133,8 @@ def __init__(self, params: Optional[OperationParameters]):


class FastICAImplementation(ComponentAnalysisImplementation):
""" Class for applying FastICA from sklearn
"""
Class for applying FastICA from sklearn

Args:
params: OperationParameters with the hyperparameters
Expand All @@ -135,7 +146,8 @@ def __init__(self, params: Optional[OperationParameters]):


class PolyFeaturesImplementation(EncodedInvariantImplementation):
""" Class for application of :obj:`PolynomialFeatures` operation on data,
"""
Class for application of :obj:`PolynomialFeatures` operation on data,
where only not encoded features (were not converted from categorical using
``OneHot encoding``) are used

Expand All @@ -158,7 +170,9 @@ def __init__(self, params: Optional[OperationParameters]):
self.columns_to_take = None

def fit(self, input_data: InputData):
""" Method for fit Poly features operation """
"""
Method for fit Poly features operation
"""
# Check the number of columns in source dataset
n_rows, n_cols = input_data.features.shape
if n_cols > self.th_columns:
Expand All @@ -170,7 +184,8 @@ def fit(self, input_data: InputData):
return super().fit(input_data)

def transform(self, input_data: InputData) -> OutputData:
"""Firstly perform filtration of columns
"""
Firstly perform filtration of columns
"""

clipped_input_data = input_data
Expand Down
2 changes: 0 additions & 2 deletions test/integration/real_applications/test_examples.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from datetime import timedelta

import numpy as np
import pytest
from sklearn.metrics import mean_squared_error

from examples.advanced.multimodal_text_num_example import run_multi_modal_example
Expand Down Expand Up @@ -84,7 +83,6 @@ def test_api_classification_example():
assert prediction is not None


@pytest.mark.skip(reason="topo features fail") # TODO resolve
def test_api_ts_forecasting_example():
forecast = run_ts_forecasting_example(dataset='salaries', timeout=2, with_tuning=False)
assert forecast is not None
Expand Down
Loading