Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Example of custom AutoML system implementation #1236

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import datetime
from pathlib import Path

import tensorflow as tf
from golem.core.optimisers.genetic.gp_params import GPAlgorithmParameters
from golem.core.optimisers.genetic.operators.base_mutations import MutationTypesEnum
from golem.core.optimisers.genetic.operators.inheritance import GeneticSchemeTypesEnum
from golem.core.optimisers.genetic.operators.selection import SelectionTypesEnum
from golem.core.tuning.simultaneous import SimultaneousTuner
from hyperopt import hp
from sklearn.metrics import roc_auc_score as roc_auc

from fedot.core.composer.composer_builder import ComposerBuilder
from fedot.core.composer.gp_composer.specific_operators import parameter_change_mutation
from fedot.core.data.data import InputData, OutputData
from fedot.core.operations.operation_parameters import OperationParameters
from fedot.core.pipelines.node import PipelineNode
from fedot.core.pipelines.pipeline import Pipeline
from fedot.core.pipelines.pipeline_composer_requirements import PipelineComposerRequirements
from fedot.core.pipelines.tuning.search_space import PipelineSearchSpace
from fedot.core.pipelines.tuning.tuner_builder import TunerBuilder
from fedot.core.repository.metrics_repository import ClassificationMetricsEnum, ComplexityMetricsEnum
from fedot.core.repository.operation_types_repository import get_operations_for_task, OperationTypesRepository
from fedot.core.repository.tasks import Task, TaskTypesEnum
from fedot.core.utils import set_random_seed, fedot_project_root

custom_search_space = {'filter_1': {
'r': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [-254, 254],
'type': 'discrete'}
},
'filter_2': {
'r': {
'hyperopt-dist': hp.uniformint,
'sampling-scope': [-254, 254],
'type': 'discrete'},
}
}


def calculate_validation_metric(predicted: OutputData, dataset_to_validate: InputData) -> float:
# the quality assessment for the simulation results
roc_auc_value = roc_auc(y_true=dataset_to_validate.target,
y_score=predicted.predict,
multi_class="ovo")
return roc_auc_value


def cnn_composite_pipeline() -> Pipeline:
node_first = PipelineNode('filter_1')

node_second = PipelineNode('cnn_1', nodes_from=[node_first])

node_final = PipelineNode('rf', nodes_from=[node_second])

pipeline = Pipeline(node_final)
return pipeline


def setup_repository():
repo_folder = Path(fedot_project_root(), 'examples', 'advanced', 'customization',
'repositories')
OperationTypesRepository.__repository_dict__ = {
'model': {'file': Path(repo_folder, 'my_model_repository.json'), 'initialized_repo': None, 'default_tags': []},
'data_operation': {'file': Path(repo_folder, 'my_data_operation_repository.json'),
'initialized_repo': None, 'default_tags': []}
}

OperationParameters.custom_default_params_path = Path(repo_folder,
'my_default_operation_params.json')


def run_image_classification_automl(train_dataset: tuple,
test_dataset: tuple):
task = Task(TaskTypesEnum.classification)

setup_repository()

x_train, y_train = train_dataset[0], train_dataset[1]
x_test, y_test = test_dataset[0], test_dataset[1]

dataset_to_train = InputData.from_image(images=x_train,
labels=y_train,
task=task)
dataset_to_validate = InputData.from_image(images=x_test,
labels=y_test,
task=task)

dataset_to_train = dataset_to_train.subset_range(0, min(100, dataset_to_train.features.shape[0]))

initial_pipeline = cnn_composite_pipeline()
initial_pipeline.show()
initial_pipeline.fit(dataset_to_train)
predictions = initial_pipeline.predict(dataset_to_validate)
roc_auc_on_valid = calculate_validation_metric(predictions,
dataset_to_validate)

print(roc_auc_on_valid)

# the choice of the metric for the pipeline quality assessment during composition
quality_metric = ClassificationMetricsEnum.f1
complexity_metric = ComplexityMetricsEnum.node_number
metrics = [quality_metric, complexity_metric]
# the choice and initialisation of the GP search
composer_requirements = PipelineComposerRequirements(
primary=get_operations_for_task(task=task, mode='all'),
timeout=datetime.timedelta(minutes=3),
num_of_generations=20, n_jobs=1, cv_folds=None
)

pop_size = 5

# search space for hyper-parametric mutation
PipelineSearchSpace.pre_defined_custom_search_space = custom_search_space

params = GPAlgorithmParameters(
selection_types=[SelectionTypesEnum.spea2],
genetic_scheme_type=GeneticSchemeTypesEnum.steady_state,
mutation_types=[MutationTypesEnum.single_change, parameter_change_mutation],
pop_size=pop_size
)

# Create composer and with required composer params
composer = (
ComposerBuilder(task=task)
.with_optimizer_params(params)
.with_requirements(composer_requirements)
.with_metrics(metrics)
.with_initial_pipelines(initial_pipelines=[initial_pipeline] * pop_size)
.build()
)

# the optimal pipeline generation by composition - the most time-consuming task
pipeline_evo_composed = composer.compose_pipeline(data=dataset_to_train)[0]

pipeline_evo_composed.show()
print(pipeline_evo_composed.descriptive_id)

pipeline_evo_composed.fit(input_data=dataset_to_train)

replace_default_search_space = True
cv_folds = 1
search_space = PipelineSearchSpace(custom_search_space=custom_search_space,
replace_default_search_space=replace_default_search_space)

pipeline_evo_composed.predict(dataset_to_validate)

# .with_cv_folds(cv_folds) \
pipeline_tuner = TunerBuilder(dataset_to_train.task) \
.with_tuner(SimultaneousTuner) \
.with_metric(ClassificationMetricsEnum.ROCAUC) \
.with_cv_folds(cv_folds) \
.with_iterations(50) \
.with_search_space(search_space).build(dataset_to_train)

pipeline_tuner.tune(pipeline_evo_composed)

predictions = pipeline_evo_composed.predict(dataset_to_validate)

roc_auc_on_valid = calculate_validation_metric(predictions,
dataset_to_validate)
return roc_auc_on_valid, dataset_to_train, dataset_to_validate


if __name__ == '__main__':
set_random_seed(1)

training_set, testing_set = tf.keras.datasets.mnist.load_data(path='mnist.npz')
roc_auc_on_valid, dataset_to_train, dataset_to_validate = run_image_classification_automl(
train_dataset=training_set,
test_dataset=testing_set)

print(roc_auc_on_valid)
Empty file.
168 changes: 168 additions & 0 deletions examples/advanced/customization/implementations/cnn_impls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import logging
import os
import random
from typing import Optional

import numpy as np
from golem.utilities.requirements_notificator import warn_requirement

from fedot.core.operations.operation_parameters import OperationParameters

try:
import tensorflow as tf
except ModuleNotFoundError:
warn_requirement('tensorflow', 'fedot[extra]')
tf = None

from fedot.core.data.data import InputData, OutputData
from golem.core.log import LoggerAdapter, default_log
from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import ModelImplementation
from sklearn import preprocessing

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'


def check_input_array(x_train):
if np.max(x_train) > 1:
transformed_x_train = x_train.astype("float32") / 255
transform_flag = True
else:
transformed_x_train = x_train
transform_flag = False

return transformed_x_train, transform_flag


def create_simple_cnn(input_shape: tuple,
num_classes: int):
model = tf.keras.Sequential(
[
tf.keras.layers.InputLayer(input_shape=input_shape),
tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(num_classes, activation="softmax"),
]
)

return model


def fit_cnn(train_data: InputData,
model,
epochs: int = 10,
batch_size: int = 128,
optimizer_params: dict = None,
logger: Optional[LoggerAdapter] = None):
x_train, y_train = train_data.features, train_data.target
transformed_x_train, transform_flag = check_input_array(x_train)

if logger is None:
logger = default_log(prefix=__name__)

if transform_flag:
logger.debug('Train data set was not scaled. The data was divided by 255.')

if len(x_train.shape) == 3:
transformed_x_train = np.expand_dims(x_train, -1)

if len(train_data.target.shape) < 2:
le = preprocessing.OneHotEncoder()
y_train = le.fit_transform(y_train.reshape(-1, 1)).toarray()

if optimizer_params is None:
optimizer_params = {'loss': "categorical_crossentropy",
'optimizer': "adam",
'metrics': ["accuracy"]}

model.compile(**optimizer_params)
model.num_classes = train_data.num_classes
if logger is None:
logger = default_log(prefix=__name__)

if logger.logging_level > logging.DEBUG:
verbose = 0
else:
verbose = 2

if epochs is None:
logger.warning('The number of training epochs was not set. The selected number of epochs is 10.')

model.fit(transformed_x_train, y_train, batch_size=batch_size, epochs=epochs,
validation_split=0.1, verbose=verbose)

return model


def predict_cnn(trained_model, predict_data: InputData, output_mode: str = 'labels', logger=None) -> OutputData:
prediction = np.asarray([[random.random()] for j in range(predict_data.features.shape[0])])
return prediction


cnn_model_dict = {'simplified': create_simple_cnn}


class MyCNNImplementation(ModelImplementation):
def __init__(self, params: Optional[OperationParameters] = None):
super().__init__(params)

default_params = {'log': default_log(prefix=__name__),
'epochs': 10,
'batch_size': 32,
'output_mode': 'labels',
'architecture_type': 'simplified',
'optimizer_parameters': {'loss': "categorical_crossentropy",
'optimizer': "adam",
'metrics': ["accuracy"]}}

complete_params = {**default_params, **self.params.to_dict()}
self.params.update(**complete_params)

def fit(self, train_data):
""" Method fit model on a dataset

:param train_data: data to train the model
"""

# TODO make case for multiclass multioutput task
# check for multioutput target
if len(train_data.target.shape) < 2:
self.classes = np.unique(train_data.target)
else:
self.classes = np.arange(train_data.target.shape[1])

self.model = cnn_model_dict[self.params.get('architecture_type')](input_shape=train_data.features.shape[1:4],
num_classes=len(self.classes))

self.model = fit_cnn(train_data=train_data, model=self.model, epochs=self.params.get('epochs'),
batch_size=self.params.get('batch_size'),
optimizer_params=self.params.get('optimizer_parameters'), logger=self.params.get('log'))
return self.model

def predict(self, input_data):
""" Method make prediction with labels of classes for predict stage

:param input_data: data with features to process
"""

return predict_cnn(trained_model=self.model, predict_data=input_data,
output_mode='labels', logger=self.params['log'])

def predict_proba(self, input_data):
""" Method make prediction with probabilities of classes

:param input_data: data with features to process
"""

return predict_cnn(trained_model=self.model, predict_data=input_data, output_mode='probs')

@property
def classes_(self):
return self.classes

def __deepcopy__(self, memo=None):
clone_model = tf.keras.models.clone_model(self.model)
clone_model.compile(optimizer=self.model.optimizer, loss=self.model.loss, metrics=self.model.metrics)
clone_model.set_weights(self.model.get_weights())
return clone_model
35 changes: 35 additions & 0 deletions examples/advanced/customization/implementations/preproc_impls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from copy import deepcopy
from typing import Optional

import numpy as np

from fedot.core.data.data import InputData, OutputData
from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import \
DataOperationImplementation
from fedot.core.operations.operation_parameters import OperationParameters
from fedot.core.repository.dataset_types import DataTypesEnum


class GammaFiltImplementation(DataOperationImplementation):
def __init__(self, params: Optional[OperationParameters]):
super().__init__(params)
if not self.params:
# Default parameters
pass
else:
# Checking the appropriate params are using or not
pass

def fit(self, input_data: InputData):
return None

def transform(self, input_data: InputData) -> OutputData:
# example of custom data pre-processing for predict state
transformed_features = deepcopy(input_data.features)
for i in range(transformed_features.shape[0]):
transformed_features[i, :, :] = transformed_features[i, :, :] + np.random.normal(0, 30)

output_data = self._convert_to_output(input_data,
transformed_features, data_type=DataTypesEnum.image)

return output_data
Loading
Loading