From 6d60801b09c929979eebd9057d1c1428de4b706a Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 7 Aug 2024 18:47:20 +0300 Subject: [PATCH 01/69] Adding logs & the ability to specify categorical data --- fedot/core/data/data.py | 5 ++- fedot/preprocessing/data_types.py | 52 +++++++++++++++++++++------- fedot/preprocessing/preprocessing.py | 12 +++++-- 3 files changed, 53 insertions(+), 16 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index 0101650ee2..f4ab7491a1 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -105,6 +105,7 @@ def from_numpy_time_series(cls, def from_dataframe(cls, features_df: Union[pd.DataFrame, pd.Series], target_df: Union[pd.DataFrame, pd.Series], + categorical_idx: np.ndarray = None, task: Union[Task, str] = 'classification', data_type: DataTypesEnum = DataTypesEnum.table) -> InputData: """Import data from pandas DataFrame. @@ -131,9 +132,11 @@ def from_dataframe(cls, features_names = features_df.columns.to_numpy() df = pd.concat([features_df, target_df], axis=1) features, target = process_target_and_features(df, target_columns) + categorical_features = features_df.loc[:, categorical_idx].to_numpy() return InputData(idx=idx, features=features, target=target, task=task, data_type=data_type, - features_names=features_names) + features_names=features_names, categorical_features=categorical_features, + categorical_idx=categorical_idx) @classmethod def from_csv(cls, diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index a81700b964..18a15b08bc 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -85,10 +85,10 @@ def convert_data_for_fit(self, data: InputData): # And in target(s) data.target = self.target_types_converting(target=data.target, task=data.task) - data.supplementary_data.col_type_ids = self.prepare_column_types_info(predictors=data.features, - target=data.target, - task=data.task) - + column_types_info = self.prepare_column_types_info(predictors=data.features, target=data.target, task=data.task) + data.supplementary_data.col_type_ids = column_types_info + col_types_info_message = prepare_log_message_with_cols_types(column_types_info, data.features_names) + self.log.message(f'The information about types of each feature are {col_types_info_message}') self._into_numeric_features_transformation_for_fit(data) # Launch conversion float and integer features into categorical self._into_categorical_features_transformation_for_fit(data) @@ -155,7 +155,7 @@ def target_types_converting(self, target: np.ndarray, task: Task) -> np.ndarray: def prepare_column_types_info(self, predictors: np.ndarray, target: np.ndarray = None, task: Task = None) -> dict: - """ Prepare information about columns in a form of dictionary + """ Prepare information about columns in a form of dictionary. Dictionary has two keys: 'target' and 'features' """ if self.features_columns_info.empty: @@ -181,7 +181,7 @@ def _retain_columns_info_without_types_conflicts(self, data: InputData): Such columns have no conflicts with types converting. """ if self.string_columns_transformation_failed: - self.log.warning(f'Columns with indices {self.string_columns_transformation_failed} were ' + self.log.message(f'Columns with indices {self.string_columns_transformation_failed} were ' f'removed during mixed types column converting due to conflicts.') data.features = self.remove_incorrect_features(data.features, self.string_columns_transformation_failed) @@ -287,13 +287,26 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): # reduce dataframe to include only categorical features num_df = num_df.loc[:, (2 < nuniques) & (nuniques < self.categorical_max_uniques_th)] - cat_col_ids = num_df.columns - # Convert into string - data.features[:, cat_col_ids] = num_df.apply(convert_num_column_into_string_array).to_numpy() - # Columns need to be transformed into categorical (string) ones - self.numerical_into_str.extend(cat_col_ids.difference(self.numerical_into_str)) - # Update information about column types (in-place) - feature_type_ids[cat_col_ids] = TYPE_TO_ID[str] + + if data.categorical_idx is not None: + cat_col_ids = data.categorical_idx + else: + cat_col_ids = num_df.columns + + if np.size(cat_col_ids) > 0: + cat_features_names = data.features_names[cat_col_ids] + else: + cat_features_names = [] + + self.log.message(f'Preprocessing define next cols {cat_features_names} as categorical') + + if np.size(cat_col_ids) > 0: + # Convert into string + data.features[:, cat_col_ids] = num_df.apply(convert_num_column_into_string_array).to_numpy() + # Columns need to be transformed into categorical (string) ones + self.numerical_into_str.extend(cat_col_ids.difference(self.numerical_into_str)) + # Update information about column types (in-place) + feature_type_ids[cat_col_ids] = TYPE_TO_ID[str] def _into_categorical_features_transformation_for_predict(self, data: InputData): """ Apply conversion into categorical string column for every signed column """ @@ -499,3 +512,16 @@ def _process_predict_column_values_one_by_one(value, current_type: type): except ValueError: pass return new_value + + +def prepare_log_message_with_cols_types(col_types_info, features_names): + message = '\n' + for type_name, type_id in TYPE_TO_ID.items(): + count_types = np.count_nonzero(col_types_info['features'] == type_id) + features_idx = np.where(col_types_info['features'] == type_id)[0] + names_or_indexes = features_names[features_idx] if features_names is not None else features_idx + message = message + f'TYPE {type_name} - count {count_types} - features {names_or_indexes} \n' \ + + message = message + f'Target: TYPE {_convertable_types[col_types_info["target"][0]]}' + + return message diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index a59f901d1b..634d35e299 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -270,10 +270,13 @@ def _find_features_lacking_nans(self, data: InputData, source_name: str): features = data.features axes_except_cols = (0,) + tuple(range(2, features.ndim)) are_allowed = np.mean(pd.isna(features), axis=axes_except_cols) < ALLOWED_NAN_PERCENT + self.log.message( + f'The number of features with an acceptable nan\'s percent value was taken ' + f'{len(are_allowed)} / {data.features.shape[1]}' + ) self.ids_relevant_features[source_name] = np.flatnonzero(are_allowed) - @staticmethod - def _drop_rows_with_nan_in_target(data: InputData) -> InputData: + def _drop_rows_with_nan_in_target(self, data: InputData) -> InputData: """ Drops rows with nans in target column @@ -299,6 +302,11 @@ def _drop_rows_with_nan_in_target(data: InputData) -> InputData: data.target = target[non_nan_row_ids, :] data.idx = np.array(data.idx)[non_nan_row_ids] + self.log.message( + f'The number of rows with an nan\'s in target is ' + f'{sum(number_nans_per_rows)} / {data.features.shape[0]}' + ) + return data @staticmethod From 057c4d24ad58466c1d5a0d9b00cdc028f800653b Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Fri, 9 Aug 2024 16:06:24 +0300 Subject: [PATCH 02/69] Fixes categorical features --- fedot/core/data/data.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index f4ab7491a1..dcaefc2d73 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -132,7 +132,10 @@ def from_dataframe(cls, features_names = features_df.columns.to_numpy() df = pd.concat([features_df, target_df], axis=1) features, target = process_target_and_features(df, target_columns) - categorical_features = features_df.loc[:, categorical_idx].to_numpy() + + categorical_features = None + if categorical_idx is not None: + categorical_features = features_df.loc[:, categorical_idx].to_numpy() return InputData(idx=idx, features=features, target=target, task=task, data_type=data_type, features_names=features_names, categorical_features=categorical_features, From 4b4536af23a77d0bf4034ad5a166cb40af6d6d63 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Fri, 9 Aug 2024 18:38:31 +0300 Subject: [PATCH 03/69] Changing getsizeof to nbytes --- fedot/api/api_utils/api_data.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py index 69c9f2a97b..1ba04a7f59 100644 --- a/fedot/api/api_utils/api_data.py +++ b/fedot/api/api_utils/api_data.py @@ -1,4 +1,3 @@ -import sys from datetime import datetime from typing import Dict, Union from typing import Optional @@ -133,7 +132,7 @@ def accept_and_apply_recommendations(self, input_data: Union[InputData, MultiMod def fit_transform(self, train_data: InputData) -> InputData: start_time = datetime.now() self.log.message('Preprocessing data') - memory_usage = convert_memory_size(sys.getsizeof(train_data.features)) + memory_usage = convert_memory_size(train_data.features.nbytes) features_shape = train_data.features.shape target_shape = train_data.target.shape self.log.message( @@ -144,7 +143,7 @@ def fit_transform(self, train_data: InputData) -> InputData: train_data = self.preprocessor.convert_indexes_for_fit(pipeline=Pipeline(), data=train_data) train_data.supplementary_data.is_auto_preprocessed = True - memory_usage = convert_memory_size(sys.getsizeof(train_data.features)) + memory_usage = convert_memory_size(train_data.features.nbytes) features_shape = train_data.features.shape target_shape = train_data.target.shape self.log.message( @@ -156,7 +155,7 @@ def fit_transform(self, train_data: InputData) -> InputData: def transform(self, test_data: InputData, current_pipeline) -> InputData: start_time = datetime.now() self.log.message('Preprocessing data') - memory_usage = convert_memory_size(sys.getsizeof(test_data)) + memory_usage = convert_memory_size(test_data.features.nbytes) features_shape = test_data.features.shape target_shape = test_data.target.shape self.log.message( @@ -168,7 +167,7 @@ def transform(self, test_data: InputData, current_pipeline) -> InputData: test_data = self.preprocessor.update_indices_for_time_series(test_data) test_data.supplementary_data.is_auto_preprocessed = True - memory_usage = convert_memory_size(sys.getsizeof(test_data)) + memory_usage = convert_memory_size(test_data.features.nbytes) features_shape = test_data.features.shape target_shape = test_data.target.shape self.log.message( From ae6eb42667384039c721010a36e30bdcdda50d89 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Fri, 9 Aug 2024 18:45:45 +0300 Subject: [PATCH 04/69] Delete _clean_extra_spaces --- fedot/preprocessing/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 634d35e299..f4d40e27da 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -227,7 +227,7 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str, # TODO andreygetmanov to new class text preprocessing? replace_nans_with_empty_strings(data) elif data_type_is_table(data): - data = self._clean_extra_spaces(data) + # data = self._clean_extra_spaces(data) # Process binary categorical features if is_fit_stage: data = self.binary_categorical_processors[source_name].fit_transform(data) From f0df60ceb30a923f4e9bbfe2e8b6adee7b074b83 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Tue, 13 Aug 2024 17:59:51 +0300 Subject: [PATCH 05/69] Adding more logs, adding OptimisedFeature storage, refactoring fitting BinaryCategoricalPreprocessor, fix bugs, adding reduce memory size, delete clean_extra_spaces --- fedot/api/api_utils/api_data.py | 27 ++++++- fedot/core/data/data.py | 58 +++++++++++++- fedot/preprocessing/categorical.py | 56 +++++++------ fedot/preprocessing/data_types.py | 5 +- fedot/preprocessing/preprocessing.py | 115 +++++++++++++++++++-------- 5 files changed, 197 insertions(+), 64 deletions(-) diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py index 1ba04a7f59..9d6d1a33bd 100644 --- a/fedot/api/api_utils/api_data.py +++ b/fedot/api/api_utils/api_data.py @@ -6,7 +6,7 @@ from golem.core.log import default_log from fedot.api.api_utils.data_definition import data_strategy_selector, FeaturesType, TargetType -from fedot.core.data.data import InputData, OutputData, data_type_is_table +from fedot.core.data.data import InputData, OutputData, data_type_is_table, OptimisedFeature from fedot.core.data.data_preprocessing import convert_into_column from fedot.core.data.multi_modal import MultiModalData from fedot.core.pipelines.pipeline import Pipeline @@ -138,12 +138,26 @@ def fit_transform(self, train_data: InputData) -> InputData: self.log.message( f'Train Data (Original) Memory Usage: {memory_usage} Data Shapes: {features_shape, target_shape}') + self.log.message('- Obligatory preprocessing started') train_data = self.preprocessor.obligatory_prepare_for_fit(data=train_data) + + self.log.message('- Optional preprocessing started') train_data = self.preprocessor.optional_prepare_for_fit(pipeline=Pipeline(), data=train_data) + + self.log.message('- Converting indexes for fitting started') train_data = self.preprocessor.convert_indexes_for_fit(pipeline=Pipeline(), data=train_data) + + self.log.message('- Reducing memory started') + train_data = self.preprocessor.reduce_memory_size(data=train_data) + train_data.supplementary_data.is_auto_preprocessed = True - memory_usage = convert_memory_size(train_data.features.nbytes) + if isinstance(train_data.features, OptimisedFeature): + memory_usage = convert_memory_size(train_data.features.memory_usage) + + else: + memory_usage = convert_memory_size(train_data.features.nbytes) + features_shape = train_data.features.shape target_shape = train_data.target.shape self.log.message( @@ -167,7 +181,14 @@ def transform(self, test_data: InputData, current_pipeline) -> InputData: test_data = self.preprocessor.update_indices_for_time_series(test_data) test_data.supplementary_data.is_auto_preprocessed = True - memory_usage = convert_memory_size(test_data.features.nbytes) + test_data = self.preprocessor.reduce_memory_size(data=test_data) + + if isinstance(test_data.features, OptimisedFeature): + memory_usage = convert_memory_size(test_data.features.memory_usage) + + else: + memory_usage = convert_memory_size(test_data.features.nbytes) + features_shape = test_data.features.shape target_shape = test_data.target.shape self.log.message( diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index dcaefc2d73..ed3433beca 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -42,7 +42,7 @@ class Data: idx: np.ndarray task: Task data_type: DataTypesEnum - features: np.ndarray + features: Optional[np.ndarray, OptimisedFeature] categorical_features: Optional[np.ndarray] = None categorical_idx: Optional[np.ndarray] = None numerical_idx: Optional[np.ndarray] = None @@ -671,6 +671,62 @@ class OutputData(Data): target: Optional[np.ndarray] = None encoded_idx: Optional[np.ndarray] = None +@dataclass +class OptimisedFeature: + _columns: list = field(default_factory=list, init=False) + _shape: tuple = field(default=(0, 0), init=False) + _memory_usage: int = 0 + ndim: int = 2 + + def add_column(self, data: np.ndarray): + if not isinstance(data, np.ndarray): + raise ValueError("Data should be a NumPy array.") + + if self._shape == (0, 0): + self._shape = (data.shape[0], 1) + else: + if data.shape[0] != self._shape[0]: + raise ValueError("All columns must have the same number of rows.") + + self._shape = (self._shape[0], self._shape[1] + 1) + + self._columns.append(data) + self._memory_usage += data.nbytes + + def __getitem__(self, key): + if isinstance(key, tuple): + row_idx, col_idx = key + if isinstance(col_idx, int): + return self._columns[col_idx][row_idx] + else: + selected_columns = [self._columns[i] for i in col_idx] + return np.column_stack(selected_columns)[row_idx] + else: + result = np.column_stack(self._columns)[key] + return result if result.ndim > 1 else result.ravel() + + def __setitem__(self, key, value): + if isinstance(key, tuple): + row_idx, col_idx = key + if isinstance(col_idx, int): + self._columns[col_idx][row_idx] = value + else: + for i, col in zip(col_idx, value): + self._columns[i][row_idx] = col + else: + raise NotImplementedError("Setting values by index without specifying a column is not supported.") + + def __len__(self): + return self._shape[0] if self._columns else 0 + + @property + def shape(self): + return self._shape + + @property + def memory_usage(self): + return self._memory_usage + def _resize_image(file_path: str, target_size: Tuple[int, int]): """Function resizes and rewrites the input image diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 5cde088d7a..509bb811c0 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -24,31 +24,39 @@ def fit(self, input_data: InputData): Find indices of columns which are contains categorical values. Binary features and at the same time has str objects. If there are such features - convert it into int """ - feature_type_ids = input_data.supplementary_data.col_type_ids['features'] - categorical_ids, _ = find_categorical_columns(input_data.features, - feature_type_ids) - - binary_ids_to_convert = [] - for column_id, column in zip(categorical_ids, input_data.features[:, categorical_ids].T): - pd_column = pd.Series(column, name=column_id, copy=True) - is_nan = pd_column.isna() - column_nuniques = pd_column.nunique(dropna=False) - if is_nan.sum(): - # This categorical column has nans - pd_column[is_nan] = FEDOT_STR_NAN - - if column_nuniques <= 3: - # There is column with binary categories and gaps - self.binary_features_with_nans.append(column_id) + if np.size(input_data.categorical_idx) != 0: + categorical_columns = input_data.features[:, input_data.categorical_idx].T + nan_matrix = np.isnan(categorical_columns.astype(float, copy=False)) + nuniques = np.array([len(np.unique(col[~is_nan])) for col, is_nan in zip(categorical_columns, nan_matrix)]) + + binary_ids_to_convert = [] + + for i, (column_id, column_nuniques, is_nan) in enumerate( + zip(input_data.categorical_idx, nuniques, nan_matrix) + ): + if is_nan.any(): + # This categorical column has nans + categorical_columns[i, is_nan] = FEDOT_STR_NAN + column_nuniques = len(set(categorical_columns[i])) + + if column_nuniques <= 3: + # There is column with binary categories and gaps + self.binary_features_with_nans.append(column_id) + binary_ids_to_convert.append(column_id) + self._train_encoder(pd.Series(categorical_columns[i], name=column_id)) + + elif column_nuniques <= 2: + # Column contains binary string feature binary_ids_to_convert.append(column_id) - self._train_encoder(pd_column) - elif column_nuniques <= 2: - # Column contains binary string feature - binary_ids_to_convert.append(column_id) - # Train encoder for current column - self._train_encoder(pd_column) - - self.binary_ids_to_convert = binary_ids_to_convert + # Train encoder for current column + self._train_encoder(pd.Series(categorical_columns[i], name=column_id)) + + # Remove binary columns from categorical_idx + input_data.categorical_idx = [idx for idx in input_data.categorical_idx if idx not in binary_ids_to_convert] + self.binary_ids_to_convert = binary_ids_to_convert + + # TODO: Add log.message with binary ids + return self def transform(self, input_data: InputData) -> InputData: diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 18a15b08bc..aa5038c201 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -88,7 +88,7 @@ def convert_data_for_fit(self, data: InputData): column_types_info = self.prepare_column_types_info(predictors=data.features, target=data.target, task=data.task) data.supplementary_data.col_type_ids = column_types_info col_types_info_message = prepare_log_message_with_cols_types(column_types_info, data.features_names) - self.log.message(f'The information about types of each feature are {col_types_info_message}') + self.log.message(f'--- The information about types of each feature are {col_types_info_message}') self._into_numeric_features_transformation_for_fit(data) # Launch conversion float and integer features into categorical self._into_categorical_features_transformation_for_fit(data) @@ -292,13 +292,14 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): cat_col_ids = data.categorical_idx else: cat_col_ids = num_df.columns + data.categorical_idx = cat_col_ids if np.size(cat_col_ids) > 0: cat_features_names = data.features_names[cat_col_ids] else: cat_features_names = [] - self.log.message(f'Preprocessing define next cols {cat_features_names} as categorical') + self.log.message(f'--- Preprocessing define next cols {cat_features_names} as categorical') if np.size(cat_col_ids) > 0: # Convert into string diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index f4d40e27da..a0271086ad 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -7,7 +7,7 @@ from golem.core.paths import copy_doc from sklearn.preprocessing import LabelEncoder -from fedot.core.data.data import InputData, np_datetime_to_numeric +from fedot.core.data.data import InputData, np_datetime_to_numeric, OptimisedFeature from fedot.core.data.data import OutputData, data_type_is_table, data_type_is_text, data_type_is_ts from fedot.core.data.data_preprocessing import ( data_has_categorical_features, @@ -29,7 +29,7 @@ from fedot.preprocessing.base_preprocessing import BasePreprocessor from fedot.preprocessing.categorical import BinaryCategoricalPreprocessor from fedot.preprocessing.data_type_check import exclude_image, exclude_multi_ts, exclude_ts -from fedot.preprocessing.data_types import TYPE_TO_ID, TableTypesCorrector +from fedot.preprocessing.data_types import TYPE_TO_ID, TableTypesCorrector, _convertable_types from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME, PipelineStructureExplorer # The allowed percent of empty samples in features. @@ -192,6 +192,7 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str, return data # Convert datetime data to numerical + self.log.message('-- Converting datetime data to numerical') data.features = np_datetime_to_numeric(data.features) if data.target is not None: data.target = np_datetime_to_numeric(data.target) @@ -200,36 +201,49 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str, data.idx = np.asarray(data.idx) # Fix tables / time series sizes + self.log.message('-- Fixing table / time series shapes') data = self._correct_shapes(data) replace_inf_with_nans(data) # Find incorrect features which must be removed if is_fit_stage: + self.log.message('-- Finding incorrect features') self._find_features_lacking_nans(data, source_name) + + self.log.message('-- Removing incorrect features') self._take_only_correct_features(data, source_name) if is_fit_stage: + self.log.message('-- Dropping rows with nan\'s in target') data = self._drop_rows_with_nan_in_target(data) # Column types processing - launch after correct features selection + self.log.message('-- Features types processing') self.types_correctors[source_name].convert_data_for_fit(data) + if self.types_correctors[source_name].target_converting_has_errors: + self.log.message('-- Dropping rows with nan\'s in target') data = self._drop_rows_with_nan_in_target(data) + # Train Label Encoder for categorical target if necessary and apply it + self.log.message('-- Applying the Label Encoder to Target due to the presence of categories') if source_name not in self.target_encoders: self._train_target_encoder(data, source_name) + data.target = self._apply_target_encoding(data, source_name) + else: + self.log.message('-- Converting data for predict') self.types_correctors[source_name].convert_data_for_predict(data) # TODO andreygetmanov target encoding must be obligatory for all data types if data_type_is_text(data): # TODO andreygetmanov to new class text preprocessing? replace_nans_with_empty_strings(data) + elif data_type_is_table(data): - # data = self._clean_extra_spaces(data) - # Process binary categorical features if is_fit_stage: + self.log.message('-- Searching binary categorical features to encode them') data = self.binary_categorical_processors[source_name].fit_transform(data) else: data = self.binary_categorical_processors[source_name].transform(data) @@ -252,10 +266,13 @@ def _prepare_optional(self, pipeline, data: InputData, source_name: str): (data_has_missing_values, 'imputation', self._apply_imputation_unidata), (data_has_categorical_features, 'encoding', self._apply_categorical_encoding) ]: + self.log.message(f'-- Deciding to apply {tag_to_check} for data') if has_problems(data): + self.log.message(f'-- Finding {tag_to_check} is required and trying to apply') # Data contains missing values has_tag = PipelineStructureExplorer.check_structure_by_tag( pipeline, tag_to_check=tag_to_check, source_name=source_name) + if not has_tag: data = action_if_no_tag(data, source_name) @@ -271,7 +288,7 @@ def _find_features_lacking_nans(self, data: InputData, source_name: str): axes_except_cols = (0,) + tuple(range(2, features.ndim)) are_allowed = np.mean(pd.isna(features), axis=axes_except_cols) < ALLOWED_NAN_PERCENT self.log.message( - f'The number of features with an acceptable nan\'s percent value was taken ' + f'--- The number of features with an acceptable nan\'s percent value was taken ' f'{len(are_allowed)} / {data.features.shape[1]}' ) self.ids_relevant_features[source_name] = np.flatnonzero(are_allowed) @@ -303,39 +320,12 @@ def _drop_rows_with_nan_in_target(self, data: InputData) -> InputData: data.idx = np.array(data.idx)[non_nan_row_ids] self.log.message( - f'The number of rows with an nan\'s in target is ' + f'--- The number of rows with an nan\'s in target is ' f'{sum(number_nans_per_rows)} / {data.features.shape[0]}' ) return data - @staticmethod - def _clean_extra_spaces(data: InputData) -> InputData: - """ - Removes extra spaces from data. - Transforms cells in columns from ' x ' to 'x' - - Args: - data: to be stripped - - Returns: - cleaned ``data`` - """ - - def strip_all_strs(item: Union[object, str]): - try: - return item.strip() - except AttributeError: - # not a str object - return item - - features_df = pd.DataFrame(data.features) - mixed_or_str = features_df.select_dtypes(object) - features_df[mixed_or_str.columns] = mixed_or_str.applymap(strip_all_strs) - - data.features = features_df.to_numpy() - return data - @copy_doc(BasePreprocessor.label_encoding_for_fit) def label_encoding_for_fit(self, data: InputData, source_name: str = DEFAULT_SOURCE_NAME): if data_has_categorical_features(data): @@ -369,20 +359,26 @@ def _apply_imputation_unidata(self, data: InputData, source_name: str) -> InputD Returns: imputed ``data`` """ + self.log.message('--- Initialising imputer') imputer = self.features_imputers.get(source_name) + if not imputer: imputer = ImputationImplementation() + self.log.message('--- Fitting and transforming imputer for missings') output_data = imputer.fit_transform(data) self.features_imputers[source_name] = imputer + else: + self.log.message('--- Transforming imputer for missings') output_data = imputer.transform(data) + data.features = output_data.predict return data def _apply_categorical_encoding(self, data: InputData, source_name: str) -> InputData: """ Transforms the data inplace. Uses the same transformations as for the training data if trained already. - Otherwise fits appropriate encoder and converts data's categorical features with it. + Otherwise, fits appropriate encoder and converts data's categorical features with it. Args: data: data to be transformed @@ -391,11 +387,16 @@ def _apply_categorical_encoding(self, data: InputData, source_name: str) -> Inpu Returns: encoded ``data`` """ + self.log.message('--- Initialising categorical encoder') encoder = self.features_encoders.get(source_name) + if encoder is None: encoder = LabelEncodingImplementation() if self.use_label_encoder else OneHotEncodingImplementation() encoder.fit(data) self.features_encoders[source_name] = encoder + + self.log.message(f'--- {encoder.__class__.__name__} was choose') + self.log.message(f'--- Fitting and transforming data') output_data = encoder.transform_for_fit(data) output_data.predict = output_data.predict.astype(float) data.features = output_data.predict @@ -550,3 +551,49 @@ def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalD last_id = len(input_data.idx) input_data.idx = np.arange(last_id, last_id + input_data.task.task_params.forecast_length) return test_data + + def reduce_memory_size(self, data: InputData) -> InputData: + def reduce_mem_usage_np(arr, initial_types): + reduced_columns = OptimisedFeature() + + + for i in range(arr.shape[1]): + col = arr[:, i] + init_type = _convertable_types[initial_types[i]] + col = col.astype(init_type) + col_type = col.dtype.name + + if col_type not in ['object']: + c_min = col.max() + c_max = col.max() + + if np.issubdtype(col.dtype, np.integer): + if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: + reduced_columns.add_column(col.astype(np.int8)) + elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: + reduced_columns.add_column(col.astype(np.int16)) + elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: + reduced_columns.add_column(col.astype(np.int32)) + elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: + reduced_columns.add_column(col.astype(np.int64)) + + elif np.issubdtype(col.dtype, np.floating): + if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: + reduced_columns.add_column(col.astype(np.float16)) + elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: + reduced_columns.add_column(col.astype(np.float32)) + else: + reduced_columns.add_column(col.astype(np.float64)) + else: + reduced_columns.add_column(col) + + return reduced_columns + + if isinstance(data, InputData): + self.log.message('-- Reduce memory in features') + data.features = reduce_mem_usage_np(data.features, data.supplementary_data.col_type_ids['features']) + + self.log.message('-- Reduce memory in target') + data.target = reduce_mem_usage_np(data.target, data.supplementary_data.col_type_ids['target']) + + return data From e4c13f54a0689235646fb0be1d027258d456b24c Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 14 Aug 2024 17:14:00 +0300 Subject: [PATCH 06/69] @Lopa10ko requested changes --- fedot/api/api_utils/api_data.py | 7 +------ fedot/core/data/data.py | 10 +++++----- fedot/preprocessing/data_types.py | 4 ++-- fedot/preprocessing/preprocessing.py | 6 +++--- 4 files changed, 11 insertions(+), 16 deletions(-) diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py index 9d6d1a33bd..3ac1c9242b 100644 --- a/fedot/api/api_utils/api_data.py +++ b/fedot/api/api_utils/api_data.py @@ -183,12 +183,7 @@ def transform(self, test_data: InputData, current_pipeline) -> InputData: test_data = self.preprocessor.reduce_memory_size(data=test_data) - if isinstance(test_data.features, OptimisedFeature): - memory_usage = convert_memory_size(test_data.features.memory_usage) - - else: - memory_usage = convert_memory_size(test_data.features.nbytes) - + memory_usage = convert_memory_size(test_data.features.nbytes) features_shape = test_data.features.shape target_shape = test_data.target.shape self.log.message( diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index ed3433beca..2650d2cedd 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -42,7 +42,7 @@ class Data: idx: np.ndarray task: Task data_type: DataTypesEnum - features: Optional[np.ndarray, OptimisedFeature] + features: Union[np.ndarray, OptimisedFeature] categorical_features: Optional[np.ndarray] = None categorical_idx: Optional[np.ndarray] = None numerical_idx: Optional[np.ndarray] = None @@ -675,7 +675,7 @@ class OutputData(Data): class OptimisedFeature: _columns: list = field(default_factory=list, init=False) _shape: tuple = field(default=(0, 0), init=False) - _memory_usage: int = 0 + nbytes: int = 0 ndim: int = 2 def add_column(self, data: np.ndarray): @@ -691,7 +691,7 @@ def add_column(self, data: np.ndarray): self._shape = (self._shape[0], self._shape[1] + 1) self._columns.append(data) - self._memory_usage += data.nbytes + self.nbytes += data.nbytes def __getitem__(self, key): if isinstance(key, tuple): @@ -724,8 +724,8 @@ def shape(self): return self._shape @property - def memory_usage(self): - return self._memory_usage + def nbytes(self): + return self.nbytes def _resize_image(file_path: str, target_size: Tuple[int, int]): diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index aa5038c201..f8436c4932 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -521,8 +521,8 @@ def prepare_log_message_with_cols_types(col_types_info, features_names): count_types = np.count_nonzero(col_types_info['features'] == type_id) features_idx = np.where(col_types_info['features'] == type_id)[0] names_or_indexes = features_names[features_idx] if features_names is not None else features_idx - message = message + f'TYPE {type_name} - count {count_types} - features {names_or_indexes} \n' \ + message += f'TYPE {type_name} - count {count_types} - features {names_or_indexes} \n' \ - message = message + f'Target: TYPE {_convertable_types[col_types_info["target"][0]]}' + message += f'Target: TYPE {_convertable_types[col_types_info["target"][0]]}' return message diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index a0271086ad..6128090bc4 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -214,7 +214,7 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str, self._take_only_correct_features(data, source_name) if is_fit_stage: - self.log.message('-- Dropping rows with nan\'s in target') + self.log.message('-- Dropping rows with NaN-values in target') data = self._drop_rows_with_nan_in_target(data) # Column types processing - launch after correct features selection @@ -222,7 +222,7 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str, self.types_correctors[source_name].convert_data_for_fit(data) if self.types_correctors[source_name].target_converting_has_errors: - self.log.message('-- Dropping rows with nan\'s in target') + self.log.message('-- Dropping rows with NaN-values in target') data = self._drop_rows_with_nan_in_target(data) # Train Label Encoder for categorical target if necessary and apply it @@ -395,7 +395,7 @@ def _apply_categorical_encoding(self, data: InputData, source_name: str) -> Inpu encoder.fit(data) self.features_encoders[source_name] = encoder - self.log.message(f'--- {encoder.__class__.__name__} was choose') + self.log.message(f'--- {encoder.__class__.__name__} was chosen') self.log.message(f'--- Fitting and transforming data') output_data = encoder.transform_for_fit(data) output_data.predict = output_data.predict.astype(float) From c0f7ff322eca01c8664c5db0c2e565024e417be8 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 14 Aug 2024 18:00:54 +0300 Subject: [PATCH 07/69] Fix bug with nbytes --- fedot/core/data/data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index 2650d2cedd..3077f42e92 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -675,7 +675,7 @@ class OutputData(Data): class OptimisedFeature: _columns: list = field(default_factory=list, init=False) _shape: tuple = field(default=(0, 0), init=False) - nbytes: int = 0 + _nbytes: int = 0 ndim: int = 2 def add_column(self, data: np.ndarray): @@ -691,7 +691,7 @@ def add_column(self, data: np.ndarray): self._shape = (self._shape[0], self._shape[1] + 1) self._columns.append(data) - self.nbytes += data.nbytes + self._nbytes += data.nbytes def __getitem__(self, key): if isinstance(key, tuple): @@ -725,7 +725,7 @@ def shape(self): @property def nbytes(self): - return self.nbytes + return self._nbytes def _resize_image(file_path: str, target_size: Tuple[int, int]): From 6d7bf975c422e13bbc27c182419bc7b85041d2a8 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 14 Aug 2024 18:01:52 +0300 Subject: [PATCH 08/69] Fix bug with cat_features_names if there aren't exists features_names --- fedot/preprocessing/data_types.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index f8436c4932..7475a51c56 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -295,11 +295,13 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): data.categorical_idx = cat_col_ids if np.size(cat_col_ids) > 0: - cat_features_names = data.features_names[cat_col_ids] + if data.features_names is not None: + cat_features_names = data.features_names[cat_col_ids] + self.log.message(f'--- Preprocessing define next cols {cat_features_names} as categorical') + else: + self.log.message(f'--- Preprocessing define next cols {cat_col_ids} as categorical') else: - cat_features_names = [] - - self.log.message(f'--- Preprocessing define next cols {cat_features_names} as categorical') + self.log.message(f'--- Preprocessing was unable to define the categorical columns') if np.size(cat_col_ids) > 0: # Convert into string From 705529a851e402c8ca532afe92e004eab9a48805 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 14 Aug 2024 18:02:19 +0300 Subject: [PATCH 09/69] Adding reduce_memory_size to pipeline._preprocess --- fedot/core/pipelines/pipeline.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fedot/core/pipelines/pipeline.py b/fedot/core/pipelines/pipeline.py index 66c62b09e4..76b7c4a7ac 100644 --- a/fedot/core/pipelines/pipeline.py +++ b/fedot/core/pipelines/pipeline.py @@ -137,18 +137,17 @@ def _preprocess(self, input_data: Union[InputData, MultiModalData], *, is_fit_st if is_fit_stage: copied_input_data = self.preprocessor.obligatory_prepare_for_fit(copied_input_data) # Make additional preprocessing if it is needed - copied_input_data = self.preprocessor.optional_prepare_for_fit(pipeline=self, - data=copied_input_data) - copied_input_data = self.preprocessor.convert_indexes_for_fit(pipeline=self, - data=copied_input_data) + copied_input_data = self.preprocessor.optional_prepare_for_fit(pipeline=self, data=copied_input_data) + copied_input_data = self.preprocessor.convert_indexes_for_fit(pipeline=self, data=copied_input_data) + copied_input_data = self.preprocessor.reduce_memory_size(data=copied_input_data) else: copied_input_data = self.preprocessor.obligatory_prepare_for_predict(copied_input_data) # Make additional preprocessing if it is needed - copied_input_data = self.preprocessor.optional_prepare_for_predict(pipeline=self, - data=copied_input_data) - copied_input_data = self.preprocessor.convert_indexes_for_predict(pipeline=self, - data=copied_input_data) + copied_input_data = self.preprocessor.optional_prepare_for_predict(pipeline=self, data=copied_input_data) + copied_input_data = self.preprocessor.convert_indexes_for_predict(pipeline=self, data=copied_input_data) copied_input_data = self.preprocessor.update_indices_for_time_series(copied_input_data) + copied_input_data = self.preprocessor.reduce_memory_size(data=copied_input_data) + return copied_input_data def _postprocess(self, copied_input_data: Optional[InputData], result: OutputData, From 4c7d281eefed6092ea6e6057a05929b9d7ff65fe Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 14 Aug 2024 19:20:13 +0300 Subject: [PATCH 10/69] Return to Pandas for nan_matrix --- fedot/preprocessing/categorical.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 509bb811c0..c879a25744 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -26,8 +26,10 @@ def fit(self, input_data: InputData): """ if np.size(input_data.categorical_idx) != 0: categorical_columns = input_data.features[:, input_data.categorical_idx].T - nan_matrix = np.isnan(categorical_columns.astype(float, copy=False)) - nuniques = np.array([len(np.unique(col[~is_nan])) for col, is_nan in zip(categorical_columns, nan_matrix)]) + nan_matrix = pd.DataFrame(categorical_columns.T, columns=input_data.categorical_idx).isna().values + nuniques = np.array([ + len(np.unique(col[~is_nan])) for col, is_nan in zip(categorical_columns, nan_matrix.T) + ]) binary_ids_to_convert = [] From 75901ae96fece246c669ca78e714900a5eb7577b Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 14 Aug 2024 19:20:47 +0300 Subject: [PATCH 11/69] Change logic of _into_categorical_features_transformation_for_fit --- fedot/preprocessing/data_types.py | 52 +++++++++++++++---------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 7475a51c56..ce9442d1b3 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -279,37 +279,37 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): Perform automated categorical features determination. If feature column contains int or float values with few unique values (less than 13) """ - feature_type_ids = data.supplementary_data.col_type_ids['features'] - is_numeric_type = np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]]) - numeric_type_ids = np.flatnonzero(is_numeric_type) - num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids) - nuniques = num_df.nunique(dropna=True) - - # reduce dataframe to include only categorical features - num_df = num_df.loc[:, (2 < nuniques) & (nuniques < self.categorical_max_uniques_th)] + if data.categorical_idx is None: + feature_type_ids = data.supplementary_data.col_type_ids['features'] + is_numeric_type = np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]]) + numeric_type_ids = np.flatnonzero(is_numeric_type) + num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids) + nuniques = num_df.nunique(dropna=True) - if data.categorical_idx is not None: - cat_col_ids = data.categorical_idx - else: - cat_col_ids = num_df.columns - data.categorical_idx = cat_col_ids - - if np.size(cat_col_ids) > 0: - if data.features_names is not None: - cat_features_names = data.features_names[cat_col_ids] - self.log.message(f'--- Preprocessing define next cols {cat_features_names} as categorical') - else: - self.log.message(f'--- Preprocessing define next cols {cat_col_ids} as categorical') - else: - self.log.message(f'--- Preprocessing was unable to define the categorical columns') + # reduce dataframe to include only categorical features + num_df = num_df.loc[:, (2 < nuniques) & (nuniques < self.categorical_max_uniques_th)] + cat_col_from_heuristic_rule_ids = num_df.columns - if np.size(cat_col_ids) > 0: # Convert into string - data.features[:, cat_col_ids] = num_df.apply(convert_num_column_into_string_array).to_numpy() + data.features[:, cat_col_from_heuristic_rule_ids] = num_df.apply( + convert_num_column_into_string_array).to_numpy() # Columns need to be transformed into categorical (string) ones - self.numerical_into_str.extend(cat_col_ids.difference(self.numerical_into_str)) + self.numerical_into_str.extend(cat_col_from_heuristic_rule_ids.difference(self.numerical_into_str)) # Update information about column types (in-place) - feature_type_ids[cat_col_ids] = TYPE_TO_ID[str] + feature_type_ids[cat_col_from_heuristic_rule_ids] = TYPE_TO_ID[str] + + is_cat_type = np.isin(feature_type_ids, [TYPE_TO_ID[str]]) + all_cat_col_ids = np.flatnonzero(is_cat_type) + data.categorical_idx = all_cat_col_ids + + if np.size(all_cat_col_ids) > 0: + if data.features_names is not None: + cat_features_names = data.features_names[all_cat_col_ids] + self.log.message(f'--- Preprocessing define next cols {cat_features_names} as categorical') + else: + self.log.message(f'--- Preprocessing define next cols {all_cat_col_ids} as categorical') + else: + self.log.message(f'--- Preprocessing was unable to define the categorical columns') def _into_categorical_features_transformation_for_predict(self, data: InputData): """ Apply conversion into categorical string column for every signed column """ From 426dbd9a09b1cdb3de75569e6690222b20168584 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 14 Aug 2024 19:56:39 +0300 Subject: [PATCH 12/69] Adding convert to np.array --- fedot/core/data/data_preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index c8f9fd383a..40afac8d87 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -101,8 +101,8 @@ def data_has_categorical_features(data: InputData) -> bool: feature_type_ids = data.supplementary_data.col_type_ids['features'] cat_ids, non_cat_ids = find_categorical_columns(data.features, feature_type_ids) - data.numerical_idx = non_cat_ids - data.categorical_idx = cat_ids + data.numerical_idx = np.array(non_cat_ids) + data.categorical_idx = np.array(cat_ids) if len(cat_ids) > 0: data.categorical_features = data.subset_features(cat_ids).features From 9ab9f997d754921f3f3b4796c053b81d7d0dffbb Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 14 Aug 2024 19:57:31 +0300 Subject: [PATCH 13/69] Update ImputationImplementation --- .../sklearn_transformations.py | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 7a70442914..3485586fa4 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -281,10 +281,13 @@ def fit(self, input_data: InputData): replace_inf_with_nans(input_data) if data_type_is_table(input_data): + categorical_idx = input_data.categorical_idx.tolist() + numerical_idx = np.setdiff1d( + np.concatenate((input_data.numerical_idx, input_data.encoded_idx)), + categorical_idx + ).tolist() # Tabular data contains categorical features - categorical_ids, non_categorical_ids = find_categorical_columns(input_data.features) - numerical, categorical = divide_data_categorical_numerical(input_data, categorical_ids, - non_categorical_ids) + numerical, categorical = divide_data_categorical_numerical(input_data, categorical_idx, numerical_idx) if categorical is not None and categorical.features.size > 0: categorical.features = convert_into_column(categorical.features) @@ -312,12 +315,16 @@ def transform(self, input_data: InputData) -> OutputData: replace_inf_with_nans(input_data) - if data_type_is_table(input_data) and data_has_categorical_features(input_data): - feature_type_ids = input_data.supplementary_data.col_type_ids['features'] - self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features, - feature_type_ids) - numerical, categorical = divide_data_categorical_numerical(input_data, self.categorical_ids, - self.non_categorical_ids) + if data_type_is_table(input_data) and input_data.categorical_idx is not None: + self.categorical_ids = input_data.categorical_idx.tolist() + self.non_categorical_ids = np.setdiff1d( + np.concatenate((input_data.numerical_idx, input_data.encoded_idx)), + self.categorical_ids + ).tolist() + + numerical, categorical = divide_data_categorical_numerical( + input_data, self.categorical_ids, self.non_categorical_ids + ) if categorical is not None: categorical_features = convert_into_column(categorical.features) From b679660089a1339f2eb9d32b8cd6aa2b0758d061 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Thu, 15 Aug 2024 13:50:03 +0300 Subject: [PATCH 14/69] Fix bug in BinaryCategorical --- fedot/preprocessing/categorical.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index c879a25744..2a2226d524 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -26,9 +26,9 @@ def fit(self, input_data: InputData): """ if np.size(input_data.categorical_idx) != 0: categorical_columns = input_data.features[:, input_data.categorical_idx].T - nan_matrix = pd.DataFrame(categorical_columns.T, columns=input_data.categorical_idx).isna().values + nan_matrix = pd.DataFrame(categorical_columns.T, columns=input_data.categorical_idx).isna().values.T nuniques = np.array([ - len(np.unique(col[~is_nan])) for col, is_nan in zip(categorical_columns, nan_matrix.T) + len(np.unique(col[~is_nan])) for col, is_nan in zip(categorical_columns, nan_matrix) ]) binary_ids_to_convert = [] @@ -38,7 +38,7 @@ def fit(self, input_data: InputData): ): if is_nan.any(): # This categorical column has nans - categorical_columns[i, is_nan] = FEDOT_STR_NAN + categorical_columns[i, np.where(is_nan)[0]] = FEDOT_STR_NAN column_nuniques = len(set(categorical_columns[i])) if column_nuniques <= 3: From 119bca83ef01f0b093890190a6b38249068524c4 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Thu, 15 Aug 2024 16:14:35 +0300 Subject: [PATCH 15/69] Fix bug with test_data_from_csv_load_correctly --- fedot/preprocessing/data_types.py | 7 +++++++ fedot/preprocessing/preprocessing.py | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index ce9442d1b3..1628ead891 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -298,10 +298,16 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): # Update information about column types (in-place) feature_type_ids[cat_col_from_heuristic_rule_ids] = TYPE_TO_ID[str] + # Update cat cols idx in data is_cat_type = np.isin(feature_type_ids, [TYPE_TO_ID[str]]) all_cat_col_ids = np.flatnonzero(is_cat_type) data.categorical_idx = all_cat_col_ids + # Update num cols idx in data + is_numeric_type = np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]]) + all_numeric_type_ids = np.flatnonzero(is_numeric_type) + data.numerical_idx = all_numeric_type_ids + if np.size(all_cat_col_ids) > 0: if data.features_names is not None: cat_features_names = data.features_names[all_cat_col_ids] @@ -359,6 +365,7 @@ def _into_numeric_features_transformation_for_fit(self, data: InputData): (self.acceptable_failed_rate_bottom <= failed_ratio) & (failed_ratio < self.acceptable_failed_rate_top)) self.string_columns_transformation_failed.update(dict.fromkeys(is_of_mistakes[is_of_mistakes].index)) + data.numerical_idx = is_numeric_ids def _into_numeric_features_transformation_for_predict(self, data: InputData): """ Apply conversion into float string column for every signed column """ diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 6128090bc4..9fee4db89f 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -236,6 +236,10 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str, self.log.message('-- Converting data for predict') self.types_correctors[source_name].convert_data_for_predict(data) + feature_type_ids = data.supplementary_data.col_type_ids['features'] + data.numerical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]])) + data.categorical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[str]])) + # TODO andreygetmanov target encoding must be obligatory for all data types if data_type_is_text(data): # TODO andreygetmanov to new class text preprocessing? From 7a3946a95ed633e02e67d0cf0111652eabf93ce0 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Thu, 15 Aug 2024 16:34:33 +0300 Subject: [PATCH 16/69] Fix bug with test_api_fit_predict_with_pseudo_large_dataset_with_label_correct --- .../data_operations/sklearn_transformations.py | 10 ++-------- fedot/preprocessing/categorical.py | 1 + fedot/preprocessing/preprocessing.py | 13 ++++++++++--- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 3485586fa4..9b6b3d2c8c 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -282,10 +282,7 @@ def fit(self, input_data: InputData): if data_type_is_table(input_data): categorical_idx = input_data.categorical_idx.tolist() - numerical_idx = np.setdiff1d( - np.concatenate((input_data.numerical_idx, input_data.encoded_idx)), - categorical_idx - ).tolist() + numerical_idx = np.setdiff1d(input_data.numerical_idx, categorical_idx).tolist() # Tabular data contains categorical features numerical, categorical = divide_data_categorical_numerical(input_data, categorical_idx, numerical_idx) @@ -317,10 +314,7 @@ def transform(self, input_data: InputData) -> OutputData: if data_type_is_table(input_data) and input_data.categorical_idx is not None: self.categorical_ids = input_data.categorical_idx.tolist() - self.non_categorical_ids = np.setdiff1d( - np.concatenate((input_data.numerical_idx, input_data.encoded_idx)), - self.categorical_ids - ).tolist() + self.non_categorical_ids = np.setdiff1d(input_data.numerical_idx, self.categorical_ids).tolist() numerical, categorical = divide_data_categorical_numerical( input_data, self.categorical_ids, self.non_categorical_ids diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 2a2226d524..07c70de0c9 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -55,6 +55,7 @@ def fit(self, input_data: InputData): # Remove binary columns from categorical_idx input_data.categorical_idx = [idx for idx in input_data.categorical_idx if idx not in binary_ids_to_convert] + input_data.categorical_idx = np.array(input_data.categorical_idx) self.binary_ids_to_convert = binary_ids_to_convert # TODO: Add log.message with binary ids diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 9fee4db89f..0752e17b3e 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -237,8 +237,7 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str, self.types_correctors[source_name].convert_data_for_predict(data) feature_type_ids = data.supplementary_data.col_type_ids['features'] - data.numerical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]])) - data.categorical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[str]])) + data.numerical_idx, data.categorical_idx = self._update_num_and_cats_ids(feature_type_ids) # TODO andreygetmanov target encoding must be obligatory for all data types if data_type_is_text(data): @@ -252,6 +251,9 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str, else: data = self.binary_categorical_processors[source_name].transform(data) + feature_type_ids = data.supplementary_data.col_type_ids['features'] + data.numerical_idx, data.categorical_idx = self._update_num_and_cats_ids(feature_type_ids) + return data def _prepare_optional(self, pipeline, data: InputData, source_name: str): @@ -560,7 +562,6 @@ def reduce_memory_size(self, data: InputData) -> InputData: def reduce_mem_usage_np(arr, initial_types): reduced_columns = OptimisedFeature() - for i in range(arr.shape[1]): col = arr[:, i] init_type = _convertable_types[initial_types[i]] @@ -601,3 +602,9 @@ def reduce_mem_usage_np(arr, initial_types): data.target = reduce_mem_usage_np(data.target, data.supplementary_data.col_type_ids['target']) return data + + def _update_num_and_cats_ids(self, feature_type_ids): + numerical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]])) + categorical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[str]])) + + return numerical_idx, categorical_idx From 3134fc64da0dbea2d59a672cc53bdc19ae9bb80f Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Thu, 15 Aug 2024 16:42:42 +0300 Subject: [PATCH 17/69] Fix bug with test_pipeline_preprocessing_through_api_correctly --- fedot/preprocessing/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 0752e17b3e..144d865eef 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -568,7 +568,7 @@ def reduce_mem_usage_np(arr, initial_types): col = col.astype(init_type) col_type = col.dtype.name - if col_type not in ['object']: + if col_type not in ['object', 'str384']: c_min = col.max() c_max = col.max() From e5db54dcf809bf70e64f6e6c2197aea749a8e898 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Thu, 15 Aug 2024 16:59:13 +0300 Subject: [PATCH 18/69] Fix bug with test_default_forecast (add new TODO for ts_forecasting) --- fedot/core/data/data.py | 4 ++-- fedot/preprocessing/preprocessing.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index 3077f42e92..c8f39e9392 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -613,7 +613,7 @@ def get_not_encoded_data(self): num_features_names, cat_features_names = None, None # Checking numerical data exists - if self.numerical_idx: + if self.numerical_idx.any(): num_features = self.features[:, self.numerical_idx] if self.features_names is not None and np.size(self.features_names): @@ -622,7 +622,7 @@ def get_not_encoded_data(self): num_features_names = np.array([f'num_feature_{i}' for i in range(1, num_features.shape[1] + 1)]) # Checking categorical data exists - if self.categorical_idx: + if self.categorical_idx.any(): cat_features = self.categorical_features if self.features_names is not None and np.size(self.features_names): diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 144d865eef..64e9720c3d 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -595,11 +595,16 @@ def reduce_mem_usage_np(arr, initial_types): return reduced_columns if isinstance(data, InputData): - self.log.message('-- Reduce memory in features') - data.features = reduce_mem_usage_np(data.features, data.supplementary_data.col_type_ids['features']) + if data.task.task_type == TaskTypesEnum.ts_forecasting: + # TODO: TS data has col_type_ids['features'] = None. + # It required to add this to reduce memory for them + pass + else: + self.log.message('-- Reduce memory in features') + data.features = reduce_mem_usage_np(data.features, data.supplementary_data.col_type_ids['features']) - self.log.message('-- Reduce memory in target') - data.target = reduce_mem_usage_np(data.target, data.supplementary_data.col_type_ids['target']) + self.log.message('-- Reduce memory in target') + data.target = reduce_mem_usage_np(data.target, data.supplementary_data.col_type_ids['target']) return data From ebab7f2736fa9dc60ec1befdec2c51118220af3e Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Thu, 15 Aug 2024 17:15:38 +0300 Subject: [PATCH 19/69] Fix bug with test_cv_multiple_metrics_evaluated_correct by adding copy method to OptimisedFeature --- fedot/core/data/data.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index c8f39e9392..eb09a08ea8 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -719,6 +719,9 @@ def __setitem__(self, key, value): def __len__(self): return self._shape[0] if self._columns else 0 + def copy(self): + return self._columns.copy() + @property def shape(self): return self._shape From c12377921b957f1238d09f79e48733fe1e48dcc4 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Thu, 15 Aug 2024 17:19:04 +0300 Subject: [PATCH 20/69] Fix bug with test_regression_pipeline_with_data_operation_fit_predict_correct by adding check for target --- fedot/preprocessing/preprocessing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 64e9720c3d..874c308280 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -603,8 +603,9 @@ def reduce_mem_usage_np(arr, initial_types): self.log.message('-- Reduce memory in features') data.features = reduce_mem_usage_np(data.features, data.supplementary_data.col_type_ids['features']) - self.log.message('-- Reduce memory in target') - data.target = reduce_mem_usage_np(data.target, data.supplementary_data.col_type_ids['target']) + if data.target is not None: + self.log.message('-- Reduce memory in target') + data.target = reduce_mem_usage_np(data.target, data.supplementary_data.col_type_ids['target']) return data From 2e168dc346570b8bc8867c022137c71af2e9145c Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Fri, 16 Aug 2024 14:10:04 +0300 Subject: [PATCH 21/69] Fix bug in test_default_train_test_simple with nbytes --- fedot/api/api_utils/api_data.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py index 3ac1c9242b..c9db8f641e 100644 --- a/fedot/api/api_utils/api_data.py +++ b/fedot/api/api_utils/api_data.py @@ -152,11 +152,7 @@ def fit_transform(self, train_data: InputData) -> InputData: train_data.supplementary_data.is_auto_preprocessed = True - if isinstance(train_data.features, OptimisedFeature): - memory_usage = convert_memory_size(train_data.features.memory_usage) - - else: - memory_usage = convert_memory_size(train_data.features.nbytes) + memory_usage = convert_memory_size(train_data.features.nbytes) features_shape = train_data.features.shape target_shape = train_data.target.shape From f6d539a850a437a939321e71a00c8da3d014777a Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Fri, 16 Aug 2024 14:17:26 +0300 Subject: [PATCH 22/69] Fix bugs with str* types in features --- fedot/preprocessing/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 874c308280..bc0661495c 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -568,7 +568,7 @@ def reduce_mem_usage_np(arr, initial_types): col = col.astype(init_type) col_type = col.dtype.name - if col_type not in ['object', 'str384']: + if col_type not in ['object', 'str32', 'str96', 'str128', 'str160', 'str384']: c_min = col.max() c_max = col.max() From 9290d8266f76032737937773b33ee2d83946f6f3 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Fri, 16 Aug 2024 14:20:58 +0300 Subject: [PATCH 23/69] Fix bug with test_inf_and_nan_absence_after_imputation_implementation_fit_transform by adding cat and num idx in get_dataset func --- .../test_data_operations_implementations.py | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/test/unit/data_operations/test_data_operations_implementations.py b/test/unit/data_operations/test_data_operations_implementations.py index b5832b1bc1..3f49b1b21e 100644 --- a/test/unit/data_operations/test_data_operations_implementations.py +++ b/test/unit/data_operations/test_data_operations_implementations.py @@ -130,15 +130,21 @@ def get_multivariate_time_series(mutli_ts=False): def get_nan_inf_data(): supp_data = SupplementaryData(col_type_ids={'features': np.array([TYPE_TO_ID[float]] * 4)}) - train_input = InputData(idx=[0, 1, 2, 3], - features=np.array([[1, 2, 3, 4], - [2, np.nan, 4, 5], - [3, 4, 5, np.inf], - [-np.inf, 5, 6, 7]]), - target=np.array([1, 2, 3, 4]), - task=Task(TaskTypesEnum.regression), - data_type=DataTypesEnum.table, - supplementary_data=supp_data) + train_input = InputData( + idx=np.array([0, 1, 2, 3]), + features=np.array([ + [1, 2, 3, 4], + [2, np.nan, 4, 5], + [3, 4, 5, np.inf], + [-np.inf, 5, 6, 7] + ]), + target=np.array([1, 2, 3, 4]), + numerical_idx=np.array([0, 1, 2, 4]), + categorical_idx=np.array([]), + task=Task(TaskTypesEnum.regression), + data_type=DataTypesEnum.table, + supplementary_data=supp_data + ) return train_input From 2f5946636b21ca925dacfad3e89111a153173407 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Fri, 16 Aug 2024 14:35:21 +0300 Subject: [PATCH 24/69] Fix bug with test_pipeline_objective_evaluate_with_different_metrics by switching Xgboost to Catboost, due to "Experimental support for categorical data is not implemented for current tree method yet." for XgBoost and checking feat ids with size --- fedot/core/data/data.py | 4 ++-- test/unit/optimizer/test_pipeline_objective_eval.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index eb09a08ea8..e1031cb4e0 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -613,7 +613,7 @@ def get_not_encoded_data(self): num_features_names, cat_features_names = None, None # Checking numerical data exists - if self.numerical_idx.any(): + if self.numerical_idx.size != 0: num_features = self.features[:, self.numerical_idx] if self.features_names is not None and np.size(self.features_names): @@ -622,7 +622,7 @@ def get_not_encoded_data(self): num_features_names = np.array([f'num_feature_{i}' for i in range(1, num_features.shape[1] + 1)]) # Checking categorical data exists - if self.categorical_idx.any(): + if self.categorical_idx.size != 0: cat_features = self.categorical_features if self.features_names is not None and np.size(self.features_names): diff --git a/test/unit/optimizer/test_pipeline_objective_eval.py b/test/unit/optimizer/test_pipeline_objective_eval.py index 145a28d3db..1a82f86548 100644 --- a/test/unit/optimizer/test_pipeline_objective_eval.py +++ b/test/unit/optimizer/test_pipeline_objective_eval.py @@ -35,7 +35,7 @@ def pipeline_second_test(): def pipeline_third_test(): - pipeline = PipelineBuilder().add_node('xgboost').build() + pipeline = PipelineBuilder().add_node('catboost').build() return pipeline From 1be317f32a5a3af0ca185f840c7af232bdada3b1 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Fri, 16 Aug 2024 14:56:13 +0300 Subject: [PATCH 25/69] Fix bug with test_order_by_data_flow_len_correct --- test/unit/pipelines/test_decompose_pipelines.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/unit/pipelines/test_decompose_pipelines.py b/test/unit/pipelines/test_decompose_pipelines.py index a3fdc50a30..fb86ca9646 100644 --- a/test/unit/pipelines/test_decompose_pipelines.py +++ b/test/unit/pipelines/test_decompose_pipelines.py @@ -145,14 +145,14 @@ def test_order_by_data_flow_len_correct(): counters can allow for decompose implementation to determine how the nodes in the graph are located """ - input_data = get_iris_data() - input_data = DataPreprocessor().obligatory_prepare_for_fit(input_data) - data_operations = ['scaling', 'normalization', 'pca', 'poly_features'] model_operations = ['lda', 'knn', 'logit'] list_with_operations = list(product(data_operations, model_operations)) for data_operation, model_operation in list_with_operations: + input_data = get_iris_data() + input_data = DataPreprocessor().obligatory_prepare_for_fit(input_data) + # Generate pipeline with different operations in the nodes with decomposition pipeline = generate_pipeline_with_decomposition(data_operation, model_operation) From 16285df716dd371b717da8ba48f65de5397683d2 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Fri, 16 Aug 2024 15:44:28 +0300 Subject: [PATCH 26/69] Fix bug with test_pipeline_with_imputer (finally) --- fedot/core/data/data.py | 5 +++++ fedot/core/data/merge/data_merger.py | 5 +++++ .../data_operations/sklearn_transformations.py | 10 +++++----- .../implementation_interfaces.py | 5 +++++ 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index e1031cb4e0..83056e37a1 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -545,7 +545,12 @@ def subset_features(self, feature_ids: list) -> Optional[InputData]: subsample_input = InputData(features=subsample_features, data_type=self.data_type, target=self.target, task=self.task, + categorical_features=self.categorical_features, idx=self.idx, + numerical_idx=self.numerical_idx, + categorical_idx=self.categorical_idx, + encoded_idx=self.encoded_idx, + features_names=self.features_names, supplementary_data=self.supplementary_data) return subsample_input diff --git a/fedot/core/data/merge/data_merger.py b/fedot/core/data/merge/data_merger.py index b7f8ac1a5f..a1dc312f0b 100644 --- a/fedot/core/data/merge/data_merger.py +++ b/fedot/core/data/merge/data_merger.py @@ -78,6 +78,11 @@ def merge(self) -> 'InputData': return InputData(idx=common_idx, features=merged_features, target=filtered_main_target, task=self.main_output.task, data_type=self.data_type, + numerical_idx=self.main_output.numerical_idx, + categorical_idx=self.main_output.categorical_idx, + encoded_idx=self.main_output.encoded_idx, + categorical_features=self.main_output.categorical_features, + features_names=self.main_output.features_names, supplementary_data=updated_metadata) def merge_targets(self) -> np.array: diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 9b6b3d2c8c..4090b6002b 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -281,10 +281,10 @@ def fit(self, input_data: InputData): replace_inf_with_nans(input_data) if data_type_is_table(input_data): - categorical_idx = input_data.categorical_idx.tolist() - numerical_idx = np.setdiff1d(input_data.numerical_idx, categorical_idx).tolist() + encoded_idx = input_data.encoded_idx.tolist() + numerical_idx = np.setdiff1d(input_data.numerical_idx, encoded_idx).tolist() # Tabular data contains categorical features - numerical, categorical = divide_data_categorical_numerical(input_data, categorical_idx, numerical_idx) + numerical, categorical = divide_data_categorical_numerical(input_data, encoded_idx, numerical_idx) if categorical is not None and categorical.features.size > 0: categorical.features = convert_into_column(categorical.features) @@ -312,8 +312,8 @@ def transform(self, input_data: InputData) -> OutputData: replace_inf_with_nans(input_data) - if data_type_is_table(input_data) and input_data.categorical_idx is not None: - self.categorical_ids = input_data.categorical_idx.tolist() + if data_type_is_table(input_data) and input_data.encoded_idx is not None: + self.categorical_ids = input_data.encoded_idx.tolist() self.non_categorical_ids = np.setdiff1d(input_data.numerical_idx, self.categorical_ids).tolist() numerical, categorical = divide_data_categorical_numerical( diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index ed952ecf81..c4f60d1bbb 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -236,6 +236,11 @@ def _convert_to_output_function(input_data: InputData, transformed_features: np. task=input_data.task, target=input_data.target, data_type=data_type, + numerical_idx=input_data.numerical_idx, + categorical_idx=input_data.categorical_idx, + encoded_idx=input_data.encoded_idx, + categorical_features=input_data.categorical_features, + features_names=input_data.features_names, supplementary_data=input_data.supplementary_data) return converted From 36f994c73d095fb71add51975d265bd82d6fa480 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Fri, 16 Aug 2024 16:22:23 +0300 Subject: [PATCH 27/69] Fix bug with test_correct_api_dataset_with_text_preprocessing by update col_type regex rule for str* --- fedot/preprocessing/preprocessing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index bc0661495c..1444e0287f 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -1,3 +1,4 @@ +import re from copy import copy from typing import Optional, Union @@ -568,7 +569,7 @@ def reduce_mem_usage_np(arr, initial_types): col = col.astype(init_type) col_type = col.dtype.name - if col_type not in ['object', 'str32', 'str96', 'str128', 'str160', 'str384']: + if col_type not in ['object'] and not bool(re.match(r'str\d*$', col_type)): c_min = col.max() c_max = col.max() From 0b8c41c5b3db9206e3e6cbdfe429c85821db7d3a Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Mon, 19 Aug 2024 18:14:21 +0300 Subject: [PATCH 28/69] Update for OneHotImplementation --- .../data_operations/categorical_encoders.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index dce9296c12..03b1ac70bb 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -4,7 +4,7 @@ import numpy as np from sklearn.preprocessing import LabelEncoder, OneHotEncoder -from fedot.core.data.data import InputData, OutputData +from fedot.core.data.data import InputData, OutputData, OptimisedFeature from fedot.core.data.data_preprocessing import find_categorical_columns from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import ( DataOperationImplementation @@ -35,10 +35,10 @@ def fit(self, input_data: InputData): """ features = input_data.features feature_type_ids = input_data.supplementary_data.col_type_ids['features'] - self.categorical_ids, self.non_categorical_ids = find_categorical_columns(features, feature_type_ids) + self.categorical_ids, self.non_categorical_ids = input_data.categorical_idx, input_data.numerical_idx # If there are categorical features - process it - if self.categorical_ids: + if self.categorical_ids.size > 0: updated_cat_features = features[:, self.categorical_ids].astype(str) self.encoder.fit(updated_cat_features) @@ -55,7 +55,7 @@ def transform(self, input_data: InputData) -> OutputData: copied_data = deepcopy(input_data) transformed_features = copied_data.features - if self.categorical_ids: + if self.categorical_ids.size > 0: # If categorical features exist transformed_features = self._apply_one_hot_encoding(transformed_features) @@ -67,7 +67,7 @@ def transform(self, input_data: InputData) -> OutputData: def _update_column_types(self, output_data: OutputData): """ Update column types after encoding. Categorical columns becomes integer with extension """ - if self.categorical_ids: + if self.categorical_ids.size > 0: # There are categorical features in the table feature_type_ids = output_data.supplementary_data.col_type_ids['features'] numerical_columns = feature_type_ids[feature_type_ids != TYPE_TO_ID[str]] @@ -108,9 +108,7 @@ def __init__(self, params: Optional[OperationParameters] = None): self.non_categorical_ids: List[int] = [] def fit(self, input_data: InputData): - feature_type_ids = input_data.supplementary_data.col_type_ids['features'] - self.categorical_ids, self.non_categorical_ids = find_categorical_columns(input_data.features, - feature_type_ids) + self.categorical_ids, self.non_categorical_ids = input_data.categorical_idx, input_data.numerical_idx # For every existing categorical feature - perform encoding self._fit_label_encoders(input_data.features) @@ -161,7 +159,12 @@ def _apply_label_encoder(self, data: np.ndarray): # Store np.nan values transformed_column = transformed_column.astype(object) transformed_column[nan_idxs] = np.nan - data[:, column_id] = transformed_column + + if isinstance(data, np.ndarray): + data[:, column_id] = transformed_column + + elif isinstance(data, OptimisedFeature): + data._columns[column_id] = transformed_column def get_params(self) -> OperationParameters: """ Due to LabelEncoder has no parameters - return empty set """ From c3a80696179a72dfe861927706ede6548dd44662 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Mon, 19 Aug 2024 18:15:02 +0300 Subject: [PATCH 29/69] Update for subset_features and post_init --- fedot/core/data/data.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index 83056e37a1..d8799ec555 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -481,9 +481,12 @@ class InputData(Data): def __post_init__(self): if self.numerical_idx is None: if self.features is not None and isinstance(self.features, np.ndarray) and self.features.ndim > 1: - self.numerical_idx = list(range(self.features.shape[1])) + if self.categorical_idx is None: + self.numerical_idx = np.arange(0, self.features.shape[1]) + else: + self.numerical_idx = np.setdiff1d(np.arange(0, self.features.shape[1]), self.categorical_idx) else: - self.numerical_idx = [0] + self.numerical_idx = np.array([0]) @property def num_classes(self) -> Optional[int]: @@ -534,24 +537,26 @@ def subset_indices(self, selected_idx: List): target=self.target[row_nums], task=self.task, data_type=self.data_type) - def subset_features(self, feature_ids: list) -> Optional[InputData]: + def subset_features(self, feature_ids: np.array) -> Optional[InputData]: """ Return new :obj:`InputData` with subset of features based on non-empty ``features_ids`` list or `None` otherwise """ - if not feature_ids: + if feature_ids.size == 0: return None subsample_features = self.features[:, feature_ids] - subsample_input = InputData(features=subsample_features, - data_type=self.data_type, - target=self.target, task=self.task, - categorical_features=self.categorical_features, - idx=self.idx, - numerical_idx=self.numerical_idx, - categorical_idx=self.categorical_idx, - encoded_idx=self.encoded_idx, - features_names=self.features_names, - supplementary_data=self.supplementary_data) + subsample_input = InputData( + features=subsample_features, + data_type=self.data_type, + target=self.target, task=self.task, + idx=self.idx, + categorical_idx=np.setdiff1d(self.categorical_idx, feature_ids), + numerical_idx=np.setdiff1d(self.numerical_idx, feature_ids), + encoded_idx=np.setdiff1d(self.encoded_idx, feature_ids), + categorical_features=self.categorical_features, + features_names=self.features_names, + supplementary_data=self.supplementary_data + ) return subsample_input From 1d5ecfe6f1fe6111a1b16dd145cde8e880300746 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Mon, 19 Aug 2024 18:15:38 +0300 Subject: [PATCH 30/69] Update data_has_categorical_features --- fedot/core/data/data_preprocessing.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 40afac8d87..4323020da3 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -33,8 +33,8 @@ def convert_into_column(array: np.ndarray) -> np.ndarray: return array -def divide_data_categorical_numerical(input_data: InputData, categorical_ids: list, - non_categorical_ids: list) -> Tuple[Optional[InputData], Optional[InputData]]: +def divide_data_categorical_numerical(input_data: InputData, categorical_ids: np.ndarray, + non_categorical_ids: np.ndarray) -> Tuple[Optional[InputData], Optional[InputData]]: """ Split tabular InputData into two parts: with numerical and categorical features using list with ids of categorical and numerical features. @@ -98,16 +98,12 @@ def data_has_categorical_features(data: InputData) -> bool: if data.data_type is not DataTypesEnum.table: return False - feature_type_ids = data.supplementary_data.col_type_ids['features'] - cat_ids, non_cat_ids = find_categorical_columns(data.features, feature_type_ids) - - data.numerical_idx = np.array(non_cat_ids) - data.categorical_idx = np.array(cat_ids) + cat_ids, non_cat_ids = data.categorical_idx, data.numerical_idx if len(cat_ids) > 0: - data.categorical_features = data.subset_features(cat_ids).features + data.categorical_features = data.features[:, cat_ids] - return bool(cat_ids) + return bool(cat_ids.tolist()) def data_has_text_features(data: InputData) -> bool: From eb14784f540fe90b1f4a8c8f1efbc5d51bf57791 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Mon, 19 Aug 2024 18:15:57 +0300 Subject: [PATCH 31/69] Adding bool to numerical --- fedot/preprocessing/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 1444e0287f..d98c32da1a 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -611,7 +611,7 @@ def reduce_mem_usage_np(arr, initial_types): return data def _update_num_and_cats_ids(self, feature_type_ids): - numerical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]])) + numerical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float], TYPE_TO_ID[bool]])) categorical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[str]])) return numerical_idx, categorical_idx From af00955c247fc4c9cdcd33b43774cbe52e279292 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Mon, 19 Aug 2024 18:16:14 +0300 Subject: [PATCH 32/69] Update for ImputationImplementation --- .../sklearn_transformations.py | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 4090b6002b..3e127d09c6 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -264,7 +264,7 @@ def __init__(self, params: Optional[OperationParameters] = None): default_params_categorical = {'strategy': 'most_frequent'} self.params_cat = {**self.params.to_dict(), **default_params_categorical} self.params_num = self.params.to_dict() - self.categorical_ids = None + self.categorical_or_encoded_ids = None self.non_categorical_ids = None self.ids_binary_integer_features = {} @@ -281,10 +281,20 @@ def fit(self, input_data: InputData): replace_inf_with_nans(input_data) if data_type_is_table(input_data): - encoded_idx = input_data.encoded_idx.tolist() - numerical_idx = np.setdiff1d(input_data.numerical_idx, encoded_idx).tolist() + self.non_categorical_ids = input_data.numerical_idx + + # The data may have arrived here before categorical data encoding was called. + if input_data.categorical_idx is not None and input_data.encoded_idx is None: + self.categorical_or_encoded_ids = input_data.categorical_idx + + # Otherwise, it may have arrived here after categorical data encoding + elif input_data.encoded_idx is not None: + self.categorical_or_encoded_ids = input_data.encoded_idx + # Tabular data contains categorical features - numerical, categorical = divide_data_categorical_numerical(input_data, encoded_idx, numerical_idx) + numerical, categorical = divide_data_categorical_numerical( + input_data, self.categorical_or_encoded_ids, self.non_categorical_ids + ) if categorical is not None and categorical.features.size > 0: categorical.features = convert_into_column(categorical.features) @@ -312,12 +322,11 @@ def transform(self, input_data: InputData) -> OutputData: replace_inf_with_nans(input_data) - if data_type_is_table(input_data) and input_data.encoded_idx is not None: - self.categorical_ids = input_data.encoded_idx.tolist() - self.non_categorical_ids = np.setdiff1d(input_data.numerical_idx, self.categorical_ids).tolist() + categorical_features, numerical_features = None, None + if data_type_is_table(input_data): numerical, categorical = divide_data_categorical_numerical( - input_data, self.categorical_ids, self.non_categorical_ids + input_data, self.categorical_or_encoded_ids, self.non_categorical_ids ) if categorical is not None: @@ -332,13 +341,14 @@ def transform(self, input_data: InputData) -> OutputData: numerical_features = self.imputer_num.transform(numerical_features) numerical_features = self._correct_binary_ids_features(numerical_features) - if categorical is not None and numerical is not None: + if categorical_features is not None and numerical_features is not None: # Stack both categorical and numerical features transformed_features = self._categorical_numerical_union(categorical_features, numerical_features) - elif categorical is not None and numerical is None: + elif categorical_features is not None and numerical_features is None: # Dataset contain only categorical features transformed_features = categorical_features + elif categorical is None and numerical is not None: # Dataset contain only numerical features transformed_features = numerical_features @@ -368,7 +378,7 @@ def _categorical_numerical_union(self, categorical_features: np.array, numerical """Merge numerical and categorical features in right order (as it was in source table) """ - categorical_df = pd.DataFrame(categorical_features, columns=self.categorical_ids) + categorical_df = pd.DataFrame(categorical_features, columns=self.categorical_or_encoded_ids) numerical_df = pd.DataFrame(numerical_features, columns=self.non_categorical_ids) all_features_df = pd.concat([numerical_df, categorical_df], axis=1) From 600d12c4592cebb7efae2385e30ba8ac83ec9772 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Mon, 19 Aug 2024 18:17:43 +0300 Subject: [PATCH 33/69] Fix data for tests --- .../test_data_operations_implementations.py | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/test/unit/data_operations/test_data_operations_implementations.py b/test/unit/data_operations/test_data_operations_implementations.py index 3f49b1b21e..9529b33316 100644 --- a/test/unit/data_operations/test_data_operations_implementations.py +++ b/test/unit/data_operations/test_data_operations_implementations.py @@ -139,7 +139,7 @@ def get_nan_inf_data(): [-np.inf, 5, 6, 7] ]), target=np.array([1, 2, 3, 4]), - numerical_idx=np.array([0, 1, 2, 4]), + numerical_idx=np.array([0, 1, 2, 3]), categorical_idx=np.array([]), task=Task(TaskTypesEnum.regression), data_type=DataTypesEnum.table, @@ -216,10 +216,14 @@ def get_nan_binary_data(task=None): [1, '1', 1], [5, '1', 1]], dtype=object) - input_data = InputData(idx=[0, 1, 2, 3], features=features, - target=np.array([[0], [0], [1], [1]]), - task=task, data_type=DataTypesEnum.table, - supplementary_data=supp_data) + input_data = InputData( + idx=np.array([0, 1, 2, 3]), + features=features, + target=np.array([[0], [0], [1], [1]]), + categorical_idx=np.array([1]), + task=task, data_type=DataTypesEnum.table, + supplementary_data=supp_data + ) return input_data @@ -266,9 +270,19 @@ def data_with_binary_int_features_and_equal_categories(): [np.nan, np.nan], [0, 0]]) target = np.array([['not-nan'], ['nan'], ['nan'], ['not-nan']]) - train_input = InputData(idx=[0, 1, 2, 3], features=features, target=target, - task=task, data_type=DataTypesEnum.table, - supplementary_data=supp_data) + train_input = InputData( + idx=np.array([0, 1, 2, 3]), + features=features, + target=target, + numerical_idx=np.array([0, 1]), + categorical_idx=np.array([]), + encoded_idx=np.array([]), + categorical_features=None, + features_names=None, + task=task, + data_type=DataTypesEnum.table, + supplementary_data=supp_data + ) return train_input From 91c24a481727161b463645793e594a1af6c0cd61 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Mon, 19 Aug 2024 18:18:04 +0300 Subject: [PATCH 34/69] Fix test with adding new types --- test/integration/api/test_main_api.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/integration/api/test_main_api.py b/test/integration/api/test_main_api.py index 01700842e1..351dc6e24a 100644 --- a/test/integration/api/test_main_api.py +++ b/test/integration/api/test_main_api.py @@ -224,8 +224,14 @@ def test_categorical_preprocessing_unidata_predefined_linear(): pipeline.fit(train_data) prediction = pipeline.predict(test_data) + types_encountered = ( + int, float, + np.int8, np.int16, np.int32, np.int64, + np.float16, np.float32, np.float64, + ) + for i in range(prediction.features.shape[1]): - assert all(list(map(lambda x: isinstance(x, (int, float)), prediction.features[:, i]))) + assert all(list(map(lambda x: isinstance(x, types_encountered), prediction.features[:, i]))) def test_fill_nan_without_categorical(): From 313ad8a583b2b5fc6c0e0ef923d95f4d512a43d0 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Mon, 19 Aug 2024 18:34:41 +0300 Subject: [PATCH 35/69] Update test with deleting extra spaces --- test/unit/preprocessing/test_preprocessors.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py index 856f59f40d..6eda74361d 100644 --- a/test/unit/preprocessing/test_preprocessors.py +++ b/test/unit/preprocessing/test_preprocessors.py @@ -91,24 +91,24 @@ def data_with_complicated_types(): """ task = Task(TaskTypesEnum.classification) - features = np.array([[0, np.nan, 1, 1, 1, 'monday', 'a ', 'true', 1, '0', 'a'], + features = np.array([[0, np.nan, 1, 1, 1, 'monday', 'a', 'true', 1, '0', 'a'], [np.nan, 5, 2, 2, 0, 'tuesday', 'b', np.nan, 0, '1', np.inf], [2, np.nan, 3, 3, np.nan, 3, 'c', 'false', 1, '?', 'c'], - [3, np.nan, 4, 4, 3.0, 4, ' a ', 'true', 0, 'error', 'd'], - [4, np.nan, 5, 5.0, 0, 5, ' b ', np.nan, 0, '3', 'e'], - [5, np.nan, 6, 6, 0, 6, ' c ', 'false', 0, '4', 'f'], - [6, np.inf, 7, 7, 0, 7, ' a ', 'true', 1, '5', 'g'], - [7, np.inf, 8, 8, 1.0, 1, ' b ', np.nan, 0, '6', 'h'], + [3, np.nan, 4, 4, 3.0, 4, 'a', 'true', 0, 'error', 'd'], + [4, np.nan, 5, 5.0, 0, 5, 'b', np.nan, 0, '3', 'e'], + [5, np.nan, 6, 6, 0, 6, 'c', 'false', 0, '4', 'f'], + [6, np.inf, 7, 7, 0, 7, 'a', 'true', 1, '5', 'g'], + [7, np.inf, 8, 8, 1.0, 1, 'b', np.nan, 0, '6', 'h'], [np.inf, np.inf, '9', '9', 2, 2, np.nan, 'true', 1, '7', 'i'], - [9, np.inf, '10', '10', 2, 3, ' c ', 'false', 0, '8', 'j'], - [10, np.nan, 11.0, 11.0, 0, 4, 'c ', 'false', 0, '9', 'k'], + [9, np.inf, '10', '10', 2, 3, 'c', 'false', 0, '8', 'j'], + [10, np.nan, 11.0, 11.0, 0, 4, 'c', 'false', 0, '9', 'k'], [11, np.nan, 12, 12, 2.0, 5, np.nan, 'false', 1, '10', 'l'], - [12, np.nan, 1, 1.0, 1.0, 6, ' b ', 'false', 0, '11', 'm'], - [13, np.nan, 2, 2, 1, 7, ' c ', 'true', np.nan, '12', 'n'], + [12, np.nan, 1, 1.0, 1.0, 6, 'b', 'false', 0, '11', 'm'], + [13, np.nan, 2, 2, 1, 7, 'c', 'true', np.nan, '12', 'n'], [14, np.nan, 3, 3, 2.0, 1, 'a', 'false', np.nan, 'error', 'o'], - [15, np.nan, 4, 4, 1, 2, 'a ', 'false', np.nan, '13', 'p'], - [16, 2, 5, 12, 0, 3, ' d ', 'true', 1, '?', 'r'], - [17, 3, 6, 13, 0, 4, ' d ', 'false', 0, '17', 's']], + [15, np.nan, 4, 4, 1, 2, 'a', 'false', np.nan, '13', 'p'], + [16, 2, 5, 12, 0, 3, 'd', 'true', 1, '?', 'r'], + [17, 3, 6, 13, 0, 4, 'd', 'false', 0, '17', 's']], dtype=object) target = np.array([['no'], ['yes'], ['yes'], ['yes'], ['no'], ['no'], ['no'], ['no'], ['no'], ['yes'], ['yes'], ['yes'], ['yes'], ['yes'], ['no'], ['no'], ['yes'], ['no']]) From fa11d8bbbc747b3e2985e6d077a552d0bb88c3ea Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Mon, 19 Aug 2024 18:39:27 +0300 Subject: [PATCH 36/69] Update test with adding extra types_encountered --- test/unit/preprocessing/test_preprocessors.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py index 6eda74361d..d4d52c4884 100644 --- a/test/unit/preprocessing/test_preprocessors.py +++ b/test/unit/preprocessing/test_preprocessors.py @@ -216,8 +216,14 @@ def test_binary_pseudo_string_column_process_correctly(): pipeline = correct_preprocessing_params(pipeline) train_predicted = pipeline.fit(train_data) + types_encountered = ( + int, float, + np.int8, np.int16, np.int32, np.int64, + np.float16, np.float32, np.float64, + ) + assert train_predicted.features.shape[1] == 1 - assert all(isinstance(el[0], float) for el in train_predicted.features) + assert all(isinstance(el[0], types_encountered) for el in train_predicted.features) def fit_predict_cycle_for_testing(idx: int): @@ -240,8 +246,15 @@ def test_mixed_column_with_str_and_float_values(): # column with index 1 must be converted to float and the gaps must be filled train_predicted = fit_predict_cycle_for_testing(idx=1) + + types_encountered = ( + int, float, + np.int8, np.int16, np.int32, np.int64, + np.float16, np.float32, np.float64, + ) + assert train_predicted.features.shape[1] == 1 - assert all(isinstance(el[0], float) for el in train_predicted.features) + assert all(isinstance(el[0], types_encountered) for el in train_predicted.features) # column with index 2 must be removed due to unclear type of data try: From e76cd93ad52fc15dd0878c9873fad7b2211ae5e6 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Mon, 19 Aug 2024 20:30:36 +0300 Subject: [PATCH 37/69] Fixes different tests --- fedot/core/data/data.py | 2 +- .../data_operations/sklearn_transformations.py | 2 +- test/unit/multimodal/data_generators.py | 8 ++++---- .../preprocessing/test_preprocessing_through_api.py | 13 +++++++++---- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index d8799ec555..41373743f5 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -541,7 +541,7 @@ def subset_features(self, feature_ids: np.array) -> Optional[InputData]: """ Return new :obj:`InputData` with subset of features based on non-empty ``features_ids`` list or `None` otherwise """ - if feature_ids.size == 0: + if feature_ids is None or feature_ids.size == 0: return None subsample_features = self.features[:, feature_ids] diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index 3e127d09c6..ca6e59a8e9 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -186,7 +186,7 @@ def fit(self, input_data: InputData): if n_cols > self.th_columns: # Randomly choose subsample of features columns - 10 features column_indices = np.arange(n_cols) - self.columns_to_take = random.sample(list(column_indices), self.th_columns) + self.columns_to_take = np.array(random.sample(list(column_indices), self.th_columns)) input_data = input_data.subset_features(self.columns_to_take) return super().fit(input_data) diff --git a/test/unit/multimodal/data_generators.py b/test/unit/multimodal/data_generators.py index e5a390d0b5..f47732f758 100644 --- a/test/unit/multimodal/data_generators.py +++ b/test/unit/multimodal/data_generators.py @@ -27,10 +27,10 @@ def get_single_task_multimodal_tabular_data(): task = Task(TaskTypesEnum.classification) # Create features table - features_first = np.array([[0, ' a'], [1, ' a '], [2, ' b'], [3, np.nan], [4, ' a'], - [5, ' b'], [6, 'b '], [7, ' c'], [8, ' c ']], dtype=object) - features_second = np.array([[10, ' a'], [11, ' a '], [12, ' b'], [13, ' a '], [14, ' a'], - [15, ' b'], [16, 'b '], [17, ' c'], [18, ' c ']], dtype=object) + features_first = np.array([[0, 'a'], [1, 'a'], [2, 'b'], [3, np.nan], [4, 'a'], + [5, 'b'], [6, 'b'], [7, 'c'], [8, 'c']], dtype=object) + features_second = np.array([[10, 'a'], [11, 'a'], [12, 'b'], [13, 'a'], [14, 'a'], + [15, 'b'], [16, 'b'], [17, 'c'], [18, 'c']], dtype=object) target = np.array(['true', 'false', 'true', 'false', 'false', 'false', 'false', 'true', 'true'], dtype=str) diff --git a/test/unit/preprocessing/test_preprocessing_through_api.py b/test/unit/preprocessing/test_preprocessing_through_api.py index 6e42ee0975..cb2d2479b2 100644 --- a/test/unit/preprocessing/test_preprocessing_through_api.py +++ b/test/unit/preprocessing/test_preprocessing_through_api.py @@ -16,10 +16,15 @@ def data_with_only_categorical_features(): features = np.array([["'a'", "0", "1"], ["'b'", "1", "0"], ["'c'", "1", "0"]], dtype=object) - input_data = InputData(idx=np.array([0, 1, 2]), features=features, - target=np.array([0, 1, 2]), - task=task, data_type=DataTypesEnum.table, - supplementary_data=supp_data) + input_data = InputData( + idx=np.array([0, 1, 2]), + features=features, + target=np.array([0, 1, 2]), + categorical_idx=np.array([0, 1, 2]), + numerical_idx=np.array([]), + task=task, data_type=DataTypesEnum.table, + supplementary_data=supp_data + ) return input_data From 4085f5501eb3b9b7874f556940191652bda7fbfe Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Tue, 20 Aug 2024 15:09:07 +0300 Subject: [PATCH 38/69] Update expected_values for test_metrics test --- test/data/expected_metric_values.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/data/expected_metric_values.json b/test/data/expected_metric_values.json index 8a293325c6..4b81051a1d 100644 --- a/test/data/expected_metric_values.json +++ b/test/data/expected_metric_values.json @@ -13,11 +13,11 @@ "accuracy": -0.95 }, "multiclass": { - "roc_auc": -0.9832500832500832, + "roc_auc": -0.9881784881784883, "precision": -0.9777777777777779, "f1": -0.9719701552732407, "neg_log_loss": 0.17094588819131074, - "roc_auc_pen": -0.9789893328893329, + "roc_auc_pen": -0.9838963813963815, "accuracy": -0.9722222222222222 }, "regression": { From f9f8acfa75b0308075688545eb08396fc3211f29 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Tue, 20 Aug 2024 15:37:10 +0300 Subject: [PATCH 39/69] pep8 fixes --- fedot/api/api_utils/api_data.py | 2 +- fedot/core/data/data.py | 1 + fedot/core/data/data_preprocessing.py | 5 +++-- .../data_operations/categorical_encoders.py | 2 -- .../data_operations/sklearn_transformations.py | 4 ++-- fedot/preprocessing/categorical.py | 4 +--- fedot/preprocessing/data_types.py | 2 +- fedot/preprocessing/preprocessing.py | 6 ++++-- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py index c9db8f641e..7d54a50745 100644 --- a/fedot/api/api_utils/api_data.py +++ b/fedot/api/api_utils/api_data.py @@ -6,7 +6,7 @@ from golem.core.log import default_log from fedot.api.api_utils.data_definition import data_strategy_selector, FeaturesType, TargetType -from fedot.core.data.data import InputData, OutputData, data_type_is_table, OptimisedFeature +from fedot.core.data.data import InputData, OutputData, data_type_is_table from fedot.core.data.data_preprocessing import convert_into_column from fedot.core.data.multi_modal import MultiModalData from fedot.core.pipelines.pipeline import Pipeline diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index 41373743f5..eda9440313 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -681,6 +681,7 @@ class OutputData(Data): target: Optional[np.ndarray] = None encoded_idx: Optional[np.ndarray] = None + @dataclass class OptimisedFeature: _columns: list = field(default_factory=list, init=False) diff --git a/fedot/core/data/data_preprocessing.py b/fedot/core/data/data_preprocessing.py index 4323020da3..ff767bfde6 100644 --- a/fedot/core/data/data_preprocessing.py +++ b/fedot/core/data/data_preprocessing.py @@ -34,7 +34,8 @@ def convert_into_column(array: np.ndarray) -> np.ndarray: def divide_data_categorical_numerical(input_data: InputData, categorical_ids: np.ndarray, - non_categorical_ids: np.ndarray) -> Tuple[Optional[InputData], Optional[InputData]]: + non_categorical_ids: np.ndarray) -> \ + Tuple[Optional[InputData], Optional[InputData]]: """ Split tabular InputData into two parts: with numerical and categorical features using list with ids of categorical and numerical features. @@ -98,7 +99,7 @@ def data_has_categorical_features(data: InputData) -> bool: if data.data_type is not DataTypesEnum.table: return False - cat_ids, non_cat_ids = data.categorical_idx, data.numerical_idx + cat_ids, _ = data.categorical_idx, data.numerical_idx if len(cat_ids) > 0: data.categorical_features = data.features[:, cat_ids] diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 03b1ac70bb..c2ffac8e93 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -5,7 +5,6 @@ from sklearn.preprocessing import LabelEncoder, OneHotEncoder from fedot.core.data.data import InputData, OutputData, OptimisedFeature -from fedot.core.data.data_preprocessing import find_categorical_columns from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import ( DataOperationImplementation ) @@ -34,7 +33,6 @@ def fit(self, input_data: InputData): :return encoder: trained encoder (optional output) """ features = input_data.features - feature_type_ids = input_data.supplementary_data.col_type_ids['features'] self.categorical_ids, self.non_categorical_ids = input_data.categorical_idx, input_data.numerical_idx # If there are categorical features - process it diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py index ca6e59a8e9..8367007e0c 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py @@ -9,8 +9,8 @@ from fedot.core.constants import PCA_MIN_THRESHOLD_TS from fedot.core.data.data import InputData, OutputData, data_type_is_table -from fedot.core.data.data_preprocessing import convert_into_column, data_has_categorical_features, \ - divide_data_categorical_numerical, find_categorical_columns, replace_inf_with_nans +from fedot.core.data.data_preprocessing import convert_into_column, divide_data_categorical_numerical, \ + replace_inf_with_nans from fedot.core.operations.evaluation.operation_implementations. \ implementation_interfaces import DataOperationImplementation, EncodedInvariantImplementation from fedot.core.operations.operation_parameters import OperationParameters diff --git a/fedot/preprocessing/categorical.py b/fedot/preprocessing/categorical.py index 07c70de0c9..9994c98167 100644 --- a/fedot/preprocessing/categorical.py +++ b/fedot/preprocessing/categorical.py @@ -5,7 +5,6 @@ from sklearn.preprocessing import LabelEncoder from fedot.core.data.data import InputData -from fedot.core.data.data_preprocessing import find_categorical_columns from fedot.preprocessing.data_types import FEDOT_STR_NAN, TYPE_TO_ID @@ -24,6 +23,7 @@ def fit(self, input_data: InputData): Find indices of columns which are contains categorical values. Binary features and at the same time has str objects. If there are such features - convert it into int """ + # TODO: Add log.message with binary ids if np.size(input_data.categorical_idx) != 0: categorical_columns = input_data.features[:, input_data.categorical_idx].T nan_matrix = pd.DataFrame(categorical_columns.T, columns=input_data.categorical_idx).isna().values.T @@ -57,8 +57,6 @@ def fit(self, input_data: InputData): input_data.categorical_idx = [idx for idx in input_data.categorical_idx if idx not in binary_ids_to_convert] input_data.categorical_idx = np.array(input_data.categorical_idx) self.binary_ids_to_convert = binary_ids_to_convert - - # TODO: Add log.message with binary ids return self diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 1628ead891..6005cff5a5 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -315,7 +315,7 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): else: self.log.message(f'--- Preprocessing define next cols {all_cat_col_ids} as categorical') else: - self.log.message(f'--- Preprocessing was unable to define the categorical columns') + self.log.message('--- Preprocessing was unable to define the categorical columns') def _into_categorical_features_transformation_for_predict(self, data: InputData): """ Apply conversion into categorical string column for every signed column """ diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index d98c32da1a..434da34928 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -403,7 +403,7 @@ def _apply_categorical_encoding(self, data: InputData, source_name: str) -> Inpu self.features_encoders[source_name] = encoder self.log.message(f'--- {encoder.__class__.__name__} was chosen') - self.log.message(f'--- Fitting and transforming data') + self.log.message('--- Fitting and transforming data') output_data = encoder.transform_for_fit(data) output_data.predict = output_data.predict.astype(float) data.features = output_data.predict @@ -611,7 +611,9 @@ def reduce_mem_usage_np(arr, initial_types): return data def _update_num_and_cats_ids(self, feature_type_ids): - numerical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float], TYPE_TO_ID[bool]])) + numerical_idx = np.flatnonzero( + np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float], TYPE_TO_ID[bool]]) + ) categorical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[str]])) return numerical_idx, categorical_idx From fca7ef6e3c7e1c996c1ee8cd4cc5d70c31b7ce16 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Tue, 20 Aug 2024 19:28:09 +0300 Subject: [PATCH 40/69] Adding preprocessing copying to predefined models --- fedot/api/api_utils/api_data.py | 11 ++++++++--- fedot/api/api_utils/predefined_model.py | 15 ++++++++++++--- fedot/api/main.py | 8 +++++--- .../implementation_interfaces.py | 2 +- 4 files changed, 26 insertions(+), 10 deletions(-) diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py index 7d54a50745..9607ad40aa 100644 --- a/fedot/api/api_utils/api_data.py +++ b/fedot/api/api_utils/api_data.py @@ -33,14 +33,19 @@ def __init__(self, task: Task, use_input_preprocessing: bool = True): self.task = task self._recommendations = {} - self.preprocessor = DummyPreprocessor() + if use_input_preprocessing: self.preprocessor = DataPreprocessor() # Dictionary with recommendations (e.g. 'cut' for cutting dataset, 'label_encoded' # to encode features using label encoder). Parameters for transformation provided also - self._recommendations = {'cut': self.preprocessor.cut_dataset, - 'label_encoded': self.preprocessor.label_encoding_for_fit} + self._recommendations = { + 'cut': self.preprocessor.cut_dataset, + 'label_encoded': self.preprocessor.label_encoding_for_fit + } + + else: + self.preprocessor = DummyPreprocessor() self.log = default_log(self) diff --git a/fedot/api/api_utils/predefined_model.py b/fedot/api/api_utils/predefined_model.py index 1b50bd8d90..7ba9c288e7 100644 --- a/fedot/api/api_utils/predefined_model.py +++ b/fedot/api/api_utils/predefined_model.py @@ -8,26 +8,35 @@ from fedot.core.pipelines.node import PipelineNode from fedot.core.pipelines.pipeline import Pipeline from fedot.core.pipelines.verification import verify_pipeline +from fedot.preprocessing.base_preprocessing import BasePreprocessor class PredefinedModel: def __init__(self, predefined_model: Union[str, Pipeline], data: InputData, log: LoggerAdapter, - use_input_preprocessing: bool = True): + use_input_preprocessing: bool = True, api_preprocessor: BasePreprocessor = None): self.predefined_model = predefined_model self.data = data self.log = log - self.pipeline = self._get_pipeline(use_input_preprocessing) + self.pipeline = self._get_pipeline(use_input_preprocessing, api_preprocessor) - def _get_pipeline(self, use_input_preprocessing: bool = True) -> Pipeline: + def _get_pipeline(self, use_input_preprocessing: bool = True, api_preprocessor: BasePreprocessor = None) -> Pipeline: if isinstance(self.predefined_model, Pipeline): pipelines = self.predefined_model elif self.predefined_model == 'auto': # Generate initial assumption automatically pipelines = AssumptionsBuilder.get(self.data).from_operations().build( use_input_preprocessing=use_input_preprocessing)[0] + + if use_input_preprocessing and api_preprocessor is not None: + pipelines.preprocessor = api_preprocessor + elif isinstance(self.predefined_model, str): model = PipelineNode(self.predefined_model) pipelines = Pipeline(model, use_input_preprocessing=use_input_preprocessing) + + if use_input_preprocessing and api_preprocessor is not None: + pipelines.preprocessor = api_preprocessor + else: raise ValueError(f'{type(self.predefined_model)} is not supported as Fedot model') diff --git a/fedot/api/main.py b/fedot/api/main.py index f389489acc..56f19c3b5c 100644 --- a/fedot/api/main.py +++ b/fedot/api/main.py @@ -169,9 +169,11 @@ def fit(self, with fedot_composer_timer.launch_fitting(): if predefined_model is not None: # Fit predefined model and return it without composing - self.current_pipeline = PredefinedModel(predefined_model, self.train_data, self.log, - use_input_preprocessing=self.params.get( - 'use_input_preprocessing')).fit() + self.current_pipeline = PredefinedModel( + predefined_model, self.train_data, self.log, + use_input_preprocessing=self.params.get('use_input_preprocessing'), + api_preprocessor=self.data_processor.preprocessor, + ).fit() else: self.current_pipeline, self.best_models, self.history = self.api_composer.obtain_model(self.train_data) diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index c4f60d1bbb..5a007bf33e 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -162,7 +162,7 @@ def _reasonability_check(features): # For every column in table make check for column_id in range(0, columns_amount): column = features[:, column_id] if columns_amount > 1 else features.copy() - if len(np.unique(column)) > 2: + if len(set(column)) > 2: non_bool_ids.append(column_id) else: bool_ids.append(column_id) From 5a7cd7aa9a1236ffa213ec2ef5f44807d6d8ace2 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Tue, 20 Aug 2024 19:33:09 +0300 Subject: [PATCH 41/69] Adding docstring to reduce memory and optimisedfeatures --- fedot/core/data/data.py | 7 +++++-- .../data_operations/categorical_encoders.py | 4 ++-- fedot/preprocessing/base_preprocessing.py | 14 ++++++++++++++ fedot/preprocessing/dummy_preprocessing.py | 3 +++ fedot/preprocessing/preprocessing.py | 6 +++--- 5 files changed, 27 insertions(+), 7 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index eda9440313..8e475bc629 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -42,7 +42,7 @@ class Data: idx: np.ndarray task: Task data_type: DataTypesEnum - features: Union[np.ndarray, OptimisedFeature] + features: Union[np.ndarray, OptimisedFeatures] categorical_features: Optional[np.ndarray] = None categorical_idx: Optional[np.ndarray] = None numerical_idx: Optional[np.ndarray] = None @@ -683,7 +683,10 @@ class OutputData(Data): @dataclass -class OptimisedFeature: +class OptimisedFeatures: + """``Data`` type for optimised storage data. + It based on numpy ndarray, but the features storages in list of np.ndarray with own optimal dtype + """ _columns: list = field(default_factory=list, init=False) _shape: tuple = field(default=(0, 0), init=False) _nbytes: int = 0 diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index c2ffac8e93..057702c6ba 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -4,7 +4,7 @@ import numpy as np from sklearn.preprocessing import LabelEncoder, OneHotEncoder -from fedot.core.data.data import InputData, OutputData, OptimisedFeature +from fedot.core.data.data import InputData, OutputData, OptimisedFeatures from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import ( DataOperationImplementation ) @@ -161,7 +161,7 @@ def _apply_label_encoder(self, data: np.ndarray): if isinstance(data, np.ndarray): data[:, column_id] = transformed_column - elif isinstance(data, OptimisedFeature): + elif isinstance(data, OptimisedFeatures): data._columns[column_id] = transformed_column def get_params(self) -> OperationParameters: diff --git a/fedot/preprocessing/base_preprocessing.py b/fedot/preprocessing/base_preprocessing.py index 7871af8fc4..56c238ffb9 100644 --- a/fedot/preprocessing/base_preprocessing.py +++ b/fedot/preprocessing/base_preprocessing.py @@ -192,6 +192,20 @@ def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalD """ raise AbstractMethodNotImplementError + @abstractmethod + def reduce_memory_size(self, data: InputData) -> InputData: + """ + Method allows to reduce the memory consumption of InputData. + + This works in this way: + - Getting the defined type of feature from preprocessing (e.g. int); + - Finding the minimum and maximum values in this feature; + - Finding a suitable type and change it + (e.g.: Feature has unique values 0 and 1, the suitable type would be np.bool. + Feature has all values between 0 and 100, the suitable type would be np.int8); + """ + raise AbstractMethodNotImplementError + @staticmethod def mark_as_preprocessed(data: Union[InputData, MultiModalData], *, is_obligatory: bool = True): """ diff --git a/fedot/preprocessing/dummy_preprocessing.py b/fedot/preprocessing/dummy_preprocessing.py index d3c4206e34..4d0d1cd456 100644 --- a/fedot/preprocessing/dummy_preprocessing.py +++ b/fedot/preprocessing/dummy_preprocessing.py @@ -63,3 +63,6 @@ def restore_index(self, input_data: InputData, result: OutputData) -> OutputData def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalData] ) -> Union[InputData, MultiModalData]: return test_data + + def reduce_memory_size(self, data: InputData) -> InputData: + return data diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 434da34928..1f85e4e824 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -8,7 +8,7 @@ from golem.core.paths import copy_doc from sklearn.preprocessing import LabelEncoder -from fedot.core.data.data import InputData, np_datetime_to_numeric, OptimisedFeature +from fedot.core.data.data import InputData, np_datetime_to_numeric, OptimisedFeatures from fedot.core.data.data import OutputData, data_type_is_table, data_type_is_text, data_type_is_ts from fedot.core.data.data_preprocessing import ( data_has_categorical_features, @@ -558,10 +558,10 @@ def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalD last_id = len(input_data.idx) input_data.idx = np.arange(last_id, last_id + input_data.task.task_params.forecast_length) return test_data - + @copy_doc(BasePreprocessor.reduce_memory_size) def reduce_memory_size(self, data: InputData) -> InputData: def reduce_mem_usage_np(arr, initial_types): - reduced_columns = OptimisedFeature() + reduced_columns = OptimisedFeatures() for i in range(arr.shape[1]): col = arr[:, i] From 25cbe7a135c1d0976c089b1310f23f9194ee337d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 20 Aug 2024 16:37:18 +0000 Subject: [PATCH 42/69] Automated autopep8 fixes --- fedot/api/api_utils/predefined_model.py | 3 ++- fedot/preprocessing/preprocessing.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fedot/api/api_utils/predefined_model.py b/fedot/api/api_utils/predefined_model.py index 7ba9c288e7..09ace672bf 100644 --- a/fedot/api/api_utils/predefined_model.py +++ b/fedot/api/api_utils/predefined_model.py @@ -19,7 +19,8 @@ def __init__(self, predefined_model: Union[str, Pipeline], data: InputData, log: self.log = log self.pipeline = self._get_pipeline(use_input_preprocessing, api_preprocessor) - def _get_pipeline(self, use_input_preprocessing: bool = True, api_preprocessor: BasePreprocessor = None) -> Pipeline: + def _get_pipeline(self, use_input_preprocessing: bool = True, + api_preprocessor: BasePreprocessor = None) -> Pipeline: if isinstance(self.predefined_model, Pipeline): pipelines = self.predefined_model elif self.predefined_model == 'auto': diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 1f85e4e824..f4d95a36e0 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -558,6 +558,7 @@ def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalD last_id = len(input_data.idx) input_data.idx = np.arange(last_id, last_id + input_data.task.task_params.forecast_length) return test_data + @copy_doc(BasePreprocessor.reduce_memory_size) def reduce_memory_size(self, data: InputData) -> InputData: def reduce_mem_usage_np(arr, initial_types): From 9053f9f546967f8b924e3f51a6a05cd9cd59b3e2 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 21 Aug 2024 16:48:44 +0300 Subject: [PATCH 43/69] Fix bug with unhashable np --- .../operation_implementations/implementation_interfaces.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index 5a007bf33e..8822b8e436 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -160,8 +160,8 @@ def _reasonability_check(features): non_bool_ids = [] # For every column in table make check - for column_id in range(0, columns_amount): - column = features[:, column_id] if columns_amount > 1 else features.copy() + for column_id, column in enumerate(features._columns): + # column = features[:, column_id] if columns_amount > 1 else features.copy() if len(set(column)) > 2: non_bool_ids.append(column_id) else: From 8411c6e62c8e1712cee9dc32da3498bbebf3deaa Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 21 Aug 2024 16:53:30 +0300 Subject: [PATCH 44/69] Temp update --- fedot/api/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fedot/api/main.py b/fedot/api/main.py index 56f19c3b5c..7087b36826 100644 --- a/fedot/api/main.py +++ b/fedot/api/main.py @@ -189,6 +189,7 @@ def fit(self, self.log.message('Final pipeline was fitted') else: self.log.message('Already fitted initial pipeline is used') + # temp # Merge API & pipelines encoders if it is required self.current_pipeline.preprocessor = BasePreprocessor.merge_preprocessors( From 5cf8d1bb943587c947b3e0e4684a83b75ecd6480 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 21 Aug 2024 17:14:22 +0300 Subject: [PATCH 45/69] Fix tests --- fedot/api/main.py | 1 - .../operation_implementations/implementation_interfaces.py | 7 +++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fedot/api/main.py b/fedot/api/main.py index 7087b36826..56f19c3b5c 100644 --- a/fedot/api/main.py +++ b/fedot/api/main.py @@ -189,7 +189,6 @@ def fit(self, self.log.message('Final pipeline was fitted') else: self.log.message('Already fitted initial pipeline is used') - # temp # Merge API & pipelines encoders if it is required self.current_pipeline.preprocessor = BasePreprocessor.merge_preprocessors( diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index 8822b8e436..cb04120612 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -5,7 +5,7 @@ import numpy as np from golem.core.log import default_log -from fedot.core.data.data import InputData, OutputData +from fedot.core.data.data import InputData, OutputData, OptimisedFeatures from fedot.core.operations.operation_parameters import OperationParameters from fedot.core.repository.dataset_types import DataTypesEnum from fedot.utilities.custom_errors import AbstractMethodNotImplementError @@ -160,7 +160,10 @@ def _reasonability_check(features): non_bool_ids = [] # For every column in table make check - for column_id, column in enumerate(features._columns): + if isinstance(features, OptimisedFeatures): + features = features._columns + + for column_id, column in enumerate(features): # column = features[:, column_id] if columns_amount > 1 else features.copy() if len(set(column)) > 2: non_bool_ids.append(column_id) From 35446365fe944aa86f399ec37d9c0603b04e9955 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 21 Aug 2024 17:50:08 +0300 Subject: [PATCH 46/69] Fix test_regression_data_operations with inf data after poly_features --- .../operation_implementations/implementation_interfaces.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index cb04120612..4a1d2902cc 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -107,6 +107,8 @@ def transform(self, input_data: InputData) -> OutputData: else: transformed_features = features + transformed_features = np.nan_to_num(transformed_features, copy=False, nan=0, posinf=0, neginf=0) + # Update features and column types output_data = self._convert_to_output(input_data, transformed_features) self._update_column_types(source_features_shape, output_data) From 40aabd755b575f9374f214e107845b7463f15883 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 21 Aug 2024 17:57:58 +0300 Subject: [PATCH 47/69] Fix bug in tests with IndexError --- .../operation_implementations/implementation_interfaces.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index 4a1d2902cc..a08c9a9f12 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -164,6 +164,8 @@ def _reasonability_check(features): # For every column in table make check if isinstance(features, OptimisedFeatures): features = features._columns + elif isinstance(features, np.ndarray): + features = features.T for column_id, column in enumerate(features): # column = features[:, column_id] if columns_amount > 1 else features.copy() From 74293811eb6b5adb908a6af437c0e736477e2fe9 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 21 Aug 2024 20:35:49 +0300 Subject: [PATCH 48/69] Adding take by indecies method and to_numpy() in OptimisedFeatures --- fedot/core/data/data.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index 8e475bc629..d56bddc044 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -733,9 +733,28 @@ def __setitem__(self, key, value): def __len__(self): return self._shape[0] if self._columns else 0 + def take(self, indices, axis=0): + output = OptimisedFeatures() + + if axis == 0: + # Takes rows + for col in self._columns: + output.add_column(np.take(col, indices, axis)) + elif axis == 1: + # Takes columns + for i in indices: + output.add_column(self._columns[i]) + else: + raise ValueError("Axis must be 0 (rows) or 1 (columns)") + + return output + def copy(self): return self._columns.copy() + def to_numpy(self): + return np.transpose(np.array(self._columns)) + @property def shape(self): return self._shape From b58993f30c3165397dd5b00b8e1fb2a308aa2f3f Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 21 Aug 2024 20:36:21 +0300 Subject: [PATCH 49/69] Update train_test_split for OptimisedFeatures --- fedot/core/data/data_split.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fedot/core/data/data_split.py b/fedot/core/data/data_split.py index 73b4f21da2..1c2f34e60a 100644 --- a/fedot/core/data/data_split.py +++ b/fedot/core/data/data_split.py @@ -4,7 +4,7 @@ import numpy as np from sklearn.model_selection import train_test_split -from fedot.core.data.data import InputData +from fedot.core.data.data import InputData, OptimisedFeatures from fedot.core.data.multi_modal import MultiModalData from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import TaskTypesEnum @@ -30,8 +30,13 @@ def _split_input_data_by_indexes(origin_input_data: Union[InputData, MultiModalD return data elif isinstance(origin_input_data, InputData): idx = np.take(origin_input_data.idx, index, 0) - target = np.take(origin_input_data.target, index, 0) - features = np.take(origin_input_data.features, index, 0) + if isinstance(origin_input_data.features, OptimisedFeatures): + features = origin_input_data.features.take(index) + target = origin_input_data.target.take(index) + + else: + features = np.take(origin_input_data.features, index, 0) + target = np.take(origin_input_data.target, index, 0) if origin_input_data.categorical_features is not None: categorical_features = np.take(origin_input_data.categorical_features, index, 0) From 936635cb1b02b7ada4264a9f1ca7d0dddc52208c Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 21 Aug 2024 20:37:02 +0300 Subject: [PATCH 50/69] Transform target to numpy array during memory_reduce --- fedot/preprocessing/preprocessing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index f4d95a36e0..ca03a8eb29 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -608,6 +608,7 @@ def reduce_mem_usage_np(arr, initial_types): if data.target is not None: self.log.message('-- Reduce memory in target') data.target = reduce_mem_usage_np(data.target, data.supplementary_data.col_type_ids['target']) + data.target = data.target.to_numpy() return data From 47f214c2330b64f4edc7e4d309b9721a233981e9 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Thu, 22 Aug 2024 18:54:08 +0300 Subject: [PATCH 51/69] PR#1318 migration --- fedot/core/data/data.py | 163 +++- fedot/preprocessing/data_types.py | 27 +- test/data/melb_data.csv | 1001 +++++++++++++++++++++++ test/unit/data/test_data_categorical.py | 205 +++++ 4 files changed, 1352 insertions(+), 44 deletions(-) create mode 100644 test/data/melb_data.csv create mode 100644 test/unit/data/test_data_categorical.py diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index d56bddc044..b382eb6839 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -57,24 +57,29 @@ class Data: def from_numpy(cls, features_array: np.ndarray, target_array: np.ndarray, + features_names: np.ndarray[str] = None, + categorical_idx: Union[list[int, str], np.ndarray[int, str]] = None, idx: Optional[np.ndarray] = None, task: Union[Task, str] = 'classification', data_type: Optional[DataTypesEnum] = DataTypesEnum.table) -> InputData: """Import data from numpy array. - Args: - features_array: numpy array with features. - target_array: numpy array with target. - idx: indices of arrays. - task: the :obj:`Task` to solve with the data. - data_type: the type of the data. Possible values are listed at :class:`DataTypesEnum`. + Args: + features_array: numpy array with features. + target_array: numpy array with target. + features_names: numpy array with names of features + categorical_idx: a list or numpy array with indexes or names of features (if provided feature_names) + that indicate that the feature is categorical. + idx: indices of arrays. + task: the :obj:`Task` to solve with the data. + data_type: the type of the data. Possible values are listed at :class:`DataTypesEnum`. - Returns: - data - """ + Returns: + data: :InputData: representation of data in an internal data structure. + """ if isinstance(task, str): task = Task(TaskTypesEnum(task)) - return array_to_input_data(features_array, target_array, idx, task, data_type) + return array_to_input_data(features_array, target_array, features_names, categorical_idx, idx, task, data_type) @classmethod def from_numpy_time_series(cls, @@ -85,16 +90,16 @@ def from_numpy_time_series(cls, data_type: Optional[DataTypesEnum] = DataTypesEnum.ts) -> InputData: """Import time series from numpy array. - Args: - features_array: numpy array with features time series. - target_array: numpy array with target time series (if None same as features). - idx: indices of arrays. - task: the :obj:`Task` to solve with the data. - data_type: the type of the data. Possible values are listed at :class:`DataTypesEnum`. + Args: + features_array: numpy array with features time series. + target_array: numpy array with target time series (if None same as features). + idx: indices of arrays. + task: the :obj:`Task` to solve with the data. + data_type: the type of the data. Possible values are listed at :class:`DataTypesEnum`. - Returns: - data - """ + Returns: + data: :InputData: representation of data in an internal data structure. + """ if isinstance(task, str): task = Task(TaskTypesEnum(task)) if target_array is None: @@ -105,20 +110,22 @@ def from_numpy_time_series(cls, def from_dataframe(cls, features_df: Union[pd.DataFrame, pd.Series], target_df: Union[pd.DataFrame, pd.Series], - categorical_idx: np.ndarray = None, + categorical_idx: Union[list[int, str], np.ndarray[int, str]] = None, task: Union[Task, str] = 'classification', data_type: DataTypesEnum = DataTypesEnum.table) -> InputData: """Import data from pandas DataFrame. - Args: - features_df: loaded pandas DataFrame or Series with features. - target_df: loaded pandas DataFrame or Series with target. - task: the :obj:`Task` to solve with the data. - data_type: the type of the data. Possible values are listed at :class:`DataTypesEnum`. + Args: + features_df: loaded pandas DataFrame or Series with features. + target_df: loaded pandas DataFrame or Series with target. + categorical_idx: a list or numpy array with indexes or names of features that indicate that + the feature is categorical. + task: the :obj:`Task` to solve with the data. + data_type: the type of the data. Possible values are listed at :class:`DataTypesEnum`. - Returns: - data - """ + Returns: + data: :InputData: representation of data in an internal data structure. + """ if isinstance(task, str): task = Task(TaskTypesEnum(task)) @@ -135,11 +142,34 @@ def from_dataframe(cls, categorical_features = None if categorical_idx is not None: - categorical_features = features_df.loc[:, categorical_idx].to_numpy() + if isinstance(categorical_idx, list): + categorical_idx = np.array(categorical_idx) + + if categorical_idx != np.array([]) and isinstance(categorical_idx[0], str) and features_names is None: + raise ValueError( + 'Impossible to specify categorical features by name when the features_names are not specified' + ) + + if categorical_idx != np.array([]) and isinstance(categorical_idx[0], str): + categorical_idx = np.array( + [idx for idx, column in enumerate(features_names) if column in set(categorical_idx)] + ) + + if categorical_idx.size != 0: + categorical_features = features[:, categorical_idx] + + data = InputData( + idx=idx, + features=features, + target=target, + task=task, + data_type=data_type, + features_names=features_names, + categorical_idx=categorical_idx, + categorical_features=categorical_features + ) - return InputData(idx=idx, features=features, target=target, task=task, data_type=data_type, - features_names=features_names, categorical_features=categorical_features, - categorical_idx=categorical_idx) + return data @classmethod def from_csv(cls, @@ -149,6 +179,7 @@ def from_csv(cls, data_type: DataTypesEnum = DataTypesEnum.table, columns_to_drop: Optional[List[Union[str, int]]] = None, target_columns: Union[str, List[Union[str, int]]] = '', + categorical_idx: Union[list[int, str], np.ndarray[int, str]] = None, index_col: Optional[Union[str, int]] = None, possible_idx_keywords: Optional[List[str]] = None) -> InputData: """Import data from ``csv``. @@ -160,6 +191,8 @@ def from_csv(cls, task: the :obj:`Task` to solve with the data. data_type: the type of the data. Possible values are listed at :class:`DataTypesEnum`. target_columns: name of the target column (the last column if empty and no target if ``None``). + categorical_idx: a list or numpy array with indexes or names of features that indicate that + the feature is categorical. index_col: name or index of the column to use as the :obj:`Data.idx`.\n If ``None``, then check the first column's name and use it as index if succeeded (see the param ``possible_idx_keywords``).\n @@ -184,8 +217,36 @@ def from_csv(cls, features, target = process_target_and_features(df, target_columns) - return InputData(idx=idx, features=features, target=target, task=task, data_type=data_type, - features_names=features_names) + categorical_features = None + if categorical_idx is not None: + if isinstance(categorical_idx, list): + categorical_idx = np.array(categorical_idx) + + if categorical_idx != np.array([]) and isinstance(categorical_idx[0], str) and features_names is None: + raise ValueError( + 'Impossible to specify categorical features by name when the features_names are not specified' + ) + + if categorical_idx != np.array([]) and isinstance(categorical_idx[0], str): + categorical_idx = np.array( + [idx for idx, column in enumerate(features_names) if column in set(categorical_idx)] + ) + + if categorical_idx.size != 0: + categorical_features = features[:, categorical_idx] + + data = InputData( + idx=idx, + features=features, + target=target, + task=task, + data_type=data_type, + features_names=features_names, + categorical_idx=categorical_idx, + categorical_features=categorical_features + ) + + return data @classmethod def from_csv_time_series(cls, @@ -852,6 +913,8 @@ def np_datetime_to_numeric(data: np.ndarray) -> np.ndarray: def array_to_input_data(features_array: np.ndarray, target_array: np.ndarray, + features_names: np.ndarray[str] = None, + categorical_idx: Union[list[int, str], np.ndarray[int, str]] = None, idx: Optional[np.ndarray] = None, task: Task = Task(TaskTypesEnum.classification), data_type: Optional[DataTypesEnum] = None) -> InputData: @@ -859,7 +922,37 @@ def array_to_input_data(features_array: np.ndarray, idx = np.arange(len(features_array)) if data_type is None: data_type = autodetect_data_type(task) - return InputData(idx=idx, features=features_array, target=target_array, task=task, data_type=data_type) + + categorical_features = None + if categorical_idx is not None: + if isinstance(categorical_idx, list): + categorical_idx = np.array(categorical_idx) + + if categorical_idx != np.array([]) and isinstance(categorical_idx[0], str) and features_names is None: + raise ValueError( + 'Impossible to specify categorical features by name when the features_names are not specified' + ) + + if categorical_idx != np.array([]) and isinstance(categorical_idx[0], str): + categorical_idx = np.array( + [idx for idx, column in enumerate(features_names) if column in set(categorical_idx)] + ) + + if categorical_idx.size != 0: + categorical_features = features_array[:, categorical_idx] + + data = InputData( + idx=idx, + features=features_array, + target=target_array, + features_names=features_names, + categorical_idx=categorical_idx, + categorical_features=categorical_features, + task=task, + data_type=data_type + ) + + return data def autodetect_data_type(task: Task) -> DataTypesEnum: diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 6005cff5a5..566cdafbde 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -286,17 +286,26 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): num_df = pd.DataFrame(data.features[:, numeric_type_ids], columns=numeric_type_ids) nuniques = num_df.nunique(dropna=True) + # TODO: Improve the naive approach (with categorical_max_uniques_th) of identifying categorical data + # to a smarter approach (eg. numeric, features naming with llm) # reduce dataframe to include only categorical features num_df = num_df.loc[:, (2 < nuniques) & (nuniques < self.categorical_max_uniques_th)] - cat_col_from_heuristic_rule_ids = num_df.columns - - # Convert into string - data.features[:, cat_col_from_heuristic_rule_ids] = num_df.apply( - convert_num_column_into_string_array).to_numpy() - # Columns need to be transformed into categorical (string) ones - self.numerical_into_str.extend(cat_col_from_heuristic_rule_ids.difference(self.numerical_into_str)) - # Update information about column types (in-place) - feature_type_ids[cat_col_from_heuristic_rule_ids] = TYPE_TO_ID[str] + + if data.categorical_idx is not None: + # If cats features were defined take it + cat_col_ids = data.categorical_idx + else: + # Else cats features are selected by heuristic rule + cat_col_ids = num_df.columns + + if np.size(cat_col_ids) > 0: + # Convert into string + data.features[:, cat_col_ids] = num_df.apply( + convert_num_column_into_string_array).to_numpy() + # Columns need to be transformed into categorical (string) ones + self.numerical_into_str.extend(cat_col_ids.difference(self.numerical_into_str)) + # Update information about column types (in-place) + feature_type_ids[cat_col_ids] = TYPE_TO_ID[str] # Update cat cols idx in data is_cat_type = np.isin(feature_type_ids, [TYPE_TO_ID[str]]) diff --git a/test/data/melb_data.csv b/test/data/melb_data.csv new file mode 100644 index 0000000000..53d430c2c1 --- /dev/null +++ b/test/data/melb_data.csv @@ -0,0 +1,1001 @@ +Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Price +t,VB,Western Metropolitan,2,11.7,3033.0,2.0,2.0,62.0,-37.73893,144.87661,5629.0,520000.0 +h,SP,Southern Metropolitan,3,11.4,3204.0,3.0,1.0,670.0,-37.91855,145.02628,6795.0,1573000.0 +t,S,Western Metropolitan,4,7.5,3040.0,4.0,3.0,205.0,-37.74588,144.92700000000005,9264.0,1185000.0 +u,S,Southern Metropolitan,1,4.6,3122.0,2.0,1.0,0.0,-37.8264,145.02700000000004,11308.0,485000.0 +h,S,Eastern Metropolitan,3,23.0,3136.0,3.0,1.0,1082.0,-37.80618,145.27755,11925.0,1005000.0 +t,S,Southern Metropolitan,3,10.7,3187.0,3.0,2.0,119.0,-37.9084,145.0118,6938.0,972000.0 +h,VB,Eastern Metropolitan,4,10.3,3084.0,4.0,2.0,707.0,-37.762,145.0645,1651.0,1750000.0 +h,S,Southern Metropolitan,2,13.0,3204.0,2.0,1.0,730.0,-37.9207,145.0479,6795.0,1550000.0 +h,VB,Northern Metropolitan,3,3.5,3068.0,3.0,2.0,240.0,-37.782,144.9834,6244.0,1700000.0 +u,PI,Southern Metropolitan,2,7.7,3184.0,2.0,1.0,0.0,-37.8753,144.9902,8989.0,680000.0 +t,S,Northern Metropolitan,2,11.2,3073.0,2.0,1.0,159.0,-37.7059,145.0115,21650.0,416000.0 +h,S,Eastern Metropolitan,3,21.3,3135.0,3.0,2.0,941.0,-37.81289,145.24213,4407.0,1400000.0 +u,SA,Southern Metropolitan,1,4.6,3142.0,1.0,1.0,0.0,-37.8421,145.0104,7217.0,247500.0 +h,S,South-Eastern Metropolitan,4,35.4,3198.0,4.0,2.0,542.0,-38.11161,145.15011,8077.0,768000.0 +u,S,Southern Metropolitan,2,2.7,3141.0,2.0,1.0,17200.0,-37.83613,144.99661,14887.0,762500.0 +h,PI,Northern Metropolitan,3,5.2,3056.0,3.0,1.0,193.0,-37.76418,144.95715,11918.0,1100000.0 +u,S,Southern Metropolitan,2,4.6,3122.0,2.0,1.0,0.0,-37.8166,145.0163,11308.0,663000.0 +h,S,Northern Metropolitan,3,15.3,3074.0,3.0,2.0,545.0,-37.68403,144.99246000000005,7955.0,740000.0 +h,S,Northern Metropolitan,3,6.5,3071.0,3.0,1.0,0.0,-37.7539,144.989,8870.0,1190000.0 +h,VB,Southern Metropolitan,3,5.9,3144.0,3.0,4.0,950.0,-37.85905,145.03229,4675.0,4900000.0 +h,S,Northern Metropolitan,3,5.2,3055.0,3.0,1.0,613.0,-37.76883,144.94592,7082.0,1445000.0 +h,S,Western Metropolitan,3,12.8,3033.0,3.0,1.0,713.0,-37.7301,144.8671,5629.0,900000.0 +h,S,Western Metropolitan,3,14.8,3023.0,3.0,2.0,461.0,-37.74518,144.74708,1607.0,580000.0 +h,PI,Western Metropolitan,3,11.1,3025.0,3.0,1.0,540.0,-37.8294,144.8378,5132.0,710000.0 +h,S,Eastern Metropolitan,4,13.9,3108.0,4.0,4.0,1157.0,-37.7779,145.127,9028.0,1924500.0 +h,S,Northern Victoria,3,26.1,3099.0,3.0,2.0,785.0,-37.637,145.20166,1345.0,600000.0 +h,S,Southern Metropolitan,4,4.6,3122.0,4.0,1.0,383.0,-37.8237,145.0311,11308.0,2100000.0 +u,S,Southern Metropolitan,2,2.7,3141.0,2.0,1.0,1272.0,-37.84283,145.00015,14887.0,771000.0 +u,S,Southern Metropolitan,2,7.7,3184.0,2.0,1.0,0.0,-37.8842,144.9829,8989.0,645000.0 +h,S,Western Metropolitan,3,7.0,3013.0,3.0,1.0,464.0,-37.8151,144.8638,6543.0,796000.0 +h,S,Northern Metropolitan,2,11.2,3073.0,2.0,1.0,310.0,-37.7288,145.0224,21650.0,498000.0 +h,S,Western Metropolitan,2,6.9,3039.0,2.0,1.0,292.0,-37.7642,144.9195,6232.0,1055000.0 +t,S,Northern Metropolitan,2,5.2,3056.0,2.0,2.0,177.0,-37.7625,144.9653,11918.0,754000.0 +h,S,Southern Metropolitan,4,10.4,3125.0,4.0,2.0,514.0,-37.84248,145.10181,5678.0,1750000.0 +h,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,650.0,-37.699,144.9421,8870.0,690000.0 +h,S,Northern Metropolitan,2,17.9,3082.0,2.0,1.0,257.0,-37.65636,145.03996999999995,10529.0,421000.0 +h,S,Southern Metropolitan,3,13.9,3165.0,3.0,2.0,710.0,-37.9348,145.0634,10969.0,1085000.0 +h,S,Northern Metropolitan,4,12.1,3083.0,4.0,3.0,331.0,-37.67998,145.07345,10175.0,758000.0 +u,S,Southern Metropolitan,2,11.4,3163.0,2.0,1.0,134.0,-37.8987,145.0557,7822.0,765000.0 +u,S,Northern Metropolitan,3,1.6,3066.0,3.0,2.0,0.0,-37.8032,144.9842,4553.0,1326000.0 +t,PI,Southern Metropolitan,3,14.6,3189.0,3.0,2.0,164.0,-37.9378,145.0385,2555.0,750000.0 +h,S,Northern Metropolitan,3,14.0,3047.0,3.0,1.0,622.0,-37.68908,144.9116,851.0,626000.0 +h,S,Northern Metropolitan,4,3.6,3068.0,4.0,2.0,191.0,-37.79274,144.99863,2954.0,1662500.0 +h,S,Southern Metropolitan,4,11.8,3204.0,4.0,2.0,705.0,-37.9035,145.028,3578.0,1715000.0 +h,S,Northern Metropolitan,5,5.5,3070.0,4.0,3.0,490.0,-37.7726,145.0048,11364.0,2700000.0 +t,VB,Northern Metropolitan,2,1.9,3003.0,2.0,2.0,54.0,-37.8094,144.9479,2230.0,800000.0 +h,S,Southern Metropolitan,3,7.3,3146.0,3.0,2.0,0.0,-37.85139,145.05835,10412.0,1440000.0 +t,S,Western Metropolitan,3,7.0,3013.0,3.0,2.0,257.0,-37.813,144.8703,6543.0,1070000.0 +h,SP,Western Metropolitan,4,8.7,3032.0,4.0,2.0,215.0,-37.7817,144.8916,4918.0,770000.0 +h,PI,Southern Metropolitan,4,4.6,3142.0,4.0,2.0,237.0,-37.8507,145.0298,7217.0,2025000.0 +h,S,Eastern Metropolitan,5,12.4,3108.0,5.0,2.0,726.0,-37.78133,145.10833,9028.0,1540500.0 +h,S,Eastern Metropolitan,4,13.9,3108.0,4.0,3.0,657.0,-37.7954,145.1379,9028.0,1520000.0 +h,S,Southern Metropolitan,3,7.4,3144.0,3.0,2.0,258.0,-37.8644,145.0302,4675.0,1895000.0 +h,S,Southern Metropolitan,2,5.6,3101.0,2.0,1.0,667.0,-37.8007,145.0327,10331.0,1507000.0 +h,S,Eastern Metropolitan,5,13.8,3084.0,5.0,3.0,531.0,-37.7378,145.0955,2698.0,1025000.0 +h,PI,Southern Metropolitan,3,4.6,3181.0,3.0,2.0,362.0,-37.85327,144.99947,4380.0,1970000.0 +h,S,Southern Metropolitan,4,11.0,3147.0,4.0,2.0,696.0,-37.8711,145.0746,3052.0,1860000.0 +h,S,Western Metropolitan,5,7.5,3040.0,5.0,3.0,590.0,-37.75511,144.90935,9264.0,2210000.0 +h,S,Western Metropolitan,4,10.8,3019.0,4.0,2.0,599.0,-37.7896,144.8559,3589.0,856500.0 +u,VB,Southern Metropolitan,1,2.1,3205.0,1.0,1.0,0.0,-37.8341,144.9713,5943.0,320000.0 +h,SA,Western Metropolitan,4,5.1,3011.0,4.0,2.0,180.0,-37.79686,144.908,7570.0,1000000.0 +u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85705,144.98699,13240.0,451000.0 +h,PI,Southern Metropolitan,2,13.9,3165.0,2.0,1.0,591.0,-37.9179,145.071,10969.0,702000.0 +h,S,Northern Metropolitan,3,5.9,3055.0,3.0,1.0,349.0,-37.7589,144.9368,7082.0,810000.0 +h,SP,Northern Metropolitan,3,2.6,3052.0,3.0,2.0,173.0,-37.7795,144.9413,2309.0,965000.0 +h,SP,Northern Metropolitan,4,9.2,3058.0,4.0,2.0,302.0,-37.7271,144.9842,3445.0,735000.0 +h,S,Southern Metropolitan,4,3.3,3206.0,4.0,1.0,306.0,-37.8459,144.9574,3280.0,2950000.0 +h,S,Southern Metropolitan,3,11.2,3127.0,0.0,2.0,335.0,-37.8165,145.0981,5457.0,1560000.0 +h,S,Northern Metropolitan,2,3.2,3054.0,3.0,1.0,100.0,-37.7879,144.9759,3106.0,910000.0 +h,S,Western Metropolitan,3,5.9,3032.0,3.0,1.0,263.0,-37.7753,144.9116,6567.0,955000.0 +u,SP,Southern Metropolitan,2,7.5,3123.0,2.0,1.0,710.0,-37.8259,145.0483,6482.0,500000.0 +h,S,Northern Metropolitan,4,9.9,3044.0,4.0,2.0,708.0,-37.7257,144.9418,7485.0,937000.0 +h,PI,Northern Metropolitan,4,20.6,3064.0,4.0,2.0,756.0,-37.58885,144.90135,15510.0,665000.0 +h,S,Western Metropolitan,3,18.0,3037.0,3.0,2.0,666.0,-37.68381,144.73331000000005,5556.0,660000.0 +h,SP,Western Metropolitan,2,8.0,3016.0,2.0,1.0,278.0,-37.857,144.8906,6380.0,850000.0 +h,S,Western Metropolitan,3,5.9,3032.0,3.0,1.0,452.0,-37.7762,144.9174,6567.0,1215000.0 +h,S,Northern Metropolitan,4,5.2,3056.0,4.0,1.0,363.0,-37.7621,144.9506,11918.0,1217000.0 +u,S,Northern Metropolitan,2,2.6,3121.0,2.0,1.0,0.0,-37.8333,144.998,14949.0,695000.0 +h,S,Northern Metropolitan,3,17.9,3082.0,3.0,1.0,345.0,-37.67121,145.06246000000004,10529.0,665000.0 +u,S,Western Metropolitan,2,6.9,3039.0,2.0,1.0,166.0,-37.7624,144.9365,6232.0,600000.0 +h,S,Western Metropolitan,3,6.4,3011.0,3.0,1.0,210.0,-37.7947,144.8871,7570.0,831000.0 +t,S,Southern Metropolitan,3,8.4,3126.0,3.0,2.0,230.0,-37.81653,145.05971,3265.0,1381000.0 +u,SP,Western Metropolitan,2,12.8,3033.0,2.0,1.0,220.0,-37.7346,144.8584,5629.0,490000.0 +t,VB,Southern Metropolitan,3,3.3,3141.0,3.0,2.0,163.0,-37.8425,144.9877,14887.0,2010000.0 +h,S,Southern Metropolitan,3,9.7,3103.0,3.0,1.0,281.0,-37.8013,145.0652,5682.0,905000.0 +h,S,Northern Metropolitan,2,11.2,3073.0,2.0,1.0,452.0,-37.7271,145.0009,21650.0,623500.0 +h,S,Southern Metropolitan,3,12.3,3166.0,3.0,1.0,501.0,-37.90805,145.10683999999995,768.0,1093800.0 +h,S,Western Metropolitan,4,8.0,3040.0,4.0,2.0,639.0,-37.7471,144.9157,9264.0,1720000.0 +u,S,Western Metropolitan,2,13.5,3020.0,2.0,1.0,235.0,-37.7847,144.8146,6763.0,350000.0 +h,S,Western Metropolitan,3,7.0,3013.0,3.0,2.0,473.0,-37.8217,144.8842,6543.0,1320000.0 +h,S,Western Metropolitan,3,12.8,3033.0,3.0,1.0,662.0,-37.738,144.869,5629.0,935000.0 +u,S,Southern Metropolitan,2,10.1,3163.0,2.0,1.0,0.0,-37.88368,145.0515,7822.0,750000.0 +h,PI,Southern Metropolitan,3,14.6,3189.0,3.0,2.0,374.0,-37.939,145.0533,2555.0,725000.0 +u,S,Southern Metropolitan,2,11.4,3163.0,2.0,1.0,174.0,-37.8978,145.062,7822.0,650000.0 +u,S,Southern Metropolitan,2,2.1,3205.0,2.0,1.0,0.0,-37.8341,144.9713,5943.0,490000.0 +t,PI,Northern Metropolitan,3,7.8,3058.0,3.0,2.0,531.0,-37.7424,144.9571,11204.0,720000.0 +h,S,Northern Metropolitan,4,20.6,3064.0,4.0,2.0,504.0,-37.61419,144.93448,5833.0,570000.0 +h,VB,Southern Metropolitan,5,9.7,3103.0,5.0,4.0,1437.0,-37.8058,145.0882,5682.0,4000000.0 +h,S,Northern Metropolitan,2,5.2,3056.0,2.0,1.0,152.0,-37.7611,144.966,11918.0,770000.0 +h,S,Northern Metropolitan,2,12.1,3046.0,2.0,1.0,394.0,-37.7153,144.9507,2606.0,610000.0 +h,S,Southern Metropolitan,4,4.5,3181.0,4.0,1.0,305.0,-37.8493,144.9873,7717.0,1820000.0 +t,S,Northern Metropolitan,2,3.4,3031.0,2.0,1.0,105.0,-37.79244,144.92036000000004,5263.0,841000.0 +h,S,Northern Metropolitan,3,4.2,3031.0,3.0,1.0,459.0,-37.7917,144.9251,5263.0,1335000.0 +u,S,Southern Metropolitan,3,5.4,3101.0,3.0,1.0,1096.0,-37.81207,145.0371,10331.0,660000.0 +h,PI,Southern Metropolitan,4,13.7,3188.0,4.0,3.0,684.0,-37.9436,145.0169,5454.0,2500000.0 +h,S,Eastern Metropolitan,5,10.5,3081.0,5.0,2.0,596.0,-37.7487,145.0522,2947.0,890000.0 +u,SA,Western Metropolitan,1,5.1,3011.0,1.0,1.0,1015.0,-37.78778,144.89037,7570.0,240000.0 +u,SP,Northern Metropolitan,1,2.0,3066.0,1.0,1.0,0.0,-37.79597,144.99108,4553.0,365000.0 +t,S,Eastern Metropolitan,2,10.6,3084.0,2.0,1.0,86.0,-37.7586,145.0629,2890.0,630000.0 +h,SA,Northern Metropolitan,4,11.2,3046.0,4.0,2.0,697.0,-37.72001,144.91683,2651.0,1138000.0 +u,S,Southern Metropolitan,2,2.7,3141.0,2.0,1.0,0.0,-37.83613,144.99661,14887.0,666000.0 +h,SP,Western Metropolitan,3,31.7,3429.0,3.0,1.0,639.0,-37.56291,144.72848,14092.0,455000.0 +h,PI,Southern Metropolitan,2,9.7,3103.0,2.0,0.0,1611.0,-37.8092,145.1016,5682.0,1010000.0 +u,VB,Southern Metropolitan,2,3.8,3207.0,2.0,2.0,2166.0,-37.8415,144.9412,8648.0,820000.0 +h,S,Northern Metropolitan,2,5.2,3056.0,2.0,1.0,319.0,-37.7572,144.9686,11918.0,830000.0 +h,VB,Southern Metropolitan,3,3.3,3141.0,3.0,3.0,191.0,-37.836,144.9824,14887.0,3000000.0 +t,S,Southern Metropolitan,4,1.9,3008.0,4.0,2.0,0.0,-37.8141,144.9387,4707.0,1370000.0 +u,SP,Western Metropolitan,2,5.9,3032.0,2.0,1.0,301.0,-37.7791,144.914,6567.0,610000.0 +h,SP,Northern Metropolitan,3,12.4,3060.0,3.0,1.0,589.0,-37.7006,144.9697,5070.0,650000.0 +t,VB,Western Metropolitan,3,6.4,3011.0,3.0,1.0,219.0,-37.796,144.8819,7570.0,760000.0 +u,S,Southern Metropolitan,2,10.1,3163.0,2.0,1.0,109.0,-37.89578,145.06899,4442.0,660000.0 +h,S,Southern Metropolitan,3,3.8,3207.0,3.0,1.0,108.0,-37.8332,144.945,8648.0,1402000.0 +h,S,Northern Metropolitan,3,5.2,3056.0,3.0,1.0,318.0,-37.7648,144.9633,11918.0,1200000.0 +h,S,South-Eastern Metropolitan,4,18.8,3170.0,4.0,2.0,492.0,-37.92299000000001,145.19156,7113.0,1030000.0 +h,S,Northern Metropolitan,4,3.4,3068.0,4.0,2.0,162.0,-37.7885,144.9994,2954.0,1506000.0 +h,SP,Eastern Metropolitan,2,10.9,3128.0,2.0,1.0,701.0,-37.82756,145.11897,4605.0,1820000.0 +u,PI,Western Metropolitan,1,6.4,3011.0,1.0,1.0,0.0,-37.7911,144.89,7570.0,85000.0 +h,S,Southern Metropolitan,3,11.4,3163.0,3.0,2.0,603.0,-37.9023,145.0568,7822.0,1430000.0 +h,S,Northern Metropolitan,2,4.5,3057.0,2.0,1.0,263.0,-37.7661,144.9742,5533.0,1283000.0 +u,S,Northern Metropolitan,2,2.8,3000.0,2.0,2.0,1136.0,-37.8211,144.9559,17496.0,683000.0 +h,VB,Western Metropolitan,3,6.9,3039.0,3.0,1.0,572.0,-37.7683,144.9325,6232.0,1100000.0 +u,S,Northern Metropolitan,2,3.5,3068.0,2.0,2.0,4296.0,-37.7846,144.9785,6244.0,720000.0 +h,PI,Western Metropolitan,4,6.4,3011.0,4.0,2.0,369.0,-37.7914,144.8957,7570.0,815000.0 +h,S,Southern Metropolitan,4,7.5,3123.0,4.0,3.0,726.0,-37.8239,145.0553,6482.0,2920000.0 +u,VB,Southern Metropolitan,2,7.5,3123.0,2.0,1.0,3084.0,-37.8414,145.0505,6482.0,500000.0 +u,S,Northern Metropolitan,2,1.6,3065.0,2.0,1.0,0.0,-37.7967,144.9836,5825.0,790000.0 +h,S,Southern Metropolitan,3,4.6,3122.0,3.0,2.0,264.0,-37.8163,145.0301,11308.0,1510000.0 +u,S,Southern Metropolitan,3,7.8,3124.0,3.0,1.0,112.0,-37.8378,145.0949,8920.0,835000.0 +h,S,Northern Metropolitan,2,11.2,3073.0,2.0,1.0,268.0,-37.7113,145.0224,21650.0,510000.0 +h,S,Western Metropolitan,3,8.0,3040.0,3.0,1.0,175.0,-37.7393,144.894,9264.0,801500.0 +u,PI,Southern Metropolitan,1,3.3,3141.0,1.0,1.0,14500.0,-37.8372,144.9963,14887.0,315000.0 +h,S,Northern Metropolitan,3,2.3,3051.0,2.0,1.0,517.0,-37.7967,144.9472,6821.0,1635000.0 +h,S,Northern Metropolitan,3,8.8,3072.0,3.0,1.0,504.0,-37.7476,144.9842,14577.0,1123000.0 +h,S,Western Metropolitan,3,8.0,3016.0,3.0,1.0,477.0,-37.8516,144.8949,6380.0,1360000.0 +u,S,Northern Metropolitan,1,2.6,3121.0,1.0,1.0,1332.0,-37.8181,144.9901,14949.0,360000.0 +h,S,Northern Metropolitan,3,7.8,3058.0,3.0,1.0,309.0,-37.7434,144.9697,11204.0,985000.0 +h,S,Southern Metropolitan,3,7.8,3124.0,3.0,1.0,970.0,-37.8344,145.0818,8920.0,1700000.0 +h,S,Southern Metropolitan,5,11.7,3125.0,5.0,2.0,544.0,-37.8525,145.1154,5678.0,1100000.0 +u,S,Southern Metropolitan,2,3.3,3141.0,2.0,1.0,595.0,-37.8422,144.9855,14887.0,642000.0 +h,S,Northern Metropolitan,3,11.2,3046.0,3.0,1.0,650.0,-37.70768,144.92586,8870.0,820000.0 +h,S,Northern Metropolitan,3,4.4,3031.0,3.0,2.0,167.0,-37.7869,144.9228,3593.0,870000.0 +h,S,Northern Metropolitan,2,4.2,3031.0,2.0,1.0,150.0,-37.7963,144.935,5263.0,775000.0 +u,S,Southern Metropolitan,2,9.2,3104.0,2.0,1.0,0.0,-37.7961,145.0808,7809.0,560000.0 +u,S,Western Metropolitan,2,6.2,3015.0,2.0,1.0,163.0,-37.84608,144.86525,5498.0,600000.0 +h,SP,Northern Metropolitan,1,8.8,3072.0,1.0,1.0,397.0,-37.7366,145.0115,14577.0,830000.0 +u,VB,Southern Metropolitan,2,5.1,3181.0,2.0,1.0,0.0,-37.8555,145.0018,4380.0,380000.0 +h,S,Western Metropolitan,3,14.8,3023.0,3.0,3.0,585.0,-37.75907,144.75923999999995,6388.0,572000.0 +h,S,Northern Metropolitan,3,9.2,3058.0,3.0,1.0,565.0,-37.7283,144.969,3445.0,864000.0 +u,S,Northern Metropolitan,2,4.5,3057.0,2.0,1.0,0.0,-37.7786,144.9753,5533.0,665000.0 +h,S,Northern Metropolitan,3,19.6,3076.0,3.0,2.0,606.0,-37.62858,145.03735,10926.0,465000.0 +u,S,Southern Metropolitan,2,7.2,3184.0,2.0,1.0,837.0,-37.87307,144.98635,8989.0,930000.0 +h,S,Western Metropolitan,2,13.9,3020.0,2.0,1.0,497.0,-37.7806,144.8159,2185.0,541000.0 +h,S,Northern Metropolitan,2,6.5,3071.0,2.0,1.0,258.0,-37.7584,144.9971,8870.0,925000.0 +h,S,Eastern Metropolitan,3,21.3,3135.0,3.0,1.0,756.0,-37.82909,145.233,3794.0,1040000.0 +h,SP,Northern Metropolitan,3,16.5,3049.0,3.0,1.0,532.0,-37.67949,144.88349,2474.0,540000.0 +h,S,Southern Metropolitan,5,9.2,3146.0,5.0,2.0,1339.0,-37.8649,145.0547,10412.0,3365000.0 +h,S,Southern Metropolitan,3,7.4,3144.0,3.0,2.0,246.0,-37.871,145.0364,4675.0,1970000.0 +h,S,Northern Metropolitan,3,14.0,3047.0,3.0,1.0,584.0,-37.67159,144.94522,2246.0,408500.0 +h,VB,Southern Metropolitan,4,11.2,3127.0,4.0,2.0,783.0,-37.8181,145.0913,5457.0,2500000.0 +u,S,Western Metropolitan,1,7.5,3040.0,1.0,1.0,0.0,-37.75121,144.91326,9264.0,291000.0 +t,VB,Northern Metropolitan,3,7.0,3071.0,3.0,2.0,120.0,-37.76343,145.02096,8870.0,900000.0 +h,S,Western Metropolitan,3,6.4,3011.0,3.0,1.0,292.0,-37.797,144.9051,7570.0,1003000.0 +h,S,Eastern Victoria,3,26.5,3138.0,3.0,1.0,864.0,-37.76983,145.31687,8280.0,760000.0 +h,S,Northern Metropolitan,3,6.5,3071.0,3.0,1.0,253.0,-37.7566,144.9965,8870.0,1280000.0 +h,PI,Western Metropolitan,4,13.8,3018.0,4.0,1.0,655.0,-37.868,144.8154,5301.0,780000.0 +h,S,Northern Metropolitan,3,12.0,3073.0,3.0,1.0,606.0,-37.72057,145.02615,21650.0,760000.0 +h,S,South-Eastern Metropolitan,3,15.5,3167.0,3.0,1.0,640.0,-37.93646,145.08728,3692.0,945000.0 +h,PI,South-Eastern Metropolitan,3,15.5,3167.0,3.0,1.0,601.0,-37.93869,145.08441000000005,3692.0,840000.0 +h,S,Northern Metropolitan,3,9.9,3044.0,3.0,1.0,321.0,-37.7242,144.9424,7485.0,708000.0 +h,S,Western Metropolitan,3,4.3,3032.0,3.0,1.0,196.0,-37.77552,144.92022,6567.0,990000.0 +u,S,Western Metropolitan,2,10.5,3034.0,2.0,1.0,263.0,-37.7697,144.8657,4502.0,440000.0 +h,PI,Northern Metropolitan,4,11.2,3046.0,4.0,1.0,587.0,-37.69897,144.90998000000005,8870.0,651000.0 +h,S,Southern Metropolitan,4,11.8,3204.0,4.0,2.0,618.0,-37.9051,145.0473,3578.0,1486000.0 +h,S,Western Metropolitan,3,14.7,3030.0,3.0,1.0,530.0,-37.90111,144.63302,16166.0,390000.0 +h,S,Eastern Metropolitan,3,13.4,3130.0,3.0,1.0,448.0,-37.82956,145.13868,5713.0,1070000.0 +t,S,Western Metropolitan,3,6.4,3011.0,3.0,2.0,102.0,-37.7987,144.8807,7570.0,757000.0 +h,S,Eastern Metropolitan,3,14.3,3109.0,3.0,1.0,696.0,-37.77973,145.16473,10999.0,1369000.0 +h,S,Western Metropolitan,3,14.0,3021.0,3.0,1.0,539.0,-37.74845,144.8144,14042.0,645000.0 +h,S,Northern Metropolitan,3,3.6,3068.0,3.0,1.0,192.0,-37.78992,144.99845,2954.0,1460000.0 +h,SP,South-Eastern Metropolitan,4,21.5,3195.0,4.0,2.0,597.0,-37.99232,145.08469,5087.0,1300000.0 +h,S,Northern Metropolitan,3,3.4,3068.0,3.0,2.0,201.0,-37.7884,145.0,2954.0,1680000.0 +u,PI,Southern Metropolitan,2,6.1,3182.0,2.0,1.0,0.0,-37.8619,144.976,13240.0,400000.0 +h,PI,Northern Metropolitan,3,12.4,3060.0,3.0,1.0,1069.0,-37.6931,144.9596,5070.0,1000000.0 +h,S,Southern Metropolitan,3,9.2,3146.0,3.0,1.0,652.0,-37.8552,145.0785,10412.0,1825000.0 +u,SP,Western Metropolitan,2,8.7,3032.0,2.0,1.0,5661.0,-37.773,144.8806,4918.0,420000.0 +h,S,Eastern Victoria,3,26.5,3138.0,3.0,1.0,484.0,-37.78454,145.33073000000005,8280.0,581000.0 +h,SP,Western Metropolitan,3,8.0,3040.0,3.0,1.0,578.0,-37.7526,144.9089,9264.0,1400000.0 +h,S,Northern Metropolitan,3,5.5,3070.0,3.0,1.0,279.0,-37.77,145.0049,11364.0,980000.0 +h,VB,Northern Metropolitan,4,12.4,3060.0,4.0,2.0,254.0,-37.7082,144.9725,5070.0,540000.0 +h,SP,Western Metropolitan,3,7.7,3015.0,3.0,1.0,320.0,-37.8277,144.8841,1223.0,1016000.0 +t,SP,Eastern Metropolitan,3,14.2,3149.0,3.0,2.0,236.0,-37.88678,145.12748,13366.0,1035000.0 +u,S,Southern Metropolitan,2,11.2,3127.0,2.0,1.0,180.0,-37.8351,145.10299999999995,5457.0,825000.0 +h,S,Northern Metropolitan,4,9.9,3044.0,4.0,2.0,559.0,-37.7236,144.9347,7485.0,1196000.0 +h,VB,Southern Metropolitan,4,7.7,3184.0,3.0,3.0,235.0,-37.8775,144.9808,8989.0,1875000.0 +t,VB,Northern Metropolitan,2,6.7,3058.0,2.0,2.0,143.0,-37.72037,144.97023000000004,3445.0,525000.0 +u,VB,Southern Metropolitan,2,13.9,3165.0,2.0,1.0,0.0,-37.9096,145.0843,10969.0,340000.0 +u,S,Southern Metropolitan,3,10.5,3186.0,3.0,2.0,257.0,-37.89886,144.99462,10579.0,1410000.0 +h,VB,Southern Metropolitan,4,7.4,3144.0,4.0,2.0,586.0,-37.866,145.039,4675.0,2400000.0 +u,S,Southern Metropolitan,1,4.6,3181.0,1.0,1.0,0.0,-37.85279,145.00811000000004,7717.0,399000.0 +h,S,Southern Metropolitan,4,11.4,3204.0,4.0,2.0,673.0,-37.92837,145.03821000000005,6795.0,1700000.0 +h,S,Western Metropolitan,3,6.4,3011.0,3.0,2.0,497.0,-37.7958,144.9024,7570.0,1362000.0 +t,S,Southern Metropolitan,3,3.3,3141.0,3.0,2.0,201.0,-37.8428,144.9877,14887.0,2030000.0 +h,S,Northern Metropolitan,2,6.5,3071.0,2.0,1.0,317.0,-37.7606,144.9995,8870.0,1260000.0 +u,S,Northern Metropolitan,2,9.9,3044.0,3.0,1.0,219.0,-37.7283,144.9331,7485.0,488000.0 +u,PI,Northern Metropolitan,2,4.2,3031.0,2.0,1.0,0.0,-37.7899,144.9231,5263.0,395000.0 +h,S,Northern Metropolitan,4,20.6,3064.0,4.0,2.0,383.0,-37.61132,144.92075,5833.0,500000.0 +h,S,Southern Metropolitan,3,4.6,3122.0,3.0,2.0,254.0,-37.8287,145.0419,11308.0,1985000.0 +h,SP,Southern Metropolitan,3,5.6,3101.0,3.0,2.0,392.0,-37.8081,145.0263,10331.0,1878000.0 +h,PI,Northern Metropolitan,4,3.6,3068.0,4.0,3.0,193.0,-37.79395,144.98899,2954.0,1010000.0 +h,SP,Northern Metropolitan,2,8.8,3072.0,2.0,1.0,529.0,-37.7427,144.9868,14577.0,900000.0 +h,VB,Western Metropolitan,3,6.6,3011.0,3.0,2.0,309.0,-37.807,144.898,2417.0,920000.0 +h,S,Northern Metropolitan,3,12.4,3060.0,3.0,1.0,285.0,-37.7161,144.9662,5070.0,400000.0 +h,S,Eastern Metropolitan,5,16.7,3150.0,5.0,2.0,651.0,-37.8656,145.15034,15321.0,1285000.0 +h,S,Western Metropolitan,3,6.6,3011.0,3.0,2.0,229.0,-37.8032,144.8892,2417.0,1310000.0 +h,S,Western Metropolitan,3,8.0,3016.0,3.0,2.0,292.0,-37.8575,144.8922,6380.0,1245000.0 +h,PI,Western Metropolitan,4,8.2,3012.0,4.0,1.0,528.0,-37.7969,144.8647,5058.0,970000.0 +h,S,South-Eastern Metropolitan,3,14.7,3167.0,3.0,2.0,727.0,-37.9187,145.1098,3692.0,980500.0 +h,PI,Southern Metropolitan,3,4.5,3181.0,2.0,1.0,111.0,-37.8525,145.0071,7717.0,1100000.0 +u,S,Southern Metropolitan,1,7.7,3184.0,1.0,1.0,0.0,-37.8895,144.9902,8989.0,451000.0 +u,S,Northern Metropolitan,2,5.8,3078.0,2.0,1.0,1658.0,-37.7781,145.0157,2970.0,510000.0 +h,SP,Eastern Metropolitan,4,13.9,3108.0,4.0,2.0,840.0,-37.78,145.1145,9028.0,1460000.0 +h,VB,Southern Metropolitan,4,7.3,3146.0,4.0,1.0,1110.0,-37.85723,145.0547,10412.0,2250000.0 +u,S,Southern Metropolitan,3,13.6,3148.0,2.0,2.0,224.0,-37.8738,145.1054,3582.0,700000.0 +u,PI,Southern Metropolitan,3,13.9,3165.0,3.0,2.0,258.0,-37.9244,145.0547,10969.0,740000.0 +t,S,Western Metropolitan,3,13.5,3020.0,3.0,3.0,134.0,-37.79,144.7886,6763.0,521000.0 +h,S,Eastern Metropolitan,5,13.8,3084.0,5.0,3.0,648.0,-37.7346,145.093,2698.0,895000.0 +h,S,Western Metropolitan,4,12.8,3033.0,4.0,2.0,659.0,-37.7467,144.8683,5629.0,970000.0 +t,S,Southern Metropolitan,3,7.3,3146.0,3.0,2.0,203.0,-37.86248,145.06682,10412.0,1160000.0 +h,PI,Western Metropolitan,1,9.1,3040.0,3.0,2.0,676.0,-37.7632,144.898,1543.0,1720000.0 +u,VB,Western Metropolitan,2,8.2,3012.0,2.0,1.0,781.0,-37.8079,144.8684,5058.0,420000.0 +u,S,Southern Metropolitan,3,6.3,3143.0,2.0,1.0,0.0,-37.853,145.0264,4836.0,869000.0 +h,S,South-Eastern Metropolitan,3,17.5,3169.0,3.0,1.0,602.0,-37.94146,145.11121,4734.0,800000.0 +u,PI,Southern Metropolitan,4,11.7,3125.0,4.0,2.0,0.0,-37.8507,145.109,5678.0,800000.0 +h,S,Western Metropolitan,3,11.1,3025.0,3.0,1.0,740.0,-37.8269,144.8455,5132.0,923000.0 +h,PI,Eastern Metropolitan,3,13.4,3130.0,3.0,2.0,567.0,-37.81684,145.14992,5713.0,1200000.0 +h,S,Northern Metropolitan,3,5.5,3070.0,3.0,1.0,445.0,-37.7652,145.0123,11364.0,1190000.0 +h,VB,South-Eastern Metropolitan,4,38.0,3199.0,4.0,2.0,767.0,-38.16147,145.14285,17055.0,680000.0 +u,S,Northern Metropolitan,2,3.1,3003.0,2.0,1.0,17.0,-37.8118,144.95259,2230.0,670000.0 +h,S,South-Eastern Metropolitan,3,38.0,3199.0,3.0,1.0,578.0,-38.13743,145.16702,17055.0,506000.0 +h,S,Southern Metropolitan,3,9.3,3162.0,3.0,1.0,359.0,-37.8898,145.0196,5051.0,1170000.0 +u,SP,Southern Metropolitan,1,11.2,3145.0,1.0,1.0,0.0,-37.8728,145.0417,8801.0,373000.0 +u,S,Southern Metropolitan,2,9.2,3146.0,2.0,1.0,0.0,-37.8497,145.0466,10412.0,390000.0 +u,S,Northern Metropolitan,2,5.8,3078.0,2.0,1.0,0.0,-37.7847,145.0109,2970.0,676000.0 +u,S,Northern Metropolitan,2,4.4,3031.0,2.0,1.0,0.0,-37.7825,144.9239,3593.0,391000.0 +u,SP,Southern Metropolitan,3,6.1,3182.0,3.0,1.0,0.0,-37.8612,144.985,13240.0,720000.0 +h,S,Northern Metropolitan,3,11.2,3073.0,3.0,1.0,570.0,-37.718,144.9999,21650.0,800000.0 +h,SP,South-Eastern Metropolitan,2,24.7,3175.0,2.0,1.0,414.0,-37.99173,145.22308999999996,10894.0,488000.0 +u,S,Southern Metropolitan,1,3.3,3141.0,1.0,1.0,1369.0,-37.8405,145.0025,14887.0,411000.0 +u,S,Eastern Metropolitan,2,8.9,3084.0,2.0,1.0,2033.0,-37.75715,145.06463,2890.0,534000.0 +h,S,Western Metropolitan,4,31.7,3429.0,4.0,2.0,738.0,-37.58457,144.70005,14092.0,605000.0 +h,S,Southern Metropolitan,3,11.7,3125.0,3.0,1.0,742.0,-37.8529,145.0962,5678.0,1460000.0 +u,S,Southern Metropolitan,1,3.3,3141.0,1.0,1.0,0.0,-37.8355,144.9884,14887.0,400000.0 +h,S,Northern Metropolitan,3,2.6,3121.0,3.0,2.0,115.0,-37.8163,144.9984,14949.0,1600000.0 +h,S,Northern Metropolitan,4,12.1,3046.0,4.0,2.0,672.0,-37.7083,144.9495,2606.0,815000.0 +h,S,Southern Metropolitan,2,9.2,3146.0,2.0,1.0,553.0,-37.8464,145.0511,10412.0,1723000.0 +h,S,Western Metropolitan,3,5.9,3032.0,3.0,2.0,416.0,-37.7727,144.9055,6567.0,1381500.0 +h,S,Southern Metropolitan,2,5.1,3181.0,2.0,1.0,153.0,-37.8535,144.9952,4380.0,1325000.0 +t,PI,Southern Metropolitan,3,13.0,3204.0,3.0,2.0,418.0,-37.9172,145.0421,6795.0,900000.0 +u,PI,Southern Metropolitan,3,13.6,3148.0,3.0,2.0,235.0,-37.8856,145.0934,3582.0,701000.0 +u,PI,Southern Metropolitan,1,13.9,3165.0,1.0,1.0,0.0,-37.9205,145.0528,10969.0,370000.0 +h,PI,Eastern Metropolitan,3,11.8,3105.0,3.0,2.0,728.0,-37.7765,145.1031,4480.0,1050000.0 +u,S,Southern Metropolitan,1,6.1,3182.0,1.0,1.0,1659.0,-37.8699,144.9764,13240.0,354000.0 +h,S,Western Metropolitan,2,12.8,3033.0,2.0,1.0,116.0,-37.74800000000001,144.8696,5629.0,380000.0 +u,VB,Southern Metropolitan,2,2.1,3205.0,2.0,2.0,0.0,-37.8361,144.9682,5943.0,690000.0 +u,VB,Southern Metropolitan,2,9.3,3162.0,2.0,1.0,90.0,-37.8996,145.0169,5051.0,500000.0 +h,S,Western Metropolitan,3,6.2,3039.0,3.0,1.0,619.0,-37.76996,144.93466,6232.0,1162000.0 +u,SP,Western Metropolitan,4,8.2,3012.0,4.0,2.0,0.0,-37.809,144.8686,5058.0,530000.0 +h,S,Western Metropolitan,3,9.2,3012.0,3.0,1.0,576.0,-37.7808,144.8678,3873.0,715000.0 +u,SP,Northern Metropolitan,2,12.4,3060.0,2.0,1.0,97.0,-37.7118,144.9689,5070.0,333000.0 +u,SP,Southern Metropolitan,2,5.6,3101.0,2.0,1.0,0.0,-37.7977,145.0333,10331.0,700000.0 +u,PI,Northern Metropolitan,3,1.9,3003.0,3.0,2.0,0.0,-37.8118,144.9526,2230.0,660000.0 +h,S,Southern Metropolitan,3,14.6,3189.0,3.0,1.0,653.0,-37.9392,145.0481,2555.0,920000.0 +h,VB,Southern Metropolitan,4,11.2,3145.0,4.0,3.0,222.0,-37.878,145.0666,8801.0,1350000.0 +h,S,Northern Metropolitan,2,2.6,3121.0,2.0,1.0,178.0,-37.8226,145.0064,14949.0,1210000.0 +h,S,Southern Metropolitan,3,7.7,3184.0,3.0,1.0,345.0,-37.8865,144.9891,8989.0,1675000.0 +h,SP,Western Metropolitan,3,5.9,3032.0,3.0,1.0,536.0,-37.7728,144.9076,6567.0,1050000.0 +u,VB,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.8614,144.97406999999995,13240.0,350000.0 +h,S,Northern Metropolitan,4,12.1,3083.0,4.0,2.0,525.0,-37.70765,145.05556,10175.0,815000.0 +h,S,Western Victoria,4,29.8,3338.0,4.0,2.0,587.0,-37.69392,144.57468,4718.0,347500.0 +u,S,Southern Metropolitan,1,11.4,3163.0,1.0,1.0,0.0,-37.8968,145.0609,7822.0,345000.0 +h,S,Southern Metropolitan,3,9.2,3104.0,3.0,1.0,888.0,-37.8002,145.0949,7809.0,2251000.0 +h,SP,Western Metropolitan,4,8.0,3040.0,4.0,1.0,638.0,-37.7523,144.9052,9264.0,1535000.0 +h,S,Northern Metropolitan,2,5.5,3070.0,2.0,1.0,120.0,-37.7715,145.0075,11364.0,695000.0 +u,S,Northern Metropolitan,2,2.5,3121.0,2.0,1.0,0.0,-37.8282,144.9897,1123.0,546000.0 +h,S,Southern Metropolitan,4,10.7,3187.0,4.0,2.0,674.0,-37.9022,145.0125,6938.0,2504000.0 +h,S,Northern Metropolitan,3,7.8,3058.0,3.0,1.0,414.0,-37.7432,144.9573,11204.0,990000.0 +t,S,Northern Metropolitan,3,1.8,3052.0,3.0,1.0,2429.0,-37.78033,144.95949,2309.0,850000.0 +u,S,Eastern Metropolitan,2,13.1,3128.0,2.0,1.0,151.0,-37.8234,145.1235,4605.0,636000.0 +t,PI,Western Metropolitan,3,10.5,3020.0,3.0,2.0,175.0,-37.77856,144.82443,2185.0,580000.0 +h,VB,South-Eastern Metropolitan,3,38.0,3199.0,3.0,2.0,539.0,-38.14406,145.16352,17055.0,550000.0 +u,S,Southern Metropolitan,2,13.0,3204.0,2.0,1.0,118.0,-37.9245,145.0336,6795.0,635000.0 +h,SP,Northern Metropolitan,3,3.4,3068.0,3.0,3.0,142.0,-37.7928,145.0021,2954.0,1180000.0 +t,S,Western Metropolitan,4,12.8,3033.0,4.0,3.0,322.0,-37.7293,144.8659,5629.0,886000.0 +h,S,Southern Metropolitan,2,5.6,3101.0,2.0,1.0,381.0,-37.8033,145.036,10331.0,1275000.0 +u,S,Southern Metropolitan,2,9.3,3162.0,2.0,1.0,831.0,-37.8974,145.0294,5051.0,525500.0 +u,S,Southern Metropolitan,2,5.6,3101.0,2.0,2.0,0.0,-37.8099,145.0606,10331.0,562500.0 +h,S,Southern Metropolitan,4,13.0,3204.0,4.0,1.0,786.0,-37.9305,145.0449,6795.0,1420000.0 +h,SP,Northern Metropolitan,3,5.2,3056.0,3.0,1.0,308.0,-37.7649,144.9641,11918.0,1020000.0 +h,S,Southern Metropolitan,5,7.4,3144.0,5.0,3.0,580.0,-37.8653,145.0304,4675.0,4240000.0 +h,VB,Southern Metropolitan,1,4.6,3122.0,1.0,1.0,0.0,-37.8143,145.0319,11308.0,300000.0 +h,S,Western Metropolitan,4,14.8,3023.0,4.0,2.0,709.0,-37.76336,144.7725,6388.0,650000.0 +h,S,Northern Metropolitan,3,1.6,3066.0,3.0,1.0,168.0,-37.7975,144.9924,4553.0,1309000.0 +u,S,Southern Metropolitan,2,1.2,3006.0,2.0,1.0,0.0,-37.8235,144.9655,8400.0,590000.0 +u,S,Southern Metropolitan,2,11.4,3163.0,2.0,1.0,165.0,-37.8927,145.0539,7822.0,737000.0 +h,S,Southern Metropolitan,2,11.2,3145.0,2.0,1.0,312.0,-37.8654,145.0412,8801.0,1400000.0 +h,S,Western Metropolitan,4,15.0,3021.0,4.0,3.0,654.0,-37.7392,144.8299,1202.0,1095000.0 +u,S,Southern Metropolitan,2,5.1,3181.0,2.0,1.0,0.0,-37.8577,145.0002,4380.0,800000.0 +h,S,Western Metropolitan,5,9.7,3041.0,5.0,3.0,755.0,-37.7417,144.9133,3284.0,2650000.0 +h,S,Southern Metropolitan,4,13.7,3188.0,4.0,3.0,414.0,-37.9401,145.0083,5454.0,1740000.0 +h,S,Southern Metropolitan,3,4.5,3181.0,3.0,2.0,272.0,-37.8553,145.007,7717.0,1720000.0 +h,S,Southern Metropolitan,3,11.4,3163.0,2.0,1.0,461.0,-37.8988,145.0516,7822.0,1170000.0 +u,SP,Western Metropolitan,2,8.0,3016.0,2.0,1.0,0.0,-37.8504,144.8924,6380.0,375000.0 +h,S,Southern Metropolitan,4,9.2,3146.0,4.0,3.0,603.0,-37.8521,145.0657,10412.0,2725000.0 +h,S,Southern Metropolitan,3,7.3,3102.0,3.0,2.0,753.0,-37.79017,145.06381000000005,2671.0,2260000.0 +h,VB,Southern Metropolitan,3,8.4,3145.0,3.0,1.0,1128.0,-37.87923,145.08955,8801.0,2500000.0 +h,S,Southern Metropolitan,4,9.2,3104.0,4.0,2.0,647.0,-37.7917,145.0868,7809.0,1905000.0 +h,S,Western Metropolitan,3,14.7,3030.0,3.0,1.0,239.0,-37.88831,144.63942,16166.0,372000.0 +h,S,Western Metropolitan,4,18.4,3029.0,4.0,2.0,546.0,-37.85146,144.67109,13830.0,472000.0 +u,SP,Southern Metropolitan,1,4.6,3142.0,1.0,1.0,0.0,-37.8428,145.0081,7217.0,611000.0 +h,S,Southern Metropolitan,3,9.2,3104.0,3.0,1.0,713.0,-37.7982,145.0816,7809.0,1903000.0 +t,S,Southern Metropolitan,3,4.6,3122.0,3.0,2.0,299.0,-37.8299,145.0387,11308.0,1400000.0 +h,S,Eastern Metropolitan,4,7.9,3079.0,4.0,2.0,631.0,-37.7771,145.0448,5549.0,1720000.0 +u,VB,Southern Metropolitan,3,1.2,3006.0,3.0,2.0,0.0,-37.828,144.9683,8400.0,740000.0 +h,S,Southern Metropolitan,3,7.5,3123.0,3.0,1.0,780.0,-37.8198,145.0465,6482.0,2151000.0 +u,S,Eastern Metropolitan,3,16.1,3111.0,3.0,2.0,294.0,-37.79937,145.18164,4790.0,775000.0 +h,SP,Western Metropolitan,2,5.1,3011.0,2.0,1.0,126.0,-37.80473,144.89095,2417.0,945000.0 +h,S,Northern Metropolitan,3,9.9,3044.0,3.0,1.0,506.0,-37.7187,144.9433,7485.0,800000.0 +h,S,Western Metropolitan,3,18.4,3029.0,3.0,1.0,592.0,-37.88157,144.69426,13830.0,575000.0 +h,S,Eastern Metropolitan,4,11.8,3127.0,3.0,2.0,626.0,-37.8197,145.1106,2079.0,2000000.0 +u,S,Western Metropolitan,2,6.4,3011.0,2.0,1.0,0.0,-37.8006,144.881,7570.0,490000.0 +h,S,Northern Metropolitan,3,14.5,3087.0,3.0,2.0,447.0,-37.71627,145.08526,2329.0,835000.0 +h,VB,Eastern Metropolitan,3,13.1,3128.0,3.0,1.0,763.0,-37.8166,145.1215,4605.0,4000000.0 +h,VB,Southern Metropolitan,5,13.0,3204.0,5.0,3.0,772.0,-37.9168,145.04,6795.0,1600000.0 +h,PI,Southern Metropolitan,5,9.2,3146.0,5.0,2.0,400.0,-37.8572,145.0555,10412.0,1650000.0 +u,S,Southern Metropolitan,2,4.6,3122.0,2.0,1.0,0.0,-37.8142,145.0308,11308.0,715000.0 +h,S,Eastern Metropolitan,3,14.2,3149.0,3.0,1.0,810.0,-37.86838,145.14664,13366.0,1530000.0 +h,VB,Northern Metropolitan,3,3.2,3054.0,3.0,1.0,203.0,-37.7824,144.9733,3106.0,1280000.0 +h,PI,Southern Metropolitan,4,9.2,3104.0,4.0,2.0,715.0,-37.7943,145.083,7809.0,1702000.0 +h,S,Northern Metropolitan,3,5.2,3056.0,3.0,2.0,280.0,-37.7719,144.9661,11918.0,1210000.0 +t,S,Southern Metropolitan,3,9.7,3103.0,3.0,2.0,306.0,-37.8106,145.0848,5682.0,1472000.0 +u,VB,Southern Metropolitan,2,8.1,3161.0,2.0,1.0,0.0,-37.861,145.0136,6923.0,420000.0 +h,SP,Eastern Metropolitan,4,25.0,3155.0,4.0,1.0,730.0,-37.87377,145.28688,9704.0,783000.0 +h,S,Northern Metropolitan,2,5.5,3070.0,2.0,1.0,453.0,-37.7666,145.0132,11364.0,1170000.0 +h,S,Southern Metropolitan,4,6.3,3143.0,4.0,3.0,421.0,-37.8593,145.0275,4836.0,3660000.0 +u,S,Southern Metropolitan,2,3.5,3207.0,2.0,2.0,0.0,-37.84158,144.93809,8648.0,760000.0 +h,S,Western Metropolitan,2,6.4,3012.0,2.0,1.0,369.0,-37.79221,144.86408,5058.0,749000.0 +h,S,Southern Metropolitan,3,11.2,3186.0,3.0,1.0,340.0,-37.9205,145.0007,10579.0,1390000.0 +u,SP,Northern Metropolitan,3,12.1,3046.0,3.0,1.0,225.0,-37.7068,144.9467,2606.0,465000.0 +h,S,Southern Metropolitan,2,7.8,3124.0,2.0,1.0,633.0,-37.8427,145.0824,8920.0,1900000.0 +u,VB,Southern Metropolitan,2,6.1,3182.0,2.0,1.0,0.0,-37.8562,144.9844,13240.0,470000.0 +h,S,Southern Metropolitan,5,15.2,3191.0,5.0,3.0,545.0,-37.94953,145.00607,4497.0,2220000.0 +h,S,Southern Metropolitan,2,12.2,3147.0,2.0,1.0,583.0,-37.8693,145.1082,2894.0,995000.0 +h,S,Western Metropolitan,3,13.8,3018.0,3.0,1.0,604.0,-37.8631,144.8195,5301.0,740000.0 +u,S,Northern Metropolitan,2,1.5,3002.0,0.0,0.0,0.0,-37.8154,144.9851,3040.0,872000.0 +h,S,Southern Metropolitan,3,14.0,3166.0,3.0,1.0,548.0,-37.8951,145.101,3224.0,1033000.0 +h,S,Northern Metropolitan,3,12.0,3073.0,3.0,1.0,529.0,-37.72017,144.99873,21650.0,935000.0 +h,VB,Southern Metropolitan,3,11.2,3186.0,3.0,2.0,266.0,-37.9241,145.0028,10579.0,1600000.0 +h,S,Northern Metropolitan,2,9.9,3044.0,2.0,1.0,629.0,-37.7312,144.9399,7485.0,915000.0 +t,VB,Northern Metropolitan,2,8.5,3044.0,2.0,2.0,74.0,-37.72184,144.92969,7485.0,500000.0 +t,S,Western Metropolitan,4,8.0,3040.0,4.0,2.0,230.0,-37.7547,144.9239,9264.0,1162500.0 +h,PI,Southern Metropolitan,4,5.4,3101.0,4.0,2.0,399.0,-37.80526,145.04294,10331.0,2450000.0 +h,S,Western Metropolitan,3,6.9,3039.0,3.0,2.0,253.0,-37.7708,144.9234,6232.0,1260000.0 +h,S,Northern Metropolitan,4,9.9,3044.0,4.0,3.0,590.0,-37.7302,144.9357,7485.0,1395000.0 +h,SP,Western Metropolitan,3,14.7,3030.0,3.0,2.0,312.0,-37.89273,144.72558999999995,15542.0,520000.0 +h,S,Northern Metropolitan,3,12.4,3060.0,2.0,1.0,583.0,-37.6949,144.9619,5070.0,550000.0 +u,S,Western Metropolitan,1,4.3,3032.0,1.0,1.0,887.0,-37.76878,144.89197,4918.0,301000.0 +u,S,Southern Metropolitan,2,0.7,3006.0,2.0,1.0,0.0,-37.8281,144.96627,8400.0,600000.0 +h,PI,Western Metropolitan,4,8.4,3015.0,3.0,3.0,217.0,-37.8344,144.8764,5498.0,860000.0 +u,SP,Western Metropolitan,2,8.7,3032.0,2.0,1.0,3967.0,-37.7706,144.8805,4918.0,400000.0 +h,S,Southern Metropolitan,4,9.7,3104.0,4.0,2.0,605.0,-37.79466,145.06564,7809.0,1755000.0 +h,S,Northern Metropolitan,4,6.4,3078.0,4.0,2.0,606.0,-37.7743,145.0316,2211.0,1830000.0 +h,S,Southern Metropolitan,3,17.9,3192.0,3.0,1.0,584.0,-37.96911,145.07271,9758.0,1145000.0 +h,S,Northern Metropolitan,3,11.2,3046.0,3.0,1.0,600.0,-37.70067,144.92689,8870.0,721000.0 +h,S,Southern Metropolitan,4,11.2,3127.0,4.0,2.0,734.0,-37.8286,145.092,5457.0,2200000.0 +h,S,Eastern Metropolitan,3,13.1,3128.0,3.0,1.0,662.0,-37.8246,145.1269,4605.0,1351000.0 +h,S,Eastern Metropolitan,2,9.4,3081.0,2.0,1.0,650.0,-37.7446,145.0404,2674.0,686000.0 +h,S,Southern Metropolitan,4,13.9,3165.0,4.0,2.0,592.0,-37.9351,145.0572,10969.0,1472500.0 +t,S,Northern Metropolitan,3,4.0,3057.0,3.0,2.0,138.0,-37.76292,144.97975,5533.0,959000.0 +h,S,Southern Metropolitan,3,17.9,3192.0,3.0,1.0,570.0,-37.96258,145.08038,9758.0,948000.0 +h,S,Northern Metropolitan,2,2.5,3067.0,2.0,2.0,98.0,-37.8005,144.9952,4019.0,1135000.0 +h,S,Northern Metropolitan,3,7.8,3058.0,3.0,1.0,233.0,-37.7549,144.9611,11204.0,1370000.0 +h,S,Northern Metropolitan,4,20.6,3064.0,4.0,2.0,560.0,-37.59943,144.91439,15510.0,600000.0 +h,S,Southern Metropolitan,5,7.2,3185.0,5.0,2.0,538.0,-37.89989,145.00883000000005,534.0,1880000.0 +u,SP,Northern Metropolitan,2,5.5,3070.0,2.0,1.0,1882.0,-37.768,144.9895,11364.0,418000.0 +h,S,Northern Metropolitan,3,15.3,3074.0,3.0,1.0,573.0,-37.67939000000001,145.00143,7955.0,720000.0 +h,S,Southern Metropolitan,3,2.1,3205.0,3.0,1.0,276.0,-37.838,144.9489,5943.0,2633000.0 +h,S,Northern Metropolitan,2,2.4,3121.0,2.0,1.0,135.0,-37.82241,145.00235,14949.0,1341000.0 +h,S,Eastern Metropolitan,2,7.8,3079.0,2.0,1.0,697.0,-37.77345,145.06098,1554.0,1601000.0 +h,S,Southern Metropolitan,2,3.8,3207.0,2.0,1.0,111.0,-37.835,144.9373,8648.0,825000.0 +h,S,Northern Metropolitan,4,11.2,3073.0,4.0,1.0,855.0,-37.6997,145.0043,21650.0,770000.0 +h,S,Northern Metropolitan,4,5.9,3055.0,4.0,2.0,299.0,-37.764,144.9454,7082.0,930000.0 +u,S,Southern Metropolitan,2,4.6,3122.0,2.0,1.0,2679.0,-37.8261,145.0269,11308.0,635000.0 +u,PI,Southern Metropolitan,2,7.7,3184.0,2.0,1.0,0.0,-37.8817,144.984,8989.0,630000.0 +h,S,Northern Metropolitan,2,4.0,3057.0,2.0,1.0,71.0,-37.77169,144.97557,5533.0,501000.0 +h,S,Northern Metropolitan,3,8.8,3072.0,3.0,2.0,227.0,-37.7445,144.9917,14577.0,1000000.0 +h,S,Southern Metropolitan,3,10.2,3127.0,3.0,2.0,385.0,-37.82886,145.10093,5457.0,1385000.0 +h,S,Eastern Metropolitan,3,13.4,3130.0,3.0,2.0,887.0,-37.8429,145.14895,4387.0,905000.0 +h,S,Western Victoria,4,31.7,3337.0,4.0,2.0,643.0,-37.68834,144.56803,3600.0,400000.0 +h,S,Southern Metropolitan,3,11.2,3186.0,3.0,3.0,482.0,-37.912,144.9994,10579.0,2450000.0 +h,S,South-Eastern Metropolitan,4,18.8,3170.0,4.0,2.0,790.0,-37.91375,145.16438,7113.0,992000.0 +h,PI,Northern Metropolitan,3,8.8,3072.0,3.0,2.0,490.0,-37.7413,145.0203,14577.0,1100000.0 +u,PI,Southern Metropolitan,2,7.8,3124.0,2.0,2.0,188.0,-37.8442,145.0668,8920.0,805000.0 +t,S,Southern Metropolitan,3,4.6,3181.0,3.0,2.0,15.0,-37.84969,145.00056999999995,7717.0,1463000.0 +h,S,Southern Metropolitan,4,16.7,3168.0,4.0,3.0,727.0,-37.90779000000001,145.15042,902.0,1211000.0 +h,S,Western Metropolitan,3,13.5,3042.0,3.0,1.0,0.0,-37.7217,144.8783,3464.0,805000.0 +h,S,Northern Metropolitan,1,3.2,3054.0,1.0,1.0,93.0,-37.7888,144.9698,3106.0,885000.0 +h,S,Northern Metropolitan,3,4.5,3057.0,3.0,1.0,104.0,-37.7723,144.9761,5533.0,998000.0 +h,S,Northern Metropolitan,3,12.1,3046.0,3.0,1.0,570.0,-37.7106,144.9491,2606.0,730000.0 +h,PI,Southern Metropolitan,5,9.7,3103.0,5.0,5.0,651.0,-37.8071,145.0908,5682.0,3250000.0 +u,SP,Southern Metropolitan,2,7.4,3144.0,2.0,1.0,0.0,-37.8633,145.0338,4675.0,566000.0 +t,S,Western Metropolitan,3,6.4,3012.0,3.0,2.0,209.0,-37.78707,144.87608,3873.0,810000.0 +h,S,Western Metropolitan,4,9.7,3041.0,4.0,2.0,607.0,-37.7376,144.9154,3284.0,1430000.0 +h,SP,Northern Metropolitan,3,2.5,3067.0,3.0,2.0,134.0,-37.8093,144.9944,4019.0,1465000.0 +u,S,Southern Metropolitan,2,4.6,3181.0,2.0,2.0,0.0,-37.85184,145.01049,7717.0,660000.0 +h,S,South-Eastern Metropolitan,3,24.7,3175.0,3.0,1.0,630.0,-37.96543,145.20338999999996,8322.0,560000.0 +h,S,South-Eastern Metropolitan,3,38.0,3199.0,3.0,1.0,713.0,-38.16483,145.16383000000005,17055.0,565000.0 +h,S,Southern Metropolitan,4,11.4,3204.0,4.0,2.0,603.0,-37.91655,145.02448,6795.0,1479000.0 +h,S,Southern Metropolitan,3,6.6,3183.0,3.0,1.0,178.0,-37.8663,144.9948,2952.0,1193000.0 +u,S,Southern Metropolitan,2,6.1,3182.0,2.0,1.0,0.0,-37.857,144.9867,13240.0,482000.0 +t,S,Northern Metropolitan,3,8.8,3072.0,3.0,2.0,242.0,-37.7506,145.0185,14577.0,880000.0 +u,SP,Southern Metropolitan,2,4.6,3181.0,2.0,1.0,0.0,-37.85924,145.00563,7717.0,500000.0 +u,S,Northern Metropolitan,2,5.9,3055.0,2.0,1.0,564.0,-37.7665,144.9425,7082.0,471000.0 +h,S,Southern Metropolitan,2,14.0,3166.0,2.0,1.0,553.0,-37.9001,145.0981,3224.0,1010000.0 +h,SP,Eastern Metropolitan,4,11.8,3105.0,4.0,3.0,604.0,-37.7621,145.086,4480.0,1300000.0 +t,PI,Western Metropolitan,4,8.4,3015.0,4.0,3.0,278.0,-37.8468,144.874,5498.0,930000.0 +u,PI,Southern Metropolitan,3,4.6,3122.0,3.0,3.0,0.0,-37.8144,145.0153,11308.0,1000000.0 +u,VB,Southern Metropolitan,1,1.2,3006.0,1.0,1.0,546.0,-37.8274,144.9587,8400.0,370000.0 +u,S,Southern Metropolitan,2,1.2,3006.0,2.0,1.0,0.0,-37.828,144.9683,8400.0,623000.0 +h,S,Western Metropolitan,3,9.2,3012.0,3.0,2.0,260.0,-37.7825,144.8833,3873.0,725000.0 +u,S,Southern Metropolitan,2,7.2,3184.0,2.0,1.0,797.0,-37.87449,144.99059,8989.0,720000.0 +h,SP,Northern Metropolitan,3,11.2,3073.0,3.0,1.0,551.0,-37.7194,145.0015,21650.0,801000.0 +h,S,Northern Metropolitan,3,12.0,3073.0,3.0,1.0,694.0,-37.72089,145.0153,21650.0,900000.0 +h,S,Western Metropolitan,3,6.8,3016.0,3.0,2.0,297.0,-37.85537,144.87578,802.0,1190000.0 +h,S,Western Metropolitan,4,8.0,3016.0,4.0,2.0,897.0,-37.8679,144.899,6380.0,3130000.0 +h,S,Northern Metropolitan,4,7.8,3058.0,4.0,1.0,531.0,-37.7473,144.9528,11204.0,1075000.0 +h,S,Western Metropolitan,3,4.3,3032.0,3.0,2.0,359.0,-37.78606,144.88459,4918.0,983000.0 +u,S,Southern Metropolitan,2,1.2,3006.0,2.0,2.0,0.0,-37.828,144.9683,8400.0,600000.0 +h,S,Northern Metropolitan,3,4.4,3031.0,2.0,1.0,313.0,-37.7853,144.9235,3593.0,1315000.0 +u,S,Southern Metropolitan,2,16.0,3190.0,2.0,1.0,158.0,-37.95147,145.04904,4794.0,680500.0 +h,SP,Western Metropolitan,3,8.4,3015.0,3.0,2.0,360.0,-37.835,144.8814,5498.0,900000.0 +u,S,Southern Metropolitan,2,4.6,3122.0,2.0,1.0,0.0,-37.8268,145.0257,11308.0,605000.0 +h,SA,Eastern Victoria,4,35.2,3806.0,4.0,4.0,603.0,-38.06788,145.33966999999996,17093.0,950000.0 +h,S,South-Eastern Metropolitan,5,20.0,3194.0,5.0,2.0,720.0,-37.98439000000001,145.06812,6162.0,1485000.0 +h,VB,Southern Metropolitan,4,9.2,3104.0,3.0,2.0,754.0,-37.786,145.0886,7809.0,1700000.0 +u,SP,Southern Metropolitan,2,2.7,3141.0,0.0,1.0,0.0,-37.84468,145.00368,14887.0,535000.0 +h,S,Northern Metropolitan,3,7.8,3058.0,2.0,1.0,537.0,-37.7443,144.9494,11204.0,1194500.0 +h,PI,Western Metropolitan,3,5.9,3032.0,3.0,1.0,555.0,-37.7767,144.9125,6567.0,1060000.0 +h,S,Northern Metropolitan,3,11.2,3073.0,3.0,1.0,460.0,-37.6917,144.9755,21650.0,601000.0 +h,VB,Northern Metropolitan,3,5.8,3078.0,3.0,1.0,553.0,-37.7728,145.0214,2970.0,1350000.0 +h,S,Northern Metropolitan,3,7.8,3058.0,3.0,1.0,527.0,-37.7359,144.9502,11204.0,1061000.0 +h,S,Northern Metropolitan,4,2.6,3121.0,4.0,1.0,434.0,-37.8185,145.0061,14949.0,1775000.0 +h,PI,Southern Metropolitan,2,2.1,3205.0,2.0,2.0,141.0,-37.8389,144.9612,5943.0,2000000.0 +u,S,Southern Metropolitan,2,3.8,3207.0,2.0,2.0,0.0,-37.8444,144.9421,8648.0,2250000.0 +h,S,Eastern Victoria,2,36.9,3782.0,2.0,1.0,1000.0,-37.93064,145.4453,2259.0,655000.0 +h,S,Northern Metropolitan,3,11.2,3073.0,3.0,1.0,4679.0,-37.7093,145.0012,21650.0,800000.0 +h,S,Eastern Metropolitan,3,13.8,3107.0,3.0,1.0,799.0,-37.7698,145.1028,5420.0,1002000.0 +h,SP,Western Victoria,4,31.7,3337.0,4.0,2.0,547.0,-37.69026,144.57866,3600.0,320000.0 +h,S,Southern Metropolitan,2,3.3,3206.0,2.0,1.0,149.0,-37.8443,144.9481,3280.0,1322500.0 +h,S,Southern Metropolitan,3,13.0,3204.0,3.0,1.0,700.0,-37.9228,145.0476,6795.0,1327500.0 +h,S,Eastern Metropolitan,4,19.9,3134.0,4.0,2.0,1016.0,-37.78372,145.25311000000005,7785.0,1725000.0 +h,SP,Western Metropolitan,3,15.5,3038.0,3.0,2.0,749.0,-37.72321,144.80882,3656.0,735000.0 +u,S,Southern Metropolitan,2,13.8,3165.0,2.0,2.0,212.0,-37.91786,145.08243000000004,10969.0,690000.0 +t,S,Southern Metropolitan,3,14.6,3189.0,3.0,2.0,472.0,-37.9387,145.0461,2555.0,917000.0 +h,S,Northern Metropolitan,3,12.0,3073.0,3.0,1.0,494.0,-37.71477,144.98235,21650.0,685000.0 +h,S,Northern Metropolitan,3,2.6,3121.0,3.0,1.0,209.0,-37.8206,144.9913,14949.0,1345000.0 +h,S,Northern Metropolitan,3,4.5,3057.0,3.0,1.0,812.0,-37.7673,144.9782,5533.0,1600000.0 +u,VB,Southern Metropolitan,2,6.3,3143.0,2.0,1.0,861.0,-37.8539,145.0284,4836.0,550000.0 +h,SP,Southern Metropolitan,3,10.2,3147.0,3.0,1.0,648.0,-37.8614,145.08436,3052.0,1840000.0 +h,S,Western Metropolitan,3,12.8,3033.0,3.0,2.0,630.0,-37.7378,144.8705,5629.0,825000.0 +h,PI,Northern Metropolitan,2,2.6,3121.0,2.0,1.0,95.0,-37.8193,144.9976,14949.0,900000.0 +h,S,Eastern Metropolitan,2,10.5,3081.0,2.0,1.0,620.0,-37.7377,145.0541,2947.0,650000.0 +h,PI,Southern Metropolitan,5,9.2,3146.0,5.0,2.0,654.0,-37.8515,145.0922,10412.0,2410000.0 +h,PI,Northern Metropolitan,3,11.2,3073.0,3.0,1.0,543.0,-37.6946,144.9962,21650.0,510000.0 +h,S,Eastern Metropolitan,4,13.8,3084.0,4.0,2.0,648.0,-37.7363,145.0851,2698.0,1042500.0 +h,S,Southern Metropolitan,3,11.2,3186.0,3.0,2.0,275.0,-37.9153,144.9924,10579.0,1875000.0 +h,S,Western Metropolitan,4,7.0,3013.0,4.0,1.0,766.0,-37.8126,144.8905,6543.0,1870000.0 +u,SP,Southern Metropolitan,1,10.4,3163.0,1.0,1.0,1162.0,-37.8911,145.0451,2403.0,260000.0 +u,SP,Southern Metropolitan,2,11.4,3163.0,2.0,1.0,0.0,-37.8893,145.0589,7822.0,525000.0 +u,S,Eastern Metropolitan,3,9.0,3079.0,3.0,2.0,356.0,-37.7749,145.0568,1554.0,1260000.0 +h,PI,Northern Metropolitan,3,5.2,3056.0,3.0,1.0,302.0,-37.758,144.9724,11918.0,835000.0 +u,S,Southern Metropolitan,3,5.6,3101.0,3.0,2.0,0.0,-37.8064,145.0182,10331.0,882000.0 +u,S,Northern Metropolitan,2,5.9,3055.0,2.0,1.0,0.0,-37.7752,144.9463,7082.0,400000.0 +h,SP,Western Metropolitan,3,9.2,3012.0,3.0,1.0,300.0,-37.7896,144.8752,3873.0,832000.0 +h,S,Southern Metropolitan,2,7.7,3184.0,2.0,1.0,249.0,-37.8778,144.9866,8989.0,1180000.0 +h,VB,Southern Metropolitan,3,9.2,3104.0,3.0,2.0,763.0,-37.7935,145.0865,7809.0,1700000.0 +u,S,Northern Metropolitan,2,11.2,3046.0,2.0,1.0,304.0,-37.70733,144.94041,2606.0,500000.0 +t,PI,Western Metropolitan,5,8.7,3032.0,5.0,3.0,98.0,-37.7843,144.8939,4918.0,1000000.0 +h,S,Western Metropolitan,4,18.4,3029.0,4.0,2.0,872.0,-37.87217,144.68746000000004,13830.0,723000.0 +h,S,Southern Metropolitan,4,13.7,3188.0,3.0,1.0,1504.0,-37.944,145.0015,5454.0,2600000.0 +h,S,Western Metropolitan,3,7.5,3040.0,3.0,3.0,226.0,-37.75485,144.9119,9264.0,1315000.0 +h,S,Western Metropolitan,4,8.0,3040.0,4.0,2.0,858.0,-37.7442,144.8934,9264.0,1500000.0 +h,S,Northern Metropolitan,3,12.0,3073.0,3.0,2.0,461.0,-37.72499000000001,145.00038,21650.0,1040000.0 +u,S,Southern Metropolitan,2,8.5,3185.0,2.0,1.0,99.0,-37.8951,145.0085,4898.0,700000.0 +h,S,Southern Metropolitan,4,13.0,3166.0,3.0,2.0,678.0,-37.898,145.0815,3145.0,1167500.0 +h,S,Southern Metropolitan,4,11.0,3147.0,4.0,3.0,480.0,-37.8726,145.0716,3052.0,2220000.0 +t,S,Northern Metropolitan,4,17.9,3082.0,4.0,1.0,199.0,-37.677,145.05658,10529.0,492000.0 +h,VB,Western Metropolitan,3,12.9,3043.0,3.0,1.0,529.0,-37.69485,144.89092,3285.0,650000.0 +u,S,Southern Metropolitan,3,11.2,3186.0,3.0,1.0,119.0,-37.8958,144.9994,10579.0,826000.0 +u,S,Northern Metropolitan,2,9.9,3044.0,2.0,1.0,139.0,-37.7315,144.9301,7485.0,485000.0 +h,S,Southern Metropolitan,3,13.9,3165.0,4.0,1.0,701.0,-37.9178,145.0815,10969.0,1140000.0 +h,S,Southern Metropolitan,4,5.1,3181.0,4.0,1.0,230.0,-37.8549,144.9948,4380.0,1605000.0 +h,S,Northern Metropolitan,3,2.3,3051.0,3.0,2.0,230.0,-37.7987,144.9434,6821.0,2161000.0 +h,S,Northern Metropolitan,3,12.4,3060.0,3.0,1.0,587.0,-37.7007,144.9632,5070.0,670000.0 +h,S,Northern Metropolitan,4,16.1,3088.0,4.0,3.0,807.0,-37.70077,145.12743,8524.0,900000.0 +h,S,Southern Metropolitan,4,11.8,3204.0,3.0,1.0,805.0,-37.9066,145.0354,3578.0,1920000.0 +t,S,Northern Metropolitan,2,11.2,3073.0,2.0,1.0,120.0,-37.7239,145.0083,21650.0,568000.0 +h,S,Northern Metropolitan,2,1.6,3066.0,3.0,1.0,282.0,-37.7985,144.9883,4553.0,1265000.0 +h,S,Southern Metropolitan,4,11.4,3204.0,4.0,2.0,567.0,-37.90998,145.03082,2397.0,1830000.0 +u,VB,Southern Metropolitan,1,8.1,3161.0,1.0,1.0,952.0,-37.8744,145.0371,6923.0,290000.0 +h,PI,Western Metropolitan,3,13.3,3020.0,3.0,1.0,541.0,-37.7669,144.8308,4217.0,490000.0 +h,S,Southern Metropolitan,3,13.7,3188.0,3.0,2.0,766.0,-37.9346,145.0049,5454.0,2513000.0 +h,S,Southern Metropolitan,5,13.0,3204.0,5.0,2.0,664.0,-37.9241,145.048,6795.0,1385000.0 +h,S,Northern Metropolitan,2,2.5,3067.0,3.0,1.0,220.0,-37.801,144.9989,4019.0,1097000.0 +h,PI,Northern Metropolitan,2,8.8,3072.0,2.0,1.0,319.0,-37.7442,144.9958,14577.0,630000.0 +t,S,Southern Metropolitan,3,2.1,3205.0,3.0,2.0,146.0,-37.8331,144.9665,5943.0,1450000.0 +h,VB,Northern Metropolitan,3,5.5,3070.0,3.0,1.0,440.0,-37.7663,145.0118,11364.0,1000000.0 +u,S,South-Eastern Metropolitan,2,21.5,3195.0,2.0,1.0,0.0,-38.00765,145.0935,3650.0,600000.0 +h,S,Northern Metropolitan,3,20.6,3064.0,3.0,2.0,255.0,-37.63382,144.92774,5833.0,425000.0 +h,S,Eastern Metropolitan,3,23.0,3136.0,3.0,1.0,850.0,-37.78297,145.28774,11925.0,840500.0 +h,VB,Southern Metropolitan,3,7.2,3184.0,3.0,2.0,225.0,-37.87614,144.99081,8989.0,1400000.0 +h,VB,Western Metropolitan,4,12.9,3043.0,4.0,2.0,620.0,-37.70377,144.90216999999996,1071.0,820000.0 +h,SP,Western Metropolitan,4,14.7,3030.0,4.0,2.0,531.0,-37.89556,144.64268,16166.0,510000.0 +h,PI,Southern Metropolitan,3,3.3,3141.0,3.0,2.0,358.0,-37.8425,145.0031,14887.0,2200000.0 +h,S,Western Metropolitan,2,8.0,3040.0,2.0,1.0,414.0,-37.7466,144.8941,9264.0,953000.0 +h,PI,Northern Metropolitan,2,5.5,3070.0,2.0,1.0,326.0,-37.7681,145.0119,11364.0,970000.0 +h,S,Southern Metropolitan,3,10.1,3163.0,3.0,2.0,602.0,-37.90248,145.05943,7822.0,1570000.0 +h,S,Eastern Metropolitan,3,7.9,3079.0,3.0,1.0,630.0,-37.7635,145.039,5549.0,1465000.0 +h,S,Northern Metropolitan,2,11.2,3046.0,2.0,1.0,378.0,-37.70175,144.93578,8870.0,605000.0 +h,S,Northern Metropolitan,3,4.4,3031.0,3.0,1.0,320.0,-37.7844,144.9359,3593.0,940000.0 +h,S,Southern Metropolitan,3,13.9,3165.0,3.0,1.0,576.0,-37.9362,145.0691,10969.0,1000000.0 +h,PI,Northern Metropolitan,3,4.2,3031.0,3.0,1.0,234.0,-37.7923,144.9332,5263.0,1010000.0 +u,SP,Northern Metropolitan,1,3.5,3068.0,1.0,1.0,0.0,-37.7925,144.9786,6244.0,358500.0 +h,S,Western Metropolitan,3,6.2,3015.0,3.0,1.0,343.0,-37.84502,144.88612,5498.0,1250000.0 +h,S,South-Eastern Metropolitan,4,16.7,3150.0,4.0,2.0,650.0,-37.91026,145.19298,7392.0,1200000.0 +h,VB,Northern Metropolitan,3,11.5,3046.0,3.0,1.0,730.0,-37.7167,144.9216,2651.0,1400000.0 +h,S,Eastern Metropolitan,3,13.1,3128.0,3.0,1.0,702.0,-37.827,145.1209,4605.0,1325000.0 +u,SP,Western Metropolitan,2,14.0,3021.0,2.0,1.0,218.0,-37.74839,144.7735,1899.0,415000.0 +u,SP,Southern Metropolitan,2,7.7,3184.0,2.0,2.0,0.0,-37.8744,144.9888,8989.0,1122000.0 +h,SP,South-Eastern Metropolitan,4,34.9,3201.0,3.0,1.0,646.0,-38.10346,145.18159,8060.0,546000.0 +t,S,Western Metropolitan,3,4.3,3032.0,3.0,2.0,231.0,-37.77373,144.9311,6567.0,1130000.0 +t,S,Western Metropolitan,3,12.8,3033.0,3.0,2.0,224.0,-37.7426,144.868,5629.0,745000.0 +h,S,Southern Metropolitan,3,4.6,3122.0,3.0,1.0,199.0,-37.8172,145.0243,11308.0,1551000.0 +h,S,Western Metropolitan,3,8.4,3015.0,2.0,2.0,281.0,-37.8431,144.8845,5498.0,950000.0 +u,S,Southern Metropolitan,2,3.3,3141.0,2.0,1.0,0.0,-37.8452,145.0001,14887.0,499000.0 +h,PI,Western Metropolitan,3,6.4,3011.0,3.0,1.0,413.0,-37.7931,144.893,7570.0,930000.0 +t,S,Western Metropolitan,3,6.4,3012.0,3.0,2.0,262.0,-37.78284,144.88098,3873.0,895000.0 +t,S,Western Metropolitan,3,10.4,3042.0,3.0,1.0,242.0,-37.72893,144.88859,3464.0,730000.0 +h,S,Northern Metropolitan,3,8.8,3072.0,3.0,1.0,594.0,-37.7512,145.017,14577.0,690000.0 +h,VB,Eastern Metropolitan,3,23.0,3136.0,3.0,2.0,815.0,-37.7969,145.29273999999995,11925.0,720000.0 +h,S,Northern Metropolitan,2,3.4,3031.0,2.0,1.0,193.0,-37.78749000000001,144.93203,3593.0,1100000.0 +h,SP,Eastern Metropolitan,3,13.8,3084.0,3.0,1.0,698.0,-37.7343,145.0837,2698.0,870000.0 +h,S,Western Metropolitan,3,11.7,3033.0,3.0,3.0,326.0,-37.74009,144.87787,5629.0,1295000.0 +u,S,Southern Metropolitan,2,7.7,3184.0,2.0,1.0,0.0,-37.8758,144.9874,8989.0,717000.0 +h,PI,Western Metropolitan,3,8.0,3016.0,3.0,2.0,289.0,-37.8593,144.8881,6380.0,975000.0 +u,S,Northern Metropolitan,2,2.6,3121.0,2.0,1.0,0.0,-37.823,144.9982,14949.0,712000.0 +h,S,Southern Metropolitan,3,9.2,3104.0,3.0,1.0,464.0,-37.7945,145.0667,7809.0,1300000.0 +t,SP,Western Metropolitan,3,8.9,3016.0,3.0,2.0,146.0,-37.857,144.8846,802.0,720000.0 +h,S,Southern Metropolitan,4,11.7,3125.0,4.0,2.0,438.0,-37.8599,145.1101,5678.0,1255000.0 +h,S,Northern Metropolitan,3,8.8,3072.0,3.0,1.0,520.0,-37.7362,145.0232,14577.0,810000.0 +h,S,Eastern Metropolitan,2,8.9,3084.0,2.0,1.0,1313.0,-37.74694,145.07048,3540.0,1310000.0 +h,S,Eastern Metropolitan,5,8.9,3084.0,5.0,3.0,694.0,-37.73968,145.07973,3540.0,1170000.0 +h,PI,Western Metropolitan,3,8.0,3040.0,3.0,1.0,477.0,-37.7499,144.9127,9264.0,825000.0 +h,PI,Northern Metropolitan,4,5.2,3056.0,4.0,1.0,678.0,-37.75967,144.97214,11918.0,1400000.0 +h,PI,Eastern Metropolitan,4,13.8,3084.0,4.0,2.0,780.0,-37.7308,145.0932,2698.0,1155000.0 +h,S,Southern Metropolitan,6,6.3,3143.0,5.0,3.0,1491.0,-37.8602,145.013,4836.0,5525000.0 +h,S,Eastern Metropolitan,3,16.7,3150.0,3.0,2.0,648.0,-37.88255,145.14727,15321.0,1550000.0 +h,PI,Northern Metropolitan,5,13.0,3046.0,5.0,3.0,700.0,-37.7095,144.9253,8870.0,1150000.0 +t,S,Southern Metropolitan,3,5.6,3101.0,3.0,1.0,111.0,-37.7941,145.0238,10331.0,690000.0 +h,VB,Western Metropolitan,5,7.5,3040.0,5.0,3.0,607.0,-37.75148,144.88519,588.0,2200000.0 +h,S,Southern Metropolitan,3,8.5,3185.0,3.0,2.0,492.0,-37.8789,145.0048,4898.0,1800000.0 +h,S,South-Eastern Metropolitan,4,15.5,3167.0,4.0,2.0,564.0,-37.93543,145.08408,3692.0,1120000.0 +u,S,Southern Metropolitan,2,4.6,3142.0,2.0,1.0,1119.0,-37.8498,145.0173,7217.0,620000.0 +u,S,Eastern Metropolitan,2,8.8,3081.0,2.0,1.0,94.0,-37.74432,145.04721,2674.0,444000.0 +h,S,Northern Metropolitan,4,5.5,3070.0,4.0,2.0,440.0,-37.7852,144.9975,11364.0,2270000.0 +h,S,Northern Metropolitan,3,16.3,3075.0,3.0,1.0,852.0,-37.67601,145.02955,8279.0,801000.0 +h,S,Southern Metropolitan,2,13.7,3188.0,2.0,1.0,650.0,-37.9338,145.0143,5454.0,1789000.0 +h,S,Western Metropolitan,4,6.8,3016.0,4.0,3.0,650.0,-37.85925,144.88761,6380.0,2200000.0 +h,S,Southern Metropolitan,5,5.6,3101.0,5.0,3.0,853.0,-37.8151,145.0555,10331.0,4350000.0 +u,S,Northern Metropolitan,3,4.2,3031.0,3.0,1.0,4440.0,-37.7898,144.9233,5263.0,551000.0 +h,S,Southern Metropolitan,3,6.2,3123.0,3.0,2.0,304.0,-37.83277,145.04373,6482.0,1768000.0 +h,S,South-Eastern Metropolitan,3,31.2,3197.0,3.0,2.0,355.0,-38.07196,145.13036,1989.0,825000.0 +h,S,Southern Metropolitan,4,10.2,3147.0,4.0,1.0,778.0,-37.86301,145.11158999999995,2894.0,1425000.0 +u,S,Southern Metropolitan,2,1.2,3006.0,2.0,2.0,2955.0,-37.8299,144.9679,8400.0,622500.0 +t,S,Western Metropolitan,3,8.7,3032.0,3.0,2.0,225.0,-37.7867,144.8906,4918.0,1011000.0 +h,S,Northern Metropolitan,3,5.5,3070.0,3.0,1.0,501.0,-37.7771,145.0116,11364.0,1610000.0 +u,PI,Northern Metropolitan,2,5.8,3078.0,2.0,1.0,0.0,-37.7755,145.0178,2970.0,505000.0 +t,S,Northern Metropolitan,4,3.0,3067.0,4.0,2.0,102.0,-37.80116,145.00066,4019.0,1525000.0 +h,PI,Southern Metropolitan,5,9.2,3104.0,3.0,4.0,886.0,-37.804,145.0951,7809.0,3250000.0 +u,PI,Southern Metropolitan,3,13.9,3165.0,3.0,2.0,252.0,-37.9085,145.0683,10969.0,800000.0 +u,VB,Southern Metropolitan,2,4.5,3181.0,2.0,1.0,0.0,-37.8588,145.0024,7717.0,570000.0 +h,VB,Northern Metropolitan,2,5.9,3055.0,2.0,1.0,0.0,-37.7711,144.9487,7082.0,545000.0 +h,SP,Northern Metropolitan,4,11.2,3073.0,4.0,1.0,605.0,-37.7051,145.0331,21650.0,800000.0 +h,VB,Western Metropolitan,6,8.0,3040.0,7.0,2.0,870.0,-37.7517,144.9018,9264.0,1800000.0 +t,S,Western Metropolitan,3,8.2,3012.0,3.0,1.0,203.0,-37.7928,144.8811,5058.0,815000.0 +t,S,Northern Metropolitan,5,13.0,3046.0,5.0,1.0,227.0,-37.6979,144.941,8870.0,645000.0 +h,SP,Northern Metropolitan,3,5.2,3056.0,3.0,2.0,401.0,-37.76491,144.95253,11918.0,1405000.0 +h,S,Western Metropolitan,3,13.5,3042.0,3.0,1.0,614.0,-37.7194,144.8827,3464.0,790000.0 +h,SP,Northern Metropolitan,3,12.0,3073.0,3.0,2.0,247.0,-37.70952,145.02823999999995,21650.0,635000.0 +h,PI,South-Eastern Metropolitan,3,24.7,3175.0,3.0,1.0,597.0,-37.98265,145.22696000000005,10894.0,620000.0 +h,S,Western Metropolitan,4,6.2,3039.0,4.0,2.0,465.0,-37.76306,144.92851000000005,6232.0,1450000.0 +h,S,Southern Metropolitan,3,11.7,3125.0,3.0,1.0,931.0,-37.854,145.1195,5678.0,1755000.0 +h,SP,Southern Metropolitan,5,9.7,3103.0,5.0,3.0,678.0,-37.8063,145.0705,5682.0,3600000.0 +h,S,Western Metropolitan,2,6.9,3039.0,2.0,1.0,428.0,-37.773,144.9332,6232.0,940000.0 +h,S,Western Metropolitan,3,8.4,3015.0,3.0,1.0,588.0,-37.8485,144.8909,5498.0,1530000.0 +h,S,South-Eastern Metropolitan,3,38.0,3199.0,3.0,2.0,595.0,-38.16135,145.13374,17055.0,690000.0 +h,PI,Northern Metropolitan,5,13.0,3046.0,5.0,3.0,487.0,-37.7088,144.92600000000004,8870.0,850000.0 +h,PI,Northern Metropolitan,4,7.0,3071.0,4.0,2.0,348.0,-37.75582,144.98951,8870.0,1605000.0 +h,SP,Northern Metropolitan,3,12.1,3083.0,3.0,1.0,541.0,-37.70603,145.05423000000005,10175.0,715000.0 +t,SP,Southern Metropolitan,3,12.1,3163.0,3.0,2.0,0.0,-37.8939,145.0715,4442.0,860000.0 +h,S,Northern Metropolitan,3,5.2,3056.0,3.0,1.0,277.0,-37.7592,144.97321000000005,11918.0,905000.0 +t,S,Western Metropolitan,4,7.7,3015.0,3.0,3.0,278.0,-37.8261,144.8718,1223.0,840000.0 +u,S,Western Metropolitan,3,10.5,3020.0,3.0,1.0,284.0,-37.77933,144.81679,2185.0,675000.0 +u,VB,Southern Metropolitan,2,4.6,3122.0,2.0,1.0,747.0,-37.8287,145.035,11308.0,750000.0 +h,S,Western Metropolitan,3,7.8,3012.0,2.0,1.0,259.0,-37.8046,144.8831,1808.0,817000.0 +h,S,Eastern Metropolitan,4,10.5,3081.0,4.0,1.0,722.0,-37.7477,145.0565,2947.0,856000.0 +h,S,Southern Metropolitan,2,13.0,3204.0,2.0,1.0,274.0,-37.9178,145.0413,6795.0,910000.0 +t,VB,Southern Metropolitan,2,9.2,3146.0,2.0,1.0,0.0,-37.8506,145.0465,10412.0,600000.0 +h,S,Southern Metropolitan,4,2.1,3205.0,4.0,3.0,577.0,-37.8357,144.9502,5943.0,1615000.0 +h,S,Western Metropolitan,6,18.0,3037.0,6.0,4.0,694.0,-37.68178,144.73779,5556.0,935000.0 +u,S,Southern Metropolitan,2,7.2,3185.0,2.0,1.0,538.0,-37.88295,145.00083,4898.0,834500.0 +h,S,Northern Metropolitan,4,8.8,3072.0,4.0,2.0,530.0,-37.7416,145.016,14577.0,900000.0 +h,S,Southern Metropolitan,2,3.3,3141.0,2.0,1.0,367.0,-37.842,144.9873,14887.0,2430000.0 +u,VB,Western Metropolitan,2,12.8,3033.0,2.0,1.0,218.0,-37.7337,144.8617,5629.0,470000.0 +h,PI,Northern Metropolitan,4,8.8,3072.0,9.0,8.0,1254.0,-37.7367,144.9895,14577.0,760000.0 +h,S,Western Metropolitan,1,14.0,3021.0,1.0,1.0,617.0,-37.73385,144.80535,14042.0,565000.0 +h,VB,Eastern Metropolitan,4,23.0,3136.0,4.0,2.0,655.0,-37.79725,145.29651,11925.0,730000.0 +h,S,Northern Metropolitan,2,1.6,3065.0,3.0,1.0,95.0,-37.8032,144.9825,5825.0,1440000.0 +t,S,Western Metropolitan,3,13.8,3018.0,3.0,2.0,197.0,-37.8579,144.8181,5301.0,705000.0 +u,S,Southern Metropolitan,2,6.1,3182.0,2.0,1.0,0.0,-37.8679,144.9813,13240.0,509000.0 +u,S,Southern Metropolitan,2,4.6,3122.0,2.0,1.0,0.0,-37.8268,145.0344,11308.0,560000.0 +h,SP,Southern Metropolitan,3,2.1,3205.0,3.0,2.0,88.0,-37.8363,144.9628,5943.0,1600000.0 +u,PI,Northern Metropolitan,2,5.2,3056.0,2.0,1.0,0.0,-37.7738,144.9585,11918.0,1180000.0 +h,S,Southern Metropolitan,4,10.2,3147.0,4.0,2.0,668.0,-37.86096,145.10026000000005,2894.0,1540000.0 +h,S,Southern Metropolitan,3,9.2,3146.0,3.0,2.0,668.0,-37.8489,145.0711,10412.0,1880000.0 +h,S,Western Metropolitan,3,8.4,3015.0,3.0,1.0,752.0,-37.8481,144.8721,5498.0,1370000.0 +h,S,Northern Metropolitan,3,7.0,3071.0,3.0,1.0,612.0,-37.76063,145.02139,8870.0,1031000.0 +h,S,Western Metropolitan,4,5.1,3011.0,4.0,2.0,457.0,-37.80803,144.89601000000005,2417.0,1870000.0 +u,S,Western Metropolitan,1,6.4,3011.0,1.0,1.0,0.0,-37.7907,144.8924,7570.0,227000.0 +h,S,Northern Metropolitan,4,3.4,3068.0,4.0,1.0,306.0,-37.7888,144.9929,2954.0,1825000.0 +h,SP,Northern Metropolitan,3,5.3,3070.0,3.0,1.0,375.0,-37.76397,144.99481,11364.0,1305000.0 +h,S,Western Metropolitan,3,13.5,3042.0,3.0,1.0,623.0,-37.7184,144.882,3464.0,805000.0 +h,SP,Southern Metropolitan,3,2.1,3205.0,3.0,2.0,171.0,-37.8391,144.9501,5943.0,1595000.0 +h,S,Northern Metropolitan,4,4.5,3057.0,4.0,2.0,227.0,-37.7776,144.9726,5533.0,1900000.0 +h,S,Western Metropolitan,2,8.4,3015.0,2.0,1.0,255.0,-37.8469,144.8766,5498.0,760000.0 +u,S,Southern Metropolitan,2,8.5,3185.0,2.0,1.0,0.0,-37.8817,145.0032,4898.0,632500.0 +h,S,Northern Metropolitan,5,20.5,3752.0,5.0,5.0,700.0,-37.6236,145.10629,7969.0,905000.0 +h,PI,Southern Metropolitan,3,4.6,3142.0,3.0,2.0,224.0,-37.8445,145.0124,7217.0,2400000.0 +u,SP,Northern Metropolitan,2,8.8,3072.0,2.0,1.0,0.0,-37.7499,145.0031,14577.0,413000.0 +u,S,Southern Metropolitan,2,5.1,3181.0,2.0,1.0,598.0,-37.8564,144.9971,4380.0,611000.0 +h,S,Western Metropolitan,4,6.4,3011.0,4.0,2.0,154.0,-37.7983,144.8911,7570.0,977000.0 +h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,535.0,-37.757,144.8313,4217.0,520000.0 +h,S,Northern Metropolitan,3,12.4,3060.0,3.0,2.0,297.0,-37.6994,144.9686,5070.0,631000.0 +t,S,Eastern Metropolitan,3,24.8,3156.0,3.0,2.0,239.0,-37.886,145.28646,10788.0,670000.0 +u,SP,Northern Metropolitan,1,1.5,3002.0,2.0,1.0,0.0,-37.8136,144.9892,3040.0,750000.0 +h,VB,Northern Metropolitan,2,2.4,3121.0,2.0,2.0,189.0,-37.81684,145.00606000000005,14949.0,1200000.0 +h,S,South-Eastern Metropolitan,4,20.0,3194.0,4.0,2.0,408.0,-37.98707,145.0609,6162.0,2025000.0 +h,S,Northern Metropolitan,2,1.6,3066.0,2.0,1.0,118.0,-37.7976,144.993,4553.0,670000.0 +h,S,Eastern Metropolitan,3,9.4,3081.0,2.0,1.0,413.0,-37.7399,145.0369,2674.0,465000.0 +h,S,South-Eastern Metropolitan,3,14.7,3167.0,3.0,1.0,774.0,-37.9178,145.0872,3692.0,1260500.0 +h,S,Western Metropolitan,4,7.0,3013.0,4.0,2.0,431.0,-37.8186,144.8764,6543.0,1028000.0 +u,SP,Southern Metropolitan,2,5.3,3122.0,2.0,1.0,279.0,-37.82676,145.04443999999995,11308.0,525000.0 +h,PI,Northern Metropolitan,4,2.6,3121.0,4.0,2.0,0.0,-37.8206,145.0099,14949.0,1510000.0 +h,S,Southern Metropolitan,4,4.1,3206.0,4.0,2.0,182.0,-37.8518,144.9663,2019.0,3120000.0 +h,S,Southern Metropolitan,3,14.0,3166.0,3.0,1.0,602.0,-37.8923,145.1013,3224.0,1045000.0 +h,S,Northern Metropolitan,3,5.5,3070.0,3.0,1.0,407.0,-37.7799,144.9937,11364.0,1715000.0 +h,PI,Southern Metropolitan,3,7.5,3123.0,3.0,1.0,477.0,-37.8257,145.0514,6482.0,1890000.0 +h,S,Eastern Metropolitan,3,13.9,3108.0,3.0,2.0,648.0,-37.7798,145.1368,9028.0,1152500.0 +u,SP,Eastern Metropolitan,1,7.9,3079.0,1.0,1.0,0.0,-37.7638,145.0458,5549.0,385000.0 +h,SA,Southern Metropolitan,4,14.6,3189.0,4.0,2.0,318.0,-37.9443,145.0448,2555.0,1140000.0 +h,SP,Western Metropolitan,4,12.8,3033.0,4.0,2.0,745.0,-37.7479,144.8805,5629.0,920000.0 +u,S,Southern Metropolitan,2,7.5,3123.0,2.0,1.0,172.0,-37.8437,145.0536,6482.0,706000.0 +h,SP,Northern Metropolitan,4,7.8,3058.0,4.0,2.0,398.0,-37.7364,144.9787,11204.0,901000.0 +u,S,Northern Metropolitan,3,1.9,3003.0,3.0,2.0,0.0,-37.8117,144.9518,2230.0,650000.0 +h,S,Northern Metropolitan,3,17.9,3082.0,3.0,1.0,290.0,-37.6669,145.04234,10529.0,470000.0 +t,SP,Western Metropolitan,3,8.0,3016.0,3.0,2.0,233.0,-37.8573,144.8911,6380.0,935000.0 +u,S,Southern Metropolitan,2,15.2,3191.0,2.0,1.0,0.0,-37.95142,145.01977,4497.0,760000.0 +u,SP,Southern Metropolitan,2,5.1,3181.0,2.0,2.0,0.0,-37.8563,144.9916,4380.0,1200000.0 +u,S,Southern Metropolitan,1,5.4,3101.0,1.0,1.0,0.0,-37.80468,145.03614,10331.0,515000.0 +h,S,Northern Metropolitan,3,20.6,3064.0,3.0,2.0,654.0,-37.6038,144.9226,15510.0,607500.0 +u,SP,Southern Metropolitan,1,4.6,3181.0,1.0,1.0,0.0,-37.85012,144.99225,7717.0,440000.0 +h,SP,Eastern Metropolitan,3,24.8,3156.0,3.0,1.0,968.0,-37.88523,145.28553,10788.0,740000.0 +h,S,Northern Metropolitan,3,5.9,3055.0,3.0,1.0,275.0,-37.76300000000001,144.9439,7082.0,793000.0 +h,S,Southern Metropolitan,3,4.5,3181.0,3.0,1.0,231.0,-37.8498,145.0039,7717.0,1530000.0 +h,S,Southern Metropolitan,4,10.7,3187.0,5.0,2.0,999.0,-37.9135,145.0189,6938.0,2840000.0 +u,S,Southern Metropolitan,2,8.1,3161.0,2.0,1.0,0.0,-37.8625,145.0103,6923.0,570000.0 +u,S,Northern Metropolitan,3,9.9,3044.0,3.0,1.0,204.0,-37.7234,144.9497,7485.0,600000.0 +h,S,Western Metropolitan,2,8.0,3016.0,2.0,1.0,265.0,-37.853,144.8928,6380.0,769000.0 +h,S,Southern Metropolitan,4,9.0,3126.0,4.0,2.0,796.0,-37.8266,145.072,3265.0,3145000.0 +u,S,Western Metropolitan,1,6.4,3011.0,1.0,1.0,0.0,-37.8006,144.881,7570.0,375000.0 +h,PI,Northern Metropolitan,3,4.5,3057.0,3.0,2.0,138.0,-37.7658,144.9823,5533.0,930000.0 +h,S,Southern Metropolitan,4,10.1,3163.0,4.0,2.0,632.0,-37.89465,145.06984,4442.0,1820000.0 +h,SP,Western Metropolitan,5,10.5,3034.0,5.0,2.0,574.0,-37.7639,144.8615,4502.0,850000.0 +u,S,Eastern Metropolitan,2,23.2,3153.0,2.0,1.0,418.0,-37.83777,145.26345,5030.0,520000.0 +h,S,Eastern Metropolitan,4,21.3,3135.0,4.0,2.0,842.0,-37.81109,145.25738,4407.0,1175000.0 +h,S,Southern Metropolitan,3,17.9,3192.0,3.0,2.0,638.0,-37.95569,145.06588,9758.0,1201000.0 +h,S,Western Metropolitan,3,14.5,3036.0,3.0,2.0,847.0,-37.72286,144.83252,2339.0,930000.0 +h,SA,Southern Metropolitan,4,10.7,3187.0,5.0,3.0,598.0,-37.9075,145.0248,6938.0,1650000.0 +h,S,Northern Metropolitan,3,12.0,3073.0,3.0,2.0,301.0,-37.72839000000001,145.00105,21650.0,700000.0 +u,PI,Southern Metropolitan,2,9.2,3104.0,2.0,2.0,196.0,-37.785,145.0961,7809.0,800000.0 +h,S,Southern Metropolitan,4,7.3,3146.0,4.0,1.0,632.0,-37.86024000000001,145.04674,10412.0,2305000.0 +u,SP,Southern Metropolitan,1,8.7,3162.0,1.0,1.0,811.0,-37.89614,145.01323,5051.0,295000.0 +h,S,Northern Metropolitan,3,11.2,3073.0,3.0,1.0,437.0,-37.7291,145.02100000000004,21650.0,705000.0 +u,S,Southern Metropolitan,1,7.7,3184.0,1.0,1.0,0.0,-37.8839,144.9903,8989.0,347000.0 +h,PI,Southern Metropolitan,6,7.9,3103.0,6.0,3.0,845.0,-37.80708,145.09698999999995,5682.0,2100000.0 +h,SP,Western Metropolitan,3,9.2,3012.0,3.0,1.0,285.0,-37.7813,144.8678,3873.0,673000.0 +h,PI,Southern Metropolitan,4,10.7,3187.0,4.0,2.0,0.0,-37.9292,145.0252,6938.0,1755000.0 +h,S,Eastern Metropolitan,2,10.5,3081.0,2.0,1.0,586.0,-37.7435,145.0486,2947.0,590000.0 +u,SP,Northern Metropolitan,2,11.2,3073.0,2.0,1.0,183.0,-37.7078,145.033,21650.0,440000.0 +h,S,Western Metropolitan,4,15.5,3038.0,4.0,2.0,660.0,-37.72086,144.79399999999995,3656.0,720000.0 +h,S,Northern Metropolitan,4,3.4,3031.0,4.0,2.0,616.0,-37.78474,144.9322,3593.0,2268000.0 +h,S,Western Metropolitan,3,6.4,3011.0,3.0,1.0,0.0,-37.7962,144.8851,7570.0,853000.0 +h,PI,Southern Metropolitan,5,7.9,3103.0,5.0,3.0,646.0,-37.81369,145.09486,5682.0,2180000.0 +h,S,Northern Metropolitan,4,9.9,3044.0,4.0,1.0,733.0,-37.7194,144.9307,7485.0,975000.0 +u,S,Northern Metropolitan,3,1.8,3053.0,3.0,1.0,0.0,-37.8052,144.9604,6786.0,875000.0 +u,SP,Southern Metropolitan,2,3.3,3141.0,2.0,1.0,0.0,-37.8394,144.9886,14887.0,775000.0 +u,S,Southern Metropolitan,2,4.6,3142.0,2.0,1.0,1087.0,-37.8373,145.0106,7217.0,867000.0 +h,SP,Southern Metropolitan,2,13.0,3204.0,2.0,1.0,296.0,-37.9243,145.0376,6795.0,855000.0 +h,S,Southern Metropolitan,4,13.9,3165.0,4.0,2.0,633.0,-37.9272,145.0644,10969.0,1100000.0 +h,S,Northern Metropolitan,3,6.5,3071.0,3.0,1.0,447.0,-37.7622,145.0074,8870.0,1200000.0 +h,S,Northern Metropolitan,3,6.4,3078.0,3.0,1.0,592.0,-37.7755,145.0288,2211.0,1540000.0 +h,S,Southern Metropolitan,3,13.8,3188.0,3.0,1.0,625.0,-37.94027,145.03593,2356.0,1335000.0 +h,S,Northern Metropolitan,3,12.0,3073.0,3.0,1.0,601.0,-37.72943,145.01904,21650.0,911000.0 +h,VB,Southern Metropolitan,4,9.2,3104.0,4.0,2.0,642.0,-37.7988,145.0726,7809.0,1900000.0 +u,S,Northern Metropolitan,2,16.3,3075.0,2.0,1.0,143.0,-37.67222,145.0231,8279.0,410000.0 +h,S,Eastern Metropolitan,3,13.9,3108.0,3.0,1.0,814.0,-37.7909,145.1309,9028.0,1355000.0 +h,S,Western Metropolitan,3,7.0,3013.0,3.0,2.0,428.0,-37.8167,144.8965,6543.0,871000.0 +t,VB,Southern Metropolitan,4,10.2,3147.0,4.0,2.0,338.0,-37.86058,145.08542,3052.0,1650000.0 +u,SP,Southern Metropolitan,2,9.8,3185.0,2.0,1.0,0.0,-37.8971,145.0093,534.0,393000.0 +h,SP,Southern Metropolitan,2,11.4,3204.0,2.0,1.0,368.0,-37.91304,145.0373,2397.0,1250000.0 +h,S,Northern Metropolitan,4,5.9,3055.0,4.0,1.0,384.0,-37.7733,144.9491,7082.0,988000.0 +h,S,Southern Metropolitan,3,3.8,3207.0,3.0,2.0,214.0,-37.8341,144.9458,8648.0,3705000.0 +h,S,Western Metropolitan,4,7.5,3040.0,4.0,2.0,951.0,-37.74997000000001,144.90752,9264.0,2165000.0 +h,SP,Western Metropolitan,3,13.3,3020.0,3.0,3.0,221.0,-37.7708,144.8401,4217.0,620000.0 +h,S,Northern Metropolitan,4,7.8,3058.0,4.0,1.0,856.0,-37.7432,144.9481,11204.0,1400000.0 +u,S,Southern Metropolitan,3,6.3,3143.0,3.0,2.0,1803.0,-37.8544,145.0164,4836.0,1635000.0 +h,S,Northern Metropolitan,3,9.2,3058.0,3.0,1.0,576.0,-37.7269,144.9654,3445.0,782000.0 +h,PI,Southern Metropolitan,4,13.0,3204.0,4.0,2.0,793.0,-37.9232,145.0502,6795.0,1260000.0 +h,S,Southern Metropolitan,3,5.4,3101.0,3.0,2.0,460.0,-37.80646,145.04651,10331.0,1950000.0 +h,S,Eastern Metropolitan,3,11.4,3084.0,3.0,1.0,583.0,-37.7346,145.0715,3540.0,860000.0 +u,S,Southern Metropolitan,1,7.7,3184.0,1.0,1.0,0.0,-37.8761,144.9871,8989.0,520000.0 +h,S,Northern Metropolitan,3,11.2,3073.0,3.0,1.0,563.0,-37.7258,145.0129,21650.0,787000.0 +h,VB,Southern Metropolitan,3,13.8,3188.0,3.0,1.0,400.0,-37.93434,145.00816,5454.0,1325000.0 +h,S,Northern Metropolitan,2,6.5,3071.0,2.0,1.0,333.0,-37.7642,145.0192,8870.0,874000.0 +h,PI,South-Eastern Metropolitan,4,21.5,3195.0,4.0,2.0,530.0,-38.00429000000001,145.10286000000005,3650.0,1400000.0 +u,S,Southern Metropolitan,2,11.2,3145.0,2.0,1.0,92.0,-37.8706,145.0578,8801.0,720000.0 +h,S,Western Metropolitan,3,18.4,3029.0,3.0,2.0,913.0,-37.85152,144.70802,13830.0,650000.0 +t,S,Western Metropolitan,2,8.7,3032.0,2.0,2.0,0.0,-37.7692,144.8905,4918.0,470000.0 +h,S,Northern Metropolitan,4,12.0,3073.0,4.0,1.0,870.0,-37.70433,145.02068,21650.0,905000.0 +h,S,Southern Metropolitan,3,7.5,3123.0,3.0,2.0,224.0,-37.8368,145.0508,6482.0,1890000.0 +h,VB,Western Metropolitan,4,6.3,3013.0,4.0,2.0,277.0,-37.8215,144.89614,6543.0,800000.0 +h,S,Southern Metropolitan,3,11.2,3145.0,3.0,1.0,615.0,-37.8846,145.0861,8801.0,1782500.0 +h,S,Eastern Metropolitan,4,11.8,3127.0,4.0,2.0,609.0,-37.8149,145.11,2079.0,1865000.0 +h,S,Western Metropolitan,3,7.0,3013.0,3.0,3.0,197.0,-37.8197,144.8875,6543.0,910000.0 +h,S,Southern Metropolitan,3,13.8,3188.0,3.0,1.0,1469.0,-37.93433,145.02983999999995,2356.0,1140000.0 +h,S,Southern Metropolitan,4,12.1,3163.0,4.0,2.0,660.0,-37.8932,145.0681,4442.0,1610000.0 +h,S,Southern Metropolitan,3,11.2,3186.0,3.0,1.0,366.0,-37.9038,145.0001,10579.0,1635000.0 +h,S,Southern Metropolitan,2,3.3,3141.0,2.0,1.0,141.0,-37.8408,144.9977,14887.0,1500000.0 +h,S,Western Metropolitan,3,9.1,3040.0,3.0,2.0,311.0,-37.7603,144.8921,1543.0,1860000.0 +h,SP,Northern Metropolitan,3,5.5,3070.0,3.0,1.0,472.0,-37.7765,145.0027,11364.0,1655000.0 +h,S,Northern Metropolitan,4,11.5,3046.0,4.0,1.0,780.0,-37.7182,144.9246,2651.0,1005000.0 +u,S,Western Metropolitan,3,9.1,3015.0,3.0,1.0,144.0,-37.8268,144.8675,984.0,508000.0 +u,S,Southern Metropolitan,2,4.1,3142.0,2.0,2.0,586.0,-37.84266,145.02355,7217.0,1140000.0 +h,S,Western Metropolitan,4,12.6,3020.0,4.0,2.0,504.0,-37.7952,144.8325,3755.0,930000.0 +h,S,Southern Metropolitan,5,9.0,3126.0,5.0,3.0,645.0,-37.8317,145.0738,3265.0,2930000.0 +u,PI,Southern Metropolitan,2,13.7,3188.0,2.0,1.0,0.0,-37.9436,145.0027,5454.0,500000.0 +h,SP,Western Metropolitan,4,11.1,3025.0,3.0,1.0,559.0,-37.8448,144.8529,5132.0,875000.0 +t,PI,Northern Metropolitan,3,6.4,3078.0,3.0,2.0,528.0,-37.7817,145.0319,2211.0,1285000.0 +h,S,Eastern Metropolitan,3,15.4,3131.0,3.0,1.0,590.0,-37.84308,145.168,4385.0,1011000.0 +h,S,Eastern Metropolitan,4,22.2,3179.0,4.0,2.0,960.0,-37.88992,145.22123,2206.0,1120000.0 +u,VB,Southern Metropolitan,1,4.6,3122.0,1.0,1.0,0.0,-37.8216,145.0343,11308.0,310000.0 +t,S,Southern Metropolitan,3,11.2,3186.0,3.0,2.0,352.0,-37.9139,145.0035,10579.0,1750000.0 +h,S,South-Eastern Metropolitan,4,38.0,3199.0,4.0,2.0,2716.0,-38.17488,145.1234,7566.0,1350000.0 +u,S,Western Metropolitan,2,8.0,3040.0,2.0,1.0,0.0,-37.7494,144.921,9264.0,453000.0 +h,S,Southern Metropolitan,3,11.4,3204.0,3.0,1.0,585.0,-37.91722,145.04836,6795.0,1640000.0 +u,S,Eastern Metropolitan,4,10.6,3084.0,4.0,2.0,1180.0,-37.7551,145.0646,2890.0,830000.0 +u,S,Northern Metropolitan,2,11.2,3073.0,2.0,1.0,0.0,-37.7112,144.9989,21650.0,335000.0 +h,S,Northern Metropolitan,4,5.7,3078.0,4.0,3.0,720.0,-37.77928,145.02993,2211.0,1680000.0 +u,S,Southern Metropolitan,1,5.3,3122.0,1.0,1.0,0.0,-37.81823,145.03972,11308.0,455000.0 +h,S,Northern Metropolitan,3,25.9,3754.0,3.0,2.0,400.0,-37.60697,145.09208,5812.0,517000.0 +u,S,Eastern Metropolitan,2,16.7,3150.0,2.0,1.0,345.0,-37.89919,145.14856,15321.0,656000.0 +u,S,Western Metropolitan,2,7.5,3040.0,2.0,1.0,0.0,-37.74444,144.92084,9264.0,420000.0 +u,S,Southern Metropolitan,2,3.3,3141.0,2.0,2.0,742.0,-37.8357,144.9867,14887.0,863000.0 +h,SP,Northern Metropolitan,3,12.4,3060.0,4.0,2.0,620.0,-37.7034,144.9663,5070.0,662000.0 +u,VB,Southern Metropolitan,2,10.1,3163.0,2.0,1.0,830.0,-37.88414,145.0528,7822.0,480000.0 +h,S,South-Eastern Metropolitan,3,28.8,3177.0,3.0,1.0,680.0,-37.99581,145.24098,3533.0,508000.0 +h,S,Northern Metropolitan,2,14.9,3087.0,2.0,1.0,605.0,-37.7118,145.088,2329.0,662000.0 +t,PI,Northern Metropolitan,2,11.2,3073.0,2.0,1.0,83.0,-37.7171,145.0019,21650.0,380000.0 +u,S,Western Metropolitan,3,11.2,3042.0,3.0,2.0,203.0,-37.7465,144.8867,2291.0,815000.0 +h,S,Western Metropolitan,3,6.3,3013.0,3.0,1.0,433.0,-37.81183,144.88568999999995,6543.0,1230000.0 +h,PI,Northern Metropolitan,4,19.6,3076.0,4.0,2.0,571.0,-37.63762,145.03538999999995,10926.0,658000.0 +h,PI,Southern Metropolitan,3,5.9,3144.0,3.0,2.0,312.0,-37.86162,145.03065,4675.0,1700000.0 +h,S,Northern Metropolitan,3,3.5,3068.0,3.0,2.0,177.0,-37.7851,144.9934,6244.0,2170000.0 +h,SP,Western Metropolitan,3,13.6,3043.0,3.0,2.0,691.0,-37.7089,144.8927,1071.0,700000.0 +h,S,Southern Metropolitan,3,12.2,3147.0,3.0,1.0,723.0,-37.8642,145.1109,2894.0,1120000.0 +u,S,Southern Metropolitan,3,13.8,3188.0,3.0,2.0,0.0,-37.93538,145.00224,5454.0,970000.0 +h,S,Northern Metropolitan,3,2.4,3121.0,3.0,2.0,93.0,-37.82921,145.00826,438.0,1837500.0 +u,S,Northern Metropolitan,2,4.2,3031.0,2.0,1.0,1111.0,-37.7896,144.9321,5263.0,666000.0 +h,PI,Northern Metropolitan,4,5.2,3056.0,4.0,1.0,264.0,-37.7611,144.9677,11918.0,1115000.0 +u,S,Southern Metropolitan,2,13.7,3188.0,2.0,1.0,0.0,-37.9419,145.0019,5454.0,587000.0 +h,S,Western Metropolitan,3,8.0,3040.0,3.0,1.0,687.0,-37.7585,144.9316,9264.0,1330000.0 +h,SP,Northern Metropolitan,5,5.5,3070.0,4.0,4.0,569.0,-37.7763,144.9964,11364.0,4300000.0 +t,SP,Northern Metropolitan,3,4.5,3057.0,3.0,2.0,116.0,-37.7667,144.9821,5533.0,899000.0 +h,SP,Eastern Victoria,3,36.9,3200.0,3.0,1.0,624.0,-38.1187,145.14973,2500.0,452000.0 +h,S,Western Metropolitan,2,10.5,3020.0,2.0,1.0,630.0,-37.76701,144.84444,4217.0,732500.0 +h,S,Northern Metropolitan,3,5.2,3056.0,3.0,2.0,470.0,-37.7788,144.9688,11918.0,1691500.0 +h,S,Northern Metropolitan,4,11.2,3073.0,3.0,1.0,771.0,-37.705,145.0035,21650.0,801000.0 +h,S,Western Metropolitan,2,8.2,3012.0,2.0,1.0,0.0,-37.7956,144.8762,5058.0,450000.0 +u,S,Western Metropolitan,2,8.0,3040.0,2.0,1.0,0.0,-37.7412,144.8974,9264.0,736000.0 +h,S,Northern Metropolitan,3,8.8,3072.0,3.0,1.0,495.0,-37.7521,145.0114,14577.0,890000.0 +u,S,Eastern Metropolitan,3,13.8,3107.0,3.0,1.0,349.0,-37.7694,145.1082,5420.0,641000.0 +h,S,Southern Metropolitan,2,4.6,3122.0,2.0,1.0,243.0,-37.8198,145.0245,11308.0,1535000.0 +h,VB,Southern Metropolitan,3,11.2,3127.0,3.0,1.0,763.0,-37.8148,145.0965,5457.0,1700000.0 +h,S,Northern Metropolitan,4,12.0,3073.0,4.0,2.0,650.0,-37.71556,145.01129,21650.0,1190000.0 +h,S,Western Metropolitan,6,8.0,3040.0,6.0,4.0,860.0,-37.7484,144.9025,9264.0,2620000.0 +h,S,Southern Metropolitan,6,5.6,3101.0,6.0,6.0,1334.0,-37.8029,145.0267,10331.0,6500000.0 +h,SP,South-Eastern Metropolitan,5,27.0,3196.0,5.0,6.0,900.0,-38.04105,145.1216,2546.0,1260000.0 +h,S,Northern Metropolitan,3,9.9,3044.0,3.0,1.0,735.0,-37.7282,144.9304,7485.0,1305000.0 +h,S,South-Eastern Metropolitan,3,24.7,3175.0,3.0,1.0,713.0,-37.98997,145.2254,10894.0,669000.0 +h,S,Southern Metropolitan,3,9.2,3104.0,3.0,1.0,635.0,-37.7982,145.0839,7809.0,1320000.0 +h,VB,Southern Metropolitan,3,3.3,3206.0,3.0,3.0,0.0,-37.8399,144.9577,3280.0,2800000.0 +h,PI,Southern Metropolitan,4,7.7,3184.0,4.0,2.0,560.0,-37.8822,144.9913,8989.0,2400000.0 +u,SP,Northern Metropolitan,1,2.5,3067.0,1.0,1.0,0.0,-37.8016,144.9988,4019.0,426000.0 +h,S,Northern Metropolitan,2,2.5,3067.0,2.0,1.0,195.0,-37.8084,144.9973,4019.0,1172500.0 +h,VB,Eastern Metropolitan,4,7.8,3079.0,4.0,2.0,585.0,-37.75722,145.04399999999995,5549.0,1200000.0 +t,S,Southern Metropolitan,3,5.4,3101.0,3.0,2.0,325.0,-37.81614,145.05056000000005,10331.0,1405000.0 +h,S,Western Metropolitan,4,12.9,3043.0,4.0,2.0,462.0,-37.69953,144.89941000000005,1071.0,931000.0 +u,S,Northern Metropolitan,3,2.6,3052.0,3.0,2.0,0.0,-37.7818,144.9576,2309.0,1360000.0 +u,S,Northern Metropolitan,1,2.6,3121.0,1.0,1.0,0.0,-37.8127,145.0094,14949.0,327000.0 +h,S,Western Metropolitan,3,12.6,3020.0,3.0,2.0,286.0,-37.7896,144.8369,3755.0,590000.0 +u,S,Southern Metropolitan,2,10.4,3163.0,2.0,2.0,0.0,-37.8878,145.0407,2403.0,695000.0 +u,SP,Western Metropolitan,2,6.4,3011.0,2.0,1.0,0.0,-37.8017,144.8957,7570.0,433000.0 +h,PI,Southern Metropolitan,4,9.2,3104.0,4.0,2.0,627.0,-37.7848,145.0788,7809.0,2000000.0 +h,S,Southern Metropolitan,5,13.8,3165.0,5.0,2.0,611.0,-37.93228,145.07026000000005,10969.0,1220000.0 +t,S,Southern Metropolitan,3,12.1,3163.0,1.0,2.0,242.0,-37.9017,145.0739,4442.0,1100000.0 +h,SP,Eastern Metropolitan,4,18.0,3095.0,4.0,2.0,816.0,-37.72308,145.14011000000005,6990.0,1065000.0 +u,SP,Northern Metropolitan,1,4.4,3031.0,1.0,1.0,2077.0,-37.7852,144.9216,3593.0,380000.0 +h,S,Eastern Metropolitan,4,17.2,3132.0,4.0,1.0,842.0,-37.81235,145.19756,6871.0,1302000.0 +u,VB,Northern Metropolitan,2,11.2,3073.0,2.0,1.0,151.0,-37.7126,144.99,21650.0,340000.0 +h,S,Western Metropolitan,3,13.5,3042.0,3.0,1.0,594.0,-37.7203,144.8757,3464.0,872500.0 +u,S,Northern Metropolitan,2,4.4,3031.0,2.0,1.0,0.0,-37.783,144.93200000000004,3593.0,476000.0 +h,S,Eastern Metropolitan,4,11.8,3105.0,4.0,2.0,723.0,-37.7724,145.1033,4480.0,1225000.0 +h,S,Eastern Metropolitan,4,13.8,3107.0,4.0,2.0,650.0,-37.7694,145.1222,5420.0,1421000.0 +h,SP,Eastern Metropolitan,4,14.3,3109.0,4.0,3.0,654.0,-37.79739,145.14671,10999.0,1320000.0 +u,VB,Southern Metropolitan,2,7.3,3146.0,2.0,1.0,200.0,-37.85743,145.0468,10412.0,580000.0 +h,S,Northern Metropolitan,3,5.7,3078.0,3.0,2.0,292.0,-37.77788,145.01963999999995,2970.0,1492000.0 +u,SP,Northern Metropolitan,2,5.9,3055.0,2.0,1.0,501.0,-37.7741,144.9449,7082.0,537000.0 +t,S,Northern Metropolitan,3,12.4,3060.0,3.0,3.0,247.0,-37.7133,144.975,5070.0,543500.0 +h,S,Southern Metropolitan,3,4.5,3181.0,3.0,1.0,199.0,-37.8494,145.005,7717.0,1442000.0 +u,S,Northern Metropolitan,2,6.4,3078.0,2.0,1.0,0.0,-37.7773,145.0314,2211.0,465000.0 +h,SP,Northern Metropolitan,4,3.4,3031.0,4.0,2.0,302.0,-37.7845,144.93582,3593.0,1340000.0 +t,S,Northern Metropolitan,2,13.0,3046.0,2.0,1.0,125.0,-37.7042,144.9211,8870.0,434500.0 +h,VB,Southern Metropolitan,5,11.2,3145.0,5.0,3.0,488.0,-37.8733,145.0507,8801.0,2500000.0 +h,SP,Northern Metropolitan,5,2.6,3121.0,5.0,3.0,618.0,-37.8157,145.0073,14949.0,3200000.0 +u,VB,Southern Metropolitan,2,6.4,3183.0,2.0,1.0,2283.0,-37.87078,144.99898000000005,2952.0,600000.0 +h,S,Western Metropolitan,3,8.0,3040.0,3.0,1.0,551.0,-37.7571,144.9335,9264.0,1134000.0 +h,S,Northern Metropolitan,4,12.4,3060.0,4.0,2.0,647.0,-37.7089,144.9695,5070.0,771000.0 +u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,187.0,-37.7098,144.9163,8870.0,470000.0 +u,S,Western Metropolitan,2,7.5,3040.0,2.0,1.0,0.0,-37.75465,144.9107,9264.0,468000.0 +u,PI,Southern Metropolitan,2,13.9,3165.0,3.0,2.0,0.0,-37.9212,145.0674,10969.0,380000.0 +h,S,Western Metropolitan,3,12.8,3033.0,3.0,1.0,412.0,-37.7459,144.8643,5629.0,583000.0 +h,SP,Western Metropolitan,4,8.0,3040.0,4.0,3.0,519.0,-37.7517,144.9086,9264.0,1600000.0 +u,SP,Northern Metropolitan,2,2.3,3051.0,2.0,1.0,0.0,-37.7988,144.9475,6821.0,528500.0 +t,S,Southern Metropolitan,3,13.9,3165.0,3.0,2.0,343.0,-37.9112,145.0738,10969.0,920000.0 +h,S,Northern Metropolitan,3,5.8,3078.0,3.0,1.0,235.0,-37.7698,145.0183,2970.0,1064000.0 +t,S,Southern Metropolitan,3,7.2,3184.0,3.0,2.0,797.0,-37.8867,144.99141,8989.0,1360000.0 +u,SP,Southern Metropolitan,2,9.2,3146.0,2.0,1.0,0.0,-37.85,145.0461,10412.0,411000.0 +h,S,Eastern Metropolitan,4,11.8,3127.0,4.0,2.0,813.0,-37.8118,145.1065,2079.0,2335000.0 +h,S,Northern Metropolitan,4,3.5,3068.0,4.0,2.0,233.0,-37.7786,144.9818,6244.0,1950000.0 +h,S,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,697.0,-37.7097,144.9256,8870.0,895000.0 +t,S,Eastern Metropolitan,3,7.9,3079.0,3.0,0.0,219.0,-37.7679,145.0466,5549.0,1060000.0 +u,S,Southern Metropolitan,3,11.8,3204.0,3.0,2.0,199.0,-37.9034,145.0409,3578.0,960000.0 +h,PI,Eastern Metropolitan,3,16.7,3150.0,4.0,2.0,1590.0,-37.86957,145.17543999999995,15321.0,3056000.0 +h,VB,Southern Metropolitan,4,9.2,3104.0,4.0,2.0,610.0,-37.7843,145.0891,7809.0,1740000.0 +h,S,Northern Metropolitan,3,20.6,3064.0,3.0,1.0,368.0,-37.63693,144.92581,5833.0,400000.0 +t,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,258.0,-37.7047,144.9087,8870.0,595000.0 +h,PI,Southern Metropolitan,3,13.0,3204.0,3.0,1.0,694.0,-37.9315,145.0445,6795.0,1165000.0 +h,S,Southern Metropolitan,2,4.6,3181.0,2.0,1.0,136.0,-37.85542,144.99571,4380.0,957500.0 +h,SP,Western Metropolitan,3,6.4,3012.0,3.0,1.0,377.0,-37.79285,144.86969,5058.0,879000.0 +h,SA,Northern Metropolitan,3,16.3,3075.0,3.0,1.0,535.0,-37.67324,145.03513,8279.0,690000.0 +u,S,Northern Metropolitan,2,11.2,3073.0,2.0,1.0,141.0,-37.7149,145.0009,21650.0,424000.0 +h,PI,Southern Metropolitan,3,11.2,3145.0,3.0,2.0,700.0,-37.8807,145.0547,8801.0,1915000.0 +u,S,Western Metropolitan,2,14.7,3030.0,2.0,1.0,471.0,-37.88275,144.66661000000005,16166.0,302500.0 +h,SP,Western Metropolitan,4,8.0,3016.0,4.0,3.0,338.0,-37.8633,144.8963,6380.0,2910000.0 +u,SP,Northern Metropolitan,1,3.4,3031.0,1.0,1.0,0.0,-37.78993,144.92306000000005,5263.0,392000.0 +u,S,Western Metropolitan,2,8.0,3040.0,2.0,1.0,0.0,-37.7598,144.9128,9264.0,606000.0 +h,S,Southern Metropolitan,4,13.8,3188.0,4.0,2.0,743.0,-37.94176,145.00931,5454.0,3150000.0 +h,SP,Northern Metropolitan,3,5.5,3070.0,3.0,1.0,318.0,-37.7733,145.0078,11364.0,1090000.0 +h,S,Western Metropolitan,4,18.4,3029.0,4.0,2.0,572.0,-37.87373,144.69131000000004,13830.0,537000.0 +u,S,Northern Metropolitan,2,5.8,3078.0,2.0,1.0,0.0,-37.7762,145.0213,2970.0,438000.0 +h,S,Southern Metropolitan,3,14.6,3189.0,2.0,1.0,576.0,-37.9363,145.0495,2555.0,975000.0 +u,SP,Western Metropolitan,3,4.3,3032.0,3.0,2.0,3215.0,-37.78344,144.92183,6567.0,870000.0 +h,S,Southern Metropolitan,3,13.9,3165.0,3.0,1.0,721.0,-37.9244,145.0665,10969.0,1200000.0 +u,S,Southern Metropolitan,1,10.1,3163.0,1.0,1.0,0.0,-37.89166,145.06718,4442.0,330000.0 +h,S,Eastern Metropolitan,3,13.9,3108.0,3.0,1.0,651.0,-37.7827,145.1051,9028.0,1300000.0 +u,SP,Southern Metropolitan,2,13.0,3204.0,2.0,1.0,116.0,-37.9245,145.0336,6795.0,650000.0 +t,S,Southern Metropolitan,3,13.0,3204.0,3.0,3.0,338.0,-37.9166,145.0418,6795.0,1375000.0 +h,S,Northern Metropolitan,3,11.2,3073.0,3.0,2.0,527.0,-37.7238,144.9964,21650.0,921500.0 +h,S,Southern Metropolitan,3,5.6,3101.0,3.0,1.0,633.0,-37.7916,145.0352,10331.0,1211000.0 +h,S,Northern Metropolitan,2,7.8,3058.0,2.0,1.0,457.0,-37.7505,144.9725,11204.0,1008000.0 +h,SP,Northern Metropolitan,5,20.4,3059.0,5.0,4.0,602.0,-37.65039,144.89948,4864.0,830000.0 +h,VB,Southern Metropolitan,3,2.1,3205.0,3.0,2.0,204.0,-37.8357,144.9531,5943.0,1650000.0 +h,S,Southern Metropolitan,3,7.8,3124.0,3.0,3.0,417.0,-37.831,145.0621,8920.0,3300000.0 +h,VB,Eastern Metropolitan,2,8.8,3081.0,2.0,1.0,588.0,-37.73764,145.05323,2947.0,780000.0 +h,S,Southern Metropolitan,3,11.7,3125.0,3.0,1.0,695.0,-37.8551,145.1121,5678.0,1151000.0 +u,S,Southern Metropolitan,2,16.0,3190.0,2.0,1.0,130.0,-37.94874,145.02631,4794.0,677500.0 +h,S,Southern Metropolitan,3,13.0,3166.0,2.0,1.0,715.0,-37.9072,145.0762,3145.0,1316000.0 +h,SP,Northern Metropolitan,2,12.1,3046.0,2.0,1.0,591.0,-37.7128,144.9471,2606.0,545000.0 +h,S,Western Metropolitan,3,7.0,3013.0,3.0,2.0,230.0,-37.8124,144.8875,6543.0,1414000.0 +h,PI,Western Metropolitan,4,12.6,3020.0,4.0,1.0,603.0,-37.7945,144.8267,3755.0,670000.0 +h,S,Southern Metropolitan,3,2.1,3205.0,3.0,2.0,197.0,-37.8395,144.9489,5943.0,2240000.0 +u,SP,Northern Metropolitan,3,6.5,3071.0,3.0,1.0,242.0,-37.7639,145.0135,8870.0,880000.0 +t,S,Southern Metropolitan,3,12.1,3163.0,3.0,2.0,0.0,-37.8969,145.0654,4442.0,785000.0 +h,S,Southern Metropolitan,2,6.3,3143.0,2.0,1.0,181.0,-37.8542,145.01506,4836.0,1460000.0 +h,S,South-Eastern Metropolitan,3,33.3,3976.0,3.0,2.0,646.0,-38.03378,145.2621,8256.0,561000.0 +h,S,Western Metropolitan,4,7.0,3013.0,4.0,1.0,445.0,-37.8228,144.8769,6543.0,985500.0 +h,S,Southern Metropolitan,3,8.5,3185.0,3.0,2.0,439.0,-37.892,145.0103,4898.0,1940000.0 +u,S,Southern Metropolitan,2,11.2,3186.0,2.0,1.0,122.0,-37.9188,144.9942,10579.0,905500.0 +h,S,Southern Metropolitan,3,13.7,3188.0,3.0,1.0,495.0,-37.9428,145.0092,5454.0,1403000.0 +u,PI,Northern Metropolitan,1,4.2,3031.0,1.0,1.0,0.0,-37.7896,144.9321,5263.0,370000.0 +t,S,Southern Metropolitan,4,10.7,3187.0,4.0,3.0,898.0,-37.9176,145.0087,6938.0,2570000.0 +t,VB,Eastern Metropolitan,3,9.0,3079.0,3.0,2.0,180.0,-37.772,145.0538,1554.0,1050000.0 +h,S,South-Eastern Metropolitan,3,22.2,3172.0,3.0,2.0,533.0,-37.986,145.12035,3940.0,816000.0 +h,S,Northern Metropolitan,2,11.2,3046.0,2.0,1.0,716.0,-37.71589,144.92176,2651.0,1006000.0 +h,S,Western Metropolitan,3,6.6,3011.0,3.0,3.0,168.0,-37.8066,144.887,2417.0,1330000.0 +h,S,Western Metropolitan,3,13.5,3020.0,3.0,1.0,700.0,-37.7845,144.8131,6763.0,660000.0 +h,S,Northern Metropolitan,4,20.6,3064.0,4.0,2.0,484.0,-37.58012,144.91998,15510.0,523000.0 +h,PI,Northern Metropolitan,3,20.4,3059.0,3.0,2.0,775.0,-37.64385,144.89221,4864.0,770000.0 +u,S,Southern Metropolitan,2,3.3,3141.0,2.0,1.0,0.0,-37.8406,145.0035,14887.0,550000.0 +h,S,Eastern Metropolitan,4,16.7,3150.0,4.0,2.0,768.0,-37.89965,145.17135,15321.0,1292000.0 +h,S,Western Metropolitan,3,9.7,3041.0,3.0,2.0,371.0,-37.7229,144.9055,3284.0,1021000.0 +h,PI,Northern Metropolitan,3,8.8,3072.0,3.0,1.0,244.0,-37.7444,145.0202,14577.0,485000.0 +h,PI,Western Metropolitan,3,9.2,3012.0,3.0,1.0,584.0,-37.7858,144.8757,3873.0,760000.0 +h,S,Western Metropolitan,3,8.0,3016.0,3.0,1.0,470.0,-37.8587,144.8871,6380.0,1270000.0 +h,SP,Western Metropolitan,3,6.4,3012.0,3.0,1.0,275.0,-37.81167,144.88346,1808.0,900000.0 +h,S,Western Metropolitan,3,10.5,3020.0,3.0,1.0,631.0,-37.78956,144.84526,3755.0,805000.0 +h,S,Southern Metropolitan,2,6.3,3143.0,2.0,1.0,249.0,-37.8543,145.016,4836.0,1210000.0 +u,S,Southern Metropolitan,2,11.4,3163.0,2.0,1.0,0.0,-37.8981,145.0619,7822.0,635000.0 +h,S,Eastern Metropolitan,3,17.2,3132.0,3.0,1.0,684.0,-37.82771,145.20763,6871.0,1071000.0 +h,S,Southern Metropolitan,4,11.2,3127.0,4.0,2.0,486.0,-37.8258,145.1116,5457.0,1530000.0 +h,S,Western Metropolitan,4,14.7,3030.0,4.0,2.0,612.0,-37.88177,144.74221,15542.0,765000.0 +h,SP,Southern Metropolitan,3,11.4,3204.0,3.0,1.0,401.0,-37.92999,145.04932,6795.0,1045000.0 +h,S,South-Eastern Metropolitan,4,21.5,3195.0,4.0,2.0,695.0,-38.00477,145.10078,3650.0,1440000.0 +h,PI,Northern Metropolitan,2,11.2,3073.0,2.0,1.0,101.0,-37.7091,145.0259,21650.0,375000.0 +h,S,Eastern Metropolitan,3,18.0,3095.0,3.0,2.0,795.0,-37.72503,145.15142,6990.0,915000.0 +h,S,Southern Metropolitan,3,7.5,3123.0,3.0,2.0,185.0,-37.8255,145.0481,6482.0,1405000.0 +u,VB,Southern Metropolitan,1,11.4,3163.0,1.0,1.0,0.0,-37.8983,145.0627,7822.0,260000.0 +h,S,Western Metropolitan,4,5.9,3032.0,4.0,2.0,418.0,-37.7766,144.9187,6567.0,1430000.0 +h,S,Southern Metropolitan,3,9.2,3146.0,3.0,2.0,217.0,-37.8639,145.0641,10412.0,1425000.0 +u,PI,Western Metropolitan,2,5.1,3011.0,2.0,1.0,0.0,-37.78895,144.89014,7570.0,270000.0 +h,S,Northern Metropolitan,3,11.2,3046.0,3.0,1.0,655.0,-37.70608,144.92541,8870.0,830000.0 +h,S,Western Metropolitan,4,8.0,3040.0,3.0,2.0,725.0,-37.7551,144.9047,9264.0,1775000.0 +t,SP,Southern Metropolitan,3,11.2,3127.0,3.0,2.0,302.0,-37.8289,145.1005,5457.0,1325000.0 +h,S,Western Metropolitan,3,27.2,3024.0,3.0,2.0,300.0,-37.88162,144.62082,5262.0,475000.0 +h,PI,Northern Metropolitan,4,5.5,3070.0,4.0,2.0,282.0,-37.7709,145.0057,11364.0,1310000.0 +h,S,Northern Metropolitan,3,12.4,3060.0,3.0,1.0,531.0,-37.7022,144.9669,5070.0,605000.0 +u,VB,Eastern Metropolitan,3,23.2,3153.0,3.0,2.0,535.0,-37.82481,145.26519,3598.0,700000.0 +h,S,Southern Metropolitan,2,13.0,3204.0,2.0,1.0,599.0,-37.925,145.046,6795.0,1275000.0 +h,S,Northern Metropolitan,3,3.1,3003.0,3.0,1.0,121.0,-37.80973,144.9472,2230.0,1370000.0 +h,S,Northern Metropolitan,4,4.5,3057.0,4.0,2.0,470.0,-37.7735,144.9833,5533.0,1550000.0 +u,PI,Northern Metropolitan,1,2.6,3121.0,1.0,1.0,0.0,-37.8127,145.0094,14949.0,380000.0 +h,S,Western Metropolitan,3,31.7,3429.0,3.0,1.0,582.0,-37.58025,144.71759,14092.0,487000.0 +h,SP,Northern Metropolitan,3,6.5,3071.0,2.0,2.0,417.0,-37.7595,145.0017,8870.0,1400000.0 +h,S,Eastern Metropolitan,5,15.4,3131.0,5.0,3.0,477.0,-37.84252,145.17258,4385.0,1205000.0 +u,VB,Eastern Metropolitan,2,8.9,3084.0,2.0,1.0,2020.0,-37.75692,145.06426000000005,2890.0,450000.0 +h,PI,Northern Metropolitan,3,2.6,3121.0,3.0,2.0,345.0,-37.8159,144.994,14949.0,2180000.0 +t,VB,Southern Metropolitan,3,11.7,3125.0,3.0,2.0,267.0,-37.8463,145.1071,5678.0,975000.0 +h,S,Western Metropolitan,4,8.0,3040.0,4.0,2.0,291.0,-37.7544,144.9325,9264.0,1670000.0 +h,SP,Western Metropolitan,3,6.4,3011.0,3.0,1.0,242.0,-37.7956,144.8848,7570.0,775000.0 +h,SP,South-Eastern Metropolitan,4,14.7,3167.0,4.0,2.0,539.0,-37.9413,145.0957,3692.0,937500.0 +h,PI,Western Metropolitan,5,12.6,3020.0,4.0,2.0,690.0,-37.7933,144.8408,3755.0,895000.0 +h,S,Southern Metropolitan,4,10.7,3187.0,2.0,2.0,676.0,-37.9198,145.0137,6938.0,1720000.0 +u,S,Southern Metropolitan,3,14.3,3189.0,3.0,2.0,301.0,-37.94511,145.04426999999995,2555.0,828000.0 +u,S,Southern Metropolitan,2,7.8,3124.0,2.0,1.0,0.0,-37.8357,145.0595,8920.0,810000.0 +h,PI,Eastern Metropolitan,4,13.8,3107.0,4.0,2.0,654.0,-37.7639,145.1145,5420.0,1100000.0 +h,PI,Eastern Metropolitan,4,14.3,3109.0,4.0,2.0,775.0,-37.80126,145.1665,10999.0,1470000.0 +h,SP,Northern Metropolitan,2,4.2,3031.0,2.0,1.0,183.0,-37.7959,144.9342,5263.0,1211000.0 +t,S,Eastern Metropolitan,4,13.9,3108.0,4.0,2.0,234.0,-37.7932,145.1275,9028.0,973500.0 +h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,568.0,-37.7727,144.8417,4217.0,790000.0 +u,PI,Western Metropolitan,1,13.9,3020.0,2.0,1.0,36.0,-37.7833,144.8266,2185.0,145000.0 +h,PI,Eastern Metropolitan,4,13.1,3128.0,3.0,2.0,729.0,-37.8233,145.1267,4605.0,1560000.0 +u,S,Northern Metropolitan,2,3.6,3068.0,2.0,1.0,0.0,-37.78745,145.00061000000005,2954.0,556000.0 +h,S,Northern Metropolitan,3,14.0,3047.0,3.0,1.0,341.0,-37.6886,144.92281,4294.0,386000.0 +h,S,Eastern Metropolitan,4,14.7,3151.0,4.0,2.0,592.0,-37.84839,145.14299,4048.0,1600000.0 +u,VB,Northern Metropolitan,2,1.8,3053.0,2.0,1.0,0.0,-37.7939,144.9663,6786.0,480000.0 +u,S,Eastern Metropolitan,2,17.2,3132.0,2.0,1.0,300.0,-37.80574,145.18832,6871.0,695000.0 +h,S,Southern Metropolitan,3,5.4,3101.0,3.0,2.0,650.0,-37.80359,145.06002,10331.0,2200000.0 +h,S,Northern Metropolitan,3,5.3,3070.0,3.0,1.0,507.0,-37.77124,145.002,11364.0,1605000.0 +h,S,Northern Metropolitan,2,5.2,3055.0,2.0,1.0,539.0,-37.77279,144.94069,7082.0,1150000.0 +h,S,South-Eastern Metropolitan,3,18.8,3170.0,3.0,1.0,656.0,-37.93271,145.17792,7113.0,840000.0 +u,S,Northern Metropolitan,3,2.8,3000.0,2.0,2.0,0.0,-37.8095,144.9691,17496.0,760000.0 diff --git a/test/unit/data/test_data_categorical.py b/test/unit/data/test_data_categorical.py new file mode 100644 index 0000000000..c7742b954c --- /dev/null +++ b/test/unit/data/test_data_categorical.py @@ -0,0 +1,205 @@ +import os + +import numpy as np +import pandas as pd +import pytest + +from fedot.api.api_utils.api_data import ApiDataProcessor +from fedot.core.data.data import InputData +from fedot.core.repository.tasks import Task, TaskTypesEnum +from fedot.core.utils import fedot_project_root + + +def get_dataset_with_cats(output_mode: str = None): + path_to_csv = fedot_project_root().joinpath('test/data/melb_data.csv') + df = pd.read_csv(path_to_csv) + + if output_mode == 'path': + return path_to_csv, 'Price' + + elif output_mode == 'dataframe': + return df.drop(['Price'], axis=1), df['Price'] + + elif output_mode == 'numpy': + return df.drop(['Price'], axis=1).to_numpy(), df.Price.to_numpy(), df.columns.values + + +def get_dataset_without_cats(output_mode: str = None): + path_to_csv = fedot_project_root().joinpath('test/data/scoring/scoring_train.csv') + df = pd.read_csv(path_to_csv) + df = df.drop(['ID'], axis=1) + + if output_mode == 'path': + return path_to_csv, 'target' + + elif output_mode == 'dataframe': + return df.drop(['target'], axis=1), df['target'] + + elif output_mode == 'numpy': + return df.drop(['target'], axis=1).to_numpy(), df.target.to_numpy(), df.columns.values + + +@pytest.mark.parametrize('categorical_idx, expected_idx_after_opening, expected_idx_after_preprocessing', [ + (None, None, np.array([0, 1, 2, 3, 6, 7])), + ([], np.array([]), np.array([0, 1, 2])), + (np.array([]), np.array([]), np.array([0, 1, 2])), + (['Type', 'Method', 'Regionname'], np.array([0, 1, 2]), np.array([0, 1, 2])), + (np.array(['Type', 'Method', 'Regionname']), np.array([0, 1, 2]), np.array([0, 1, 2])), + ([0, 1, 2], np.array([0, 1, 2]), np.array([0, 1, 2])), + (np.array([0, 1, 2]), np.array([0, 1, 2]), np.array([0, 1, 2])) +]) +def test_from_numpy_with_cats(categorical_idx, expected_idx_after_opening, expected_idx_after_preprocessing): + X, y, features_names = get_dataset_with_cats(output_mode='numpy') + + input_data = InputData.from_numpy( + features_array=X, + target_array=y, + features_names=features_names, + categorical_idx=categorical_idx, + task='regression' + ) + + if isinstance(input_data.categorical_idx, np.ndarray): + assert (input_data.categorical_idx == expected_idx_after_opening).all() + else: + assert input_data.categorical_idx == expected_idx_after_opening + + data_preprocessor = ApiDataProcessor(task=Task(TaskTypesEnum.classification)) + preprocessed_input_data = data_preprocessor.fit_transform(input_data) + + assert (preprocessed_input_data.categorical_idx == expected_idx_after_preprocessing).all() + + +@pytest.mark.parametrize('categorical_idx, expected_idx_after_opening, expected_idx_after_preprocessing', [ + (None, None, np.array([0, 1, 2, 3, 6, 7])), + ([], np.array([]), np.array([0, 1, 2])), + (np.array([]), np.array([]), np.array([0, 1, 2])), + (['Type', 'Method', 'Regionname'], np.array([0, 1, 2]), np.array([0, 1, 2])), + (np.array(['Type', 'Method', 'Regionname']), np.array([0, 1, 2]), np.array([0, 1, 2])), + ([0, 1, 2], np.array([0, 1, 2]), np.array([0, 1, 2])), + (np.array([0, 1, 2]), np.array([0, 1, 2]), np.array([0, 1, 2])) +]) +def test_from_dataframe_with_cats(categorical_idx, expected_idx_after_opening, expected_idx_after_preprocessing): + X_df, y_df = get_dataset_with_cats(output_mode='dataframe') + + input_data = InputData.from_dataframe( + features_df=X_df, + target_df=y_df, + categorical_idx=categorical_idx, + ) + + if isinstance(input_data.categorical_idx, np.ndarray): + assert (input_data.categorical_idx == expected_idx_after_opening).all() + else: + assert input_data.categorical_idx == expected_idx_after_opening + + data_preprocessor = ApiDataProcessor(task=Task(TaskTypesEnum.classification)) + preprocessed_input_data = data_preprocessor.fit_transform(input_data) + + assert (preprocessed_input_data.categorical_idx == expected_idx_after_preprocessing).all() + + +@pytest.mark.parametrize('categorical_idx, expected_idx_after_opening, expected_idx_after_preprocessing', [ + (None, None, np.array([0, 1, 2, 3, 6, 7])), + ([], np.array([]), np.array([0, 1, 2])), + (np.array([]), np.array([]), np.array([0, 1, 2])), + (['Type', 'Method', 'Regionname'], np.array([0, 1, 2]), np.array([0, 1, 2])), + (np.array(['Type', 'Method', 'Regionname']), np.array([0, 1, 2]), np.array([0, 1, 2])), + ([0, 1, 2], np.array([0, 1, 2]), np.array([0, 1, 2])), + (np.array([0, 1, 2]), np.array([0, 1, 2]), np.array([0, 1, 2])) +]) +def test_from_csv_with_cats(categorical_idx, expected_idx_after_opening, expected_idx_after_preprocessing): + path, target_columns = get_dataset_with_cats(output_mode='path') + + input_data = InputData.from_csv( + file_path=path, + target_columns=target_columns, + categorical_idx=categorical_idx + ) + + if isinstance(input_data.categorical_idx, np.ndarray): + assert (input_data.categorical_idx == expected_idx_after_opening).all() + else: + assert input_data.categorical_idx == expected_idx_after_opening + + data_preprocessor = ApiDataProcessor(task=Task(TaskTypesEnum.classification)) + preprocessed_input_data = data_preprocessor.fit_transform(input_data) + + assert (preprocessed_input_data.categorical_idx == expected_idx_after_preprocessing).all() + + +@pytest.mark.parametrize('categorical_idx, expected_idx_after_opening, expected_idx_after_preprocessing', [ + (None, None, np.array([2, 6, 7, 9])), + ([], np.array([]), np.array([])), + (np.array([]), np.array([]), np.array([])), +]) +def test_from_numpy_without_cats(categorical_idx, expected_idx_after_opening, expected_idx_after_preprocessing): + X, y, features_names = get_dataset_without_cats(output_mode='numpy') + + input_data = InputData.from_numpy( + features_array=X, + target_array=y, + features_names=features_names, + categorical_idx=categorical_idx, + task='regression' + ) + + if isinstance(input_data.categorical_idx, np.ndarray): + assert (input_data.categorical_idx == expected_idx_after_opening).all() + else: + assert input_data.categorical_idx == expected_idx_after_opening + + data_preprocessor = ApiDataProcessor(task=Task(TaskTypesEnum.classification)) + preprocessed_input_data = data_preprocessor.fit_transform(input_data) + + assert (preprocessed_input_data.categorical_idx == expected_idx_after_preprocessing).all() + + +@pytest.mark.parametrize('categorical_idx, expected_idx_after_opening, expected_idx_after_preprocessing', [ + (None, None, np.array([2, 6, 7, 9])), + ([], np.array([]), np.array([])), + (np.array([]), np.array([]), np.array([])), +]) +def test_from_dataframe_without_cats(categorical_idx, expected_idx_after_opening, expected_idx_after_preprocessing): + X_df, y_df = get_dataset_without_cats(output_mode='dataframe') + + input_data = InputData.from_dataframe( + features_df=X_df, + target_df=y_df, + categorical_idx=categorical_idx, + ) + + if isinstance(input_data.categorical_idx, np.ndarray): + assert (input_data.categorical_idx == expected_idx_after_opening).all() + else: + assert input_data.categorical_idx == expected_idx_after_opening + + data_preprocessor = ApiDataProcessor(task=Task(TaskTypesEnum.classification)) + preprocessed_input_data = data_preprocessor.fit_transform(input_data) + + assert (preprocessed_input_data.categorical_idx == expected_idx_after_preprocessing).all() + + +@pytest.mark.parametrize('categorical_idx, expected_idx_after_opening, expected_idx_after_preprocessing', [ + (None, None, np.array([2, 6, 7, 9])), + ([], np.array([]), np.array([])), + (np.array([]), np.array([]), np.array([])), +]) +def test_from_csv_without_cats(categorical_idx, expected_idx_after_opening, expected_idx_after_preprocessing): + path, target_columns = get_dataset_without_cats(output_mode='path') + + input_data = InputData.from_csv( + file_path=path, + target_columns=target_columns, + categorical_idx=categorical_idx + ) + + if isinstance(input_data.categorical_idx, np.ndarray): + assert (input_data.categorical_idx == expected_idx_after_opening).all() + else: + assert input_data.categorical_idx == expected_idx_after_opening + + data_preprocessor = ApiDataProcessor(task=Task(TaskTypesEnum.classification)) + preprocessed_input_data = data_preprocessor.fit_transform(input_data) + + assert (preprocessed_input_data.categorical_idx == expected_idx_after_preprocessing).all() From b1cfadc4c81c2a10f76845a4b6e4383e716cdd13 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Thu, 22 Aug 2024 19:08:57 +0300 Subject: [PATCH 52/69] Fixing for test_metrics with py3.10 --- test/data/expected_metric_values.json | 12 ++++++------ test/unit/composer/test_metrics.py | 12 +++++++++++- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/test/data/expected_metric_values.json b/test/data/expected_metric_values.json index 4b81051a1d..5018aa8d3f 100644 --- a/test/data/expected_metric_values.json +++ b/test/data/expected_metric_values.json @@ -31,14 +31,14 @@ "rmse_pen": 52.64510049434378 }, "multitarget": { - "rmse": 15.753366859480218, - "mse": 377.5025166058113, + "rmse": [15.753366859480218, 15.715344581042293], + "mse": [377.5025166058113, 375.07708740234375], "neg_mean_squared_log_error": 0.030627538521796293, "mape": 0.15337090733886807, - "smape": 14.144394353302935, - "mae": 13.50645038033778, - "r2": -2.9713973901034954, - "rmse_pen": 15.784873593199178 + "smape": [14.144394353302935, 14.117428843762253], + "mae": [13.50645038033778, 13.459635416666666], + "r2": [-2.9713973901034954, -2.960510176151834], + "rmse_pen": [15.784873593199178, 15.746775270204378] }, "ts": { "mase": 0.6080909603204148, diff --git a/test/unit/composer/test_metrics.py b/test/unit/composer/test_metrics.py index b8b868a9e7..d16bb40aac 100644 --- a/test/unit/composer/test_metrics.py +++ b/test/unit/composer/test_metrics.py @@ -134,7 +134,17 @@ def test_metrics(metric: ClassificationMetricsEnum, pipeline_func: Callable[[], if not update_expected_values: expected_value = expected_values[task_type][str(metric)] - assert np.isclose(metric_value, expected_value, rtol=0.001, atol=0.001) + + if isinstance(expected_value, list): + expression_expected_value = [] + + for value in expected_value: + expression_expected_value.append(np.isclose(metric_value, value, rtol=0.001, atol=0.001)) + assert any(expression_expected_value) + + else: + assert np.isclose(metric_value, expected_value, rtol=0.001, atol=0.001) + assert not np.isclose(metric_value, metric_class.default_value, rtol=0.01, atol=0.01) else: with open(fedot_project_root() / 'test/data/expected_metric_values.json', 'w') as f: From 888f484250ae9699bf86b2eeaf4f749894a9a9da Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Fri, 23 Aug 2024 14:25:10 +0300 Subject: [PATCH 53/69] Fix test_from_ ... with broadcast --- fedot/core/data/data.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index b382eb6839..fa684e7a3b 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -145,12 +145,12 @@ def from_dataframe(cls, if isinstance(categorical_idx, list): categorical_idx = np.array(categorical_idx) - if categorical_idx != np.array([]) and isinstance(categorical_idx[0], str) and features_names is None: + if categorical_idx.size != 0 and isinstance(categorical_idx[0], str) and features_names is None: raise ValueError( 'Impossible to specify categorical features by name when the features_names are not specified' ) - if categorical_idx != np.array([]) and isinstance(categorical_idx[0], str): + if categorical_idx.size != 0 and isinstance(categorical_idx[0], str): categorical_idx = np.array( [idx for idx, column in enumerate(features_names) if column in set(categorical_idx)] ) @@ -222,12 +222,12 @@ def from_csv(cls, if isinstance(categorical_idx, list): categorical_idx = np.array(categorical_idx) - if categorical_idx != np.array([]) and isinstance(categorical_idx[0], str) and features_names is None: + if categorical_idx.size != 0 and isinstance(categorical_idx[0], str) and features_names is None: raise ValueError( 'Impossible to specify categorical features by name when the features_names are not specified' ) - if categorical_idx != np.array([]) and isinstance(categorical_idx[0], str): + if categorical_idx.size != 0 and isinstance(categorical_idx[0], str): categorical_idx = np.array( [idx for idx, column in enumerate(features_names) if column in set(categorical_idx)] ) @@ -928,12 +928,12 @@ def array_to_input_data(features_array: np.ndarray, if isinstance(categorical_idx, list): categorical_idx = np.array(categorical_idx) - if categorical_idx != np.array([]) and isinstance(categorical_idx[0], str) and features_names is None: + if categorical_idx.size != 0 and isinstance(categorical_idx[0], str) and features_names is None: raise ValueError( 'Impossible to specify categorical features by name when the features_names are not specified' ) - if categorical_idx != np.array([]) and isinstance(categorical_idx[0], str): + if categorical_idx.size != 0 and isinstance(categorical_idx[0], str): categorical_idx = np.array( [idx for idx, column in enumerate(features_names) if column in set(categorical_idx)] ) From f963d099af55fa59e904237692718cd633885eaf Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Fri, 23 Aug 2024 17:42:24 +0300 Subject: [PATCH 54/69] Hide preprocessing messages under debug logging (2) --- fedot/api/api_utils/api_data.py | 8 +++--- fedot/core/data/data.py | 9 ++++--- fedot/preprocessing/data_types.py | 17 +++++++----- fedot/preprocessing/preprocessing.py | 40 ++++++++++++++-------------- 4 files changed, 40 insertions(+), 34 deletions(-) diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py index 9607ad40aa..7ecd150249 100644 --- a/fedot/api/api_utils/api_data.py +++ b/fedot/api/api_utils/api_data.py @@ -143,16 +143,16 @@ def fit_transform(self, train_data: InputData) -> InputData: self.log.message( f'Train Data (Original) Memory Usage: {memory_usage} Data Shapes: {features_shape, target_shape}') - self.log.message('- Obligatory preprocessing started') + self.log.debug('- Obligatory preprocessing started') train_data = self.preprocessor.obligatory_prepare_for_fit(data=train_data) - self.log.message('- Optional preprocessing started') + self.log.debug('- Optional preprocessing started') train_data = self.preprocessor.optional_prepare_for_fit(pipeline=Pipeline(), data=train_data) - self.log.message('- Converting indexes for fitting started') + self.log.debug('- Converting indexes for fitting started') train_data = self.preprocessor.convert_indexes_for_fit(pipeline=Pipeline(), data=train_data) - self.log.message('- Reducing memory started') + self.log.debug('- Reducing memory started') train_data = self.preprocessor.reduce_memory_size(data=train_data) train_data.supplementary_data.is_auto_preprocessed = True diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index fa684e7a3b..43160ac2eb 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -178,7 +178,7 @@ def from_csv(cls, task: Union[Task, str] = 'classification', data_type: DataTypesEnum = DataTypesEnum.table, columns_to_drop: Optional[List[Union[str, int]]] = None, - target_columns: Union[str, List[Union[str, int]]] = '', + target_columns: Union[str, List[Union[str, int]], None] = '', categorical_idx: Union[list[int, str], np.ndarray[int, str]] = None, index_col: Optional[Union[str, int]] = None, possible_idx_keywords: Optional[List[str]] = None) -> InputData: @@ -210,11 +210,12 @@ def from_csv(cls, df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_drop=columns_to_drop) idx = df.index.to_numpy() - if not target_columns: - features_names = df.columns.to_numpy()[:-1] - else: + if target_columns: features_names = df.drop(target_columns, axis=1).columns.to_numpy() + else: + features_names = df.columns.to_numpy() + features, target = process_target_and_features(df, target_columns) categorical_features = None diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 566cdafbde..52ed706648 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -88,7 +88,7 @@ def convert_data_for_fit(self, data: InputData): column_types_info = self.prepare_column_types_info(predictors=data.features, target=data.target, task=data.task) data.supplementary_data.col_type_ids = column_types_info col_types_info_message = prepare_log_message_with_cols_types(column_types_info, data.features_names) - self.log.message(f'--- The information about types of each feature are {col_types_info_message}') + self.log.message(f'The detected types of data are as follows: {col_types_info_message}') self._into_numeric_features_transformation_for_fit(data) # Launch conversion float and integer features into categorical self._into_categorical_features_transformation_for_fit(data) @@ -320,11 +320,15 @@ def _into_categorical_features_transformation_for_fit(self, data: InputData): if np.size(all_cat_col_ids) > 0: if data.features_names is not None: cat_features_names = data.features_names[all_cat_col_ids] - self.log.message(f'--- Preprocessing define next cols {cat_features_names} as categorical') + self.log.message( + f'Preprocessing defines the following columns as categorical: {cat_features_names}' + ) else: - self.log.message(f'--- Preprocessing define next cols {all_cat_col_ids} as categorical') + self.log.message( + f'Preprocessing defines the following columns as categorical: {all_cat_col_ids}' + ) else: - self.log.message('--- Preprocessing was unable to define the categorical columns') + self.log.message('Preprocessing was unable to define the categorical columns') def _into_categorical_features_transformation_for_predict(self, data: InputData): """ Apply conversion into categorical string column for every signed column """ @@ -534,13 +538,14 @@ def _process_predict_column_values_one_by_one(value, current_type: type): def prepare_log_message_with_cols_types(col_types_info, features_names): - message = '\n' + message = '\n' + 'Features\n' for type_name, type_id in TYPE_TO_ID.items(): count_types = np.count_nonzero(col_types_info['features'] == type_id) features_idx = np.where(col_types_info['features'] == type_id)[0] names_or_indexes = features_names[features_idx] if features_names is not None else features_idx - message += f'TYPE {type_name} - count {count_types} - features {names_or_indexes} \n' \ + message += f'\tTYPE {type_name} - count {count_types} - features {names_or_indexes} \n' \ + message += '-' * 10 + '\n' message += f'Target: TYPE {_convertable_types[col_types_info["target"][0]]}' return message diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index ca03a8eb29..a9303f903b 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -193,7 +193,7 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str, return data # Convert datetime data to numerical - self.log.message('-- Converting datetime data to numerical') + self.log.debug('-- Converting datetime data to numerical') data.features = np_datetime_to_numeric(data.features) if data.target is not None: data.target = np_datetime_to_numeric(data.target) @@ -202,39 +202,39 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str, data.idx = np.asarray(data.idx) # Fix tables / time series sizes - self.log.message('-- Fixing table / time series shapes') + self.log.debug('-- Fixing table / time series shapes') data = self._correct_shapes(data) replace_inf_with_nans(data) # Find incorrect features which must be removed if is_fit_stage: - self.log.message('-- Finding incorrect features') + self.log.debug('-- Finding incorrect features') self._find_features_lacking_nans(data, source_name) - self.log.message('-- Removing incorrect features') + self.log.debug('-- Removing incorrect features') self._take_only_correct_features(data, source_name) if is_fit_stage: - self.log.message('-- Dropping rows with NaN-values in target') + self.log.debug('-- Dropping rows with NaN-values in target') data = self._drop_rows_with_nan_in_target(data) # Column types processing - launch after correct features selection - self.log.message('-- Features types processing') + self.log.debug('-- Features types processing') self.types_correctors[source_name].convert_data_for_fit(data) if self.types_correctors[source_name].target_converting_has_errors: - self.log.message('-- Dropping rows with NaN-values in target') + self.log.debug('-- Dropping rows with NaN-values in target') data = self._drop_rows_with_nan_in_target(data) # Train Label Encoder for categorical target if necessary and apply it - self.log.message('-- Applying the Label Encoder to Target due to the presence of categories') + self.log.debug('-- Applying the Label Encoder to Target due to the presence of categories') if source_name not in self.target_encoders: self._train_target_encoder(data, source_name) data.target = self._apply_target_encoding(data, source_name) else: - self.log.message('-- Converting data for predict') + self.log.debug('-- Converting data for predict') self.types_correctors[source_name].convert_data_for_predict(data) feature_type_ids = data.supplementary_data.col_type_ids['features'] @@ -247,7 +247,7 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str, elif data_type_is_table(data): if is_fit_stage: - self.log.message('-- Searching binary categorical features to encode them') + self.log.debug('-- Searching binary categorical features to encode them') data = self.binary_categorical_processors[source_name].fit_transform(data) else: data = self.binary_categorical_processors[source_name].transform(data) @@ -273,9 +273,9 @@ def _prepare_optional(self, pipeline, data: InputData, source_name: str): (data_has_missing_values, 'imputation', self._apply_imputation_unidata), (data_has_categorical_features, 'encoding', self._apply_categorical_encoding) ]: - self.log.message(f'-- Deciding to apply {tag_to_check} for data') + self.log.debug(f'Deciding to apply {tag_to_check} for data') if has_problems(data): - self.log.message(f'-- Finding {tag_to_check} is required and trying to apply') + self.log.debug(f'Finding {tag_to_check} is required and trying to apply') # Data contains missing values has_tag = PipelineStructureExplorer.check_structure_by_tag( pipeline, tag_to_check=tag_to_check, source_name=source_name) @@ -366,17 +366,17 @@ def _apply_imputation_unidata(self, data: InputData, source_name: str) -> InputD Returns: imputed ``data`` """ - self.log.message('--- Initialising imputer') + self.log.debug('--- Initialising imputer') imputer = self.features_imputers.get(source_name) if not imputer: imputer = ImputationImplementation() - self.log.message('--- Fitting and transforming imputer for missings') + self.log.debug('--- Fitting and transforming imputer for missings') output_data = imputer.fit_transform(data) self.features_imputers[source_name] = imputer else: - self.log.message('--- Transforming imputer for missings') + self.log.debug('--- Transforming imputer for missings') output_data = imputer.transform(data) data.features = output_data.predict @@ -394,7 +394,7 @@ def _apply_categorical_encoding(self, data: InputData, source_name: str) -> Inpu Returns: encoded ``data`` """ - self.log.message('--- Initialising categorical encoder') + self.log.debug('--- Initialising categorical encoder') encoder = self.features_encoders.get(source_name) if encoder is None: @@ -402,8 +402,8 @@ def _apply_categorical_encoding(self, data: InputData, source_name: str) -> Inpu encoder.fit(data) self.features_encoders[source_name] = encoder - self.log.message(f'--- {encoder.__class__.__name__} was chosen') - self.log.message('--- Fitting and transforming data') + self.log.debug(f'--- {encoder.__class__.__name__} was chosen as categorical encoder') + self.log.debug('--- Fitting and transforming data') output_data = encoder.transform_for_fit(data) output_data.predict = output_data.predict.astype(float) data.features = output_data.predict @@ -602,11 +602,11 @@ def reduce_mem_usage_np(arr, initial_types): # It required to add this to reduce memory for them pass else: - self.log.message('-- Reduce memory in features') + self.log.debug('-- Reduce memory in features') data.features = reduce_mem_usage_np(data.features, data.supplementary_data.col_type_ids['features']) if data.target is not None: - self.log.message('-- Reduce memory in target') + self.log.debug('-- Reduce memory in target') data.target = reduce_mem_usage_np(data.target, data.supplementary_data.col_type_ids['target']) data.target = data.target.to_numpy() From a542088a8307769fc826e6e3fd646d539330be00 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Sun, 25 Aug 2024 22:12:31 +0300 Subject: [PATCH 55/69] Fix TypeError with float16, rejection from this type --- fedot/core/data/data.py | 9 +++++++++ fedot/preprocessing/preprocessing.py | 4 +--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index 43160ac2eb..3780207894 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -792,6 +792,15 @@ def __setitem__(self, key, value): else: raise NotImplementedError("Setting values by index without specifying a column is not supported.") + def __get__(self): + output = np.empty(self._shape, dtype=np.object_) + + for i in range(self._shape[0]): + for j, col in enumerate(self._columns): + output[i, j] = col[i] + + return output + def __len__(self): return self._shape[0] if self._columns else 0 diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index a9303f903b..b04b6fbc85 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -585,9 +585,7 @@ def reduce_mem_usage_np(arr, initial_types): reduced_columns.add_column(col.astype(np.int64)) elif np.issubdtype(col.dtype, np.floating): - if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: - reduced_columns.add_column(col.astype(np.float16)) - elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: + if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: reduced_columns.add_column(col.astype(np.float32)) else: reduced_columns.add_column(col.astype(np.float64)) From 776d7f5a5d5e0c3cbc4283bdbc8a32fd5cc0f27b Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 4 Sep 2024 01:27:00 +0300 Subject: [PATCH 56/69] Refactoring OptimisedFeatures - _columns: np.ndarray -> _columns: pd.DataFrame --- fedot/core/data/data.py | 110 ++++++++++++++++++++-------------------- 1 file changed, 54 insertions(+), 56 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index 3780207894..d68ed664ed 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -528,6 +528,17 @@ def from_json_files(files_path: str, return InputData(idx=idx, features=features, target=target, task=task, data_type=data_type) + @property + def features(self): + if isinstance(self._features, OptimisedFeatures): + return self._features.items + + return self._features + + @features.setter + def features(self, value): + self._features = value + def to_csv(self, path_to_save): dataframe = pd.DataFrame(data=self.features, index=self.idx) if self.target is not None: @@ -539,6 +550,9 @@ def to_csv(self, path_to_save): class InputData(Data): """Data class for input data for the nodes """ + def __init__(self, features, *args, **kwargs): + super().__init__(*args, **kwargs) + self._features = features def __post_init__(self): if self.numerical_idx is None: @@ -749,90 +763,74 @@ class OptimisedFeatures: """``Data`` type for optimised storage data. It based on numpy ndarray, but the features storages in list of np.ndarray with own optimal dtype """ - _columns: list = field(default_factory=list, init=False) - _shape: tuple = field(default=(0, 0), init=False) - _nbytes: int = 0 + _columns: pd.DataFrame = field(default_factory=pd.DataFrame, init=False) + _cols_names: list = field(default_factory=list, init=False) ndim: int = 2 - def add_column(self, data: np.ndarray): - if not isinstance(data, np.ndarray): - raise ValueError("Data should be a NumPy array.") + def set_data(self, data: pd.DataFrame): + if isinstance(data, pd.DataFrame): + self._columns = data.copy(deep=True) + self._cols_names = list(range(0, len(self._columns.columns))) - if self._shape == (0, 0): - self._shape = (data.shape[0], 1) else: - if data.shape[0] != self._shape[0]: - raise ValueError("All columns must have the same number of rows.") - - self._shape = (self._shape[0], self._shape[1] + 1) + raise ValueError("data in set_data should be a pandas DataFrame.") - self._columns.append(data) - self._nbytes += data.nbytes + def add_column(self, arr: np.ndarray): + if isinstance(arr, np.ndarray): + if self._columns.empty: + self._cols_names = [0] + self._columns = pd.DataFrame(arr, columns=self._cols_names) - def __getitem__(self, key): - if isinstance(key, tuple): - row_idx, col_idx = key - if isinstance(col_idx, int): - return self._columns[col_idx][row_idx] else: - selected_columns = [self._columns[i] for i in col_idx] - return np.column_stack(selected_columns)[row_idx] + self._cols_names.append(self._cols_names[-1] + 1) + self._columns.insert(self._cols_names[-1], self._cols_names[-1], arr) else: - result = np.column_stack(self._columns)[key] - return result if result.ndim > 1 else result.ravel() + raise ValueError("arr in add_column should be a NumPy array.") - def __setitem__(self, key, value): + def __getitem__(self, key: Union[tuple[int, int], int]) -> Union[pd.DataFrame, pd.Series]: if isinstance(key, tuple): row_idx, col_idx = key - if isinstance(col_idx, int): - self._columns[col_idx][row_idx] = value - else: - for i, col in zip(col_idx, value): - self._columns[i][row_idx] = col - else: - raise NotImplementedError("Setting values by index without specifying a column is not supported.") + return self._columns.iloc[row_idx, col_idx] - def __get__(self): - output = np.empty(self._shape, dtype=np.object_) - - for i in range(self._shape[0]): - for j, col in enumerate(self._columns): - output[i, j] = col[i] - - return output + else: + return self._columns.iloc[key] - def __len__(self): - return self._shape[0] if self._columns else 0 + def __len__(self) -> int: + return self._columns.shape[0] if self._columns else 0 - def take(self, indices, axis=0): + def take(self, indices: np.ndarray[int], axis: int = 0) -> OptimisedFeatures: output = OptimisedFeatures() + # Takes rows if axis == 0: - # Takes rows - for col in self._columns: - output.add_column(np.take(col, indices, axis)) + output.set_data(self._columns.iloc[indices, :]) + + # Takes columns elif axis == 1: - # Takes columns - for i in indices: - output.add_column(self._columns[i]) + output.set_data(self._columns.iloc[:, indices]) + else: raise ValueError("Axis must be 0 (rows) or 1 (columns)") return output - def copy(self): - return self._columns.copy() + def copy(self) -> pd.DataFrame: + return self._columns.copy(deep=True) - def to_numpy(self): - return np.transpose(np.array(self._columns)) + def to_numpy(self) -> np.ndarray: + return self._columns.to_numpy() + + @property + def items(self): + return self._columns @property - def shape(self): - return self._shape + def shape(self) -> tuple[int, int]: + return self._columns.shape @property - def nbytes(self): - return self._nbytes + def nbytes(self) -> int: + return self._columns.memory_usage(index=True, deep=True).sum() def _resize_image(file_path: str, target_size: Tuple[int, int]): From 4cc8a3ddbdc6ea4a4a373ad62e4f7264810c50e3 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 4 Sep 2024 22:38:59 +0300 Subject: [PATCH 57/69] Revert changes with features property --- fedot/core/data/data.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index d68ed664ed..141bb9b2c8 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -528,17 +528,6 @@ def from_json_files(files_path: str, return InputData(idx=idx, features=features, target=target, task=task, data_type=data_type) - @property - def features(self): - if isinstance(self._features, OptimisedFeatures): - return self._features.items - - return self._features - - @features.setter - def features(self, value): - self._features = value - def to_csv(self, path_to_save): dataframe = pd.DataFrame(data=self.features, index=self.idx) if self.target is not None: @@ -550,10 +539,6 @@ def to_csv(self, path_to_save): class InputData(Data): """Data class for input data for the nodes """ - def __init__(self, features, *args, **kwargs): - super().__init__(*args, **kwargs) - self._features = features - def __post_init__(self): if self.numerical_idx is None: if self.features is not None and isinstance(self.features, np.ndarray) and self.features.ndim > 1: From 762f89225b9ac73a856e94674d0d6a08fc02b2f1 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Wed, 4 Sep 2024 22:57:28 +0300 Subject: [PATCH 58/69] Fixes various tests --- fedot/core/data/data.py | 2 +- .../operations/evaluation/classification.py | 16 +++++++++++++--- .../evaluation/evaluation_interfaces.py | 8 ++++++-- .../implementation_interfaces.py | 18 +++++++++++------- 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index 141bb9b2c8..a56a3dcbc6 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -781,7 +781,7 @@ def __getitem__(self, key: Union[tuple[int, int], int]) -> Union[pd.DataFrame, p return self._columns.iloc[key] def __len__(self) -> int: - return self._columns.shape[0] if self._columns else 0 + return self._columns.shape[0] if self._columns.size > 0 else 0 def take(self, indices: np.ndarray[int], axis: int = 0) -> OptimisedFeatures: output = OptimisedFeatures() diff --git a/fedot/core/operations/evaluation/classification.py b/fedot/core/operations/evaluation/classification.py index a6bdf15069..8c3b14a05c 100644 --- a/fedot/core/operations/evaluation/classification.py +++ b/fedot/core/operations/evaluation/classification.py @@ -1,7 +1,7 @@ import warnings from typing import Optional -from fedot.core.data.data import InputData, OutputData +from fedot.core.data.data import InputData, OutputData, OptimisedFeatures from fedot.core.operations.evaluation.evaluation_interfaces import EvaluationStrategy, SkLearnEvaluationStrategy from fedot.core.operations.evaluation.operation_implementations.data_operations.decompose \ import DecomposerClassImplementation @@ -35,8 +35,18 @@ def predict(self, trained_operation, predict_data: InputData) -> OutputData: :return: prediction target """ - prediction = self._sklearn_compatible_prediction(trained_operation=trained_operation, - features=predict_data.features) + if isinstance(predict_data.features, OptimisedFeatures): + prediction = self._sklearn_compatible_prediction( + trained_operation=trained_operation, + features=predict_data.features.items + ) + + else: + prediction = self._sklearn_compatible_prediction( + trained_operation=trained_operation, + features=predict_data.features + ) + converted = self._convert_to_output(prediction, predict_data) return converted diff --git a/fedot/core/operations/evaluation/evaluation_interfaces.py b/fedot/core/operations/evaluation/evaluation_interfaces.py index 5849ab3f17..ab11190aee 100644 --- a/fedot/core/operations/evaluation/evaluation_interfaces.py +++ b/fedot/core/operations/evaluation/evaluation_interfaces.py @@ -27,7 +27,7 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from xgboost import XGBClassifier, XGBRegressor -from fedot.core.data.data import InputData, OutputData +from fedot.core.data.data import InputData, OutputData, OptimisedFeatures from fedot.core.operations.operation_parameters import OperationParameters from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.operation_types_repository import OperationTypesRepository, get_operation_type_from_id @@ -228,7 +228,11 @@ def fit(self, train_data: InputData): operation_implementation = convert_to_multivariate_model(operation_implementation, train_data) else: - operation_implementation.fit(train_data.features, train_data.target) + if isinstance(train_data.features, OptimisedFeatures): + operation_implementation.fit(train_data.features.items, train_data.target) + + else: + operation_implementation.fit(train_data.features, train_data.target) return operation_implementation @abstractmethod diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index a08c9a9f12..b92397eddb 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -163,16 +163,20 @@ def _reasonability_check(features): # For every column in table make check if isinstance(features, OptimisedFeatures): - features = features._columns + features = features.items + + bool_ids = features.select_dtypes('bool').columns.values.tolist() + non_bool_ids = [col_idx for col_idx in features.columns.values.tolist() if col_idx not in bool_ids] + elif isinstance(features, np.ndarray): features = features.T - for column_id, column in enumerate(features): - # column = features[:, column_id] if columns_amount > 1 else features.copy() - if len(set(column)) > 2: - non_bool_ids.append(column_id) - else: - bool_ids.append(column_id) + for column_id, column in enumerate(features): + # column = features[:, column_id] if columns_amount > 1 else features.copy() + if len(set(column)) > 2: + non_bool_ids.append(column_id) + else: + bool_ids.append(column_id) return bool_ids, non_bool_ids From 4efdad54d60b85636ea86ed78b94a50e4431f120 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Sun, 8 Sep 2024 17:34:59 +0300 Subject: [PATCH 59/69] Global refactoring - Rejection from separate class --- fedot/api/api_utils/api_data.py | 8 +- fedot/core/data/data.py | 96 +++---------------- fedot/core/data/data_split.py | 10 +- .../operations/evaluation/classification.py | 16 +--- .../evaluation/evaluation_interfaces.py | 12 +-- .../data_operations/categorical_encoders.py | 26 +++-- .../implementation_interfaces.py | 25 ++--- .../models/boostings_implementations.py | 7 +- fedot/preprocessing/data_types.py | 1 + fedot/preprocessing/preprocessing.py | 53 +++++----- 10 files changed, 80 insertions(+), 174 deletions(-) diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py index 7ecd150249..3a5aaf5e31 100644 --- a/fedot/api/api_utils/api_data.py +++ b/fedot/api/api_utils/api_data.py @@ -137,7 +137,7 @@ def accept_and_apply_recommendations(self, input_data: Union[InputData, MultiMod def fit_transform(self, train_data: InputData) -> InputData: start_time = datetime.now() self.log.message('Preprocessing data') - memory_usage = convert_memory_size(train_data.features.nbytes) + memory_usage = convert_memory_size(train_data.features.memory_usage) features_shape = train_data.features.shape target_shape = train_data.target.shape self.log.message( @@ -157,7 +157,7 @@ def fit_transform(self, train_data: InputData) -> InputData: train_data.supplementary_data.is_auto_preprocessed = True - memory_usage = convert_memory_size(train_data.features.nbytes) + memory_usage = convert_memory_size(train_data.features.memory_usage) features_shape = train_data.features.shape target_shape = train_data.target.shape @@ -170,7 +170,7 @@ def fit_transform(self, train_data: InputData) -> InputData: def transform(self, test_data: InputData, current_pipeline) -> InputData: start_time = datetime.now() self.log.message('Preprocessing data') - memory_usage = convert_memory_size(test_data.features.nbytes) + memory_usage = convert_memory_size(test_data.features.memory_usage) features_shape = test_data.features.shape target_shape = test_data.target.shape self.log.message( @@ -184,7 +184,7 @@ def transform(self, test_data: InputData, current_pipeline) -> InputData: test_data = self.preprocessor.reduce_memory_size(data=test_data) - memory_usage = convert_memory_size(test_data.features.nbytes) + memory_usage = convert_memory_size(test_data.features.memory_usages) features_shape = test_data.features.shape target_shape = test_data.target.shape self.log.message( diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index a56a3dcbc6..f1ba556bf4 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -42,7 +42,7 @@ class Data: idx: np.ndarray task: Task data_type: DataTypesEnum - features: Union[np.ndarray, OptimisedFeatures] + features: Union[np.ndarray, pd.DataFrame] categorical_features: Optional[np.ndarray] = None categorical_idx: Optional[np.ndarray] = None numerical_idx: Optional[np.ndarray] = None @@ -439,7 +439,7 @@ def from_text_meta_file(meta_file_path: str = None, features = np.array(messages) target = np.array(df_text[label]).reshape(-1, 1) - idx = [index for index in range(len(target))] + idx = np.array([index for index in range(len(target))]) return InputData(idx=idx, features=features, target=target, task=task, data_type=data_type) @@ -457,7 +457,7 @@ def from_text_files(files_path: str, features = np.array(df_text['text']) target = np.array(df_text[label]).reshape(-1, 1) - idx = [index for index in range(len(target))] + idx = np.array([index for index in range(len(target))]) return InputData(idx=idx, features=features, target=target, task=task, data_type=data_type) @@ -523,7 +523,7 @@ def from_json_files(files_path: str, else: target = np.array(df_data[label]) - idx = [index for index in range(len(target))] + idx = np.array([index for index in range(len(target))]) return InputData(idx=idx, features=features, target=target, task=task, data_type=data_type) @@ -534,6 +534,13 @@ def to_csv(self, path_to_save): dataframe['target'] = self.target dataframe.to_csv(path_to_save) + @property + def memory_usage(self): + if isinstance(self.features, np.ndarray): + return self.features.nbytes + else: + return self.features.memory_usage().sum() + @dataclass class InputData(Data): @@ -642,7 +649,7 @@ def convert_non_int_indexes_for_fit(self, pipeline): copied_data = deepcopy(self) is_timestamp = isinstance(copied_data.idx[0], pd._libs.tslibs.timestamps.Timestamp) is_numpy_datetime = isinstance(copied_data.idx[0], np.datetime64) - # if fit stage- just creating range of integers + # if fit stage-just creating range of integers if is_timestamp or is_numpy_datetime: copied_data.supplementary_data.non_int_idx = copy(copied_data.idx) copied_data.idx = np.array(range(len(copied_data.idx))) @@ -684,7 +691,7 @@ def get_not_encoded_data(self): num_features_names, cat_features_names = None, None # Checking numerical data exists - if self.numerical_idx.size != 0: + if self.numerical_idx is not None and self.numerical_idx.size != 0: num_features = self.features[:, self.numerical_idx] if self.features_names is not None and np.size(self.features_names): @@ -693,7 +700,7 @@ def get_not_encoded_data(self): num_features_names = np.array([f'num_feature_{i}' for i in range(1, num_features.shape[1] + 1)]) # Checking categorical data exists - if self.categorical_idx.size != 0: + if self.categorical_idx is not None and self.categorical_idx.size != 0: cat_features = self.categorical_features if self.features_names is not None and np.size(self.features_names): @@ -743,81 +750,6 @@ class OutputData(Data): encoded_idx: Optional[np.ndarray] = None -@dataclass -class OptimisedFeatures: - """``Data`` type for optimised storage data. - It based on numpy ndarray, but the features storages in list of np.ndarray with own optimal dtype - """ - _columns: pd.DataFrame = field(default_factory=pd.DataFrame, init=False) - _cols_names: list = field(default_factory=list, init=False) - ndim: int = 2 - - def set_data(self, data: pd.DataFrame): - if isinstance(data, pd.DataFrame): - self._columns = data.copy(deep=True) - self._cols_names = list(range(0, len(self._columns.columns))) - - else: - raise ValueError("data in set_data should be a pandas DataFrame.") - - def add_column(self, arr: np.ndarray): - if isinstance(arr, np.ndarray): - if self._columns.empty: - self._cols_names = [0] - self._columns = pd.DataFrame(arr, columns=self._cols_names) - - else: - self._cols_names.append(self._cols_names[-1] + 1) - self._columns.insert(self._cols_names[-1], self._cols_names[-1], arr) - else: - raise ValueError("arr in add_column should be a NumPy array.") - - def __getitem__(self, key: Union[tuple[int, int], int]) -> Union[pd.DataFrame, pd.Series]: - if isinstance(key, tuple): - row_idx, col_idx = key - return self._columns.iloc[row_idx, col_idx] - - else: - return self._columns.iloc[key] - - def __len__(self) -> int: - return self._columns.shape[0] if self._columns.size > 0 else 0 - - def take(self, indices: np.ndarray[int], axis: int = 0) -> OptimisedFeatures: - output = OptimisedFeatures() - - # Takes rows - if axis == 0: - output.set_data(self._columns.iloc[indices, :]) - - # Takes columns - elif axis == 1: - output.set_data(self._columns.iloc[:, indices]) - - else: - raise ValueError("Axis must be 0 (rows) or 1 (columns)") - - return output - - def copy(self) -> pd.DataFrame: - return self._columns.copy(deep=True) - - def to_numpy(self) -> np.ndarray: - return self._columns.to_numpy() - - @property - def items(self): - return self._columns - - @property - def shape(self) -> tuple[int, int]: - return self._columns.shape - - @property - def nbytes(self) -> int: - return self._columns.memory_usage(index=True, deep=True).sum() - - def _resize_image(file_path: str, target_size: Tuple[int, int]): """Function resizes and rewrites the input image """ diff --git a/fedot/core/data/data_split.py b/fedot/core/data/data_split.py index 1c2f34e60a..a000c6e46b 100644 --- a/fedot/core/data/data_split.py +++ b/fedot/core/data/data_split.py @@ -4,7 +4,7 @@ import numpy as np from sklearn.model_selection import train_test_split -from fedot.core.data.data import InputData, OptimisedFeatures +from fedot.core.data.data import InputData from fedot.core.data.multi_modal import MultiModalData from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.tasks import TaskTypesEnum @@ -30,13 +30,9 @@ def _split_input_data_by_indexes(origin_input_data: Union[InputData, MultiModalD return data elif isinstance(origin_input_data, InputData): idx = np.take(origin_input_data.idx, index, 0) - if isinstance(origin_input_data.features, OptimisedFeatures): - features = origin_input_data.features.take(index) - target = origin_input_data.target.take(index) - else: - features = np.take(origin_input_data.features, index, 0) - target = np.take(origin_input_data.target, index, 0) + features = np.take(origin_input_data.features, index, 0) + target = np.take(origin_input_data.target, index, 0) if origin_input_data.categorical_features is not None: categorical_features = np.take(origin_input_data.categorical_features, index, 0) diff --git a/fedot/core/operations/evaluation/classification.py b/fedot/core/operations/evaluation/classification.py index 8c3b14a05c..04cc061c69 100644 --- a/fedot/core/operations/evaluation/classification.py +++ b/fedot/core/operations/evaluation/classification.py @@ -1,7 +1,7 @@ import warnings from typing import Optional -from fedot.core.data.data import InputData, OutputData, OptimisedFeatures +from fedot.core.data.data import InputData, OutputData from fedot.core.operations.evaluation.evaluation_interfaces import EvaluationStrategy, SkLearnEvaluationStrategy from fedot.core.operations.evaluation.operation_implementations.data_operations.decompose \ import DecomposerClassImplementation @@ -35,17 +35,11 @@ def predict(self, trained_operation, predict_data: InputData) -> OutputData: :return: prediction target """ - if isinstance(predict_data.features, OptimisedFeatures): - prediction = self._sklearn_compatible_prediction( - trained_operation=trained_operation, - features=predict_data.features.items - ) - else: - prediction = self._sklearn_compatible_prediction( - trained_operation=trained_operation, - features=predict_data.features - ) + prediction = self._sklearn_compatible_prediction( + trained_operation=trained_operation, + features=predict_data.features + ) converted = self._convert_to_output(prediction, predict_data) return converted diff --git a/fedot/core/operations/evaluation/evaluation_interfaces.py b/fedot/core/operations/evaluation/evaluation_interfaces.py index ab11190aee..60e00f297c 100644 --- a/fedot/core/operations/evaluation/evaluation_interfaces.py +++ b/fedot/core/operations/evaluation/evaluation_interfaces.py @@ -27,7 +27,7 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from xgboost import XGBClassifier, XGBRegressor -from fedot.core.data.data import InputData, OutputData, OptimisedFeatures +from fedot.core.data.data import InputData, OutputData from fedot.core.operations.operation_parameters import OperationParameters from fedot.core.repository.dataset_types import DataTypesEnum from fedot.core.repository.operation_types_repository import OperationTypesRepository, get_operation_type_from_id @@ -225,14 +225,10 @@ def fit(self, train_data: InputData): with ImplementationRandomStateHandler(implementation=operation_implementation): if is_model_not_support_multi and is_multi_target: # Manually wrap the regressor into multi-output model - operation_implementation = convert_to_multivariate_model(operation_implementation, - train_data) - else: - if isinstance(train_data.features, OptimisedFeatures): - operation_implementation.fit(train_data.features.items, train_data.target) + operation_implementation = convert_to_multivariate_model(operation_implementation, train_data) + + operation_implementation.fit(train_data.features, train_data.target) - else: - operation_implementation.fit(train_data.features, train_data.target) return operation_implementation @abstractmethod diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 057702c6ba..182fd346aa 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -4,7 +4,7 @@ import numpy as np from sklearn.preprocessing import LabelEncoder, OneHotEncoder -from fedot.core.data.data import InputData, OutputData, OptimisedFeatures +from fedot.core.data.data import InputData, OutputData from fedot.core.operations.evaluation.operation_implementations.implementation_interfaces import ( DataOperationImplementation ) @@ -21,16 +21,16 @@ def __init__(self, params: Optional[OperationParameters] = None): 'handle_unknown': 'ignore' } self.encoder = OneHotEncoder(**{**default_params, **self.params.to_dict()}) - self.categorical_ids: List[int] = [] - self.non_categorical_ids: List[int] = [] - self.encoded_ids: List[int] = [] - self.new_numerical_idx: List[int] = [] + self.categorical_ids: np.ndarray = np.array([]) + self.non_categorical_ids: np.ndarray = np.array([]) + self.encoded_ids: np.ndarray = np.array([]) + self.new_numerical_idx: np.ndarray = np.array([]) def fit(self, input_data: InputData): """ Method for fit encoder with automatic determination of categorical features - :param input_data: data with features, target and ids for encoder training - :return encoder: trained encoder (optional output) + :param input_data: data with features, target and ids for encoder fitting + :return encoder: encoder (optional output) """ features = input_data.features self.categorical_ids, self.non_categorical_ids = input_data.categorical_idx, input_data.numerical_idx @@ -152,17 +152,13 @@ def _apply_label_encoder(self, data: np.ndarray): column_encoder.classes_ = np.unique(np.concatenate((column_encoder.classes_, column))) transformed_column = column_encoder.transform(column) - nan_idxs = np.flatnonzero(column == 'nan') - if len(nan_idxs): + nan_indices = np.flatnonzero(column == 'nan') + if len(nan_indices): # Store np.nan values transformed_column = transformed_column.astype(object) - transformed_column[nan_idxs] = np.nan - - if isinstance(data, np.ndarray): - data[:, column_id] = transformed_column + transformed_column[nan_indices] = np.nan - elif isinstance(data, OptimisedFeatures): - data._columns[column_id] = transformed_column + data[:, column_id] = transformed_column def get_params(self) -> OperationParameters: """ Due to LabelEncoder has no parameters - return empty set """ diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index b92397eddb..ed27670f92 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -5,7 +5,7 @@ import numpy as np from golem.core.log import default_log -from fedot.core.data.data import InputData, OutputData, OptimisedFeatures +from fedot.core.data.data import InputData, OutputData from fedot.core.operations.operation_parameters import OperationParameters from fedot.core.repository.dataset_types import DataTypesEnum from fedot.utilities.custom_errors import AbstractMethodNotImplementError @@ -162,21 +162,14 @@ def _reasonability_check(features): non_bool_ids = [] # For every column in table make check - if isinstance(features, OptimisedFeatures): - features = features.items - - bool_ids = features.select_dtypes('bool').columns.values.tolist() - non_bool_ids = [col_idx for col_idx in features.columns.values.tolist() if col_idx not in bool_ids] - - elif isinstance(features, np.ndarray): - features = features.T - - for column_id, column in enumerate(features): - # column = features[:, column_id] if columns_amount > 1 else features.copy() - if len(set(column)) > 2: - non_bool_ids.append(column_id) - else: - bool_ids.append(column_id) + features = features.T + + for column_id, column in enumerate(features): + # column = features[:, column_id] if columns_amount > 1 else features.copy() + if len(set(column)) > 2: + non_bool_ids.append(column_id) + else: + bool_ids.append(column_id) return bool_ids, non_bool_ids diff --git a/fedot/core/operations/evaluation/operation_implementations/models/boostings_implementations.py b/fedot/core/operations/evaluation/operation_implementations/models/boostings_implementations.py index cfd6a37cbd..143b686fa0 100644 --- a/fedot/core/operations/evaluation/operation_implementations/models/boostings_implementations.py +++ b/fedot/core/operations/evaluation/operation_implementations/models/boostings_implementations.py @@ -80,6 +80,9 @@ def check_and_update_params(self): if booster == 'gblinear' and enable_categorical: self.params.update(enable_categorical=False) + if booster == 'gbtree' and enable_categorical: + self.params.update(enable_categorical=False) + def get_feature_importance(self) -> list: return self.model.features_importances_ @@ -91,7 +94,7 @@ def plot_feature_importance(self, importance_type='weight'): @staticmethod def convert_to_dataframe(data: Optional[InputData], identify_cats: bool): dataframe = pd.DataFrame(data=data.features) - if data.target is not None: + if data.target is not None and data.target.size > 0: dataframe['target'] = np.ravel(data.target) else: # TODO: temp workaround in case data.target is set to None intentionally @@ -236,7 +239,7 @@ def set_eval_metric(n_classes): @staticmethod def convert_to_dataframe(data: Optional[InputData], identify_cats: bool): dataframe = pd.DataFrame(data=data.features, columns=data.features_names) - if data.target is not None: + if data.target is not None and data.target.size > 0: dataframe['target'] = np.ravel(data.target) else: # TODO: temp workaround in case data.target is set to None intentionally diff --git a/fedot/preprocessing/data_types.py b/fedot/preprocessing/data_types.py index 52ed706648..6c8e52e8cf 100644 --- a/fedot/preprocessing/data_types.py +++ b/fedot/preprocessing/data_types.py @@ -16,6 +16,7 @@ _type_ids = range(len(_convertable_types)) TYPE_TO_ID = dict(zip(_convertable_types, _type_ids)) +ID_TO_TYPE = dict(zip(_type_ids, _convertable_types)) _TYPES = 'types' _FLOAT_NUMBER = 'float_number' diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index b04b6fbc85..c6056eee49 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -8,7 +8,7 @@ from golem.core.paths import copy_doc from sklearn.preprocessing import LabelEncoder -from fedot.core.data.data import InputData, np_datetime_to_numeric, OptimisedFeatures +from fedot.core.data.data import InputData, np_datetime_to_numeric from fedot.core.data.data import OutputData, data_type_is_table, data_type_is_text, data_type_is_ts from fedot.core.data.data_preprocessing import ( data_has_categorical_features, @@ -30,7 +30,7 @@ from fedot.preprocessing.base_preprocessing import BasePreprocessor from fedot.preprocessing.categorical import BinaryCategoricalPreprocessor from fedot.preprocessing.data_type_check import exclude_image, exclude_multi_ts, exclude_ts -from fedot.preprocessing.data_types import TYPE_TO_ID, TableTypesCorrector, _convertable_types +from fedot.preprocessing.data_types import TYPE_TO_ID, TableTypesCorrector, _convertable_types, ID_TO_TYPE from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME, PipelineStructureExplorer # The allowed percent of empty samples in features. @@ -561,38 +561,33 @@ def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalD @copy_doc(BasePreprocessor.reduce_memory_size) def reduce_memory_size(self, data: InputData) -> InputData: - def reduce_mem_usage_np(arr, initial_types): - reduced_columns = OptimisedFeatures() - - for i in range(arr.shape[1]): - col = arr[:, i] - init_type = _convertable_types[initial_types[i]] - col = col.astype(init_type) - col_type = col.dtype.name - - if col_type not in ['object'] and not bool(re.match(r'str\d*$', col_type)): - c_min = col.max() - c_max = col.max() - - if np.issubdtype(col.dtype, np.integer): + def reduce_mem_usage(features, initial_types): + df = pd.DataFrame(features) + types_array = [ID_TO_TYPE[_type] for _type in initial_types] + + for index, col in enumerate(df.columns): + df[col] = df[col].astype(types_array[index]) + col_type = df[col].dtype.name + + if col_type not in ['object', 'category', 'datetime64[ns, UTC]']: + c_min = df[col].min() + c_max = df[col].max() + if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: - reduced_columns.add_column(col.astype(np.int8)) + df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: - reduced_columns.add_column(col.astype(np.int16)) + df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: - reduced_columns.add_column(col.astype(np.int32)) + df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: - reduced_columns.add_column(col.astype(np.int64)) - - elif np.issubdtype(col.dtype, np.floating): + df[col] = df[col].astype(np.int64) + else: if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: - reduced_columns.add_column(col.astype(np.float32)) + df[col] = df[col].astype(np.float32) else: - reduced_columns.add_column(col.astype(np.float64)) - else: - reduced_columns.add_column(col) + df[col] = df[col].astype(np.float64) - return reduced_columns + return df if isinstance(data, InputData): if data.task.task_type == TaskTypesEnum.ts_forecasting: @@ -601,11 +596,11 @@ def reduce_mem_usage_np(arr, initial_types): pass else: self.log.debug('-- Reduce memory in features') - data.features = reduce_mem_usage_np(data.features, data.supplementary_data.col_type_ids['features']) + data.features = reduce_mem_usage(data.features, data.supplementary_data.col_type_ids['features']) if data.target is not None: self.log.debug('-- Reduce memory in target') - data.target = reduce_mem_usage_np(data.target, data.supplementary_data.col_type_ids['target']) + data.target = reduce_mem_usage(data.target, data.supplementary_data.col_type_ids['target']) data.target = data.target.to_numpy() return data From bfe617d1535feef33bb814878f1fe769553576c7 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Sun, 8 Sep 2024 17:55:51 +0300 Subject: [PATCH 60/69] Fix pep8, wrong code correction & test --- fedot/core/operations/evaluation/classification.py | 1 - fedot/core/operations/evaluation/evaluation_interfaces.py | 4 ++-- .../operation_implementations/implementation_interfaces.py | 6 +++++- fedot/preprocessing/preprocessing.py | 4 ++-- test/unit/data/test_data_categorical.py | 2 -- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fedot/core/operations/evaluation/classification.py b/fedot/core/operations/evaluation/classification.py index 04cc061c69..2765f21b3f 100644 --- a/fedot/core/operations/evaluation/classification.py +++ b/fedot/core/operations/evaluation/classification.py @@ -35,7 +35,6 @@ def predict(self, trained_operation, predict_data: InputData) -> OutputData: :return: prediction target """ - prediction = self._sklearn_compatible_prediction( trained_operation=trained_operation, features=predict_data.features diff --git a/fedot/core/operations/evaluation/evaluation_interfaces.py b/fedot/core/operations/evaluation/evaluation_interfaces.py index 60e00f297c..e0b21e0c1a 100644 --- a/fedot/core/operations/evaluation/evaluation_interfaces.py +++ b/fedot/core/operations/evaluation/evaluation_interfaces.py @@ -226,8 +226,8 @@ def fit(self, train_data: InputData): if is_model_not_support_multi and is_multi_target: # Manually wrap the regressor into multi-output model operation_implementation = convert_to_multivariate_model(operation_implementation, train_data) - - operation_implementation.fit(train_data.features, train_data.target) + else: + operation_implementation.fit(train_data.features, train_data.target) return operation_implementation diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index ed27670f92..86b7b7dd3c 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -165,7 +165,11 @@ def _reasonability_check(features): features = features.T for column_id, column in enumerate(features): - # column = features[:, column_id] if columns_amount > 1 else features.copy() + if isinstance(features, np.ndarray): + column = features[:, column_id] if columns_amount > 1 else features.copy() + else: + column = features.iloc[:, column_id] if columns_amount > 1 else features.copy() + if len(set(column)) > 2: non_bool_ids.append(column_id) else: diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index c6056eee49..650b98a171 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -1,4 +1,4 @@ -import re +from copy import copy from copy import copy from typing import Optional, Union @@ -30,7 +30,7 @@ from fedot.preprocessing.base_preprocessing import BasePreprocessor from fedot.preprocessing.categorical import BinaryCategoricalPreprocessor from fedot.preprocessing.data_type_check import exclude_image, exclude_multi_ts, exclude_ts -from fedot.preprocessing.data_types import TYPE_TO_ID, TableTypesCorrector, _convertable_types, ID_TO_TYPE +from fedot.preprocessing.data_types import TYPE_TO_ID, TableTypesCorrector, ID_TO_TYPE from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME, PipelineStructureExplorer # The allowed percent of empty samples in features. diff --git a/test/unit/data/test_data_categorical.py b/test/unit/data/test_data_categorical.py index c7742b954c..01cd66245a 100644 --- a/test/unit/data/test_data_categorical.py +++ b/test/unit/data/test_data_categorical.py @@ -1,5 +1,3 @@ -import os - import numpy as np import pandas as pd import pytest From 68e7610123aa0f3122d85443a0dcb6e64aea0d69 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Sun, 8 Sep 2024 18:04:58 +0300 Subject: [PATCH 61/69] Fixes bug with memory_usage & test --- fedot/api/api_utils/api_data.py | 8 ++++---- fedot/core/data/data.py | 14 ++++++++++++-- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py index 3a5aaf5e31..7776c32aad 100644 --- a/fedot/api/api_utils/api_data.py +++ b/fedot/api/api_utils/api_data.py @@ -137,7 +137,7 @@ def accept_and_apply_recommendations(self, input_data: Union[InputData, MultiMod def fit_transform(self, train_data: InputData) -> InputData: start_time = datetime.now() self.log.message('Preprocessing data') - memory_usage = convert_memory_size(train_data.features.memory_usage) + memory_usage = convert_memory_size(train_data.memory_usage) features_shape = train_data.features.shape target_shape = train_data.target.shape self.log.message( @@ -157,7 +157,7 @@ def fit_transform(self, train_data: InputData) -> InputData: train_data.supplementary_data.is_auto_preprocessed = True - memory_usage = convert_memory_size(train_data.features.memory_usage) + memory_usage = convert_memory_size(train_data.memory_usage) features_shape = train_data.features.shape target_shape = train_data.target.shape @@ -170,7 +170,7 @@ def fit_transform(self, train_data: InputData) -> InputData: def transform(self, test_data: InputData, current_pipeline) -> InputData: start_time = datetime.now() self.log.message('Preprocessing data') - memory_usage = convert_memory_size(test_data.features.memory_usage) + memory_usage = convert_memory_size(test_data.memory_usage) features_shape = test_data.features.shape target_shape = test_data.target.shape self.log.message( @@ -184,7 +184,7 @@ def transform(self, test_data: InputData, current_pipeline) -> InputData: test_data = self.preprocessor.reduce_memory_size(data=test_data) - memory_usage = convert_memory_size(test_data.features.memory_usages) + memory_usage = convert_memory_size(test_data.memory_usages) features_shape = test_data.features.shape target_shape = test_data.target.shape self.log.message( diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index f1ba556bf4..d68a5b9702 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -692,7 +692,10 @@ def get_not_encoded_data(self): # Checking numerical data exists if self.numerical_idx is not None and self.numerical_idx.size != 0: - num_features = self.features[:, self.numerical_idx] + if isinstance(self.features, np.ndarray): + num_features = self.features[:, self.numerical_idx] + else: + num_features = self.features.iloc[:, self.numerical_idx] if self.features_names is not None and np.size(self.features_names): num_features_names = self.features_names[self.numerical_idx] @@ -709,7 +712,11 @@ def get_not_encoded_data(self): cat_features_names = np.array([f'cat_feature_{i}' for i in range(1, cat_features.shape[1] + 1)]) if num_features is not None and cat_features is not None: - new_features = np.hstack((num_features, cat_features)) + if isinstance(self.features, np.ndarray): + new_features = np.hstack((num_features, cat_features)) + else: + new_features = pd.concat([num_features, cat_features]) + new_features_names = np.hstack((num_features_names, cat_features_names)) new_features_idx = np.array(range(new_features.shape[1])) new_num_idx = new_features_idx[:num_features.shape[1]] @@ -727,6 +734,9 @@ def get_not_encoded_data(self): else: raise ValueError('There is no features') + if isinstance(new_features, pd.DataFrame): + new_features.columns = new_features_names + return InputData(idx=self.idx, features=new_features, features_names=new_features_names, numerical_idx=new_num_idx, categorical_idx=new_cat_idx, target=self.target, task=self.task, data_type=self.data_type) From bef6bf2bad3079012e8742fd785adf475394d6cc Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Sun, 8 Sep 2024 18:13:30 +0300 Subject: [PATCH 62/69] Fixes bug with invalid slice --- .../operation_implementations/implementation_interfaces.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index 86b7b7dd3c..9b62113f7c 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -89,7 +89,10 @@ def fit(self, input_data: InputData): self.ids_to_process = ids_to_process self.bool_ids = bool_ids if len(ids_to_process) > 0: - features_to_process = np.array(features[:, ids_to_process]) if features.ndim > 1 else features + if isinstance(features, np.ndarray): + features_to_process = np.array(features[:, ids_to_process]) if features.ndim > 1 else features + else: + features_to_process = np.array(features.iloc[:, ids_to_process]) if features.ndim > 1 else features self.operation.fit(features_to_process) return self.operation From bc1681ded755bcccdc14f113aaf7e617b1212220 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Sun, 8 Sep 2024 18:13:45 +0300 Subject: [PATCH 63/69] pep8 fix --- fedot/preprocessing/preprocessing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 650b98a171..299c697c92 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -1,5 +1,4 @@ from copy import copy -from copy import copy from typing import Optional, Union import numpy as np From 4843f7b24a6a454b88bc278f2d9f50fddf0ac88e Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Sun, 8 Sep 2024 18:33:03 +0300 Subject: [PATCH 64/69] test fixes --- .../implementation_interfaces.py | 18 ++++++++++++++---- test/data/expected_metric_values.json | 6 +++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index 9b62113f7c..9156da89b6 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -3,6 +3,7 @@ from typing import Optional import numpy as np +import pandas as pd from golem.core.log import default_log from fedot.core.data.data import InputData, OutputData @@ -92,7 +93,7 @@ def fit(self, input_data: InputData): if isinstance(features, np.ndarray): features_to_process = np.array(features[:, ids_to_process]) if features.ndim > 1 else features else: - features_to_process = np.array(features.iloc[:, ids_to_process]) if features.ndim > 1 else features + features_to_process = np.array(features[ids_to_process]) if features.ndim > 1 else features self.operation.fit(features_to_process) return self.operation @@ -125,7 +126,11 @@ def _make_new_table(self, features): :param features: tabular data for processing :return transformed_features: transformed features table """ - features_to_process = np.array(features[:, self.ids_to_process]) if features.ndim > 1 else features.copy() + if isinstance(features, np.ndarray): + features_to_process = np.array(features[:, self.ids_to_process]) if features.ndim > 1 else features.copy() + else: + features_to_process = np.array(features[self.ids_to_process]) if features.ndim > 1 else features.copy() + transformed_part = self.operation.transform(features_to_process) # If there are no binary features in the dataset @@ -133,7 +138,11 @@ def _make_new_table(self, features): transformed_features = transformed_part else: # Stack transformed features and bool features - bool_features = np.array(features[:, self.bool_ids]) + if isinstance(features, np.ndarray): + bool_features = np.array(features[:, self.bool_ids]) + else: + bool_features = np.array(features[self.bool_ids]) + frames = (bool_features, transformed_part) transformed_features = np.hstack(frames) @@ -165,7 +174,8 @@ def _reasonability_check(features): non_bool_ids = [] # For every column in table make check - features = features.T + if isinstance(features, np.ndarray): + features = features.T for column_id, column in enumerate(features): if isinstance(features, np.ndarray): diff --git a/test/data/expected_metric_values.json b/test/data/expected_metric_values.json index 5018aa8d3f..102c0ca31b 100644 --- a/test/data/expected_metric_values.json +++ b/test/data/expected_metric_values.json @@ -13,11 +13,11 @@ "accuracy": -0.95 }, "multiclass": { - "roc_auc": -0.9881784881784883, + "roc_auc": [-0.9881784881784883, -0.9832500832500832], "precision": -0.9777777777777779, "f1": -0.9719701552732407, - "neg_log_loss": 0.17094588819131074, - "roc_auc_pen": -0.9838963813963815, + "neg_log_loss": [0.17094588819131074, 0.1732861818492787], + "roc_auc_pen": [-0.9838963813963815, -0.9789893328893329], "accuracy": -0.9722222222222222 }, "regression": { From a066f31875e84d696e13e17f565718770cfa7a03 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Sun, 8 Sep 2024 18:41:24 +0300 Subject: [PATCH 65/69] pep8 fix --- .../operation_implementations/implementation_interfaces.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index 9156da89b6..389b9e8b90 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -3,7 +3,6 @@ from typing import Optional import numpy as np -import pandas as pd from golem.core.log import default_log from fedot.core.data.data import InputData, OutputData From 8aac969d89fbf1c7dc1b80560c22565ac27bd816 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Sun, 8 Sep 2024 18:41:48 +0300 Subject: [PATCH 66/69] fix bug with memory_usage --- fedot/api/api_utils/api_data.py | 2 +- fedot/core/data/data.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fedot/api/api_utils/api_data.py b/fedot/api/api_utils/api_data.py index 7776c32aad..5a421397eb 100644 --- a/fedot/api/api_utils/api_data.py +++ b/fedot/api/api_utils/api_data.py @@ -184,7 +184,7 @@ def transform(self, test_data: InputData, current_pipeline) -> InputData: test_data = self.preprocessor.reduce_memory_size(data=test_data) - memory_usage = convert_memory_size(test_data.memory_usages) + memory_usage = convert_memory_size(test_data.memory_usage) features_shape = test_data.features.shape target_shape = test_data.target.shape self.log.message( diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index d68a5b9702..6153e44af5 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -537,7 +537,7 @@ def to_csv(self, path_to_save): @property def memory_usage(self): if isinstance(self.features, np.ndarray): - return self.features.nbytes + return sum([feature.nbytes for feature in self.features.T]) else: return self.features.memory_usage().sum() From 1039392f2cab77db4123da3fbe661a01cf4d7846 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Sun, 8 Sep 2024 19:45:19 +0300 Subject: [PATCH 67/69] reduce_memory_usage in utils, fix test with operations --- .../data_operations/categorical_encoders.py | 33 ++++++++++++++----- .../sklearn_imbalanced_class.py | 33 ++++++++++++++++--- .../implementation_interfaces.py | 17 ++++++---- fedot/preprocessing/preprocessing.py | 29 +--------------- fedot/utilities/memory.py | 33 +++++++++++++++++++ 5 files changed, 99 insertions(+), 46 deletions(-) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 182fd346aa..62328b9f99 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -1,7 +1,8 @@ from copy import deepcopy -from typing import List, Optional +from typing import List, Optional, Union import numpy as np +import pandas as pd from sklearn.preprocessing import LabelEncoder, OneHotEncoder from fedot.core.data.data import InputData, OutputData @@ -10,6 +11,7 @@ ) from fedot.core.operations.operation_parameters import OperationParameters from fedot.preprocessing.data_types import TYPE_TO_ID +from fedot.utilities.memory import reduce_mem_usage class OneHotEncodingImplementation(DataOperationImplementation): @@ -37,7 +39,11 @@ def fit(self, input_data: InputData): # If there are categorical features - process it if self.categorical_ids.size > 0: - updated_cat_features = features[:, self.categorical_ids].astype(str) + if isinstance(features, np.ndarray): + updated_cat_features = features[:, self.categorical_ids].astype(str) + else: + updated_cat_features = features.iloc[:, self.categorical_ids].astype(str) + self.encoder.fit(updated_cat_features) return self.encoder @@ -58,9 +64,15 @@ def transform(self, input_data: InputData) -> OutputData: transformed_features = self._apply_one_hot_encoding(transformed_features) # Update features - output_data = self._convert_to_output(copied_data, - transformed_features) + output_data = self._convert_to_output(copied_data, transformed_features) self._update_column_types(output_data) + + if isinstance(output_data.features, pd.DataFrame): + output_data.predict = reduce_mem_usage( + transformed_features, + output_data.supplementary_data.col_type_ids['features'] + ) + return output_data def _update_column_types(self, output_data: OutputData): @@ -77,17 +89,22 @@ def _update_column_types(self, output_data: OutputData): output_data.encoded_idx = self.encoded_ids output_data.supplementary_data.col_type_ids['features'] = numerical_columns - def _apply_one_hot_encoding(self, features: np.ndarray) -> np.ndarray: + def _apply_one_hot_encoding(self, features: Union[np.ndarray, pd.DataFrame]) -> np.ndarray: """ The method creates a table based on categorical and real features after One Hot Encoding transformation :param features: tabular data for processing :return transformed_features: transformed features table """ - transformed_categorical = self.encoder.transform(features[:, self.categorical_ids]).toarray() + if isinstance(features, np.ndarray): + transformed_categorical = self.encoder.transform(features[:, self.categorical_ids]).toarray() + # Stack transformed categorical and non-categorical data, ignore if none + non_categorical_features = features[:, self.non_categorical_ids] + + else: + transformed_categorical = self.encoder.transform(features.iloc[:, self.categorical_ids]).toarray() + non_categorical_features = features.iloc[:, self.non_categorical_ids].to_numpy() - # Stack transformed categorical and non-categorical data, ignore if none - non_categorical_features = features[:, self.non_categorical_ids] frames = (non_categorical_features, transformed_categorical) transformed_features = np.hstack(frames) self.encoded_ids = np.array(range(non_categorical_features.shape[1], transformed_features.shape[1])) diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_imbalanced_class.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_imbalanced_class.py index 641996dd7b..3f838cd4e5 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_imbalanced_class.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_imbalanced_class.py @@ -1,7 +1,8 @@ from copy import copy -from typing import Optional +from typing import Optional, Union import numpy as np +import pandas as pd from golem.core.log import default_log from sklearn.utils import resample @@ -10,6 +11,7 @@ DataOperationImplementation ) from fedot.core.operations.operation_parameters import OperationParameters +from fedot.utilities.memory import reduce_mem_usage GLOBAL_PREFIX = 'sklearn_imbalanced_class:' @@ -93,6 +95,12 @@ def transform_for_fit(self, input_data: InputData) -> OutputData: # If number of elements of each class are equal that transformation is not required return self._convert_to_output(input_data, input_data.features) + if isinstance(copied_data.features, pd.DataFrame): + copied_data.features = copied_data.features.to_numpy() + + if isinstance(copied_data.target, pd.DataFrame): + copied_data.target = copied_data.target.to_numpy() + min_data, maj_data = self._get_data_by_target(copied_data.features, copied_data.target, unique_class, number_of_elements) @@ -116,18 +124,35 @@ def transform_for_fit(self, input_data: InputData) -> OutputData: transformed_data = np.concatenate((min_data, maj_data), axis=0).transpose() + if isinstance(input_data.features, pd.DataFrame): + predict = reduce_mem_usage( + transformed_data[:-1].transpose(), + input_data.supplementary_data.col_type_ids['features'] + ) + + target = reduce_mem_usage( + transformed_data[-1], + input_data.supplementary_data.col_type_ids['target'] + ) + + else: + predict = transformed_data[:-1].transpose() + target = transformed_data[-1] + output_data = OutputData( idx=np.arange(transformed_data.shape[1]), features=input_data.features, - predict=transformed_data[:-1].transpose(), + predict=predict, task=input_data.task, - target=transformed_data[-1], + target=target, data_type=input_data.data_type, supplementary_data=input_data.supplementary_data) + return output_data @staticmethod - def _get_data_by_target(features: np.array, target: np.array, unique: np.array, + def _get_data_by_target(features: Union[np.array, pd.DataFrame], target: Union[np.array, pd.DataFrame], + unique: np.array, number_of_elements: np.array) -> np.array: """Unify features and target in one array and split into classes """ diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index 389b9e8b90..5ecb41b0a9 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -3,6 +3,7 @@ from typing import Optional import numpy as np +import pandas as pd from golem.core.log import default_log from fedot.core.data.data import InputData, OutputData @@ -82,7 +83,10 @@ def fit(self, input_data: InputData): :return operation: trained transformer (optional output) """ - features = input_data.features + if input_data.task.task_type.name == 'ts_forecasting' and input_data.features.ndim == 2: + features = input_data.features.ravel() + else: + features = input_data.features # Find boolean columns in features table bool_ids, ids_to_process = self._reasonability_check(features) @@ -90,6 +94,9 @@ def fit(self, input_data: InputData): self.bool_ids = bool_ids if len(ids_to_process) > 0: if isinstance(features, np.ndarray): + if input_data.task.task_type.name == 'ts_forecasting' and input_data.features.ndim == 2: + features = features.reshape(-1, 1) + features_to_process = np.array(features[:, ids_to_process]) if features.ndim > 1 else features else: features_to_process = np.array(features[ids_to_process]) if features.ndim > 1 else features @@ -173,16 +180,14 @@ def _reasonability_check(features): non_bool_ids = [] # For every column in table make check - if isinstance(features, np.ndarray): - features = features.T - - for column_id, column in enumerate(features): + for column_id in range(columns_amount): if isinstance(features, np.ndarray): column = features[:, column_id] if columns_amount > 1 else features.copy() else: column = features.iloc[:, column_id] if columns_amount > 1 else features.copy() - if len(set(column)) > 2: + if (isinstance(column, pd.DataFrame) and len(set(column)) > 2) or \ + (isinstance(column, np.ndarray) and len(np.unique(column)) > 2): non_bool_ids.append(column_id) else: bool_ids.append(column_id) diff --git a/fedot/preprocessing/preprocessing.py b/fedot/preprocessing/preprocessing.py index 299c697c92..5b0c03d7a2 100644 --- a/fedot/preprocessing/preprocessing.py +++ b/fedot/preprocessing/preprocessing.py @@ -31,6 +31,7 @@ from fedot.preprocessing.data_type_check import exclude_image, exclude_multi_ts, exclude_ts from fedot.preprocessing.data_types import TYPE_TO_ID, TableTypesCorrector, ID_TO_TYPE from fedot.preprocessing.structure import DEFAULT_SOURCE_NAME, PipelineStructureExplorer +from fedot.utilities.memory import reduce_mem_usage # The allowed percent of empty samples in features. # Example: 90% objects in features are 'nan', then drop this feature from data. @@ -560,34 +561,6 @@ def update_indices_for_time_series(self, test_data: Union[InputData, MultiModalD @copy_doc(BasePreprocessor.reduce_memory_size) def reduce_memory_size(self, data: InputData) -> InputData: - def reduce_mem_usage(features, initial_types): - df = pd.DataFrame(features) - types_array = [ID_TO_TYPE[_type] for _type in initial_types] - - for index, col in enumerate(df.columns): - df[col] = df[col].astype(types_array[index]) - col_type = df[col].dtype.name - - if col_type not in ['object', 'category', 'datetime64[ns, UTC]']: - c_min = df[col].min() - c_max = df[col].max() - if str(col_type)[:3] == 'int': - if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: - df[col] = df[col].astype(np.int8) - elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: - df[col] = df[col].astype(np.int16) - elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: - df[col] = df[col].astype(np.int32) - elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: - df[col] = df[col].astype(np.int64) - else: - if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: - df[col] = df[col].astype(np.float32) - else: - df[col] = df[col].astype(np.float64) - - return df - if isinstance(data, InputData): if data.task.task_type == TaskTypesEnum.ts_forecasting: # TODO: TS data has col_type_ids['features'] = None. diff --git a/fedot/utilities/memory.py b/fedot/utilities/memory.py index b25eb9d757..53091fd02e 100644 --- a/fedot/utilities/memory.py +++ b/fedot/utilities/memory.py @@ -2,8 +2,12 @@ import tracemalloc from typing import Optional +import numpy as np +import pandas as pd from golem.core.log import default_log +from fedot.preprocessing.data_types import ID_TO_TYPE + class MemoryAnalytics: is_active = False @@ -55,3 +59,32 @@ def log(cls, logger: Optional[logging.LoggerAdapter] = None, logger = default_log(prefix=cls.__name__) logger.log(logging_level, message) return message + + +def reduce_mem_usage(features, initial_types): + df = pd.DataFrame(features) + types_array = [ID_TO_TYPE[_type] for _type in initial_types] + + for index, col in enumerate(df.columns): + df[col] = df[col].astype(types_array[index]) + col_type = df[col].dtype.name + + if col_type not in ['object', 'category', 'datetime64[ns, UTC]']: + c_min = df[col].min() + c_max = df[col].max() + if str(col_type)[:3] == 'int': + if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: + df[col] = df[col].astype(np.int8) + elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: + df[col] = df[col].astype(np.int16) + elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: + df[col] = df[col].astype(np.int32) + elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: + df[col] = df[col].astype(np.int64) + else: + if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: + df[col] = df[col].astype(np.float32) + else: + df[col] = df[col].astype(np.float64) + + return df \ No newline at end of file From 9a0ccabac0f67a814d2f03b6551fbd2fe43286b0 Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Sun, 8 Sep 2024 20:53:12 +0300 Subject: [PATCH 68/69] fix tests --- fedot/core/data/data.py | 5 +- .../data_operations/categorical_encoders.py | 68 +++++++++++++------ .../data_operations/sklearn_selectors.py | 6 +- .../implementation_interfaces.py | 8 ++- test/unit/preprocessing/test_preprocessors.py | 6 +- 5 files changed, 66 insertions(+), 27 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index 6153e44af5..e8b16cd953 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -611,8 +611,11 @@ def subset_features(self, feature_ids: np.array) -> Optional[InputData]: """ if feature_ids is None or feature_ids.size == 0: return None + if isinstance(self.features, np.ndarray): + subsample_features = self.features[:, feature_ids] + else: + subsample_features = self.features.iloc[:, feature_ids] - subsample_features = self.features[:, feature_ids] subsample_input = InputData( features=subsample_features, data_type=self.data_type, diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py index 62328b9f99..4f9c759c24 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/categorical_encoders.py @@ -148,34 +148,62 @@ def _update_column_types(self, output_data: OutputData): feature_type_ids = output_data.supplementary_data.col_type_ids['features'] feature_type_ids[self.categorical_ids] = TYPE_TO_ID[int] - def _fit_label_encoders(self, data: np.ndarray): + def _fit_label_encoders(self, data: Union[np.ndarray, pd.DataFrame]): """ Fit LabelEncoder for every categorical column in the dataset """ - categorical_columns = data[:, self.categorical_ids].astype(str) - for column_id, column in zip(self.categorical_ids, categorical_columns.T): - le = LabelEncoder() - le.fit(column) - self.encoders[column_id] = le + if isinstance(data, np.ndarray): + categorical_columns = data[:, self.categorical_ids].astype(str) - def _apply_label_encoder(self, data: np.ndarray): + for column_id, column in zip(self.categorical_ids, categorical_columns.T): + le = LabelEncoder() + le.fit(column) + self.encoders[column_id] = le + + else: + categorical_columns = data.iloc[:, self.categorical_ids].astype(str) + + for column_id in self.categorical_ids: + le = LabelEncoder() + le.fit(categorical_columns.iloc[:, column_id]) + self.encoders[column_id] = le + + def _apply_label_encoder(self, data: Union[np.ndarray, pd.DataFrame]): """ Applies fitted LabelEncoder for all categorical features inplace Args: data: numpy array with all features """ - categorical_columns = data[:, self.categorical_ids].astype(str) - for column_id, column in zip(self.categorical_ids, categorical_columns.T): - column_encoder = self.encoders[column_id] - column_encoder.classes_ = np.unique(np.concatenate((column_encoder.classes_, column))) - - transformed_column = column_encoder.transform(column) - nan_indices = np.flatnonzero(column == 'nan') - if len(nan_indices): - # Store np.nan values - transformed_column = transformed_column.astype(object) - transformed_column[nan_indices] = np.nan - - data[:, column_id] = transformed_column + if isinstance(data, np.ndarray): + categorical_columns = data[:, self.categorical_ids].astype(str) + + for column_id, column in zip(self.categorical_ids, categorical_columns.T): + column_encoder = self.encoders[column_id] + column_encoder.classes_ = np.unique(np.concatenate((column_encoder.classes_, column))) + + transformed_column = column_encoder.transform(column) + nan_indices = np.flatnonzero(column == 'nan') + if len(nan_indices): + # Store np.nan values + transformed_column = transformed_column.astype(object) + transformed_column[nan_indices] = np.nan + + data[:, column_id] = transformed_column + else: + categorical_columns = data.iloc[:, self.categorical_ids].astype(str) + + for column_id in self.categorical_ids: + column_encoder = self.encoders[column_id] + column = categorical_columns[column_id] + column_encoder.classes_ = np.unique(np.concatenate((column_encoder.classes_, column))) + + transformed_column = column_encoder.transform(column) + nan_indices = np.flatnonzero(column == 'nan') + if len(nan_indices): + # Store np.nan values + transformed_column = transformed_column.astype(object) + transformed_column[nan_indices] = np.nan + + data.iloc[:, column_id] = transformed_column def get_params(self) -> OperationParameters: """ Due to LabelEncoder has no parameters - return empty set """ diff --git a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py index fa880ae7fd..51cf3a28ff 100644 --- a/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py +++ b/fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_selectors.py @@ -97,7 +97,11 @@ def _make_new_table(self, features): # Bool vector - mask for columns self.remain_features_mask = self.operation.support_ - transformed_features = features[:, self.remain_features_mask] + if isinstance(features, np.ndarray): + transformed_features = features[:, self.remain_features_mask] + else: + transformed_features = features.iloc[:, self.remain_features_mask] + return transformed_features @staticmethod diff --git a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py index 5ecb41b0a9..0573139643 100644 --- a/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py +++ b/fedot/core/operations/evaluation/operation_implementations/implementation_interfaces.py @@ -99,7 +99,7 @@ def fit(self, input_data: InputData): features_to_process = np.array(features[:, ids_to_process]) if features.ndim > 1 else features else: - features_to_process = np.array(features[ids_to_process]) if features.ndim > 1 else features + features_to_process = np.array(features.iloc[:, ids_to_process]) if features.ndim > 1 else features self.operation.fit(features_to_process) return self.operation @@ -135,7 +135,9 @@ def _make_new_table(self, features): if isinstance(features, np.ndarray): features_to_process = np.array(features[:, self.ids_to_process]) if features.ndim > 1 else features.copy() else: - features_to_process = np.array(features[self.ids_to_process]) if features.ndim > 1 else features.copy() + features_to_process = np.array( + features.iloc[:, self.ids_to_process] + ) if features.ndim > 1 else features.copy() transformed_part = self.operation.transform(features_to_process) @@ -186,7 +188,7 @@ def _reasonability_check(features): else: column = features.iloc[:, column_id] if columns_amount > 1 else features.copy() - if (isinstance(column, pd.DataFrame) and len(set(column)) > 2) or \ + if (isinstance(column, pd.Series) and len(set(column)) > 2) or \ (isinstance(column, np.ndarray) and len(np.unique(column)) > 2): non_bool_ids.append(column_id) else: diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py index d4d52c4884..f436ff008f 100644 --- a/test/unit/preprocessing/test_preprocessors.py +++ b/test/unit/preprocessing/test_preprocessors.py @@ -242,7 +242,8 @@ def test_mixed_column_with_str_and_float_values(): # column with index 0 must be converted to string and encoded with OHE train_predicted = fit_predict_cycle_for_testing(idx=0) assert train_predicted.features.shape[1] == 5 - assert all(isinstance(el, np.ndarray) for el in train_predicted.features) + assert isinstance(train_predicted.features, pd.DataFrame) or \ + all(isinstance(el, np.ndarray) for el in train_predicted.features) # column with index 1 must be converted to float and the gaps must be filled train_predicted = fit_predict_cycle_for_testing(idx=1) @@ -254,7 +255,8 @@ def test_mixed_column_with_str_and_float_values(): ) assert train_predicted.features.shape[1] == 1 - assert all(isinstance(el[0], types_encountered) for el in train_predicted.features) + assert all(isinstance(el[0], types_encountered) for el in train_predicted.features.to_numpy()) or \ + all(isinstance(el[0], types_encountered) for el in train_predicted.features) # column with index 2 must be removed due to unclear type of data try: From 0d8796d9a81de2bb31a3720c8f037b9563d0295b Mon Sep 17 00:00:00 2001 From: Andrey Stebenkov Date: Sun, 8 Sep 2024 22:41:25 +0300 Subject: [PATCH 69/69] fix tests in main api --- fedot/core/data/data.py | 2 +- test/integration/api/test_main_api.py | 3 ++- test/unit/preprocessing/test_preprocessors.py | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index e8b16cd953..93dc628bc8 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -757,7 +757,7 @@ class OutputData(Data): """``Data`` type for data prediction in the node """ - features: Optional[np.ndarray] = None + features: Optional[np.ndarray, pd.DataFrame] = None predict: Optional[np.ndarray] = None target: Optional[np.ndarray] = None encoded_idx: Optional[np.ndarray] = None diff --git a/test/integration/api/test_main_api.py b/test/integration/api/test_main_api.py index 351dc6e24a..7b57c0b240 100644 --- a/test/integration/api/test_main_api.py +++ b/test/integration/api/test_main_api.py @@ -231,7 +231,8 @@ def test_categorical_preprocessing_unidata_predefined_linear(): ) for i in range(prediction.features.shape[1]): - assert all(list(map(lambda x: isinstance(x, types_encountered), prediction.features[:, i]))) + assert all(list(map(lambda x: isinstance(x, types_encountered), prediction.features.to_numpy()[:, i]))) or \ + all(list(map(lambda x: isinstance(x, types_encountered), prediction.features[:, i]))) def test_fill_nan_without_categorical(): diff --git a/test/unit/preprocessing/test_preprocessors.py b/test/unit/preprocessing/test_preprocessors.py index f436ff008f..4b0b9ed41e 100644 --- a/test/unit/preprocessing/test_preprocessors.py +++ b/test/unit/preprocessing/test_preprocessors.py @@ -223,7 +223,8 @@ def test_binary_pseudo_string_column_process_correctly(): ) assert train_predicted.features.shape[1] == 1 - assert all(isinstance(el[0], types_encountered) for el in train_predicted.features) + assert all(isinstance(el[0], types_encountered) for el in train_predicted.features.to_numpy()) or \ + all(isinstance(el[0], types_encountered) for el in train_predicted.features) def fit_predict_cycle_for_testing(idx: int):