Skip to content

Commit

Permalink
Fix bug with test_api_fit_predict_with_pseudo_large_dataset_with_labe…
Browse files Browse the repository at this point in the history
…l_correct
  • Loading branch information
aPovidlo committed Aug 15, 2024
1 parent 119bca8 commit 7a3946a
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -282,10 +282,7 @@ def fit(self, input_data: InputData):

if data_type_is_table(input_data):
categorical_idx = input_data.categorical_idx.tolist()
numerical_idx = np.setdiff1d(
np.concatenate((input_data.numerical_idx, input_data.encoded_idx)),
categorical_idx
).tolist()
numerical_idx = np.setdiff1d(input_data.numerical_idx, categorical_idx).tolist()
# Tabular data contains categorical features
numerical, categorical = divide_data_categorical_numerical(input_data, categorical_idx, numerical_idx)

Expand Down Expand Up @@ -317,10 +314,7 @@ def transform(self, input_data: InputData) -> OutputData:

if data_type_is_table(input_data) and input_data.categorical_idx is not None:
self.categorical_ids = input_data.categorical_idx.tolist()
self.non_categorical_ids = np.setdiff1d(
np.concatenate((input_data.numerical_idx, input_data.encoded_idx)),
self.categorical_ids
).tolist()
self.non_categorical_ids = np.setdiff1d(input_data.numerical_idx, self.categorical_ids).tolist()

numerical, categorical = divide_data_categorical_numerical(
input_data, self.categorical_ids, self.non_categorical_ids
Expand Down
1 change: 1 addition & 0 deletions fedot/preprocessing/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def fit(self, input_data: InputData):

# Remove binary columns from categorical_idx
input_data.categorical_idx = [idx for idx in input_data.categorical_idx if idx not in binary_ids_to_convert]
input_data.categorical_idx = np.array(input_data.categorical_idx)
self.binary_ids_to_convert = binary_ids_to_convert

# TODO: Add log.message with binary ids
Expand Down
13 changes: 10 additions & 3 deletions fedot/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,7 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str,
self.types_correctors[source_name].convert_data_for_predict(data)

feature_type_ids = data.supplementary_data.col_type_ids['features']
data.numerical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]]))
data.categorical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[str]]))
data.numerical_idx, data.categorical_idx = self._update_num_and_cats_ids(feature_type_ids)

# TODO andreygetmanov target encoding must be obligatory for all data types
if data_type_is_text(data):
Expand All @@ -252,6 +251,9 @@ def _prepare_obligatory_unimodal(self, data: InputData, source_name: str,
else:
data = self.binary_categorical_processors[source_name].transform(data)

feature_type_ids = data.supplementary_data.col_type_ids['features']
data.numerical_idx, data.categorical_idx = self._update_num_and_cats_ids(feature_type_ids)

return data

def _prepare_optional(self, pipeline, data: InputData, source_name: str):
Expand Down Expand Up @@ -560,7 +562,6 @@ def reduce_memory_size(self, data: InputData) -> InputData:
def reduce_mem_usage_np(arr, initial_types):
reduced_columns = OptimisedFeature()


for i in range(arr.shape[1]):
col = arr[:, i]
init_type = _convertable_types[initial_types[i]]
Expand Down Expand Up @@ -601,3 +602,9 @@ def reduce_mem_usage_np(arr, initial_types):
data.target = reduce_mem_usage_np(data.target, data.supplementary_data.col_type_ids['target'])

return data

def _update_num_and_cats_ids(self, feature_type_ids):
numerical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[int], TYPE_TO_ID[float]]))
categorical_idx = np.flatnonzero(np.isin(feature_type_ids, [TYPE_TO_ID[str]]))

return numerical_idx, categorical_idx

0 comments on commit 7a3946a

Please sign in to comment.