update

bagustris · Jun 26, 2024 · 9e3a4ba · 9e3a4ba
1 parent c734eca
commit 9e3a4ba
Show file tree

Hide file tree

Showing 10 changed files with 191 additions and 150 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
 Changelog
 =========
 
+Version 0.88.2
+--------------
+* changed combine speaker results to show speakers not samples
+
 Version 0.88.1
 --------------
 * added obligatory scatter plot for regression

diff --git a/nkululeko/constants.py b/nkululeko/constants.py
@@ -1,2 +1,2 @@
-VERSION="0.88.1"
+VERSION="0.88.2"
 SAMPLING_RATE = 16000
diff --git a/nkululeko/experiment.py b/nkululeko/experiment.py
@@ -107,8 +107,7 @@ def load_datasets(self):
         # print keys/column
         dbs = ",".join(list(self.datasets.keys()))
         labels = self.util.config_val("DATA", "labels", False)
-        auto_labels = list(
-            next(iter(self.datasets.values())).df[self.target].unique())
+        auto_labels = list(next(iter(self.datasets.values())).df[self.target].unique())
         if labels:
             self.labels = ast.literal_eval(labels)
             self.util.debug(f"Using target labels (from config): {labels}")
@@ -158,8 +157,7 @@ def fill_tests(self):
                 data.split()
                 data.prepare_labels()
                 self.df_test = pd.concat(
-                    [self.df_test, self.util.make_segmented_index(
-                        data.df_test)]
+                    [self.df_test, self.util.make_segmented_index(data.df_test)]
                 )
                 self.df_test.is_labeled = data.is_labeled
             self.df_test.got_gender = self.got_gender
@@ -260,17 +258,15 @@ def fill_train_and_tests(self):
                     test_cats = self.df_test[self.target].unique()
                 else:
                     # if there is no target, copy a dummy label
-                    self.df_test = self._add_random_target(
-                        self.df_test).astype("str")
+                    self.df_test = self._add_random_target(self.df_test).astype("str")
                 train_cats = self.df_train[self.target].unique()
                 # print(f"df_train: {pd.DataFrame(self.df_train[self.target])}")
                 # print(f"train_cats with target {self.target}: {train_cats}")
             if self.df_test.is_labeled:
                 if type(test_cats) == np.ndarray:
                     self.util.debug(f"Categories test (nd.array): {test_cats}")
                 else:
-                    self.util.debug(
-                        f"Categories test (list): {list(test_cats)}")
+                    self.util.debug(f"Categories test (list): {list(test_cats)}")
             if type(train_cats) == np.ndarray:
                 self.util.debug(f"Categories train (nd.array): {train_cats}")
             else:
@@ -293,8 +289,7 @@ def fill_train_and_tests(self):
 
         target_factor = self.util.config_val("DATA", "target_divide_by", False)
         if target_factor:
-            self.df_test[self.target] = self.df_test[self.target] / \
-                float(target_factor)
+            self.df_test[self.target] = self.df_test[self.target] / float(target_factor)
             self.df_train[self.target] = self.df_train[self.target] / float(
                 target_factor
             )
@@ -317,16 +312,14 @@ def _add_random_target(self, df):
     def plot_distribution(self, df_labels):
         """Plot the distribution of samples and speaker per target class and biological sex"""
         plot = Plots()
-        sample_selection = self.util.config_val(
-            "EXPL", "sample_selection", "all")
+        sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
         plot.plot_distributions(df_labels)
         if self.got_speaker:
             plot.plot_distributions_speaker(df_labels)
 
     def extract_test_feats(self):
         self.feats_test = pd.DataFrame()
-        feats_name = "_".join(ast.literal_eval(
-            glob_conf.config["DATA"]["tests"]))
+        feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["tests"]))
         feats_types = self.util.config_val_list("FEATS", "type", ["os"])
         self.feature_extractor = FeatureExtractor(
             self.df_test, feats_types, feats_name, "test"
@@ -343,8 +336,7 @@ def extract_feats(self):
 
         """
         df_train, df_test = self.df_train, self.df_test
-        feats_name = "_".join(ast.literal_eval(
-            glob_conf.config["DATA"]["databases"]))
+        feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["databases"]))
         self.feats_test, self.feats_train = pd.DataFrame(), pd.DataFrame()
         feats_types = self.util.config_val("FEATS", "type", "os")
         # Ensure feats_types is always a list of strings
@@ -385,8 +377,7 @@ def extract_feats(self):
                 f"test feats ({self.feats_test.shape[0]}) != test labels"
                 f" ({self.df_test.shape[0]})"
             )
-            self.df_test = self.df_test[self.df_test.index.isin(
-                self.feats_test.index)]
+            self.df_test = self.df_test[self.df_test.index.isin(self.feats_test.index)]
             self.util.warn(f"new test labels shape: {self.df_test.shape[0]}")
 
         self._check_scale()
@@ -401,8 +392,7 @@ def augment(self):
         """Augment the selected samples."""
         from nkululeko.augmenting.augmenter import Augmenter
 
-        sample_selection = self.util.config_val(
-            "AUGMENT", "sample_selection", "all")
+        sample_selection = self.util.config_val("AUGMENT", "sample_selection", "all")
         if sample_selection == "all":
             df = pd.concat([self.df_train, self.df_test])
         elif sample_selection == "train":
@@ -497,8 +487,7 @@ def random_splice(self):
         """
         from nkululeko.augmenting.randomsplicer import Randomsplicer
 
-        sample_selection = self.util.config_val(
-            "AUGMENT", "sample_selection", "all")
+        sample_selection = self.util.config_val("AUGMENT", "sample_selection", "all")
         if sample_selection == "all":
             df = pd.concat([self.df_train, self.df_test])
         elif sample_selection == "train":
@@ -519,8 +508,7 @@ def analyse_features(self, needs_feats):
         plot_feats = eval(
             self.util.config_val("EXPL", "feature_distributions", "False")
         )
-        sample_selection = self.util.config_val(
-            "EXPL", "sample_selection", "all")
+        sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
         # get the data labels
         if sample_selection == "all":
             df_labels = pd.concat([self.df_train, self.df_test])
@@ -583,8 +571,7 @@ def analyse_features(self, needs_feats):
             for scat_target in scat_targets:
                 if self.util.is_categorical(df_labels[scat_target]):
                     for scatter in scatters:
-                        plots.scatter_plot(
-                            df_feats, df_labels, scat_target, scatter)
+                        plots.scatter_plot(df_feats, df_labels, scat_target, scatter)
                 else:
                     self.util.debug(
                         f"{self.name}: binning continuous variable to categories"
@@ -669,15 +656,15 @@ def plot_confmat_per_speaker(self, function):
             )
             return
         best = self.get_best_report(self.reports)
-        # if not best.is_classification:
-        #     best.continuous_to_categorical()
-        truths = best.truths
-        preds = best.preds
+        if best.is_classification:
+            truths = best.truths
+            preds = best.preds
+        else:
+            truths = best.truths_cont
+            preds = best.preds_cont
         speakers = self.df_test.speaker.values
-        print(f"{len(truths)} {len(preds)} {len(speakers) }")
-        df = pd.DataFrame(
-            data={"truth": truths, "pred": preds, "speaker": speakers})
-        plot_name = "result_combined_per_speaker"
+        df = pd.DataFrame(data={"truths": truths, "preds": preds, "speakers": speakers})
+        plot_name = f"{self.util.get_exp_name()}_speakercombined_{function}"
         self.util.debug(
             f"plotting speaker combination ({function}) confusion matrix to"
             f" {plot_name}"
@@ -692,13 +679,13 @@ def print_best_model(self):
 
     def demo(self, file, is_list, outfile):
         model = self.runmgr.get_best_model()
-        labelEncoder = None
+        lab_enc = None
         try:
-            labelEncoder = self.label_encoder
+            lab_enc = self.label_encoder
         except AttributeError:
             pass
         demo = Demo_predictor(
-            model, file, is_list, self.feature_extractor, labelEncoder, outfile
+            model, file, is_list, self.feature_extractor, lab_enc, outfile
         )
         demo.run_demo()