Skip to content

Commit

Permalink
Merge branch 'felixbur:main' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
bagustris authored May 29, 2024
2 parents 4dc252b + 60ae357 commit a1258a4
Show file tree
Hide file tree
Showing 8 changed files with 216 additions and 174 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
Changelog
=========

Version 0.86.0
--------------
* added regression to finetuning
* added other transformer models to finetuning
* added output the train/dev features sets actually used by the model

Version 0.85.2
--------------
* added data, and automatic task label detection
Expand Down
2 changes: 1 addition & 1 deletion nkululeko/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
VERSION="0.85.2"
VERSION="0.86.0"
SAMPLING_RATE = 16000
70 changes: 30 additions & 40 deletions nkululeko/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,14 @@


class Experiment:
"""Main class specifying an experiment"""
"""Main class specifying an experiment."""

def __init__(self, config_obj):
"""
Parameters
----------
config_obj : a config parser object that sets the experiment parameters and being set as a global object.
"""
"""Constructor.
Args:
- config_obj : a config parser object that sets the experiment parameters and being set as a global object.
"""
self.set_globals(config_obj)
self.name = glob_conf.config["EXP"]["name"]
self.root = os.path.join(glob_conf.config["EXP"]["root"], "")
Expand Down Expand Up @@ -110,15 +109,13 @@ def load_datasets(self):
# print keys/column
dbs = ",".join(list(self.datasets.keys()))
labels = self.util.config_val("DATA", "labels", False)
auto_labels = list(
next(iter(self.datasets.values())).df[self.target].unique()
)
auto_labels = list(next(iter(self.datasets.values())).df[self.target].unique())
if labels:
self.labels = ast.literal_eval(labels)
self.util.debug(f"Target labels (from config): {labels}")
else:
self.labels = auto_labels
self.util.debug(f"Target labels (from database): {auto_labels}")
self.util.debug(f"Target labels (from database): {auto_labels}")
glob_conf.set_labels(self.labels)
self.util.debug(f"loaded databases {dbs}")

Expand Down Expand Up @@ -161,8 +158,7 @@ def fill_tests(self):
data.split()
data.prepare_labels()
self.df_test = pd.concat(
[self.df_test, self.util.make_segmented_index(
data.df_test)]
[self.df_test, self.util.make_segmented_index(data.df_test)]
)
self.df_test.is_labeled = data.is_labeled
self.df_test.got_gender = self.got_gender
Expand Down Expand Up @@ -263,17 +259,15 @@ def fill_train_and_tests(self):
test_cats = self.df_test[self.target].unique()
else:
# if there is no target, copy a dummy label
self.df_test = self._add_random_target(
self.df_test).astype("str")
self.df_test = self._add_random_target(self.df_test).astype("str")
train_cats = self.df_train[self.target].unique()
# print(f"df_train: {pd.DataFrame(self.df_train[self.target])}")
# print(f"train_cats with target {self.target}: {train_cats}")
if self.df_test.is_labeled:
if type(test_cats) == np.ndarray:
self.util.debug(f"Categories test (nd.array): {test_cats}")
else:
self.util.debug(
f"Categories test (list): {list(test_cats)}")
self.util.debug(f"Categories test (list): {list(test_cats)}")
if type(train_cats) == np.ndarray:
self.util.debug(f"Categories train (nd.array): {train_cats}")
else:
Expand All @@ -296,8 +290,7 @@ def fill_train_and_tests(self):

target_factor = self.util.config_val("DATA", "target_divide_by", False)
if target_factor:
self.df_test[self.target] = self.df_test[self.target] / \
float(target_factor)
self.df_test[self.target] = self.df_test[self.target] / float(target_factor)
self.df_train[self.target] = self.df_train[self.target] / float(
target_factor
)
Expand All @@ -320,16 +313,14 @@ def _add_random_target(self, df):
def plot_distribution(self, df_labels):
"""Plot the distribution of samples and speaker per target class and biological sex"""
plot = Plots()
sample_selection = self.util.config_val(
"EXPL", "sample_selection", "all")
sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
plot.plot_distributions(df_labels)
if self.got_speaker:
plot.plot_distributions_speaker(df_labels)

def extract_test_feats(self):
self.feats_test = pd.DataFrame()
feats_name = "_".join(ast.literal_eval(
glob_conf.config["DATA"]["tests"]))
feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["tests"]))
feats_types = self.util.config_val_list("FEATS", "type", ["os"])
self.feature_extractor = FeatureExtractor(
self.df_test, feats_types, feats_name, "test"
Expand All @@ -346,8 +337,7 @@ def extract_feats(self):
"""
df_train, df_test = self.df_train, self.df_test
feats_name = "_".join(ast.literal_eval(
glob_conf.config["DATA"]["databases"]))
feats_name = "_".join(ast.literal_eval(glob_conf.config["DATA"]["databases"]))
self.feats_test, self.feats_train = pd.DataFrame(), pd.DataFrame()
feats_types = self.util.config_val_list("FEATS", "type", [])
# for some models no features are needed
Expand Down Expand Up @@ -381,20 +371,22 @@ def extract_feats(self):
f"test feats ({self.feats_test.shape[0]}) != test labels"
f" ({self.df_test.shape[0]})"
)
self.df_test = self.df_test[self.df_test.index.isin(
self.feats_test.index)]
self.util.warn(f"mew test labels shape: {self.df_test.shape[0]}")
self.df_test = self.df_test[self.df_test.index.isin(self.feats_test.index)]
self.util.warn(f"new test labels shape: {self.df_test.shape[0]}")

self._check_scale()
# store = self.util.get_path("store")
# store_format = self.util.config_val("FEATS", "store_format", "pkl")
# storage = f"{store}test_feats.{store_format}"
# self.util.write_store(self.feats_test, storage, store_format)
# storage = f"{store}train_feats.{store_format}"
# self.util.write_store(self.feats_train, storage, store_format)

def augment(self):
"""
Augment the selected samples
"""
"""Augment the selected samples."""
from nkululeko.augmenting.augmenter import Augmenter

sample_selection = self.util.config_val(
"AUGMENT", "sample_selection", "all")
sample_selection = self.util.config_val("AUGMENT", "sample_selection", "all")
if sample_selection == "all":
df = pd.concat([self.df_train, self.df_test])
elif sample_selection == "train":
Expand Down Expand Up @@ -489,8 +481,7 @@ def random_splice(self):
"""
from nkululeko.augmenting.randomsplicer import Randomsplicer

sample_selection = self.util.config_val(
"AUGMENT", "sample_selection", "all")
sample_selection = self.util.config_val("AUGMENT", "sample_selection", "all")
if sample_selection == "all":
df = pd.concat([self.df_train, self.df_test])
elif sample_selection == "train":
Expand All @@ -511,8 +502,7 @@ def analyse_features(self, needs_feats):
plot_feats = eval(
self.util.config_val("EXPL", "feature_distributions", "False")
)
sample_selection = self.util.config_val(
"EXPL", "sample_selection", "all")
sample_selection = self.util.config_val("EXPL", "sample_selection", "all")
# get the data labels
if sample_selection == "all":
df_labels = pd.concat([self.df_train, self.df_test])
Expand Down Expand Up @@ -575,8 +565,7 @@ def analyse_features(self, needs_feats):
for scat_target in scat_targets:
if self.util.is_categorical(df_labels[scat_target]):
for scatter in scatters:
plots.scatter_plot(
df_feats, df_labels, scat_target, scatter)
plots.scatter_plot(df_feats, df_labels, scat_target, scatter)
else:
self.util.debug(
f"{self.name}: binning continuous variable to categories"
Expand All @@ -591,6 +580,8 @@ def analyse_features(self, needs_feats):
)

def _check_scale(self):
self.util.save_to_store(self.feats_train, "feats_train")
self.util.save_to_store(self.feats_test, "feats_test")
scale_feats = self.util.config_val("FEATS", "scale", False)
# print the scale
self.util.debug(f"scaler: {scale_feats}")
Expand Down Expand Up @@ -665,8 +656,7 @@ def plot_confmat_per_speaker(self, function):
preds = best.preds
speakers = self.df_test.speaker.values
print(f"{len(truths)} {len(preds)} {len(speakers) }")
df = pd.DataFrame(
data={"truth": truths, "pred": preds, "speaker": speakers})
df = pd.DataFrame(data={"truth": truths, "pred": preds, "speaker": speakers})
plot_name = "result_combined_per_speaker"
self.util.debug(
f"plotting speaker combination ({function}) confusion matrix to"
Expand Down
50 changes: 25 additions & 25 deletions nkululeko/feat_extract/feats_opensmile.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,28 +65,28 @@ def extract_sample(self, signal, sr):
feats = smile.process_signal(signal, sr)
return feats.to_numpy()

def filter(self):
# use only the features that are indexed in the target dataframes
self.df = self.df[self.df.index.isin(self.data_df.index)]
try:
# use only some features
selected_features = ast.literal_eval(
glob_conf.config["FEATS"]["os.features"]
)
self.util.debug(f"selecting features from opensmile: {selected_features}")
sel_feats_df = pd.DataFrame()
hit = False
for feat in selected_features:
try:
sel_feats_df[feat] = self.df[feat]
hit = True
except KeyError:
pass
if hit:
self.df = sel_feats_df
self.util.debug(
"new feats shape after selecting opensmile features:"
f" {self.df.shape}"
)
except KeyError:
pass
# def filter(self):
# # use only the features that are indexed in the target dataframes
# self.df = self.df[self.df.index.isin(self.data_df.index)]
# try:
# # use only some features
# selected_features = ast.literal_eval(
# glob_conf.config["FEATS"]["os.features"]
# )
# self.util.debug(f"selecting features from opensmile: {selected_features}")
# sel_feats_df = pd.DataFrame()
# hit = False
# for feat in selected_features:
# try:
# sel_feats_df[feat] = self.df[feat]
# hit = True
# except KeyError:
# pass
# if hit:
# self.df = sel_feats_df
# self.util.debug(
# "new feats shape after selecting opensmile features:"
# f" {self.df.shape}"
# )
# except KeyError:
# pass
8 changes: 4 additions & 4 deletions nkululeko/feat_extract/featureset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def __init__(self, name, data_df, feats_type):
self.name = name
self.data_df = data_df
self.util = Util("featureset")
self.feats_types = feats_type
self.feats_type = feats_type

def extract(self):
pass
Expand All @@ -25,8 +25,7 @@ def filter(self):
self.df = self.df[self.df.index.isin(self.data_df.index)]
try:
# use only some features
selected_features = ast.literal_eval(
glob_conf.config["FEATS"]["features"])
selected_features = ast.literal_eval(glob_conf.config["FEATS"]["features"])
self.util.debug(f"selecting features: {selected_features}")
sel_feats_df = pd.DataFrame()
hit = False
Expand All @@ -35,11 +34,12 @@ def filter(self):
sel_feats_df[feat] = self.df[feat]
hit = True
except KeyError:
self.util.warn(f"non existent feature in {self.feats_type}: {feat}")
pass
if hit:
self.df = sel_feats_df
self.util.debug(
f"new feats shape after selecting features: {self.df.shape}"
f"new feats shape after selecting features for {self.feats_type}: {self.df.shape}"
)
except KeyError:
pass
Loading

0 comments on commit a1258a4

Please sign in to comment.