Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
FBurkhardt committed Aug 1, 2024
1 parent a9f75f9 commit 3605bcf
Show file tree
Hide file tree
Showing 19 changed files with 86 additions and 70 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
Changelog
=========

Version 0.88.12
--------------
* added n_jobs for sklearn processing
* re_named num_workers n_jobs

Version 0.88.11
--------------
* removed hack in Praat script
Expand Down
4 changes: 2 additions & 2 deletions ini_file.md
Original file line number Diff line number Diff line change
Expand Up @@ -317,12 +317,12 @@
* drop = .5
* **batch_size**: Size of the batch before backpropagation for neural nets
* batch_size = 8
* **num_workers**: Number of parallel processes for neural nets
* num_workers = 5
* **device**: For torch/huggingface models: select your GPU number if you have one. Values are either "cpu" or GPU ids (e.g., 0, 1 or both "0,1"). By default, the GPU/CUDA is used if available, otherwise is CPU.
* device = 0
* **patience**: Number of epochs to wait if the result gets better (for early stopping)
* patience = 5
* **n_jobs**: set the number of usable processors for model training
* n_jobs = 8

### EXPL

Expand Down
2 changes: 1 addition & 1 deletion nkululeko/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
VERSION="0.88.11"
VERSION="0.88.12"
SAMPLING_RATE = 16000
7 changes: 4 additions & 3 deletions nkululeko/feat_extract/feats_agender.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from nkululeko.feat_extract.featureset import Featureset
import os

# import pandas as pd
import audeer
import nkululeko.glob_conf as glob_conf
Expand All @@ -10,6 +11,7 @@
import audinterface
import torch


class AgenderSet(Featureset):
"""
Embeddings from the wav2vec2. based model finetuned on agender data, described in the paper
Expand All @@ -30,8 +32,7 @@ def _load_model(self):
if not os.path.isdir(model_root):
cache_root = audeer.mkdir("cache")
model_root = audeer.mkdir(model_root)
archive_path = audeer.download_url(
model_url, cache_root, verbose=True)
archive_path = audeer.download_url(model_url, cache_root, verbose=True)
audeer.extract_archive(archive_path, model_root)
cuda = "cuda" if torch.cuda.is_available() else "cpu"
device = self.util.config_val("MODEL", "device", cuda)
Expand Down Expand Up @@ -62,7 +63,7 @@ def extract(self):
},
sampling_rate=16000,
resample=True,
num_workers=5,
num_workers=self.n_jobs,
verbose=True,
)
self.df = hidden_states.process_index(self.data_df.index)
Expand Down
5 changes: 2 additions & 3 deletions nkululeko/feat_extract/feats_auddim.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ def _load_model(self):
if not os.path.isdir(model_root):
cache_root = audeer.mkdir("cache")
model_root = audeer.mkdir(model_root)
archive_path = audeer.download_url(
model_url, cache_root, verbose=True)
archive_path = audeer.download_url(model_url, cache_root, verbose=True)
audeer.extract_archive(archive_path, model_root)
cuda = "cuda" if torch.cuda.is_available() else "cpu"
device = self.util.config_val("MODEL", "device", cuda)
Expand Down Expand Up @@ -63,7 +62,7 @@ def extract(self):
},
sampling_rate=16000,
resample=True,
num_workers=5,
num_workers=self.n_jobs,
verbose=True,
)
self.df = logits.process_index(self.data_df.index)
Expand Down
5 changes: 2 additions & 3 deletions nkululeko/feat_extract/feats_audmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ def _load_model(self):
if not os.path.isdir(model_root):
cache_root = audeer.mkdir("cache")
model_root = audeer.mkdir(model_root)
archive_path = audeer.download_url(
model_url, cache_root, verbose=True)
archive_path = audeer.download_url(model_url, cache_root, verbose=True)
audeer.extract_archive(archive_path, model_root)
cuda = "cuda" if torch.cuda.is_available() else "cpu"
device = self.util.config_val("MODEL", "device", cuda)
Expand Down Expand Up @@ -61,7 +60,7 @@ def extract(self):
},
sampling_rate=16000,
resample=True,
num_workers=5,
num_workers=self.n_jobs,
verbose=True,
)
self.df = hidden_states.process_index(self.data_df.index)
Expand Down
2 changes: 1 addition & 1 deletion nkululeko/feat_extract/feats_opensmile.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def extract(self):
smile = opensmile.Smile(
feature_set=self.feature_set,
feature_level=self.feature_level,
num_workers=5,
num_workers=self.n_jobs,
verbose=True,
)
if isinstance(self.data_df.index, pd.MultiIndex):
Expand Down
16 changes: 6 additions & 10 deletions nkululeko/feat_extract/feats_oxbow.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,15 @@ def extract(self):
self.feature_set = eval(f"opensmile.FeatureSet.{self.featset}")
store = self.util.get_path("store")
storage = f"{store}{self.name}_{self.featset}.pkl"
extract = self.util.config_val(
"FEATS", "needs_feature_extraction", False)
extract = self.util.config_val("FEATS", "needs_feature_extraction", False)
no_reuse = eval(self.util.config_val("FEATS", "no_reuse", "False"))
if extract or no_reuse or not os.path.isfile(storage):
# extract smile features first
self.util.debug(
"extracting openSmile features, this might take a while...")
self.util.debug("extracting openSmile features, this might take a while...")
smile = opensmile.Smile(
feature_set=self.feature_set,
feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
num_workers=5,
num_workers=self.n_jobs,
)
if isinstance(self.data_df.index, pd.MultiIndex):
is_multi_index = True
Expand All @@ -51,13 +49,11 @@ def extract(self):
# save the smile features
smile_df.to_csv(lld_name, sep=";", header=False)
# get the path of the xbow java jar file
xbow_path = self.util.config_val(
"FEATS", "xbow.model", "openXBOW")
xbow_path = self.util.config_val("FEATS", "xbow.model", "openXBOW")
# check if JAR file exist
if not os.path.isfile(f"{xbow_path}/openXBOW.jar"):
# download using wget if not exist and locate in xbow_path
os.system(
f"git clone https://github.com/openXBOW/openXBOW")
os.system(f"git clone https://github.com/openXBOW/openXBOW")
# get the size of the codebook
size = self.util.config_val("FEATS", "size", 500)
# get the number of assignements
Expand Down Expand Up @@ -87,7 +83,7 @@ def extract(self):
smile = opensmile.Smile(
feature_set=opensmile.FeatureSet.eGeMAPSv02, # always use eGemaps for this
feature_level=opensmile.FeatureLevel.Functionals,
num_workers=5,
num_workers=self.n_jobs,
)
if isinstance(self.data_df.index, pd.MultiIndex):
is_multi_index = True
Expand Down
1 change: 1 addition & 0 deletions nkululeko/feat_extract/featureset.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def __init__(self, name, data_df, feats_type):
self.data_df = data_df
self.util = Util("featureset")
self.feats_type = feats_type
self.n_jobs = int(self.util.config_val("MODEL", "n_jobs", "8"))

def extract(self):
pass
Expand Down
77 changes: 42 additions & 35 deletions nkululeko/models/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pickle
import random

from joblib import parallel_backend
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
Expand Down Expand Up @@ -34,6 +35,7 @@ def __init__(self, df_train, df_test, feats_train, feats_test):
self.epoch = 0
self.logo = self.util.config_val("MODEL", "logo", False)
self.xfoldx = self.util.config_val("MODEL", "k_fold_cross", False)
self.n_jobs = int(self.util.config_val("MODEL", "n_jobs", "8"))

def set_model_type(self, type):
self.model_type = type
Expand Down Expand Up @@ -75,7 +77,8 @@ def _x_fold_cross(self):
):
train_x = feats.iloc[train_index].to_numpy()
train_y = targets[train_index]
self.clf.fit(train_x, train_y)
with parallel_backend("threading", n_jobs=self.n_jobs):
self.clf.fit(train_x, train_y)
truth_x = feats.iloc[test_index].to_numpy()
truth_y = targets[test_index]
predict_y = self.clf.predict(truth_x)
Expand Down Expand Up @@ -141,7 +144,8 @@ def _do_logo(self):
):
train_x = feats.iloc[train_index].to_numpy()
train_y = targets.iloc[train_index]
self.clf.fit(train_x, train_y)
with parallel_backend("threading", n_jobs=self.n_jobs):
self.clf.fit(train_x, train_y)

truth_x = feats.iloc[test_index].to_numpy()
truth_y = targets.iloc[test_index]
Expand Down Expand Up @@ -171,7 +175,7 @@ def _do_logo(self):
)

def train(self):
"""Train the model"""
"""Train the model."""
# # first check if the model already has been trained
# if os.path.isfile(self.store_path):
# self.load(self.run, self.epoch)
Expand Down Expand Up @@ -204,22 +208,39 @@ def train(self):
)

tuning_params = self.util.config_val("MODEL", "tuning_params", False)
if tuning_params:
# tune the model meta parameters
tuning_params = ast.literal_eval(tuning_params)
tuned_params = {}
try:
scoring = glob_conf.config["MODEL"]["scoring"]
except KeyError:
self.util.error("got tuning params but no scoring")
for param in tuning_params:
values = ast.literal_eval(glob_conf.config["MODEL"][param])
tuned_params[param] = values
self.util.debug(f"tuning on {tuned_params}")
self.clf = GridSearchCV(
self.clf, tuned_params, refit=True, verbose=3, scoring=scoring
)
try:
with parallel_backend("threading", n_jobs=self.n_jobs):
if tuning_params:
# tune the model meta parameters
tuning_params = ast.literal_eval(tuning_params)
tuned_params = {}
try:
scoring = glob_conf.config["MODEL"]["scoring"]
except KeyError:
self.util.error("got tuning params but no scoring")
for param in tuning_params:
values = ast.literal_eval(glob_conf.config["MODEL"][param])
tuned_params[param] = values
self.util.debug(f"tuning on {tuned_params}")
self.clf = GridSearchCV(
self.clf, tuned_params, refit=True, verbose=3, scoring=scoring
)
try:
class_weight = eval(
self.util.config_val("MODEL", "class_weight", "False")
)
if class_weight:
self.util.debug("using class weight")
self.clf.fit(
feats,
self.df_train[self.target],
sample_weight=self.classes_weights,
)
else:
self.clf.fit(feats, self.df_train[self.target])
except KeyError:
self.clf.fit(feats, self.df_train[self.target])
self.util.debug(f"winner parameters: {self.clf.best_params_}")
else:
class_weight = self.util.config_val("MODEL", "class_weight", False)
if class_weight:
self.util.debug("using class weight")
Expand All @@ -229,22 +250,8 @@ def train(self):
sample_weight=self.classes_weights,
)
else:
self.clf.fit(feats, self.df_train[self.target])
except KeyError:
self.clf.fit(feats, self.df_train[self.target])
self.util.debug(f"winner parameters: {self.clf.best_params_}")
else:
class_weight = self.util.config_val("MODEL", "class_weight", False)
if class_weight:
self.util.debug("using class weight")
self.clf.fit(
feats,
self.df_train[self.target],
sample_weight=self.classes_weights,
)
else:
labels = self.df_train[self.target]
self.clf.fit(feats, labels)
labels = self.df_train[self.target]
self.clf.fit(feats, labels)

def get_predictions(self):
# predictions = self.clf.predict(self.feats_test.to_numpy())
Expand Down
8 changes: 4 additions & 4 deletions nkululeko/models/model_cnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def __init__(self, df_train, df_test, feats_train, feats_test):
# batch size
self.batch_size = int(self.util.config_val("MODEL", "batch_size", 8))
# number of parallel processes
self.num_workers = int(self.util.config_val("MODEL", "num_workers", 5))
self.num_workers = self.n_jobs

# set up the data_loaders

Expand All @@ -100,13 +100,13 @@ def __init__(self, df_train, df_test, feats_train, feats_test):
train_set,
batch_size=self.batch_size,
shuffle=True,
num_workers=self.num_workers,
num_workers=self.n_jobs,
)
self.testloader = torch.utils.data.DataLoader(
test_set,
batch_size=self.batch_size,
shuffle=False,
num_workers=self.num_workers,
num_workers=self.n_jobs,
)

class Dataset_image(Dataset):
Expand Down Expand Up @@ -136,7 +136,7 @@ def set_testdata(self, data_df, feats_df):
test_set,
batch_size=self.batch_size,
shuffle=False,
num_workers=self.num_workers,
num_workers=self.n_jobs,
)

def reset_test(self, df_test, feats_test):
Expand Down
2 changes: 1 addition & 1 deletion nkululeko/models/model_mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def __init__(self, df_train, df_test, feats_train, feats_test):
# batch size
self.batch_size = int(self.util.config_val("MODEL", "batch_size", 8))
# number of parallel processes
self.num_workers = int(self.util.config_val("MODEL", "num_workers", 5))
self.num_workers = self.n_jobs
if feats_train.isna().to_numpy().any():
self.util.debug(
f"Model, train: replacing {feats_train.isna().sum().sum()} NANs"
Expand Down
4 changes: 2 additions & 2 deletions nkululeko/models/model_mlp_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def __init__(self, df_train, df_test, feats_train, feats_test):
# batch size
self.batch_size = int(self.util.config_val("MODEL", "batch_size", 8))
# number of parallel processes
self.num_workers = int(self.util.config_val("MODEL", "num_workers", 5))
self.num_workers = self.n_jobs
# set up the data_loaders
if feats_train.isna().to_numpy().any():
self.util.debug(
Expand Down Expand Up @@ -117,7 +117,7 @@ def get_loader(self, df_x, df_y, shuffle):
dataset=data_set,
batch_size=self.batch_size,
shuffle=shuffle,
num_workers=self.num_workers,
num_workers=self.n_jobs,
)
return loader

Expand Down
7 changes: 4 additions & 3 deletions nkululeko/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class Util:
"pkl",
"eGeMAPSv02",
"functionals",
"n_jobs",
]

def __init__(self, caller=None, has_config=True):
Expand Down Expand Up @@ -150,7 +151,7 @@ def set_config(self, config):
# self.logged_configs.clear()

def get_save_name(self):
"""Return a relative path to a name to save the experiment"""
"""Return a relative path to a name to save the experiment."""
store = self.get_path("store")
return f"{store}/{self.get_exp_name()}.pkl"

Expand All @@ -161,7 +162,7 @@ def get_pred_name(self):
return f"{store}/pred_{target}_{pred_name}.csv"

def is_categorical(self, pd_series):
"""Check if a dataframe column is categorical"""
"""Check if a dataframe column is categorical."""
return pd_series.dtype.name == "object" or isinstance(
pd_series.dtype, pd.CategoricalDtype
)
Expand Down Expand Up @@ -307,7 +308,7 @@ def set_config_val(self, section, key, value):
self.config[section][key] = str(value)

def check_df(self, i, df):
"""Check a dataframe"""
"""Check a dataframe."""
print(f"check {i}: {df.shape}")
print(df.head(1))

Expand Down
1 change: 1 addition & 0 deletions tests/exp_emodb_audmodel_mlp.ini
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ type = ['audmodel']
scale = standard
[MODEL]
type = mlp
n_jobs = 8
layers = {'l1':14, 'l2':8}
drop = .4
save = True
Expand Down
Loading

0 comments on commit 3605bcf

Please sign in to comment.