diff --git a/data/ravdess/exp_ravdess_os_xgb.ini b/data/ravdess/exp_ravdess_os_xgb.ini index 48d93b98..f77caf93 100644 --- a/data/ravdess/exp_ravdess_os_xgb.ini +++ b/data/ravdess/exp_ravdess_os_xgb.ini @@ -1,11 +1,11 @@ [EXP] -root = ./ -name = results/exp_ravdess +root = ./results +name = exp_ravdess_3 runs = 1 epochs = 1 save = True [DATA] -databases = ['train', 'test', 'dev'] +databases = ['train', 'dev'] train = ./data/ravdess/ravdess_train.csv train.type = csv train.absolute_path = False @@ -13,7 +13,7 @@ train.split_strategy = train dev = ./data/ravdess/ravdess_dev.csv dev.type = csv dev.absolute_path = False -dev.split_strategy = train +dev.split_strategy = test test = ./data/ravdess/ravdess_test.csv test.type = csv test.absolute_path = False @@ -24,4 +24,5 @@ labels = ['angry', 'happy', 'neutral', 'sad'] type = ['os'] scale = standard [MODEL] -type = xgb \ No newline at end of file +type = xgb +save = True \ No newline at end of file diff --git a/data/turev/README.md b/data/turev/README.md new file mode 100644 index 00000000..c3b0f0fd --- /dev/null +++ b/data/turev/README.md @@ -0,0 +1,9 @@ +# Nkululeko pre-processing for TurEV-DB + +```bash +git clone https://github.com/Xeonen/TurEV-DB.git +python3 process_database.py +cd ../.. +python3 -m nkululeko.resample --config data/turev/exp.ini +python3 -m nkululeko.nkululeko --config data/turev/exp.ini +``` \ No newline at end of file diff --git a/data/turev/exp.ini b/data/turev/exp.ini new file mode 100644 index 00000000..9cc5e80a --- /dev/null +++ b/data/turev/exp.ini @@ -0,0 +1,30 @@ +[EXP] +root = ./results/ +name = exp_turev_os_knn +[DATA] +databases = ['train', 'dev', 'test'] +train = ./data/turev/turev_train.csv +train.type = csv +train.absolute_path = False +train.split_strategy = train +dev = ./data/turev/turev_dev.csv +dev.type = csv +dev.absolute_path = False +dev.split_strategy = train +test = ./data/turev/turev_test.csv +test.type = csv +test.absolute_path = False +test.split_strategy = test +target = emotion +; labels = ['anger', 'neutral', 'fear'] +; get the number of classes from the target column automatically +[FEATS] +type = ['os'] +; type = ['hubert-xlarge-ll60k'] +; no_reuse = False +; scale = standard +[MODEL] +type = knn +; save = True +[RESAMPLE] +replace = True \ No newline at end of file diff --git a/data/turev/process_database.py b/data/turev/process_database.py new file mode 100644 index 00000000..2bd203ea --- /dev/null +++ b/data/turev/process_database.py @@ -0,0 +1,52 @@ +# process_database.py for TurEV-DB + +import argparse +from pathlib import Path +from nkululeko.utils.files import find_files +import pandas as pd + + +def process_database(data_dir, output_dir): + # check if data_dir exist + if not Path(data_dir).exists(): + raise ValueError(f"data_dir {data_dir} does not exist") + + # check if output_dir exist, create if not + if not Path(output_dir).exists(): + Path(output_dir).mkdir(parents=True) + + sound_source_dir = Path(data_dir) / "Sound Source" + wavs = find_files(sound_source_dir, ext=["wav"], relative=True) + print(f"Found {len(wavs)} files in {data_dir}") + + # read basename of each wav file, convert to dataframe + basenames = [Path(wav) for wav in wavs] + df = pd.DataFrame(basenames, columns=["file"]) + + # read emotion from parent dir of basename + df["emotion"] = [Path(wav).parent.stem.lower() for wav in wavs] + + # read speaker from basename first string before _ + df["speaker"] = [Path(wav).stem.split("_")[0] for wav in wavs] + + # split into train, val, test + # there are six speakers, use one speaker "6783","1358" as val, test + df_test = df[df["speaker"] == "6783"] + df_dev = df[df["speaker"] == "1358"] + + # use the rest as train + df_train = df[df["speaker"]!= "6783"] + df_train = df_train[df_train["speaker"]!= "1358"] + + # save all splits to csv + for split in ["train", "dev", "test"]: + df_split = eval(f"df_{split}") + df_split.to_csv(Path(output_dir) / f"turev_{split}.csv", index=False) + print(f"Saved {split} to {output_dir}/turev_{split}.csv with shape {df_split.shape}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--data_dir", type=str, default="./TurEV-DB", help="Path to data directory") + parser.add_argument("--output_dir", type=str, default=".", help="Path to output directory") + args = parser.parse_args() + process_database(args.data_dir, args.output_dir) \ No newline at end of file