Skip to content

Commit

Permalink
add kbes dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
bagustris committed May 14, 2024
1 parent b73f4e7 commit f4f60e2
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 6 deletions.
13 changes: 13 additions & 0 deletions data/kbes/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Nkululeko pre-processing for KBES dataset

Download the dataset from [1], place it in this directory or somewhere else and
ane make soft link here (`ln -sf`).

```bash
# unzipt the dataset
unzip "KUET Bangla Emotional Speech (KBES) Dataset.zip"

```

Reference:
[1] <https://data.mendeley.com/datasets/vsn37ps3rx/4>
29 changes: 29 additions & 0 deletions data/kbes/exp.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[EXP]
root = /tmp/results/
name = exp_kbes_hubert_all
[DATA]
databases = ['train', 'dev', 'test']
train = ./data/kbes/kbes_train.csv
train.type = csv
train.absolute_path = False
train.split_strategy = train
dev = ./data/kbes/kbes_dev.csv
dev.type = csv
dev.absolute_path = False
dev.split_strategy = train
test = ./data/kbes/kbes_test.csv
test.type = csv
test.absolute_path = False
test.split_strategy = test
target = emotion
; labels = ['anger', 'fear', 'sad', 'happy']
; get the number of classes from the target column automatically
[FEATS]
type = ['hubert-xlarge-ll60k']
; no_reuse = False
scale = standard
[MODEL]
type = knn
; save = True
[RESAMPLE]
replace = True
99 changes: 99 additions & 0 deletions data/kbes/process_database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# process_database.py for KBES dataset

import pandas as pd
import argparse
from nkululeko.utils.files import find_files
import os
from sklearn.model_selection import train_test_split


def process_database(data_dir, output_dir):
# check if data_dir exist
if not os.path.exists(data_dir):
raise FileNotFoundError(f"Directory {data_dir} not found.")

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# read all wav files
wavs = find_files(data_dir, relative=True, ext=["wav"])
print(f"Found {len(wavs)} wav files.")

# map emotion: 1 = Neutral, 2 = Happy, 3 = Sad, 4 = Angry, 5 = Disgust
emotion_mapping = {
1: "neutral",
2: "happy",
3: "sad",
4: "angry",
5: "disgust"
}

# map intensity, 1 = low, 2 = high
intensity_mapping = {
1: 'low',
2: 'high'
}

# map gender 0 = female, 1 = male
gender_mapping = {
0: 'female',
1: 'male'
}

data = []
for wav in wavs:
# get basename
basename = os.path.basename(wav)
# get emotion
emotion = emotion_mapping[int(basename.split("-")[0])]
# get intensity
intensity = intensity_mapping[int(basename.split("-")[1])]
# get gender
gender = gender_mapping[int(basename.split("-")[2])]
# add language
language = "bangla"
# add to data list
data.append({
"file": wav,
"emotion": emotion,
"gender": gender,
"intensity": intensity,
"language": language
})

# create dataframe from data
df = pd.DataFrame(data)
# split the data into train, dev, and test sets, balanced by emotion
train_df, temp_df = train_test_split(
df, test_size=0.3, stratify=df['emotion'], random_state=42)
dev_df, test_df = train_test_split(
temp_df, test_size=0.5, stratify=temp_df['emotion'], random_state=42)
# write dataframes to csv
train_df.to_csv(os.path.join(
output_dir, "kbes_train.csv"), index=False)
dev_df.to_csv(os.path.join(output_dir, "kbes_dev.csv"), index=False)
test_df.to_csv(os.path.join(
output_dir, "kbes_test.csv"), index=False)
print(f"Number of train samples: {len(train_df)}")
print(f"Number of dev samples: {len(dev_df)}")
print(f"Number of test samples: {len(test_df)}")
print("Database processing completed.")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process KBES dataset")
parser.add_argument(
"--data_dir",
type=str,
default="KUET Bangla Emotional Speech (KBES) Dataset",
help="Directory containing the KBES data",
)
parser.add_argument(
"--output_dir",
type=str,
default="./",
help="Directory to store the output CSV files",
)
args = parser.parse_args()

process_database(args.data_dir, args.output_dir)
4 changes: 2 additions & 2 deletions data/nemo/exp.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[EXP]
root = /tmp/results/
name = exp_nemo_audmodel_svm_big4
name = exp_nemo_audmodel_knn
[DATA]
databases = ['train', 'dev', 'test']
train = ./data/nemo/nemo_train.csv
Expand All @@ -19,7 +19,7 @@ test.absolute_path = False
test.split_strategy = test
test.audio_path = ./nEMO/samples
target = emotion
labels = ['anger', 'neutral', 'sadness', 'happiness']
; labels = ['anger', 'neutral', 'sadness', 'happiness']
; get the number of classes from the target column automatically
[FEATS]
type = ['audmodel']
Expand Down
11 changes: 7 additions & 4 deletions nkululeko/data/dataset_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def load(self):
# exp_root = self.util.config_val("EXP", "root", "")
# data_file = os.path.join(exp_root, data_file)
root = os.path.dirname(data_file)
audio_path = self.util.config_val_data(self.name, "audio_path", "")
audio_path = self.util.config_val_data(self.name, "audio_path", "./")
df = pd.read_csv(data_file)
# special treatment for segmented dataframes with only one column:
if "start" in df.columns and len(df.columns) == 4:
Expand Down Expand Up @@ -49,7 +49,8 @@ def load(self):
.map(lambda x: root + "/" + audio_path + "/" + x)
.values
)
df = df.set_index(df.index.set_levels(file_index, level="file"))
df = df.set_index(df.index.set_levels(
file_index, level="file"))
else:
if not isinstance(df, pd.DataFrame):
df = pd.DataFrame(df)
Expand All @@ -63,7 +64,8 @@ def load(self):
self.db = None
self.got_target = True
self.is_labeled = self.got_target
self.start_fresh = eval(self.util.config_val("DATA", "no_reuse", "False"))
self.start_fresh = eval(
self.util.config_val("DATA", "no_reuse", "False"))
is_index = False
try:
if self.is_labeled and not "class_label" in self.df.columns:
Expand All @@ -90,7 +92,8 @@ def load(self):
f" {self.got_gender}, got age: {self.got_age}"
)
self.util.debug(r_string)
glob_conf.report.add_item(ReportItem("Data", "Loaded report", r_string))
glob_conf.report.add_item(ReportItem(
"Data", "Loaded report", r_string))

def prepare(self):
super().prepare()

0 comments on commit f4f60e2

Please sign in to comment.