From 77931db9c411200f864db79df5fcf7da1f060ad4 Mon Sep 17 00:00:00 2001 From: SuperKogito Date: Tue, 28 Jan 2020 16:49:18 +0100 Subject: [PATCH] Update Framing and add silence removing functions (VAD) This covers: - Update framing func - Add silence remove func (= VAD) - Add muti plots func - Add error and handle bugs caused by the changes --- spafe/features/spfeats.py | 32 +++--- spafe/frequencies/dominant_frequencies.py | 1 - spafe/utils/exceptions.py | 3 +- spafe/utils/preprocessing.py | 126 ++++++++++++++-------- spafe/utils/vis.py | 30 ++++++ 5 files changed, 130 insertions(+), 62 deletions(-) diff --git a/spafe/features/spfeats.py b/spafe/features/spfeats.py index c60c70d..106848f 100644 --- a/spafe/features/spfeats.py +++ b/spafe/features/spfeats.py @@ -63,8 +63,8 @@ def compute_dom_freqs_and_mod_index(sig, lower_cutoff=50, upper_cutoff=3000, nfft=512, - win_len=3, - win_hop=1, + win_len=0.03, + win_hop=0.015, win_type='hamming', debug=False): """ @@ -239,22 +239,22 @@ def extract_feats(sig, fs, nfft=512): """ # init features dictionary feats = {} - + # compute the fft fourrier_transform = rfft(sig, nfft) # compute magnitude spectrum magnitude_spectrum = (1/nfft) * np.abs(fourrier_transform) power_spectrum = (1/nfft)**2 * magnitude_spectrum**2 - - # get all frequncies and only keep positive frequencies + + # get all frequncies and only keep positive frequencies frequencies = np.fft.fftfreq(len(power_spectrum), 1 / fs) frequencies = frequencies[np.where(frequencies >= 0)] // 2 + 1 - + # keep only half of the spectra magnitude_spectrum = magnitude_spectrum[:len(frequencies)] power_spectrum = power_spectrum[:len(frequencies)] - + # define amplitudes and spectrum spectrum = power_spectrum amplitudes = power_spectrum @@ -263,7 +263,7 @@ def extract_feats(sig, fs, nfft=512): # general stats feats["duration"] = len(sig) / float(fs) feats["spectrum"] = spectrum - + # spectral stats I feats["mean_frequency"] = frequencies.sum() feats["peak_frequency"] = frequencies[np.argmax(amplitudes)] @@ -289,7 +289,7 @@ def extract_feats(sig, fs, nfft=512): # compute energy feats["energy"] = magnitude_spectrum - + # compute root-mean-square (RMS). feats["rms"] = root_mean_square(sig=sig, fs=fs) @@ -309,14 +309,14 @@ def extract_feats(sig, fs, nfft=512): feats["maxfun"] = fund_freqs.max() # assign dominant frequencies stats - dom_freqs, mod_idx = compute_dom_freqs_and_mod_index(sig=sig, + dom_freqs, mod_idx = compute_dom_freqs_and_mod_index(sig=sig, fs=fs, - lower_cutoff = 50, - upper_cutoff = 3000, - nfft = 512, - win_len = 3, - win_hop = 1, - win_type = 'hamming', + lower_cutoff = 50, + upper_cutoff = 3000, + nfft = 512, + win_len = 0.03, + win_hop = 0.015, + win_type = 'hamming', debug = False) feats["meandom"] = dom_freqs.mean() feats["mindom"] = dom_freqs.min() diff --git a/spafe/frequencies/dominant_frequencies.py b/spafe/frequencies/dominant_frequencies.py index 7ba2d9f..f45cf80 100644 --- a/spafe/frequencies/dominant_frequencies.py +++ b/spafe/frequencies/dominant_frequencies.py @@ -53,7 +53,6 @@ def get_dominant_frequencies(sig, w, h = scipy.signal.freqs(b, a, len(sig)) sig = scipy.signal.lfilter(b, a, sig) - # -> framing frames, frame_length = framing(sig=sig, fs=fs, diff --git a/spafe/utils/exceptions.py b/spafe/utils/exceptions.py index c0974ca..3061668 100644 --- a/spafe/utils/exceptions.py +++ b/spafe/utils/exceptions.py @@ -7,7 +7,8 @@ "high_freq": "maximum frequency cannot be greater than half sampling frequency.", "nfft": "size of the FFT must be an integer.", - "nfilts": "number of filters must be bigger than number of cepstrums" + "nfilts": "number of filters must be bigger than number of cepstrums", + "win_len_win_hop_comparison": "window's length has to be larger than the window's hop" } diff --git a/spafe/utils/preprocessing.py b/spafe/utils/preprocessing.py index 5de0fb2..5bc7a5c 100644 --- a/spafe/utils/preprocessing.py +++ b/spafe/utils/preprocessing.py @@ -1,4 +1,7 @@ import numpy as np +import scipy.ndimage +from spafe.utils.spectral import rfft +from .exceptions import ParameterError, ErrorMsgs def zero_handling(x): @@ -29,54 +32,61 @@ def pre_emphasis(sig, pre_emph_coeff=0.97): return np.append(sig[0], sig[1:] - pre_emph_coeff * sig[:-1]) +def stride_trick(a, stride_length, stride_step): + """ + apply framing using the stride trick from numpy. + + Args: + a (array) : signal array. + stride_length (int) : length of the stride. + stride_step (int) : stride step. + + Returns: + blocked/framed array. + """ + nrows = ((a.size - stride_length) // stride_step) + 1 + n = a.strides[0] + return np.lib.stride_tricks.as_strided(a, + shape=(nrows, stride_length), + strides=(stride_step*n, n)) + + def framing(sig, fs=16000, win_len=0.025, win_hop=0.01): """ - transform a signal into a series of overlapping frames. + transform a signal into a series of overlapping frames (=Frame blocking). Args: - sig (array) : a mono audio signal (Nx1) from which to compute features. - fs (int) : the sampling frequency of the signal we are working with. - Default is 16000. - win_len (float) : window length in sec. - Default is 0.025. - win_hop (float) : step between successive windows in sec. - Default is 0.01. + sig (array) : a mono audio signal (Nx1) from which to compute features. + fs (int) : the sampling frequency of the signal we are working with. + Default is 16000. + win_len (float) : window length in sec. + Default is 0.025. + win_hop (float) : step between successive windows in sec. + Default is 0.01. Returns: array of frames. frame length. + + Notes: + ------ + Uses the stride trick to accelerate the processing. """ + # run checks and assertions + if win_len < win_hop: + raise ParameterError(ErrorMsgs["win_len_win_hop_comparison"]) + # compute frame length and frame step (convert from seconds to samples) frame_length = win_len * fs frame_step = win_hop * fs signal_length = len(sig) frames_overlap = frame_length - frame_step - - # Make sure that we have at least 1 frame+ - num_frames = np.abs(signal_length - frames_overlap) // np.abs(frame_length - frames_overlap) - rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap) - - # Pad Signal to make sure that all frames have equal number of samples - # without truncating any samples from the original signal - if rest_samples != 0: - pad_signal_length = int(frame_step - rest_samples) - z = np.zeros((pad_signal_length)) - pad_signal = np.append(sig, z) - num_frames += 1 - else: - pad_signal = sig - + # make sure to use integers as indices - frame_length = int(frame_length) - frame_step = int(frame_step) - num_frames = int(num_frames) - - # compute indices - idx1 = np.tile(np.arange(0, frame_length), (num_frames, 1)) - idx2 = np.tile(np.arange(0, num_frames * frame_step, frame_step), - (frame_length, 1)).T - indices = idx1 + idx2 - frames = pad_signal[indices.astype(np.int32, copy=False)] + frames = stride_trick(sig, int(frame_length), int(frame_step)) + if len(frames[-1]) < frame_length: + frames[-1] = np.append(frames[-1], np.array([0]*(frame_length - len(frames[0])))) + return frames, frame_length @@ -93,14 +103,42 @@ def windowing(frames, frame_len, win_type="hamming", beta=14): Returns: windowed frames. """ - if win_type == "hamming": - frames *= np.hamming(frame_len) - if win_type == "hanning": - frames *= np.hanning(frame_len) - if win_type == "bartlet": - frames *= np.bartlett(frame_len) - if win_type == "kaiser": - frames *= np.kaiser(frame_len, beta) - if win_type == "blackman": - frames *= np.blackman(frame_len) - return frames + if win_type == "hamming" : windows = np.hamming(frame_len) + elif win_type == "hanning" : windows = np.hanning(frame_len) + elif win_type == "bartlet" : windows = np.bartlett(frame_len) + elif win_type == "kaiser" : windows = np.kaiser(frame_len, beta) + elif win_type == "blackman": windows = np.blackman(frame_len) + windowed_frames = frames * windows + return windowed_frames + + +def remove_silence(sig, fs, win_len=0.25, win_hop=0.25, threshold=-35): + """ + generate and apply a window function to avoid spectral leakage. + + Args: + frames (array) : array including the overlapping frames. + frame_len (int) : frame length. + win_type (str) : type of window to use. + Default is "hamming" + + Returns: + windowed frames. + """ + # framing + frames, frames_len = framing(sig=sig, fs=fs, win_len=win_len, win_hop=win_hop) + + # compute short time energies to get voiced frames + amplitudes = np.abs(rfft(frames, len(frames))) + energy = np.sum(amplitudes, axis=-1) / len(frames)**2 + energy = 10 * np.log10(zero_handling(energy)) + + # normalize energy to 0 dB then filter and format + energy = energy - energy.max() + energy = scipy.ndimage.filters.median_filter(energy, 5) + energy = np.repeat(energy, frames_len) + + # compute vad and get speech frames + vad = np.array(energy > threshold, dtype=sig.dtype) + vframes = np.array(frames.flatten()[np.where(vad==1)], dtype=sig.dtype) + return energy, vad, np.array(vframes, dtype=np.float64) diff --git a/spafe/utils/vis.py b/spafe/utils/vis.py index 927febf..59c5e19 100644 --- a/spafe/utils/vis.py +++ b/spafe/utils/vis.py @@ -1,3 +1,4 @@ +import matplotlib import matplotlib.pyplot as plt @@ -72,3 +73,32 @@ def spectogram(sig, fs): plt.xlabel("Time (s)") plt.show(block=False) plt.close() + + +def multi_plots(data, fs, plot_rows, step, + colors=["b", "r", "m", "g", "b", "y", "r-."]): + """ + Generate multiple plots related to same signal in one figure. + + Args: + data (array) : array of arrays to plot. + fs (int) : the sampling frequency of the signal we are working with. + plot_rows (int) : number of rows to plot. + step (int) : array reading step. + colors (list) : list of colors for the plots. + """ + + font = {'family' : 'normal', + 'size' : 7} + matplotlib.rc('font', **font) + + fig = plt.figure(figsize=(50, 25)) + plt.subplots(plot_rows, 1) + plt.subplots_adjust(left=0.125, right=0.9, bottom=0.1, top=0.9, + wspace=0.25, hspace=0.75) + + for i in range(plot_rows): + plt.subplot(plot_rows, 1, i+1) + y = data[i] + plt.plot([i/fs for i in range(0, len(y), step)], y, colors[i]) + plt.show()