Update Framing and add silence removing functions (VAD)

This covers: - Update framing func - Add silence remove func (= VAD) - Add muti plots func - Add error and handle bugs caused by the changes
SuperKogito · Jan 28, 2020 · 77931db · 77931db
1 parent 8ea6aff
commit 77931db
Show file tree

Hide file tree

Showing 5 changed files with 130 additions and 62 deletions.
diff --git a/spafe/features/spfeats.py b/spafe/features/spfeats.py
@@ -63,8 +63,8 @@ def compute_dom_freqs_and_mod_index(sig,
                                     lower_cutoff=50,
                                     upper_cutoff=3000,
                                     nfft=512,
-                                    win_len=3,
-                                    win_hop=1,
+                                    win_len=0.03,
+                                    win_hop=0.015,
                                     win_type='hamming',
                                     debug=False):
     """
@@ -239,22 +239,22 @@ def extract_feats(sig, fs, nfft=512):
     """
     # init features dictionary
     feats = {}
-    
+
     # compute the fft
     fourrier_transform = rfft(sig, nfft)
 
     # compute magnitude spectrum
     magnitude_spectrum = (1/nfft) * np.abs(fourrier_transform)
     power_spectrum = (1/nfft)**2 * magnitude_spectrum**2
-    
-    # get all frequncies and  only keep positive frequencies 
+
+    # get all frequncies and  only keep positive frequencies
     frequencies = np.fft.fftfreq(len(power_spectrum), 1 / fs)
     frequencies = frequencies[np.where(frequencies >= 0)] // 2 + 1
-    
+
     # keep only half of the spectra
     magnitude_spectrum = magnitude_spectrum[:len(frequencies)]
     power_spectrum = power_spectrum[:len(frequencies)]
-        
+
     # define amplitudes and spectrum
     spectrum = power_spectrum
     amplitudes = power_spectrum
@@ -263,7 +263,7 @@ def extract_feats(sig, fs, nfft=512):
     # general stats
     feats["duration"] = len(sig) / float(fs)
     feats["spectrum"] = spectrum
-    
+
     # spectral stats I
     feats["mean_frequency"] = frequencies.sum()
     feats["peak_frequency"] = frequencies[np.argmax(amplitudes)]
@@ -289,7 +289,7 @@ def extract_feats(sig, fs, nfft=512):
 
     # compute energy
     feats["energy"] = magnitude_spectrum
-    
+
     # compute root-mean-square (RMS).
     feats["rms"] = root_mean_square(sig=sig, fs=fs)
 
@@ -309,14 +309,14 @@ def extract_feats(sig, fs, nfft=512):
     feats["maxfun"] = fund_freqs.max()
 
     # assign dominant frequencies stats
-    dom_freqs, mod_idx = compute_dom_freqs_and_mod_index(sig=sig, 
+    dom_freqs, mod_idx = compute_dom_freqs_and_mod_index(sig=sig,
                                                          fs=fs,
-                                                         lower_cutoff = 50, 
-                                                         upper_cutoff = 3000, 
-                                                         nfft = 512, 
-                                                         win_len = 3, 
-                                                         win_hop = 1, 
-                                                         win_type = 'hamming', 
+                                                         lower_cutoff = 50,
+                                                         upper_cutoff = 3000,
+                                                         nfft = 512,
+                                                         win_len = 0.03,
+                                                         win_hop = 0.015,
+                                                         win_type = 'hamming',
                                                          debug = False)
     feats["meandom"] = dom_freqs.mean()
     feats["mindom"] = dom_freqs.min()

diff --git a/spafe/frequencies/dominant_frequencies.py b/spafe/frequencies/dominant_frequencies.py
@@ -53,7 +53,6 @@ def get_dominant_frequencies(sig,
         w, h = scipy.signal.freqs(b, a, len(sig))
         sig = scipy.signal.lfilter(b, a, sig)
 
-
     # -> framing
     frames, frame_length = framing(sig=sig,
                                    fs=fs,

diff --git a/spafe/utils/exceptions.py b/spafe/utils/exceptions.py
@@ -7,7 +7,8 @@
     "high_freq":
     "maximum frequency cannot be greater than half sampling frequency.",
     "nfft": "size of the FFT must be an integer.",
-    "nfilts": "number of filters must be bigger than number of cepstrums"
+    "nfilts": "number of filters must be bigger than number of cepstrums",
+    "win_len_win_hop_comparison": "window's length has to be larger than the window's hop"
 }
 
 

diff --git a/spafe/utils/preprocessing.py b/spafe/utils/preprocessing.py
@@ -1,4 +1,7 @@
 import numpy as np
+import scipy.ndimage
+from spafe.utils.spectral import rfft
+from .exceptions import ParameterError, ErrorMsgs
 
 
 def zero_handling(x):
@@ -29,54 +32,61 @@ def pre_emphasis(sig, pre_emph_coeff=0.97):
     return np.append(sig[0], sig[1:] - pre_emph_coeff * sig[:-1])
 
 
+def stride_trick(a, stride_length, stride_step):
+    """
+    apply framing using the stride trick from numpy.
+
+    Args:
+        a (array) : signal array.
+        stride_length (int) : length of the stride.
+        stride_step (int) : stride step.
+
+    Returns:
+        blocked/framed array.
+    """
+    nrows = ((a.size - stride_length) // stride_step) + 1
+    n = a.strides[0]
+    return np.lib.stride_tricks.as_strided(a,
+                                           shape=(nrows, stride_length),
+                                           strides=(stride_step*n, n))
+
+
 def framing(sig, fs=16000, win_len=0.025, win_hop=0.01):
     """
-    transform a signal into a series of overlapping frames.
+    transform a signal into a series of overlapping frames (=Frame blocking).
 
     Args:
-        sig            (array) : a mono audio signal (Nx1) from which to compute features.
-        fs               (int) : the sampling frequency of the signal we are working with.
-                                  Default is 16000.
-        win_len        (float) : window length in sec.
-                                  Default is 0.025.
-        win_hop        (float) : step between successive windows in sec.
-                                  Default is 0.01.
+        sig     (array) : a mono audio signal (Nx1) from which to compute features.
+        fs        (int) : the sampling frequency of the signal we are working with.
+                          Default is 16000.
+        win_len (float) : window length in sec.
+                          Default is 0.025.
+        win_hop (float) : step between successive windows in sec.
+                          Default is 0.01.
 
     Returns:
         array of frames.
         frame length.
+
+    Notes:
+    ------
+        Uses the stride trick to accelerate the processing.
     """
+    # run checks and assertions
+    if win_len < win_hop:
+        raise ParameterError(ErrorMsgs["win_len_win_hop_comparison"])
+
     # compute frame length and frame step (convert from seconds to samples)
     frame_length = win_len * fs
     frame_step = win_hop * fs
     signal_length = len(sig)
     frames_overlap = frame_length - frame_step
-
-    # Make sure that we have at least 1 frame+
-    num_frames = np.abs(signal_length - frames_overlap) // np.abs(frame_length - frames_overlap)
-    rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap) 
-
-    # Pad Signal to make sure that all frames have equal number of samples
-    # without truncating any samples from the original signal
-    if rest_samples != 0:
-        pad_signal_length = int(frame_step - rest_samples)
-        z = np.zeros((pad_signal_length))
-        pad_signal = np.append(sig, z)
-        num_frames += 1 
-    else: 
-        pad_signal = sig
-
+
     # make sure to use integers as indices
-    frame_length = int(frame_length)
-    frame_step = int(frame_step)
-    num_frames = int(num_frames)
-
-    # compute indices
-    idx1 = np.tile(np.arange(0, frame_length), (num_frames, 1))
-    idx2 = np.tile(np.arange(0, num_frames * frame_step, frame_step),
-                    (frame_length, 1)).T
-    indices = idx1 + idx2
-    frames = pad_signal[indices.astype(np.int32, copy=False)]
+    frames = stride_trick(sig, int(frame_length), int(frame_step))
+    if len(frames[-1]) < frame_length:
+        frames[-1] = np.append(frames[-1], np.array([0]*(frame_length - len(frames[0]))))
+
     return frames, frame_length
 
 
@@ -93,14 +103,42 @@ def windowing(frames, frame_len, win_type="hamming", beta=14):
     Returns:
         windowed frames.
     """
-    if win_type == "hamming":
-        frames *= np.hamming(frame_len)
-    if win_type == "hanning":
-        frames *= np.hanning(frame_len)
-    if win_type == "bartlet":
-        frames *= np.bartlett(frame_len)
-    if win_type == "kaiser":
-        frames *= np.kaiser(frame_len, beta)
-    if win_type == "blackman":
-        frames *= np.blackman(frame_len)
-    return frames
+    if   win_type == "hamming" : windows = np.hamming(frame_len)
+    elif win_type == "hanning" : windows = np.hanning(frame_len)
+    elif win_type == "bartlet" : windows = np.bartlett(frame_len)
+    elif win_type == "kaiser"  : windows = np.kaiser(frame_len, beta)
+    elif win_type == "blackman": windows = np.blackman(frame_len)
+    windowed_frames = frames * windows
+    return windowed_frames
+
+
+def remove_silence(sig, fs, win_len=0.25, win_hop=0.25, threshold=-35):
+    """
+    generate and apply a window function to avoid spectral leakage.
+
+    Args:
+        frames  (array) : array including the overlapping frames.
+        frame_len (int) : frame length.
+        win_type  (str) : type of window to use.
+                          Default is "hamming"
+
+    Returns:
+        windowed frames.
+    """
+    # framing
+    frames, frames_len = framing(sig=sig, fs=fs, win_len=win_len, win_hop=win_hop)
+
+    # compute short time energies to get voiced frames
+    amplitudes = np.abs(rfft(frames, len(frames)))
+    energy =  np.sum(amplitudes, axis=-1) / len(frames)**2
+    energy =  10 * np.log10(zero_handling(energy))
+
+    # normalize energy to 0 dB then filter and format
+    energy = energy - energy.max()
+    energy = scipy.ndimage.filters.median_filter(energy, 5)
+    energy = np.repeat(energy, frames_len)
+
+    # compute vad and get speech frames
+    vad = np.array(energy > threshold, dtype=sig.dtype)
+    vframes = np.array(frames.flatten()[np.where(vad==1)], dtype=sig.dtype)
+    return energy, vad, np.array(vframes, dtype=np.float64)
diff --git a/spafe/utils/vis.py b/spafe/utils/vis.py
@@ -1,3 +1,4 @@
+import matplotlib
 import matplotlib.pyplot as plt
 
 
@@ -72,3 +73,32 @@ def spectogram(sig, fs):
     plt.xlabel("Time (s)")
     plt.show(block=False)
     plt.close()
+
+
+def multi_plots(data, fs, plot_rows, step,
+                colors=["b", "r", "m", "g", "b", "y", "r-."]):
+    """
+    Generate multiple plots related to same signal in one figure.
+
+    Args:
+        data    (array) : array of arrays to plot.
+        fs        (int) : the sampling frequency of the signal we are working with.
+        plot_rows (int) : number of rows to plot.
+        step      (int) : array reading step.
+        colors   (list) : list of colors for the plots.
+    """
+
+    font = {'family' : 'normal',
+            'size'   : 7}
+    matplotlib.rc('font', **font)
+
+    fig = plt.figure(figsize=(50, 25))
+    plt.subplots(plot_rows, 1)
+    plt.subplots_adjust(left=0.125, right=0.9, bottom=0.1, top=0.9,
+                        wspace=0.25, hspace=0.75)
+
+    for i in range(plot_rows):
+        plt.subplot(plot_rows, 1, i+1)
+        y = data[i]
+        plt.plot([i/fs for i in range(0, len(y), step)], y, colors[i])
+    plt.show()