Skip to content

Commit

Permalink
Update Framing and add silence removing functions (VAD)
Browse files Browse the repository at this point in the history
This covers:
- Update framing func
- Add silence remove func (= VAD)
- Add muti plots func
- Add error and handle bugs caused by the changes
  • Loading branch information
SuperKogito committed Jan 28, 2020
1 parent 8ea6aff commit 77931db
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 62 deletions.
32 changes: 16 additions & 16 deletions spafe/features/spfeats.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ def compute_dom_freqs_and_mod_index(sig,
lower_cutoff=50,
upper_cutoff=3000,
nfft=512,
win_len=3,
win_hop=1,
win_len=0.03,
win_hop=0.015,
win_type='hamming',
debug=False):
"""
Expand Down Expand Up @@ -239,22 +239,22 @@ def extract_feats(sig, fs, nfft=512):
"""
# init features dictionary
feats = {}

# compute the fft
fourrier_transform = rfft(sig, nfft)

# compute magnitude spectrum
magnitude_spectrum = (1/nfft) * np.abs(fourrier_transform)
power_spectrum = (1/nfft)**2 * magnitude_spectrum**2
# get all frequncies and only keep positive frequencies

# get all frequncies and only keep positive frequencies
frequencies = np.fft.fftfreq(len(power_spectrum), 1 / fs)
frequencies = frequencies[np.where(frequencies >= 0)] // 2 + 1

# keep only half of the spectra
magnitude_spectrum = magnitude_spectrum[:len(frequencies)]
power_spectrum = power_spectrum[:len(frequencies)]

# define amplitudes and spectrum
spectrum = power_spectrum
amplitudes = power_spectrum
Expand All @@ -263,7 +263,7 @@ def extract_feats(sig, fs, nfft=512):
# general stats
feats["duration"] = len(sig) / float(fs)
feats["spectrum"] = spectrum

# spectral stats I
feats["mean_frequency"] = frequencies.sum()
feats["peak_frequency"] = frequencies[np.argmax(amplitudes)]
Expand All @@ -289,7 +289,7 @@ def extract_feats(sig, fs, nfft=512):

# compute energy
feats["energy"] = magnitude_spectrum

# compute root-mean-square (RMS).
feats["rms"] = root_mean_square(sig=sig, fs=fs)

Expand All @@ -309,14 +309,14 @@ def extract_feats(sig, fs, nfft=512):
feats["maxfun"] = fund_freqs.max()

# assign dominant frequencies stats
dom_freqs, mod_idx = compute_dom_freqs_and_mod_index(sig=sig,
dom_freqs, mod_idx = compute_dom_freqs_and_mod_index(sig=sig,
fs=fs,
lower_cutoff = 50,
upper_cutoff = 3000,
nfft = 512,
win_len = 3,
win_hop = 1,
win_type = 'hamming',
lower_cutoff = 50,
upper_cutoff = 3000,
nfft = 512,
win_len = 0.03,
win_hop = 0.015,
win_type = 'hamming',
debug = False)
feats["meandom"] = dom_freqs.mean()
feats["mindom"] = dom_freqs.min()
Expand Down
1 change: 0 additions & 1 deletion spafe/frequencies/dominant_frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def get_dominant_frequencies(sig,
w, h = scipy.signal.freqs(b, a, len(sig))
sig = scipy.signal.lfilter(b, a, sig)


# -> framing
frames, frame_length = framing(sig=sig,
fs=fs,
Expand Down
3 changes: 2 additions & 1 deletion spafe/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
"high_freq":
"maximum frequency cannot be greater than half sampling frequency.",
"nfft": "size of the FFT must be an integer.",
"nfilts": "number of filters must be bigger than number of cepstrums"
"nfilts": "number of filters must be bigger than number of cepstrums",
"win_len_win_hop_comparison": "window's length has to be larger than the window's hop"
}


Expand Down
126 changes: 82 additions & 44 deletions spafe/utils/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import numpy as np
import scipy.ndimage
from spafe.utils.spectral import rfft
from .exceptions import ParameterError, ErrorMsgs


def zero_handling(x):
Expand Down Expand Up @@ -29,54 +32,61 @@ def pre_emphasis(sig, pre_emph_coeff=0.97):
return np.append(sig[0], sig[1:] - pre_emph_coeff * sig[:-1])


def stride_trick(a, stride_length, stride_step):
"""
apply framing using the stride trick from numpy.
Args:
a (array) : signal array.
stride_length (int) : length of the stride.
stride_step (int) : stride step.
Returns:
blocked/framed array.
"""
nrows = ((a.size - stride_length) // stride_step) + 1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a,
shape=(nrows, stride_length),
strides=(stride_step*n, n))


def framing(sig, fs=16000, win_len=0.025, win_hop=0.01):
"""
transform a signal into a series of overlapping frames.
transform a signal into a series of overlapping frames (=Frame blocking).
Args:
sig (array) : a mono audio signal (Nx1) from which to compute features.
fs (int) : the sampling frequency of the signal we are working with.
Default is 16000.
win_len (float) : window length in sec.
Default is 0.025.
win_hop (float) : step between successive windows in sec.
Default is 0.01.
sig (array) : a mono audio signal (Nx1) from which to compute features.
fs (int) : the sampling frequency of the signal we are working with.
Default is 16000.
win_len (float) : window length in sec.
Default is 0.025.
win_hop (float) : step between successive windows in sec.
Default is 0.01.
Returns:
array of frames.
frame length.
Notes:
------
Uses the stride trick to accelerate the processing.
"""
# run checks and assertions
if win_len < win_hop:
raise ParameterError(ErrorMsgs["win_len_win_hop_comparison"])

# compute frame length and frame step (convert from seconds to samples)
frame_length = win_len * fs
frame_step = win_hop * fs
signal_length = len(sig)
frames_overlap = frame_length - frame_step

# Make sure that we have at least 1 frame+
num_frames = np.abs(signal_length - frames_overlap) // np.abs(frame_length - frames_overlap)
rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap)

# Pad Signal to make sure that all frames have equal number of samples
# without truncating any samples from the original signal
if rest_samples != 0:
pad_signal_length = int(frame_step - rest_samples)
z = np.zeros((pad_signal_length))
pad_signal = np.append(sig, z)
num_frames += 1
else:
pad_signal = sig


# make sure to use integers as indices
frame_length = int(frame_length)
frame_step = int(frame_step)
num_frames = int(num_frames)

# compute indices
idx1 = np.tile(np.arange(0, frame_length), (num_frames, 1))
idx2 = np.tile(np.arange(0, num_frames * frame_step, frame_step),
(frame_length, 1)).T
indices = idx1 + idx2
frames = pad_signal[indices.astype(np.int32, copy=False)]
frames = stride_trick(sig, int(frame_length), int(frame_step))
if len(frames[-1]) < frame_length:
frames[-1] = np.append(frames[-1], np.array([0]*(frame_length - len(frames[0]))))

return frames, frame_length


Expand All @@ -93,14 +103,42 @@ def windowing(frames, frame_len, win_type="hamming", beta=14):
Returns:
windowed frames.
"""
if win_type == "hamming":
frames *= np.hamming(frame_len)
if win_type == "hanning":
frames *= np.hanning(frame_len)
if win_type == "bartlet":
frames *= np.bartlett(frame_len)
if win_type == "kaiser":
frames *= np.kaiser(frame_len, beta)
if win_type == "blackman":
frames *= np.blackman(frame_len)
return frames
if win_type == "hamming" : windows = np.hamming(frame_len)
elif win_type == "hanning" : windows = np.hanning(frame_len)
elif win_type == "bartlet" : windows = np.bartlett(frame_len)
elif win_type == "kaiser" : windows = np.kaiser(frame_len, beta)
elif win_type == "blackman": windows = np.blackman(frame_len)
windowed_frames = frames * windows
return windowed_frames


def remove_silence(sig, fs, win_len=0.25, win_hop=0.25, threshold=-35):
"""
generate and apply a window function to avoid spectral leakage.
Args:
frames (array) : array including the overlapping frames.
frame_len (int) : frame length.
win_type (str) : type of window to use.
Default is "hamming"
Returns:
windowed frames.
"""
# framing
frames, frames_len = framing(sig=sig, fs=fs, win_len=win_len, win_hop=win_hop)

# compute short time energies to get voiced frames
amplitudes = np.abs(rfft(frames, len(frames)))
energy = np.sum(amplitudes, axis=-1) / len(frames)**2
energy = 10 * np.log10(zero_handling(energy))

# normalize energy to 0 dB then filter and format
energy = energy - energy.max()
energy = scipy.ndimage.filters.median_filter(energy, 5)
energy = np.repeat(energy, frames_len)

# compute vad and get speech frames
vad = np.array(energy > threshold, dtype=sig.dtype)
vframes = np.array(frames.flatten()[np.where(vad==1)], dtype=sig.dtype)
return energy, vad, np.array(vframes, dtype=np.float64)
30 changes: 30 additions & 0 deletions spafe/utils/vis.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import matplotlib
import matplotlib.pyplot as plt


Expand Down Expand Up @@ -72,3 +73,32 @@ def spectogram(sig, fs):
plt.xlabel("Time (s)")
plt.show(block=False)
plt.close()


def multi_plots(data, fs, plot_rows, step,
colors=["b", "r", "m", "g", "b", "y", "r-."]):
"""
Generate multiple plots related to same signal in one figure.
Args:
data (array) : array of arrays to plot.
fs (int) : the sampling frequency of the signal we are working with.
plot_rows (int) : number of rows to plot.
step (int) : array reading step.
colors (list) : list of colors for the plots.
"""

font = {'family' : 'normal',
'size' : 7}
matplotlib.rc('font', **font)

fig = plt.figure(figsize=(50, 25))
plt.subplots(plot_rows, 1)
plt.subplots_adjust(left=0.125, right=0.9, bottom=0.1, top=0.9,
wspace=0.25, hspace=0.75)

for i in range(plot_rows):
plt.subplot(plot_rows, 1, i+1)
y = data[i]
plt.plot([i/fs for i in range(0, len(y), step)], y, colors[i])
plt.show()

0 comments on commit 77931db

Please sign in to comment.