From dc1c88c7594e9f20f40af5950e87d61f55772b42 Mon Sep 17 00:00:00 2001 From: PasoStudio73 Date: Fri, 15 Dec 2023 23:26:06 +0100 Subject: [PATCH] adapted to audio feature extraction --- src/featuresExtractor.jl | 168 +++++++++++----- src/fft.jl | 136 +++++++------ src/mel.jl | 388 ++++++++++++++++++++----------------- src/signalDataStructure.jl | 93 +++++---- src/spectral.jl | 87 ++++++--- 5 files changed, 513 insertions(+), 359 deletions(-) diff --git a/src/featuresExtractor.jl b/src/featuresExtractor.jl index 460a371..d96042c 100644 --- a/src/featuresExtractor.jl +++ b/src/featuresExtractor.jl @@ -6,75 +6,143 @@ include("spectral.jl") function audioFeaturesExtraction( x::AbstractArray{T}, sr::Int64; - #### define audio objects #### - # user defined options - frequency_range::Vector{Int64}=[0, Int(round(sr / 2))], - numCoeffs::Int=13, - melStyle::Symbol=:htk, - numBands::Int=32, - spectrum_type::Symbol=:power, - filterbank_design_domain::Symbol=:linear, - filterBankNormalization::Symbol=:bandwidth, + # default options + # fft window_type::Symbol=:hann, window_length::Int=Int(round(0.03 * sr)), overlap_length::Int=Int(round(0.02 * sr)), + + # mel + num_bands::Int=32, + mel_style::Symbol=:slaney, # :htk, :slaney + frequency_range::Vector{Int64}=[0, Int(round(sr / 2))], + filterbank_normalization::Symbol=:bandwidth, + spectrum_type::Symbol=:power, + + # mfcc + num_coeffs::Int=13, rectification::Symbol=:log, - logEnergyPos::Symbol=:append, - deltaWindowLength::Int=9 + log_energy_pos::Symbol=:append, + delta_window_length::Int=9, + + # filterbank_design_domain::Symbol=:linear, # settato, ma si usa? + # windowNormalization::Bool=true, # settato, ma si usa? + # oneSided::Bool=true # default, non viene parametrizzato ) where {T<:AbstractFloat} - # options and data structures definition - options = signalSetup( + # setup and data structures definition + setup = signal_setup( sr=sr, - frequency_range=Float64.(frequency_range), - numCoeffs=numCoeffs, - melStyle=melStyle, - numBands=numBands, - spectrum_type=spectrum_type, - filterbank_design_domain=filterbank_design_domain, - filterBankNormalization=filterBankNormalization, + + # fft window_type=window_type, window_length=window_length, overlap_length=overlap_length, + + # mel + num_bands=num_bands, + mel_style=mel_style, + frequency_range=Float64.(frequency_range), + filterbank_normalization=filterbank_normalization, + spectrum_type=spectrum_type, + + # mfcc + num_coeffs=num_coeffs, rectification=rectification, - logEnergyPos=logEnergyPos, - deltaWindowLength=deltaWindowLength + log_energy_pos=log_energy_pos, + delta_window_length=delta_window_length, + + # filterbank_design_domain=filterbank_design_domain, # settato, ma si usa? + # windowNormalization=windowNormalization, # settato, ma si usa? + # oneSided=oneSided # default, non viene parametrizzato ) - # normalize signal + # normalize signal ??? x = Float64.(x) x = x ./ maximum(abs.(x)) - data = signalData( + data = signal_data( x=x ) - takeFFT(data, options) - melSpectrogram(data, options) - mfcc(data, options) - spectral_features(data, options) - - hcat( - data.mel_spectrogram', - data.coeffs, - data.delta, - data.deltaDelta, - data.spectral_centroid, - data.spectral_crest, - data.spectral_flatness, - data.spectral_flux, - data.spectral_decrease, - data.spectral_kurtosis, - data.spectral_rolloff, - data.spectral_skewness, - data.spectral_slope, - data.spectral_spread + takeFFT(data, setup) + melSpectrogram(data, setup) + mfcc(data, setup) + spectral_features(data, setup) + + vcat( + data.mel_spectrogram, + data.mfcc_coeffs', + data.mfcc_delta', + data.mfcc_deltadelta', + data.spectral_centroid', + data.spectral_crest', + data.spectral_entropy', + data.spectral_flatness', + data.spectral_flux', + data.spectral_decrease', + data.spectral_kurtosis', + data.spectral_rolloff', + data.spectral_skewness', + data.spectral_slope', + data.spectral_spread' ) end -# debug -using PyCall -librosa = pyimport("librosa") -sr_src = 8000 -x, sr = librosa.load("/home/riccardopasini/Documents/Aclai/Julia_additional_files/test.wav", sr=sr_src, mono=true) +# # debug +# using PyCall +# librosa = pyimport("librosa") +# sr_src = 8000 +# x, sr = librosa.load("/home/riccardopasini/Documents/Aclai/Julia_additional_files/test.wav", sr=sr_src, mono=true) + +# # fft +# window_type = :hann +# window_length = Int(round(0.03 * sr)) +# overlap_length = Int(round(0.02 * sr)) + +# # mel +# num_bands = 32 +# mel_style = :slaney +# frequency_range = [0, Int(round(sr / 2))] +# filterbank_normalization = :bandwidth +# spectrum_type = :power + +# #mfcc +# num_coeffs = 13 +# rectification = :log +# log_energy_pos = :append +# delta_window_length = 9 + +# # filterbank_design_domain = :linear +# # windowNormalization = true +# # oneSided = true + +# # options and data structures definition +# setup = signal_setup( +# sr=sr, + +# # fft +# window_type=window_type, +# window_length=window_length, +# overlap_length=overlap_length, + +# # mel +# num_bands=num_bands, +# mel_style=mel_style, +# frequency_range=Float64.(frequency_range), +# filterbank_normalization=filterbank_normalization, +# spectrum_type=spectrum_type, + +# # mfcc +# num_coeffs=num_coeffs, +# rectification=rectification, +# log_energy_pos=log_energy_pos, +# delta_window_length=delta_window_length, + +# # filterbank_design_domain=filterbank_design_domain, +# # windowNormalization=windowNormalization, +# # oneSided=oneSided +# ) -audioFeaturesExtraction(x, sr) \ No newline at end of file +# data = signal_data( +# x=Float64.(x) +# ) \ No newline at end of file diff --git a/src/fft.jl b/src/fft.jl index dd26714..b20ea1c 100644 --- a/src/fft.jl +++ b/src/fft.jl @@ -12,103 +12,117 @@ function getFFTLength( # switch case da ricordare!!! return 1024 end # getFFTLength -function getOnesidedFFTRange(FFT_length::Int64) - - if mod(FFT_length, 2) == 0 - return collect(1:Int(FFT_length / 2 + 1)) # EVEN +function get_onesided_fft_range(fft_length::Int64) + if mod(fft_length, 2) == 0 + return collect(1:Int(fft_length / 2 + 1)) # EVEN else - return collect(1:Int((FFT_length + 1) / 2)) # ODD + return collect(1:Int((fft_length + 1) / 2)) # ODD end -end # getOnesidedFFTRange +end # get_onesided_fft_range function takeFFT( x::AbstractArray{Float64}, sr::Int64; - FFT_length::Int64=256, - windowLength::Int64, - overlap_length::Int64 + fft_length::Int64=256, + window_type::Symbol=:hann, + window_length::Int64=Int(round(0.03 * sr)), + overlap_length::Int64=Int(round(0.02 * sr)), + # windowNormalization::Bool=true, + # oneSided::Bool=true ) - # options and data structures definition - options = signalSetup( + # setup and data structures definition + setup = signal_Setup( sr=sr, - oneSided=false, - FFT_length=FFT_length, - windowLength=windowLength, - overlap_length=overlap_length + fft_length=fft_length, + window_type=window_type, + window_length=window_length, + overlap_length=overlap_length, + # windowNormalization=windowNormalization, + # oneSided=oneSided ) - data = signalData( + data = signal_data( x=Float64.(x) ) - takeFFT(data, options) + takeFFT(data, setup) end # takeFFT(kwarg...) function takeFFT( - data::signalData, - options::signalSetup + data::signal_data, + setup::signal_setup ) - options.FFT_length = options.window_length - data.window, unused = gencoswin(options.window_type, options.window_length, :symmetric) + setup.fft_length = setup.window_length # definisce la fft pari alla finestra + hop_length = setup.window_length - setup.overlap_length + data.fft_window, unused = gencoswin(setup.window_type, setup.window_length, :symmetric) - hopLength = options.window_length - options.overlap_length + # da audio feature extracion forse da inserire + ossb = get_onesided_fft_range(setup.fft_length) + logical_ossb = falses(setup.fft_length) + logical_ossb[ossb] .= true - # if (options.windowNormalization) - # options.spectrum_type == :power ? options.scale_factor = 1 / (sum(data.window)^2) : options.scale_factor = 0.5 * sum(data.window) - # data.window = data.window * options.scale_factor - # end + y = buffer(data.x, setup.window_length, hop_length) + yw = y .* data.fft_window - # if (options.windowNormalization) - # options.spectrum_type == :power ? options.scale_factor = sqrt(0.5 * sum(data.window)^2) : options.scale_factor = 0.5 * sum(data.window) - # data.window = data.window / options.scale_factor - # end + # apply window + # data.fft = fft(y .* data.window, (1,)) + Z = fft(y .* data.fft_window, (1,)) + Z = Z[logical_ossb, :] # one sided - y = buffer(data.x, options.window_length, hopLength) + setup.spectrum_type == :power ? data.fft = real(Z.*conj(Z)) : data.fft = abs.(Z) # log energy E = sum(eachrow(y .^ 2)) # somma per righe E[E.==0] .= floatmin(Float64) # il minimo float al posto di zero - data.logEnergy = log.(E) - # apply window - data.fft = (fft(y .* data.window, (1,))) + data.log_energy = log.(E) + + # if (setup.windowNormalization) + # setup.spectrum_type == :power ? setup.scale_factor = 1 / (sum(data.fft_window)^2) : setup.scale_factor = 0.5 * sum(data.fft_window) + # data.fft_window = data.fft_window * setup.scale_factor + # end - # Convert to one-sided FFT - # if (options.oneSided) - # binHigh = Int(floor(options.FFT_length / 2 + 1)) + # if (setup.windowNormalization) + # setup.spectrum_type == :power ? setup.scale_factor = sqrt(0.5 * sum(data.fft_window)^2) : setup.scale_factor = 0.5 * sum(data.fft_window) + # data.fft_window = data.fft_window / setup.scale_factor + # end + + # # Convert to one-sided FFT + # if (setup.oneSided) + # binHigh = Int(floor(setup.fft_length / 2 + 1)) # X = data.fft[1:binHigh, :] - # if (options.spectrum_type == :power) + # if (setup.spectrum_type == :power) # data.fft = (X .* conj(X)) # else # data.fft = abs.(X) # end # end - # trim to desired range - binLow = Int(ceil(options.frequency_range[1] * options.FFT_length / options.sr + 1)) - binHigh = Int(floor(options.frequency_range[2] * options.FFT_length / options.sr + 1)) + # # trim to desired range + binLow = Int(ceil(setup.frequency_range[1] * setup.fft_length / setup.sr + 1)) + binHigh = Int(floor(setup.frequency_range[2] * setup.fft_length / setup.sr + 1)) bins = binLow:binHigh - data.fft = data.fft[bins, :] - # convert to half-sided magnitude or power spectrum - if (options.spectrum_type == :power) - data.fft = data.fft .* conj(data.fft) ./ (0.5 * sum(data.window)^2) - else # Magnitude - data.fft = abs.(data.fft) ./ (0.5 * sum(data.window)) - end - # if the first bin is DC, halve it. - if (binLow == 1) - data.fft[1, :] = 0.5 * data.fft[1, :] - end + # data.fft = data.fft[bins, :] + # # convert to half-sided magnitude or power spectrum + # if (setup.spectrum_type == :power) + # data.fft = data.fft .* conj(data.fft) ./ (0.5 * sum(data.window)^2) + # else # Magnitude + # data.fft = abs.(data.fft) ./ (0.5 * sum(data.window)) + # end + # # if the first bin is DC, halve it. + # if (binLow == 1) + # data.fft[1, :] = 0.5 * data.fft[1, :] + # end - # if the final bin is Nyquist, and FFTLength is even, halve it. - if (binHigh == floor(options.FFT_length / 2 + 1) && rem(options.FFT_length, 2) == 0) - data.fft[end, :] = 0.5 * data.fft[end, :] - end + # # if the final bin is Nyquist, and FFTLength is even, halve it. + # if (binHigh == floor(setup.fft_length / 2 + 1) && rem(setup.fft_length, 2) == 0) + # data.fft[end, :] = 0.5 * data.fft[end, :] + # end # create frequency vector - w = ((options.sr / options.FFT_length) .* (collect(bins) .- 1)) + w = ((setup.sr / setup.fft_length) .* (collect(bins) .- 1)) # shift final bin if fftLength is odd and the final range is full to fs/2. - if (rem(options.FFT_length, 2) == 1 && binHigh == floor(options.FFT_length / 2 + 1)) - w[end] = options.sr * (options.FFT_length - 1) / (2 * options.FFT_length) + if (rem(setup.fft_length, 2) == 1 && binHigh == floor(setup.fft_length / 2 + 1)) + w[end] = setup.sr * (setup.fft_length - 1) / (2 * setup.fft_length) end - data.frequency_vector = w[:] -end # takeFFT(data, options) \ No newline at end of file + data.fft_frequencies = w[:] +end # takeFFT(data, setup) \ No newline at end of file diff --git a/src/mel.jl b/src/mel.jl index 9d964e7..55e6649 100644 --- a/src/mel.jl +++ b/src/mel.jl @@ -3,25 +3,25 @@ using DSP include("signalDataStructure.jl") include("fft.jl") -function getDefaultBandEdges() - # default band edges - factor = 133.33333333333333 - bE = zeros(1, 42) - for i in 1:13 - bE[i] = factor + (factor / 2) * (i - 1) - end - for i in 14:42 - bE[i] = bE[i-1] * 1.0711703 - end - - return vec(bE) -end # getDefaultBandEdges +# function getDefaultBandEdges() +# # default band edges +# factor = 133.33333333333333 +# bE = zeros(1, 42) +# for i in 1:13 +# bE[i] = factor + (factor / 2) * (i - 1) +# end +# for i in 14:42 +# bE[i] = bE[i-1] * 1.0711703 +# end + +# return vec(bE) +# end # getDefaultBandEdges function hz2mel( hz::Vector{Float64}, - melStyle::Symbol=:htk # :htk, :slaney + mel_style::Symbol=:htk # :htk, :slaney ) - if (melStyle == :htk) + if (mel_style == :htk) mel = 2595 * log10.(1 .+ hz / 700) else # slaney linStep = 200 / 3 @@ -38,9 +38,9 @@ end # hz2mel function mel2hz( mel::LinRange{Float64,Int64}, - melStyle::Symbol=:htk # :htk, :slaney + mel_style::Symbol=:htk # :htk, :slaney ) - if (melStyle == :htk) + if (mel_style == :htk) hz = 700 * (exp10.(mel / 2595) .- 1) else linStep = 200 / 3 @@ -55,77 +55,62 @@ function mel2hz( return hz end # mel2hz -function designMelFilterBank( - sr::Int64; - frequencyScale::Symbol=:mel, - FFT_length::Int64=256, - numBands::Int64=32, - frequency_range::Vector{Int64}=[0, Int(round(sr / 2))], - filterBankNormalization::Symbol=:bandwidth, - filterbank_design_domain::Symbol=:linear, - melStyle::Symbol=:htk, -) - - # options and data structures definition - options = signalSetup( - sr=sr, - frequencyScale=frequencyScale, - FFT_length=FFT_length, - numBands=numBands, - frequency_range=Float64.(frequency_range), - melStyle=melStyle, - filterbank_design_domain=filterbank_design_domain, - filterBankNormalization=filterBankNormalization, - ) - - data = signalData() - - designMelFilterBank(data, options) +function normalization_factor(spectrum_type::Symbol, fft_window::Vector{Float64}) + if spectrum_type == :power + return 1 / (sum(fft_window)^2) + else + return 1 / sum(fft_window) + end end -function designMelFilterBank(data::signalData, options::signalSetup) +### da generalizzare per +### frequency_scale :mel, :bark, :erb +### filterbanl_design_domain :linear, :warped (da verificare se serve) +function designMelFilterBank(data::signal_data, setup::signal_setup) + # set the design domain ### da implementare in futuro + # if (setup.filterbank_design_domain == :linear) + # design_domain = :linear + # else + # design_domain = setup.frequency_scale + # end + design_domain = :linear + + # set the default number of bands + # da inserire il caso :erb e :bark + num_bands = setup.num_bands # compute band edges - numBands = options.numBands - if (options.melStyle == :slaney133) - options.bandEdges = getDefaultBandEdges() - numValidEdges = sum(options.bandEdges .<= options.sr / 2) - if (numValidEdges < length(options.bandEdges)) - edges = options.bandEdges[1:numValidEdges+1] - edges[end] = options.sr / 2 - options.bandEdges = edges - end - else # :htk, :slaney - melRange = hz2mel(options.frequency_range, options.melStyle) - options.bandEdges = mel2hz(LinRange(melRange[1], melRange[end], options.numBands + 2), options.melStyle) - end + # da inserire il caso :erb e :bark + melRange = hz2mel(setup.frequency_range, setup.mel_style) + setup.band_edges = mel2hz(LinRange(melRange[1], melRange[end], setup.num_bands + 2), setup.mel_style) - # Determine the number of bands - numEdges = length(options.bandEdges) - numBands = numEdges - 2 + ### parte esclusiva per mel filterbank si passa a file designmelfilterbank.m + # determine the number of bands + num_edges = length(setup.band_edges) + num_bands = num_edges - 2 - # Determine the number of valid bands - validNumEdges = sum((options.bandEdges .- (options.sr / 2)) .< sqrt(eps(Float64))) - validNumBands = validNumEdges - 2 + # determine the number of valid bands + valid_num_edges = sum((setup.band_edges .- (setup.sr / 2)) .< sqrt(eps(Float64))) + valid_num_bands = valid_num_edges - 2 - # Preallocate the filter bank - data.filterBank = zeros(Float64, options.FFT_length, Int(numBands)) - data.centerFrequencies = options.bandEdges[2:end-1] + # preallocate the filter bank + data.mel_filterbank = zeros(Float64, setup.fft_length, num_bands) + data.mel_frequencies = setup.band_edges[2:end-1] # Set this flag to true if the number of FFT length is insufficient to # compute the specified number of mel bands FFTLengthTooSmall = false - linFq = collect(0:options.FFT_length-1) / options.FFT_length * options.sr + linFq = collect(0:setup.fft_length-1) / setup.fft_length * setup.sr # Determine inflection points - @assert(validNumEdges <= numEdges) - p = zeros(Float64, validNumEdges, 1) + @assert(valid_num_edges <= num_edges) + p = zeros(Float64, valid_num_edges, 1) - for edgeNumber in 1:validNumEdges + for edge_n in 1:valid_num_edges for index in eachindex(linFq) - if linFq[index] > options.bandEdges[edgeNumber] - p[edgeNumber] = index + if linFq[index] > setup.band_edges[edge_n] + p[edge_n] = index break end end @@ -134,16 +119,16 @@ function designMelFilterBank(data::signalData, options::signalSetup) FqMod = linFq # Create triangular filters for each band - bw = diff(options.bandEdges) + bw = diff(setup.band_edges) - for k in 1:Int(validNumBands) + for k in 1:Int(valid_num_bands) # Rising side of triangle for j in Int(p[k]):Int(p[k+1])-1 - data.filterBank[j, k] = (FqMod[j] - options.bandEdges[k]) / bw[k] + data.mel_filterbank[j, k] = (FqMod[j] - setup.band_edges[k]) / bw[k] end # Falling side of triangle for j = Int(p[k+1]):Int(p[k+2])-1 - data.filterBank[j, k] = (options.bandEdges[k+2] - FqMod[j]) / bw[k+1] + data.mel_filterbank[j, k] = (setup.band_edges[k+2] - FqMod[j]) / bw[k+1] end emptyRange1 = p[k] .> p[k+1] - 1 emptyRange2 = p[k+1] .> p[k+2] - 1 @@ -151,28 +136,56 @@ function designMelFilterBank(data::signalData, options::signalSetup) FFTLengthTooSmall = true end end - data.filterBank = data.filterBank' - filterBandWidth = options.bandEdges[3:end] - options.bandEdges[1:end-2] - - # Apply normalization - weightPerBand = ones(Float64, 1, Int(numBands)) - if (options.filterBankNormalization == :area) - # da fare - elseif (options.filterBankNormalization == :bandwidth) - # Weight by bandwidth - weightPerBand = filterBandWidth / 2 + # apply normalization ### non viene fatta qui! + # weight_per_band = ones(Float64, 1, Int(num_bands)) + # if (setup.filterbank_normalization == :area) + # # da fare + # elseif (setup.filterbank_normalization == :bandwidth) + # # weight by bandwidth + # filter_bandwidth = setup.band_edges[3:end] - setup.band_edges[1:end-2] + # weight_per_band = filter_bandwidth / 2 + # end + # for i = 1:Int(num_bands) + # if (weightPerBand[i] != 0) + # data.mel_filterbank[i, :] = data.mel_filterbank[i, :] / weightPerBand[i] + # end + # end + + # take one side ### messo di default! + range = get_onesided_fft_range(setup.fft_length) + range = range[2:end] + data.mel_filterbank[end:-1:end-length(range)+1, :] = data.mel_filterbank[range, :] + + data.mel_filterbank = data.mel_filterbank' + + # normalizzazione + BW = setup.band_edges[3:end] - setup.band_edges[1:end-2] + + if (setup.filterbank_normalization == :area) + # DA FARE + elseif (setup.filterbank_normalization == :bandwidth) + weight_per_band = BW / 2 + else + weight_per_band = ones(1, num_bands) end - for i = 1:Int(numBands) - if (weightPerBand[i] != 0) - data.filterBank[i, :] = data.filterBank[i, :] / weightPerBand[i] + for i = 1:num_bands + if (weight_per_band[i] != 0) + data.mel_filterbank[i, :] = data.mel_filterbank[i, :] ./ weight_per_band[i] end end - # take one side - select = getOnesidedFFTRange(options.FFT_length) - data.filterBank = data.filterBank[:, select] + # if onesided, ma comunque lo รจ di default + range = get_onesided_fft_range(setup.fft_length) + data.mel_filterbank = data.mel_filterbank[:,range] + # manca la parte relativa a :erb e :bark + + # setta fattore di normalizzazione + win_norm_factor = normalization_factor(setup.spectrum_type, data.fft_window) + if (setup.spectrum_type == :power) + data.mel_filterbank = data.mel_filterbank * win_norm_factor + end end # function designMelFilterBank function createDCTmatrix( @@ -230,11 +243,11 @@ end # function cepstralCoefficients function audioDelta( x::AbstractMatrix{T}, - windowLength::Int64 + window_length::Int64 ) where {T<:AbstractFloat} DT = eltype(x) - M = convert(DT, floor(windowLength / 2)) + M = convert(DT, floor(window_length / 2)) b = collect(M:-1:-M) ./ sum((1:M) .^ 2) return delta = filt(b, 1, x) @@ -243,128 +256,164 @@ end function melSpectrogram( x::AbstractArray{T}, sr::Int64; + # default setup + # fft + window_type::Symbol=:hann, + window_length::Int=Int(round(0.03 * sr)), + overlap_length::Int=Int(round(0.02 * sr)), + + # mel + num_bands::Int=32, + mel_style::Symbol=:slaney, # :htk, :slaney frequency_range::Vector{Int64}=[0, Int(round(sr / 2))], - melStyle::Symbol=:htk, - numBands::Int64=32, + filterbank_normalization::Symbol=:bandwidth, spectrum_type::Symbol=:power, - filterbank_design_domain::Symbol=:linear, - filterBankNormalization::Symbol=:bandwidth, - window_type::Symbol=:hann, - window_length::Int64=Int(round(0.03 * sr)), - overlap_length::Int64=Int(round(0.02 * sr)), + + # filterbank_design_domain::Symbol=:linear, # settato, ma si usa? + # windowNormalization::Bool=true, # settato, ma si usa? + # oneSided::Bool=true # default, non viene parametrizzato ) where {T<:AbstractFloat} - # options and data structures definition - options = signalSetup( + # setup and data structures definition + setup = signal_setup( sr=sr, - frequency_range=Float64.(frequency_range), - melStyle=melStyle, - numBands=numBands, - spectrum_type=spectrum_type, - filterbank_design_domain=filterbank_design_domain, - filterBankNormalization=filterBankNormalization, + + # fft window_type=window_type, window_length=window_length, overlap_length=overlap_length, + + # mel + num_bands=num_bands, + mel_style=mel_style, + frequency_range=Float64.(frequency_range), + filterbank_normalization=filterbank_normalization, + spectrum_type=spectrum_type, + + # filterbank_design_domain=filterbank_design_domain, # settato, ma si usa? + # windowNormalization=windowNormalization, # settato, ma si usa? + # oneSided=oneSided # default, non viene parametrizzato ) - data = signalData( + data = signal_data( x=Float64.(x) ) - takeFFT(data, options) - melSpectrogram(data, options) + takeFFT(data, setup) + melSpectrogram(data, setup) end function melSpectrogram( - data::signalData, - options::signalSetup + data::signal_data, + setup::signal_setup ) - designMelFilterBank(data, options) + designMelFilterBank(data, setup) - # data.filterBank = data.filterBank * options.scale_factor + hop_length = setup.window_length - setup.overlap_length + num_hops = Int(floor((size(data.x, 1) - setup.window_length) / hop_length) + 1) # apply filterbank - data.mel_spectrogram = data.filterBank * abs.(data.fft) + if (setup.spectrum_type == :power) + data.mel_spectrogram = reshape(data.mel_filterbank * data.fft, setup.num_bands, num_hops) + else + # magnitude, da fare + end - window_length = size(data.window, 1) - hopLength = window_length - options.overlap_length - numHops = Int(floor((size(data.x, 1) - window_length) / hopLength) + 1) - t = (collect(0:(numHops-1)) * hopLength .+ size(data.window, 1) / 2) / sr + # t = (collect(0:(num_hops-1)) * hop_length .+ size(data.fft_window, 1) / 2) / sr - return data.mel_spectrogram, data.centerFrequencies, t + # return data.mel_spectrogram, data.centerFrequencies, t end # melSpectrogram function mfcc( x::AbstractArray{T}, sr::Int64; - numCoeffs::Int64=13, - frequency_range::Vector{Int64}=[0, Int(round(sr / 2))], - melStyle::Symbol=:slaney133, # :htk, :slaney, :slaney133 - numBands::Int64=32, - spectrum_type::Symbol=:power, # :power, :magnitude - filterbank_design_domain::Symbol=:linear, - filterBankNormalization::Symbol=:bandwidth, # :bandwidth, :area, :none + + # default setup + # fft window_type::Symbol=:hann, - window_length::Int64=Int(round(0.03 * sr)), - overlap_length::Int64=Int(round(0.02 * sr)), - rectification::Symbol=:log, # :log, :cubic - logEnergyPos::Symbol=:append, - deltaWindowLength::Int64=9 + window_length::Int=Int(round(0.03 * sr)), + overlap_length::Int=Int(round(0.02 * sr)), + + # mel + num_bands::Int=32, + mel_style::Symbol=:slaney, # :htk, :slaney + frequency_range::Vector{Int64}=[0, Int(round(sr / 2))], + filterbank_normalization::Symbol=:bandwidth, + spectrum_type::Symbol=:power, + + # mfcc + num_coeffs::Int=13, + rectification::Symbol=:log, + log_energy_pos::Symbol=:append, + delta_window_length::Int=9, + + # filterbank_design_domain::Symbol=:linear, # settato, ma si usa? + # windowNormalization::Bool=true, # settato, ma si usa? + # oneSided::Bool=true # default, non viene parametrizzato ) where {T<:AbstractFloat} - # options and data structures definition - options = signalSetup( + # setup and data structures definition + setup = signal_setup( sr=sr, - frequency_range=Float64.(frequency_range), - numCoeffs=numCoeffs, - melStyle=melStyle, - numBands=numBands, - spectrum_type=spectrum_type, - filterbank_design_domain=filterbank_design_domain, - filterBankNormalization=filterBankNormalization, + + # fft window_type=window_type, window_length=window_length, overlap_length=overlap_length, + + # mel + num_bands=num_bands, + mel_style=mel_style, + frequency_range=Float64.(frequency_range), + filterbank_normalization=filterbank_normalization, + spectrum_type=spectrum_type, + + # mfcc + num_coeffs=num_coeffs, rectification=rectification, - logEnergyPos=logEnergyPos, - deltaWindowLength=deltaWindowLength + log_energy_pos=log_energy_pos, + delta_window_length=delta_window_length, + + # filterbank_design_domain=filterbank_design_domain, # settato, ma si usa? + # windowNormalization=windowNormalization, # settato, ma si usa? + # oneSided=oneSided # default, non viene parametrizzato ) - data = signalData( + data = signal_data( x=Float64.(x) ) - takeFFT(data, options) - melSpectrogram(data, options) - mfcc(data, options) + takeFFT(data, setup) + melSpectrogram(data, setup) + mfcc(data, setup) end # mfcc function mfcc( - data::signalData, - options::signalSetup + data::signal_data, + setup::signal_setup ) + # Calculate cepstral coefficients - data.coeffs = cepstralCoefficients(data.mel_spectrogram, options.numCoeffs, options.rectification) + data.mfcc_coeffs = cepstralCoefficients(data.mel_spectrogram, setup.num_coeffs, setup.rectification) # place log energy - if (options.logEnergyPos == :append) - data.coeffs = hcat(data.coeffs, data.logEnergy) - elseif (options.logEnergyPos == :replace) + if (setup.log_energy_pos == :append) + data.mfcc_coeffs = hcat(data.mfcc_coeffs, data.log_energy) + elseif (setup.log_energy_pos == :replace) # data.coeffs = [logE.',data.coeffs(:,2:end)]; # da fare end # METTERE IL CASO CHE le delta vengono calcolate solo se necessario - data.delta = audioDelta(data.coeffs, options.deltaWindowLength) - data.deltaDelta = audioDelta(data.delta, options.deltaWindowLength) + data.mfcc_delta = audioDelta(data.mfcc_coeffs, setup.delta_window_length) + data.mfcc_deltadelta = audioDelta(data.mfcc_delta, setup.delta_window_length) - window_length = size(data.window, 1) - hopLength = window_length - options.overlap_length - numHops = Int(floor((size(data.x, 1) - window_length) / hopLength) + 1) - bandUpLimit = collect(0:(numHops-1)) * hopLength .+ window_length + # window_length = size(data.window, 1) + # hop_length = window_length - setup.overlap_length + # numHops = Int(floor((size(data.x, 1) - window_length) / hop_length) + 1) + # bandUpLimit = collect(0:(numHops-1)) * hop_length .+ window_length - return data.coeffs, data.delta, data.deltaDelta, bandUpLimit + # return data.mfcc_coeffs, data.mfcc_delta, data.mfcc_deltadelta, bandUpLimit end # # debug @@ -376,17 +425,4 @@ end # # mel spectrogram verificato # S, cF, t = melSpectrogram(x, sr, window_type=:hamming) # # mfcc verificata -# coeffs, delta, deltaDelta, loc = mfcc(x, sr, window_type=:hamming) - -# # feature audio extraction -# # l'audio viene normalizzato -# x = Float64.(x) -# x = x ./ maximum(abs.(x)) -# coeffs, delta, deltaDelta, loc = mfcc( -# x, -# sr, -# melStyle=:htk, -# window_length=320, -# overlap_length=240, -# logEnergyPos=:none -# ) \ No newline at end of file +# coeffs, delta, deltadelta, loc = mfcc(x, sr, window_type=:hamming) \ No newline at end of file diff --git a/src/signalDataStructure.jl b/src/signalDataStructure.jl index 1adcdd7..e9ee738 100644 --- a/src/signalDataStructure.jl +++ b/src/signalDataStructure.jl @@ -1,52 +1,63 @@ using Parameters -@with_kw mutable struct signalSetup +@with_kw mutable struct signal_setup sr::Int64 # fft - FFT_length::Int64=0 - window_type::Symbol=:hann - window_length::Int64=0 - overlap_length::Int64=0 - normalizationFactor::Float64=1.0 + fft_length::Int64 = 0 + window_type::Symbol = :hann + window_length::Int64 = 0 + overlap_length::Int64 = 0 + # window_normalization::Bool=true + # onesided::Bool=true + # normalizationFactor::Float64=1.0 + # mel - frequency_range::Vector{Float64}=[] - numCoeffs::Int64=13 - melStyle::Symbol=:htk # :htk, :slaney, :slaney133 - numBands::Int64=32 - spectrum_type::Symbol=:power # :power, :magnitude - scale_factor::Float64=1.0 - frequencyScale::Symbol=:mel - filterbank_design_domain::Symbol=:linear - filterBankNormalization::Symbol=:bandwidth # :bandwidth, :area, :none - rectification::Symbol=:log - logEnergyPos::Symbol=:append #:apend, :replace, :none - deltaWindowLength::Int64=9 - bandEdges::AbstractVector{AbstractFloat}=[] + num_bands::Int64 = 32 + mel_style::Symbol = :htk # :htk, :slaney, :slaney133 + frequency_range::Vector{Float64} = [] + band_edges::AbstractVector{AbstractFloat} = [] + filterbank_normalization::Symbol = :bandwidth # :bandwidth, :area, :none + spectrum_type::Symbol = :power # :power, :magnitude + # filterbank_design_domain::Symbol=:linear + # scale_factor::Float64=1.0 + # frequencyScale::Symbol=:mel + + # mfcc + num_coeffs::Int64 = 13 + rectification::Symbol = :log + log_energy_pos::Symbol = :append #:append, :replace, :none + delta_window_length::Int64 = 9 end -@with_kw mutable struct signalData - x::AbstractArray{Float64}=[] +@with_kw mutable struct signal_data + x::AbstractArray{Float64} = [] + # fft - fft::AbstractArray{Complex{Float64}}=[] - frequency_vector::Vector{Float64}=[] - window::Vector{Float64}=[] + fft::AbstractArray{Float64} = [] + fft_window::Vector{Float64} = [] + fft_frequencies::Vector{Float64} = [] + # mel - filterBank::AbstractArray{Float64}=[] - centerFrequencies::Vector{Float64}=[] - mel_spectrogram::AbstractArray{Float64}=[] - coeffs::AbstractArray{Float64}=[] - delta::AbstractArray{Float64}=[] - deltaDelta::AbstractArray{Float64}=[] - logEnergy::Vector{Float64}=[] + mel_filterbank::AbstractArray{Float64} = [] + mel_frequencies::Vector{Float64} = [] + mel_spectrogram::AbstractArray{Float64} = [] + + # mfcc + mfcc_coeffs::AbstractArray{Float64} = [] + mfcc_delta::AbstractArray{Float64} = [] + mfcc_deltadelta::AbstractArray{Float64} = [] + log_energy::Vector{Float64} = [] + # spectral - spectral_centroid::Vector{Float64}=[] - spectral_crest::Vector{Float64}=[] - spectral_flatness::Vector{Float64}=[] - spectral_flux::Vector{Float64}=[] - spectral_decrease::Vector{Float64}=[] - spectral_kurtosis::Vector{Float64}=[] - spectral_rolloff::Vector{Float64}=[] - spectral_skewness::Vector{Float64}=[] - spectral_slope::Vector{Float64}=[] - spectral_spread::Vector{Float64}=[] + spectral_centroid::Vector{Float64} = [] + spectral_crest::Vector{Float64} = [] + spectral_decrease::Vector{Float64} = [] + spectral_entropy::Vector{Float64} = [] + spectral_flatness::Vector{Float64} = [] + spectral_flux::Vector{Float64} = [] + spectral_kurtosis::Vector{Float64} = [] + spectral_rolloff::Vector{Float64} = [] + spectral_skewness::Vector{Float64} = [] + spectral_slope::Vector{Float64} = [] + spectral_spread::Vector{Float64} = [] end \ No newline at end of file diff --git a/src/spectral.jl b/src/spectral.jl index 73a3164..4cabddf 100644 --- a/src/spectral.jl +++ b/src/spectral.jl @@ -9,20 +9,24 @@ function spectral_features( window_length::Int64=Int(round(0.03 * sr)), overlap_length::Int64=Int(round(0.02 * sr)), frequency_range::Vector{Int64}=[0, Int(round(sr / 2))], - window_type::Symbol=:hann + window_type::Symbol=:hann, + # windowNormalization::Bool=true, + # oneSided::Bool=true ) where {T<:AbstractFloat} # options and data structures definition - options = signalSetup( + options = signal_setup( sr=sr, window_length=window_length, overlap_length=overlap_length, frequency_range=Float64.(frequency_range), window_type=window_type, spectrum_type=:power, + # windowNormalization=windowNormalization, + # oneSided=oneSided ) - data = signalData( + data = signal_data( x=Float64.(x) ) @@ -30,9 +34,10 @@ function spectral_features( spectral_features(data, options) end -function spectral_features(data::signalData, options::signalSetup) +function spectral_features(data::signal_data, options::signal_setup) spectral_centroid(data) spectral_crest(data) + spectral_entropy(data) spectral_flatness(data) spectral_flux(data) spectral_kurtosis(data) @@ -45,12 +50,12 @@ function spectral_features(data::signalData, options::signalSetup) spectral_slope(data) end -function spectral_centroid(data::signalData) +function spectral_centroid(data::signal_data) # calculate centroid - data.spectral_centroid = vec(sum(data.fft .* data.frequency_vector, dims=1) ./ sum(data.fft, dims=1)) + data.spectral_centroid = vec(sum(data.fft .* data.fft_frequencies, dims=1) ./ sum(data.fft, dims=1)) end -function spectral_crest(data::signalData) +function spectral_crest(data::signal_data) # calculate spectral mean m = sum(real(data.fft), dims=1) ./ size(data.fft, 1) # calculate spectral peak @@ -59,14 +64,21 @@ function spectral_crest(data::signalData) data.spectral_crest = vec(p ./ m) end -function spectral_flatness(data::signalData) +function spectral_entropy(data::signal_data) + # calculate entropy + X = data.fft ./ repeat(sum(data.fft, dims=1),size(data.fft, 1),1) + t = replace!(-sum(X.*log2.(X), dims=1), NaN=>0) + data.spectral_entropy = vec(real(t./log2(size(data.fft, 1)))) +end + +function spectral_flatness(data::signal_data) # calculate geometric mean, arithmetic mean, and flatness g = exp.(sum(log.(data.fft .+ eps(Float64)), dims=1) / size(data.fft, 1)) a = sum(data.fft, dims=1) ./ size(data.fft, 1) data.spectral_flatness = vec(real(g ./ a)) end -function spectral_flux(data::signalData) +function spectral_flux(data::signal_data) norm_type = 2 initial_condition = data.fft[:, 1] # calculate flux @@ -78,17 +90,17 @@ function spectral_flux(data::signalData) data.spectral_flux = fl end -function spectral_kurtosis(data::signalData) +function spectral_kurtosis(data::signal_data) # calculate centroid - centroid = vec(sum(data.fft .* data.frequency_vector, dims=1) ./ sum(data.fft, dims=1)) + centroid = vec(sum(data.fft .* data.fft_frequencies, dims=1) ./ sum(data.fft, dims=1)) # calculate spread - temp = (data.frequency_vector .- centroid') .^ 2 + temp = (data.fft_frequencies .- centroid') .^ 2 spread = sqrt.(sum((temp) .* data.fft, dims=1) ./ sum(data.fft, dims=1)) # calculate kurtosis data.spectral_kurtosis = vec(real(sum(((temp .^ 2) .* data.fft), dims=1) ./ ((spread .^ 4) .* sum(data.fft, dims=1)))) end -function spectral_rolloff(data::signalData) +function spectral_rolloff(data::signal_data) # calculate rolloff point threshold = 0.95 c = cumsum(data.fft, dims=1) @@ -97,43 +109,56 @@ function spectral_rolloff(data::signalData) for i in eachindex(d) idx[i] = findfirst(real(c[:, i]) .>= real(d[i])) end - data.spectral_rolloff = data.frequency_vector[Int.(idx)] + data.spectral_rolloff = data.fft_frequencies[Int.(idx)] end -function spectral_skewness(data::signalData) +function spectral_skewness(data::signal_data) # calculate centroid - centroid = vec(sum(data.fft .* data.frequency_vector, dims=1) ./ sum(data.fft, dims=1)) + centroid = vec(sum(data.fft .* data.fft_frequencies, dims=1) ./ sum(data.fft, dims=1)) # calculate spread - temp = (data.frequency_vector .- centroid') - spread = sqrt.(sum((temp) .* data.fft, dims=1) ./ sum(data.fft, dims=1)) + temp = (data.fft_frequencies .- centroid') + spread = sqrt.(sum((temp.^2) .* data.fft, dims=1) ./ sum(data.fft, dims=1)) # calculate skewness data.spectral_skewness = vec(real(sum((temp .^ 3) .* data.fft, dims=1) ./ ((spread .^ 3) .* sum(data.fft, dims=1)))) end -function spectral_spread(data::signalData) +function spectral_spread(data::signal_data) # calculate centroid - centroid = vec(sum(data.fft .* data.frequency_vector, dims=1) ./ sum(data.fft, dims=1)) - temp = (data.frequency_vector .- centroid') .^ 2 + centroid = vec(sum(data.fft .* data.fft_frequencies, dims=1) ./ sum(data.fft, dims=1)) + temp = (data.fft_frequencies .- centroid') .^ 2 spread = sqrt.(sum((temp) .* data.fft, dims=1) ./ sum(data.fft, dims=1)) data.spectral_spread = vec(real(spread)) end -function spectral_decrease(data::signalData) +function spectral_decrease(data::signal_data) # calculate decrease data.spectral_decrease = vec(real(sum((data.fft[2:end, :] .- data.fft[1, :]') ./ (1:size(data.fft, 1)-1), dims=1) ./ sum(data.fft[2:end, :], dims=1))) end -function spectral_slope(data::signalData) +function spectral_slope(data::signal_data) # calculate slope - f_minus_mu_f = data.frequency_vector .- sum(data.frequency_vector, dims=1) ./ size(data.fft, 1) + f_minus_mu_f = data.fft_frequencies .- sum(data.fft_frequencies, dims=1) ./ size(data.fft, 1) X_minus_mu_X = data.fft .- sum(data.fft, dims=1) ./ size(data.fft, 1) data.spectral_slope = vec(real(sum(X_minus_mu_X .* f_minus_mu_f, dims=1) ./ sum(f_minus_mu_f .^ 2))) end -# # debug -# using PyCall -# librosa = pyimport("librosa") -# sr_src = 8000 -# x, sr = librosa.load("/home/riccardopasini/Documents/Aclai/Julia_additional_files/test.wav", sr=sr_src, mono=true) - -# spectral_features(x, sr) \ No newline at end of file +# debug +using PyCall +librosa = pyimport("librosa") +sr_src = 8000 +x, sr = librosa.load("/home/riccardopasini/Documents/Aclai/Julia_additional_files/test.wav", sr=sr_src, mono=true) + +# options = signalSetup( +# sr=sr, +# window_length=Int(round(0.03 * sr)), +# overlap_length=Int(round(0.02 * sr)), +# frequency_range=Float64.([0, Int(round(sr / 2))]), +# window_type=:hann, +# spectrum_type=:power, +# ) +# data = signalData( +# x=Float64.(x) +# ) +# takeFFT(data, options) + +spectral_features(x, sr) \ No newline at end of file