added melSpectrogram

aclai-lab · Nov 30, 2023 · 82c7fd0 · 82c7fd0
1 parent ad2954c
commit 82c7fd0
Show file tree

Hide file tree

Showing 5 changed files with 358 additions and 69 deletions.
diff --git a/src/AbsoluteDistortion.jl b/src/AbsoluteDistortion.jl
@@ -0,0 +1,11 @@
+## main
+using PyCall
+librosa = pyimport("librosa")
+soundfile = pyimport("soundfile")
+
+sr_src = 48000
+x, sr = librosa.load("/media/paso/Paso Stick/Trombone Live_01.wav", sr=sr_src, mono=true)
+
+absX = abs.(x)
+
+soundfile.write("test1.wav",absX,samplerate=sr,subtype="PCM_24")
diff --git a/src/melSpectrogram.jl b/src/melSpectrogram.jl
@@ -0,0 +1,244 @@
+using FFTW
+using DSP
+
+include("mfccDataStructure.jl")
+include("windowing.jl")
+include("mfcc.jl")
+
+function getDefaultBandEdges()
+    # default band edges
+    factor = 133.33333333333333
+    bE = zeros(1, 42)
+    for i in 1:13
+        bE[i] = factor + (factor / 2) * (i - 1)
+    end
+    for i in 14:42
+        bE[i] = bE[i-1] * 1.0711703
+    end
+
+    return vec(bE)
+end # getDefaultBandEdges
+
+function getFFTLength( # switch case da ricordare!!!
+    sr::Int64,
+    FFTLength::Int64
+)
+    if (FFTLength == 0)
+        sr <= 4000 && return 128
+        sr <= 8000 && return 256
+        sr <= 16000 && return 512
+        return 1024
+    else
+        return FFTLength
+    end
+end # getFFTLength
+
+function hz2mel(
+    fRange::Vector{Float64},
+    melStyle::Symbol=:oshaughnessy # :oshaughnessy, :slaney
+)
+    if (melStyle == :oshaughnessy)
+        mel = 2595 * log10.(1 .+ fRange / 700)
+    else
+        # f0 = 0.0         slaney, DA FARE!
+        # fsp = 200 / 3
+        # brkfrq = 1000.0
+        # brkpt = (brkfrq - f0) / fsp
+        # logstep = log(6.4) / 27
+        # linpt = f < brkfrq
+        # z = linpt ? f / brkfrq / logstep : brkpt + log(f / brkfrq) / logstep
+    end
+    return mel
+end # hz2mel
+
+function mel2hz(
+    mel::LinRange{Float64,Int64},
+    melStyle::Symbol=:oshaughnessy # :oshaughnessy, :slaney
+)
+    if (melStyle == :oshaughnessy)
+        hz = 700 * (exp10.(mel / 2595) .- 1)
+    else
+        # f0 = 0.0         slaney, DA FARE!
+        # fsp = 200 / 3
+        # brkfrq = 1000.0
+        # brkpt = (brkfrq - f0) / fsp
+        # logstep = log(6.4) / 27
+        # linpt = z < brkpt
+        # f = linpt ? f0 + fsp * z : brkfrq * exp(logstep * (z - brkpt))
+    end
+    return hz
+end # mel2hz
+
+function designAuditoryFilterBank(
+    sr::Int64,
+    options::mfccSetup
+)
+    # Set the design domain
+    if (options.filterBankDesignDomain == :linear)
+        designDomain = :linear
+    else
+        designDomain = options.frequencyScale
+    end
+
+    # Set the default number of bands
+    if (options.frequencyScale == :erb)
+        # da impostare in caso di :erb (hz2erb)
+    else
+        numBands = options.numBands # da eliminare e usare solo options?
+    end
+
+    fRange = Float64.(options.frequencyRange)
+    sampleRate = Float64(sr)
+
+    if (options.frequencyScale == :erb)
+        # erb TODO
+    else
+        if (options.frequencyScale == :mel)
+            lRange = hz2mel(fRange, options.melStyle)
+            bandEdges = mel2hz(LinRange(lRange[1], lRange[end], numBands + 2), options.melStyle)
+        else
+            # bark TODO
+        end
+        ### punto di contatto con mfcc.jl ###
+        fbank, fC = designMelFilterBank(
+            sr,
+            bandEdges,
+            options.FFTLength,
+            1, # sumExponent
+            :none, # normalization
+            :hz, # designDomain
+            Float64,
+            true, # keepTwoSided
+            designDomain, # filterBankDesignDomain
+            options.melStyle
+        )
+        bank = fbank'
+        BW = bandEdges[3:end] - bandEdges[1:end-2]
+    end
+
+    # normalization (differente da quella di designMelFilterBank, da valutarne il perchè)
+    if (options.filterBankNormalization == :area)
+        # da fare
+    elseif (options.filterBankNormalization == :bandwidth)
+        # Weight by bandwidth
+        weightPerBand = BW / 2
+    else
+        # weightPerBand = ones(1,numBands)
+    end
+    for i = 1:Int(numBands)
+        if (weightPerBand[i] != 0)
+            bank[i, :] = bank[i, :] / weightPerBand[i]
+        end
+    end
+
+    if (options.oneSided)
+        select = getOnesidedFFTRange(options.FFTLength)
+        filterBank = bank[:,select]
+        if (options.frequencyScale==:erb)
+            # # Scale down DC and Nyquist
+            # # If the first bin is DC, halve it.
+            # filterBank(:,1) = 0.5*filterBank(:,1);
+            # # If the final bin is Nyquist, halve it.
+            # if rem(options.FFTLength,2) == 0
+            #     filterBank(:,end) = 0.5*filterBank(:,end);
+            # end
+            # % Scale up one-sided filter bank
+            # filterBank = 2*filterBank;
+        end
+    else
+        # scaleFactor = double(~strcmp(options.FrequencyScale,'erb')) + 1;
+        # filterBank = bank/scaleFactor;
+    end
+
+    return filterBank, fC, BW
+end # designAuditoryFilterBank
+
+function melSpectrogram(
+    x::AbstractArray{T},
+    sr::Int64; FFTLength::Int64=0, # deve essere 0, così viene automaticamente calcolata
+    numCoeffs::Int64=13, frequencyRange::Vector{Int64}=[0, Int(round(sr / 2))],
+    numBands::Int64=32,
+    filterBankNormalization::Symbol=:bandwidth,
+    oneSided::Bool=true,
+    filterBankDesignDomain::Symbol=:linear,
+    frequencyScale=:mel,
+    spectrumType::Symbol=:power,
+    melStyle::Symbol=:oshaughnessy, deltaWindowLength::Int64=9,
+    rectification::Symbol=:log,
+    logEnergy::Symbol=:append
+) where {T<:AbstractFloat}
+
+    x = convert.(Float64, x) # mantiene compatibilità con Matlab che scricchiola con Float32, da verificare se è il caso.
+
+    options = mfccSetup(
+        numCoeffs=numCoeffs,
+        bandEdges=getDefaultBandEdges(), frequencyRange=frequencyRange,
+        numBands=numBands,
+        filterBankNormalization=filterBankNormalization,
+        frequencyScale=frequencyScale,
+        oneSided=oneSided,
+        filterBankDesignDomain=filterBankDesignDomain,
+        spectrumType=spectrumType,
+        melStyle=melStyle, deltaWindowLength=deltaWindowLength,
+        rectification=rectification,
+        logEnergy=logEnergy
+    )
+
+    # setup ufficiale
+    # options.FFTLength = getFFTLength(sr, FFTLength)
+    # options.window, options.overlapLength = gencoswin(:hann, options.FFTLength, :symmetric)
+    # options.windowLength = size(options.window, 1)
+
+    # setup di debug
+    options.overlapLength = Int(round(0.02 * sr))
+    options.window, oll = gencoswin(:hamming, Int(round(0.03 * sr)), :periodic)
+    options.windowLength = length(options.window)
+    options.FFTLength = size(options.window, 1) # da verificare se è meglio della mia idea
+
+    filterBank, F, BW = designAuditoryFilterBank(sr, options)
+
+    if (options.windowNormalization)
+        if(options.spectrumType==:power)
+            scaleFactor = sqrt(0.5*sum(options.window)^2)
+        else
+            scaleFactor = 0.5*sum(options.window)
+        end
+        win = options.window / scaleFactor
+    end
+
+    N = size(win, 1)
+    hopLength = N - options.overlapLength
+    # Buffer into frames.
+    y = buffer(x,N,hopLength)
+    # apply window
+    Y = fft(y .* win, (1,))
+    # Convert to one-sided FFT
+    binHigh = Int(floor(options.FFTLength/2 + 1))
+    X = Y[1:binHigh,:]
+    if (options.spectrumType==:power)
+        Z = real((X .* conj(X)))
+    else
+        # Z = abs.(X)
+    end
+    # apply filterbank
+    P = filterBank * Z
+    # S = reshape(P, size(P,1), Int(round(size(P,2) / size(x,2))), size(x,2))
+
+    nRow = size(x,1)
+    numHops = Int(floor((nRow-N)/hopLength) + 1)
+    t = (collect(0:(numHops-1)) * hopLength .+ N/2) / sr
+
+    return P, F, t
+
+end # melSpectrogram
+
+# Main debug
+# if (!@isdefined(x)) # carica solo se non è definita mono, serve giusto per debug
+#     # cosi ogni volta che lancio non ricarica tutto
+#     using PyCall
+#     librosa = pyimport("librosa")
+#     sr_src = 8000
+#     x, sr = librosa.load("/home/riccardopasini/Documents/Aclai/Julia_additional_files/test.wav", sr=sr_src, mono=true)
+# end
+
+# S, cF, t = melSpectrogram(x, sr)