Skip to content

Commit

Permalink
added melSpectrogram
Browse files Browse the repository at this point in the history
  • Loading branch information
PasoStudio73 committed Nov 30, 2023
1 parent ad2954c commit 82c7fd0
Show file tree
Hide file tree
Showing 5 changed files with 358 additions and 69 deletions.
11 changes: 11 additions & 0 deletions src/AbsoluteDistortion.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
## main
using PyCall
librosa = pyimport("librosa")
soundfile = pyimport("soundfile")

sr_src = 48000
x, sr = librosa.load("/media/paso/Paso Stick/Trombone Live_01.wav", sr=sr_src, mono=true)

absX = abs.(x)

soundfile.write("test1.wav",absX,samplerate=sr,subtype="PCM_24")
244 changes: 244 additions & 0 deletions src/melSpectrogram.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
using FFTW
using DSP

include("mfccDataStructure.jl")
include("windowing.jl")
include("mfcc.jl")

function getDefaultBandEdges()
# default band edges
factor = 133.33333333333333
bE = zeros(1, 42)
for i in 1:13
bE[i] = factor + (factor / 2) * (i - 1)
end
for i in 14:42
bE[i] = bE[i-1] * 1.0711703
end

return vec(bE)
end # getDefaultBandEdges

function getFFTLength( # switch case da ricordare!!!
sr::Int64,
FFTLength::Int64
)
if (FFTLength == 0)
sr <= 4000 && return 128
sr <= 8000 && return 256
sr <= 16000 && return 512
return 1024
else
return FFTLength
end
end # getFFTLength

function hz2mel(
fRange::Vector{Float64},
melStyle::Symbol=:oshaughnessy # :oshaughnessy, :slaney
)
if (melStyle == :oshaughnessy)
mel = 2595 * log10.(1 .+ fRange / 700)
else
# f0 = 0.0 slaney, DA FARE!
# fsp = 200 / 3
# brkfrq = 1000.0
# brkpt = (brkfrq - f0) / fsp
# logstep = log(6.4) / 27
# linpt = f < brkfrq
# z = linpt ? f / brkfrq / logstep : brkpt + log(f / brkfrq) / logstep
end
return mel
end # hz2mel

function mel2hz(
mel::LinRange{Float64,Int64},
melStyle::Symbol=:oshaughnessy # :oshaughnessy, :slaney
)
if (melStyle == :oshaughnessy)
hz = 700 * (exp10.(mel / 2595) .- 1)
else
# f0 = 0.0 slaney, DA FARE!
# fsp = 200 / 3
# brkfrq = 1000.0
# brkpt = (brkfrq - f0) / fsp
# logstep = log(6.4) / 27
# linpt = z < brkpt
# f = linpt ? f0 + fsp * z : brkfrq * exp(logstep * (z - brkpt))
end
return hz
end # mel2hz

function designAuditoryFilterBank(
sr::Int64,
options::mfccSetup
)
# Set the design domain
if (options.filterBankDesignDomain == :linear)
designDomain = :linear
else
designDomain = options.frequencyScale
end

# Set the default number of bands
if (options.frequencyScale == :erb)
# da impostare in caso di :erb (hz2erb)
else
numBands = options.numBands # da eliminare e usare solo options?
end

fRange = Float64.(options.frequencyRange)
sampleRate = Float64(sr)

if (options.frequencyScale == :erb)
# erb TODO
else
if (options.frequencyScale == :mel)
lRange = hz2mel(fRange, options.melStyle)
bandEdges = mel2hz(LinRange(lRange[1], lRange[end], numBands + 2), options.melStyle)
else
# bark TODO
end
### punto di contatto con mfcc.jl ###
fbank, fC = designMelFilterBank(
sr,
bandEdges,
options.FFTLength,
1, # sumExponent
:none, # normalization
:hz, # designDomain
Float64,
true, # keepTwoSided
designDomain, # filterBankDesignDomain
options.melStyle
)
bank = fbank'
BW = bandEdges[3:end] - bandEdges[1:end-2]
end

# normalization (differente da quella di designMelFilterBank, da valutarne il perchè)
if (options.filterBankNormalization == :area)
# da fare
elseif (options.filterBankNormalization == :bandwidth)
# Weight by bandwidth
weightPerBand = BW / 2
else
# weightPerBand = ones(1,numBands)
end
for i = 1:Int(numBands)
if (weightPerBand[i] != 0)
bank[i, :] = bank[i, :] / weightPerBand[i]
end
end

if (options.oneSided)
select = getOnesidedFFTRange(options.FFTLength)
filterBank = bank[:,select]
if (options.frequencyScale==:erb)
# # Scale down DC and Nyquist
# # If the first bin is DC, halve it.
# filterBank(:,1) = 0.5*filterBank(:,1);
# # If the final bin is Nyquist, halve it.
# if rem(options.FFTLength,2) == 0
# filterBank(:,end) = 0.5*filterBank(:,end);
# end
# % Scale up one-sided filter bank
# filterBank = 2*filterBank;
end
else
# scaleFactor = double(~strcmp(options.FrequencyScale,'erb')) + 1;
# filterBank = bank/scaleFactor;
end

return filterBank, fC, BW
end # designAuditoryFilterBank

function melSpectrogram(
x::AbstractArray{T},
sr::Int64; FFTLength::Int64=0, # deve essere 0, così viene automaticamente calcolata
numCoeffs::Int64=13, frequencyRange::Vector{Int64}=[0, Int(round(sr / 2))],
numBands::Int64=32,
filterBankNormalization::Symbol=:bandwidth,
oneSided::Bool=true,
filterBankDesignDomain::Symbol=:linear,
frequencyScale=:mel,
spectrumType::Symbol=:power,
melStyle::Symbol=:oshaughnessy, deltaWindowLength::Int64=9,
rectification::Symbol=:log,
logEnergy::Symbol=:append
) where {T<:AbstractFloat}

x = convert.(Float64, x) # mantiene compatibilità con Matlab che scricchiola con Float32, da verificare se è il caso.

options = mfccSetup(
numCoeffs=numCoeffs,
bandEdges=getDefaultBandEdges(), frequencyRange=frequencyRange,
numBands=numBands,
filterBankNormalization=filterBankNormalization,
frequencyScale=frequencyScale,
oneSided=oneSided,
filterBankDesignDomain=filterBankDesignDomain,
spectrumType=spectrumType,
melStyle=melStyle, deltaWindowLength=deltaWindowLength,
rectification=rectification,
logEnergy=logEnergy
)

# setup ufficiale
# options.FFTLength = getFFTLength(sr, FFTLength)
# options.window, options.overlapLength = gencoswin(:hann, options.FFTLength, :symmetric)
# options.windowLength = size(options.window, 1)

# setup di debug
options.overlapLength = Int(round(0.02 * sr))
options.window, oll = gencoswin(:hamming, Int(round(0.03 * sr)), :periodic)
options.windowLength = length(options.window)
options.FFTLength = size(options.window, 1) # da verificare se è meglio della mia idea

filterBank, F, BW = designAuditoryFilterBank(sr, options)

if (options.windowNormalization)
if(options.spectrumType==:power)
scaleFactor = sqrt(0.5*sum(options.window)^2)
else
scaleFactor = 0.5*sum(options.window)
end
win = options.window / scaleFactor
end

N = size(win, 1)
hopLength = N - options.overlapLength
# Buffer into frames.
y = buffer(x,N,hopLength)
# apply window
Y = fft(y .* win, (1,))
# Convert to one-sided FFT
binHigh = Int(floor(options.FFTLength/2 + 1))
X = Y[1:binHigh,:]
if (options.spectrumType==:power)
Z = real((X .* conj(X)))
else
# Z = abs.(X)
end
# apply filterbank
P = filterBank * Z
# S = reshape(P, size(P,1), Int(round(size(P,2) / size(x,2))), size(x,2))

nRow = size(x,1)
numHops = Int(floor((nRow-N)/hopLength) + 1)
t = (collect(0:(numHops-1)) * hopLength .+ N/2) / sr

return P, F, t

end # melSpectrogram

# Main debug
# if (!@isdefined(x)) # carica solo se non è definita mono, serve giusto per debug
# # cosi ogni volta che lancio non ricarica tutto
# using PyCall
# librosa = pyimport("librosa")
# sr_src = 8000
# x, sr = librosa.load("/home/riccardopasini/Documents/Aclai/Julia_additional_files/test.wav", sr=sr_src, mono=true)
# end

# S, cF, t = melSpectrogram(x, sr)
Loading

0 comments on commit 82c7fd0

Please sign in to comment.