From ca58c530463a9f659a3b0ee0a53774d3872dc1c6 Mon Sep 17 00:00:00 2001 From: BB zhang Date: Mon, 1 Oct 2018 11:45:27 +0800 Subject: [PATCH 1/2] export powerspec API --- python_speech_features/base.py | 37 ++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/python_speech_features/base.py b/python_speech_features/base.py index 4161899..460849d 100644 --- a/python_speech_features/base.py +++ b/python_speech_features/base.py @@ -32,6 +32,28 @@ def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy return feat +def powerspec(signal,samplerate=16000,winlen=0.025,winstep=0.01, + nfft=512, lowfreq=0,highfreq=None, preemph=0.97, + winfunc=lambda x:numpy.ones((x,))): + """Compute power spectorgram features from an audio signal. + + :param signal: the audio signal from which to compute features. Should be an N*1 array + :param samplerate: the samplerate of the signal we are working with. + :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) + :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) + :param nfft: the FFT size. Default is 512. + :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. + :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 + :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. + :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming + :returns: first is a numpy array of size (NUMFRAMES by nfft) containing power spectrogram. + """ + highfreq= highfreq or samplerate/2 + signal = sigproc.preemphasis(signal, preemph) + frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) + pspec = sigproc.powspec(frames,nfft) + return pspec + def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): @@ -50,10 +72,7 @@ def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ - highfreq= highfreq or samplerate/2 - signal = sigproc.preemphasis(signal,preemph) - frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) - pspec = sigproc.powspec(frames,nfft) + pspec = powerspec(signal, samplerate, winlen, winstep, nfft, lowfreq, highfreq, preemph, winfunc) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log @@ -83,6 +102,16 @@ def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc) return numpy.log(feat) +def logfbank_from_powspec(pspec, samplerate=16000, nfilt=26, nfft=512, lowfreq=0,highfreq=None): + energy = numpy.sum(pspec,1) # this stores the total energy in each frame + energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy) # if energy is zero, we get problems with log + + fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) + feat = numpy.dot(pspec,fb.T) # compute the filterbank energies + feat = numpy.where(feat == 0, numpy.finfo(float).eps, feat) # if feat is zero, we get problems with log + + return numpy.log(feat) + def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): From 22ae31eae1a0516d2596ce17b91de738e612720a Mon Sep 17 00:00:00 2001 From: gaoyonghu Date: Mon, 1 Oct 2018 16:42:36 +0800 Subject: [PATCH 2/2] add librosa examples --- example.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/example.py b/example.py index 4441acf..5bf8107 100644 --- a/example.py +++ b/example.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +import librosa from python_speech_features import mfcc from python_speech_features import delta from python_speech_features import logfbank @@ -11,3 +12,18 @@ fbank_feat = logfbank(sig,rate) print(fbank_feat[1:3,:]) + + +# stride = 10 ms +# sample_rate = 8K +# hop_length = stride * sample_rate +assert librosa.samples_to_frames(1200000, hop_length=80) == 15000 +assert librosa.samples_to_frames(1280000, hop_length=80) == 16000 +assert librosa.frames_to_samples(16000, hop_length=80) == 1280000 + +assert librosa.time_to_frames(10, sr=8000, hop_length=80) == 1000 +assert librosa.time_to_frames(300.29, sr=8000, hop_length=80) == 30029 +assert librosa.frames_to_time(30029, hop_length=80, sr=8000) ==300.29 + +assert librosa.samples_to_times(80000, sr=8000) == 10.0 +assert librosa.time_to_samples(10.0, sr=8000) == 80000