-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmelspec.py
More file actions
138 lines (105 loc) · 4.18 KB
/
melspec.py
File metadata and controls
138 lines (105 loc) · 4.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# TODO Build melspecs of the songs here
import librosa
import numpy as np
from math import floor
import os.path
def compute_melgram(audio_path):
''' Compute a mel-spectrogram and returns it in a shape of (1,1,96,1366), where
96 == #mel-bins and 1366 == #time frame
parameters
----------
audio_path: path for the audio file.
Any format supported by audioread will work.
More info: http://librosa.github.io/librosa/generated/librosa.core.load.html#librosa.core.load
'''
if not os.path.isfile(audio_path):
raise RuntimeError('Invalid Path')
# mel-spectrogram parameters
SR = 12000
N_FFT = 512
N_MELS = 96
HOP_LEN = 256
DURA = 29.12 # to make it 1366 frame..
src, sr = librosa.load(audio_path, sr=SR) # whole signal
n_sample = src.shape[0]
n_sample_fit = int(DURA*SR)
if n_sample < n_sample_fit: # if too short
src = np.hstack((src, np.zeros((int(DURA*SR) - n_sample,))))
elif n_sample > n_sample_fit: # if too long
src = src[(n_sample-n_sample_fit)/2:(n_sample+n_sample_fit)/2]
logam = librosa.logamplitude
melgram = librosa.feature.melspectrogram
ret = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
n_fft=N_FFT, n_mels=N_MELS)**2,
ref_power=1.0)
ret = ret[np.newaxis, np.newaxis, :]
return ret
def compute_melgram_multiframe(audio_path, trim_song=False):
''' Compute a mel-spectrogram in multiple frames of the song and returns it in a shape of (N,1,96,1366), where
96 == #mel-bins, 1366 == #time frame, and N=#frames
parameters
----------
audio_path: path for the audio file.
Any format supported by audioread will work.
More info: http://librosa.github.io/librosa/generated/librosa.core.load.html#librosa.core.load
'''
if not os.path.isfile(audio_path):
raise RuntimeError('Invalid Path')
# mel-spectrogram parameters
SR = 12000
N_FFT = 512
N_MELS = 96
HOP_LEN = 256
DURA = 29.12 # to make it 1366 frame..
if trim_song:
DURA_TRASH = 0
else:
DURA_TRASH = 20
src, sr = librosa.load(audio_path, sr=SR) # whole signal
n_sample = src.shape[0]
n_sample_fit = int(DURA*SR)
n_sample_trash = int(DURA_TRASH*SR)
#remove the trash at the beginning and at the end
src = src[n_sample_trash:(n_sample-n_sample_trash)]
n_sample=n_sample-2*n_sample_trash
#print n_sample
#print n_sample_fit
ret = np.zeros((0, 1, 96, 1366), dtype=np.float32)
if n_sample < n_sample_fit: # if too short
src = np.hstack((src, np.zeros((int(DURA*SR) - n_sample,))))
logam = librosa.logamplitude
melgram = librosa.feature.melspectrogram
ret = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
n_fft=N_FFT, n_mels=N_MELS)**2,
ref_power=1.0)
ret = ret[np.newaxis, np.newaxis, :]
elif n_sample > n_sample_fit: # if too long
N=int(floor(n_sample/n_sample_fit))
src_total=src
for i in range(0,N):
src = src_total[(i*n_sample_fit):(i+1)*(n_sample_fit)]
logam = librosa.logamplitude
melgram = librosa.feature.melspectrogram
retI = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
n_fft=N_FFT, n_mels=N_MELS)**2,
ref_power=1.0)
retI = retI[np.newaxis, np.newaxis, :]
#print retI.shape
ret = np.concatenate((ret, retI), axis=0)
print ret.shape
return ret
# Melgram computation
def extract_melgrams( song_folder_path, MULTIFRAMES, trim_song):
melgrams = np.zeros((0, 1, 96, 1366), dtype=np.float32)
num_frames_total = list()
for song_path in song_folder_path:
print song_path
if MULTIFRAMES:
melgram = compute_melgram_multiframe(song_path, trim_song)
num_frames = melgram.shape[0]
num_frames_total.append(num_frames)
print 'num frames:', num_frames
else:
melgram = compute_melgram(song_path)
melgrams = np.concatenate((melgrams, melgram), axis=0)
return melgrams, num_frames_total