pasquale90 · bmascaro · Jan 17, 2026 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+onnx/out
+onnx/.vs/
+onnx/libs/
+onnx/test/.vs/
+onnx/test/out/
+onnx/test/Audiofile/
+src/BeatNet/__pycache__/
diff --git a/onnx/BeatNet.cpp b/onnx/BeatNet.cpp
@@ -70,9 +70,8 @@ BeatNet::BeatNet(
     env(nullptr), session(nullptr), session_options(nullptr),
     memory_info(nullptr), allocator(nullptr), run_options(nullptr),
     input_name(nullptr), output_name(nullptr),
-    signal_processor(FRAME_LENGTH, HOP_SIZE),
-    fft_processor(FRAME_LENGTH, FFT_SIZE, FRAME_SIZE_POW2),
-    filterbank_processor(BANKS_PER_OCTAVE, FFT_SIZE, SR_BEATNET, 30.0f, 11025.0f, true, true),
+    fft_processor(FRAME_LENGTH, FFT_SIZE, FRAME_LENGTH),
+    filterbank_processor(BANKS_PER_OCTAVE, FFT_SIZE, SR_BEATNET, 30.0f, 17000.0f, true, true),
     SR(0),bufferSize(0)
 {
 
@@ -136,18 +135,38 @@ void BeatNet::setup(double sampleRate, int samplesPerBlock) {
 
 bool BeatNet::preprocess(const std::vector<float>& raw_input, std::vector<float>& preprocessed_input) {
 
-    std::vector<float> resampled = resampler.resample(raw_input);
-    std::vector<float> frame;
-    bool valid_frame = signal_processor.process(resampled,frame);
-    if (!valid_frame) {
-        // std::cout<<"invalid frame and will be invalid for the first ~"<<FRAME_LENGTH/resampled.size()-1<<" frames"<<std::endl;
-        return false;
-    }
-
-    spectrum = fft_processor.compute_fft(frame);
-    filters = filterbank_processor.apply(spectrum);
-    log_fb = log_compress(filters);
-    diff = spectral_diff(log_fb, prev_log_fb);
+    std::vector<float> resampledSignal = resampler.resample(raw_input); 
+
+    // slice original signal to Frames
+    const int nFrames = 4;
+    FramedSignal framedSignal{ resampledSignal , nFrames, FRAME_LENGTH, HOP_SIZE };
+
+    // spectral difference
+    // last frame
+    auto frame_3 = framedSignal[3];
+    auto spectrum_3 = fft_processor.compute_fft(frame_3);
+    auto filters_3 = filterbank_processor.apply(spectrum_3);
+    auto log_compress_3 = log_compress(filters_3);
+    log_fb = std::move(log_compress_3);
+
+    // frame before
+    auto frame_2 = framedSignal[2]; 
+    auto spectrum_2 = fft_processor.compute_fft(frame_2);
+    auto filters_2 = filterbank_processor.apply(spectrum_2);
+    auto log_compress_2 = log_compress(filters_2);
+    prev_log_fb = std::move(log_compress_2);
+
+    // diff = log_fb3 - log_fb2
+    diff.assign(log_fb.size(), 0.0f);
+    std::transform(log_fb.begin(), log_fb.end(), prev_log_fb.begin(),
+                   diff.begin(),  std::minus());
+
+    // replace negative values with zero
+    std::replace_if(diff.begin(), diff.end(), 
+                    [](float x) {return x < 0.0f; }, 
+                    0.0f);
+
+    // stack log spectrum and spectral difference
     hstack(log_fb, diff, preprocessed_input);
     return true;
 }
@@ -191,7 +210,7 @@ void BeatNet::inference(std::vector<float>& output) {
         output[i] = output_data[i];
     }
 
-    printOutputShape(output_tensor);
+    // printOutputShape(output_tensor);
 
     ReleaseValue(input_tensor);
     ReleaseValue(output_tensor);
@@ -214,4 +233,4 @@ void BeatNet::printOutputShape(OrtValue* output_tensor) {
     std::cout << "]" << std::endl;
 
     ReleaseTensorTypeAndShapeInfo(shape_info);
-}
+}
diff --git a/onnx/BeatNet.h b/onnx/BeatNet.h
@@ -5,7 +5,7 @@
 #include <string>
 #include "onnxruntime_c_api.h"
 #include "resampler.h"
-#include "frameprocessor.h"
+#include "framedSignal.h"
 #include "fftprocessor.h"
 #include "filterbankprocessor.h"
 #include "logspecutils.h"
@@ -17,10 +17,10 @@ constexpr double MS_FR_GITHUB {0.064};
 constexpr double MS_HOP_GITHUB {0.020};
 constexpr int FRAME_LENGTH {static_cast<int>(SR_BEATNET*MS_FR_GITHUB)}; // 1411
 constexpr int HOP_SIZE {static_cast<int>(SR_BEATNET*MS_HOP_GITHUB)}; // 441
-constexpr int FFT_SIZE { FRAME_LENGTH / 2 + 1}; // 706
+constexpr int FFT_SIZE {FRAME_LENGTH / 2}; // 705
 constexpr int FRAME_SIZE_POW2 {2048}; // this is the minumum higher than FRAME_LENGTH (1411) that is a power-of-two value.
 constexpr int FBANK_SIZE {272};
-constexpr int BANKS_PER_OCTAVE {16}; // {24};;
+constexpr int BANKS_PER_OCTAVE {24};
 
 using OrtGetApiBaseFn = const OrtApiBase* (*)();
 using OrtCreateTensorWithDataAsOrtValueFn = OrtStatus* (*)
@@ -125,16 +125,17 @@ class BeatNet{
 
     // Preprocessing
     Resampler resampler;
-    FramedSignalProcessor signal_processor;
+
     FFTProcessor fft_processor;
     FilterBankProcessor filterbank_processor;
     std::vector<float> preprocessed_input;
     std::vector<int64_t> input_shape;
     std::vector<float> spectrum;
     std::vector<float> filters;
     std::vector<float> log_fb;
-    std::vector<float> diff;
     std::vector<float> prev_log_fb;
+    std::vector<float> diff;
+
 
     // helper functions - preprocess for feature extraction and inference for model utilization
     bool preprocess(const std::vector<float>& raw_input, std::vector<float>& preprocessed_input);
@@ -143,4 +144,4 @@ class BeatNet{
 
 };
 
-#endif
+#endif
diff --git a/onnx/CMakeLists.txt b/onnx/CMakeLists.txt
@@ -56,7 +56,7 @@ endif()
 set(LIB_SOURCE_FILES  
     BeatNet.cpp 
     resampler.cpp
-    frameprocessor.cpp
+    framedSignal.cpp
     fftprocessor.cpp
     filterbankprocessor.cpp
     logspecutils.cpp

diff --git a/onnx/filterbankprocessor.cpp b/onnx/filterbankprocessor.cpp
@@ -1,4 +1,5 @@
 #include "filterbankprocessor.h"
+#include <algorithm>
 
 FilterBankProcessor::FilterBankProcessor(
     int bands_per_octave, 
@@ -22,24 +23,76 @@ FilterBankProcessor::FilterBankProcessor(
 void FilterBankProcessor::buildFilters() {
     filters.clear();
     float num_octaves = std::log2(fmax / fmin);
-    int num_filters = static_cast<int>(std::floor(num_octaves * bands_per_octave));
-    std::vector<float> centers(num_filters + 2);
+    // centerfrequencies (219)
+    int num_filters = static_cast<int>(std::floor(num_octaves * bands_per_octave)); // (219)
 
-    for (int i = 0; i < centers.size(); ++i) {
-        centers[i] = fmin * std::pow(2.0, (float)i / (float)bands_per_octave);
-    }
+    /* 
+    # get the range
+    left = np.floor(np.log2(float(fmin) / fref) * bands_per_octave)
+    right = np.ceil(np.log2(float(fmax) / fref) * bands_per_octave)
+    # generate frequencies
+    frequencies = fref * 2. ** (np.arange(left, right) /
+                                float(bands_per_octave))
+    # filter frequencies
+    # needed, because range might be bigger because of the use of floor/ceil
+    frequencies = frequencies[np.searchsorted(frequencies, fmin):]
+    frequencies = frequencies[:np.searchsorted(frequencies, fmax, 'right')]
+
+    */
+    const float fref = 440.0; // 440Hz reference value in madmom python code
+    float left = std::floor(std::log2(fmin / fref) * bands_per_octave);
+    float right = std::ceil(std::log2(fmax / fref) * bands_per_octave);
+
+    // centers
+    std::vector<float> centers(num_filters);
+    float val = left + 1.0f; // left + 1 to skip the first value which is < fmin
+    std::generate(centers.begin(), centers.end(),
+                    [&val, fref, this]() 
+                    {
+                        return fref * std::pow(2.0f, val++ / (float) bands_per_octave); 
+                    });
+
+    // bins
+    std::vector<int> bins = centersHzToBins(centers);
+    for (int i = 1; i < bins.size() - 1; ++i) {
+
+        std::vector<float> filt(fft_size, 0.0f); // std::vector<float> filt(fft_size / 2 + 1, 0.0);
+
+        int l = bins[i - 1];  // float l = hzToBin(centers[i - 1]);
+        int c = bins[i];      // float c = hzToBin(centers[i]);
+        int r = bins[i + 1];  // float r = hzToBin(centers[i + 1]);
+
+        int start = l;
+        int center = c - l; // relative to start
+        int stop = r - l;  // relative to start
+
+        /*
+        data = np.zeros(stop)
+        # rising edge (without the center)
+        data[:center] = np.linspace(0, 1, center, endpoint=False)
+        # falling edge (including the center, but without the last bin)
+        data[center:] = np.linspace(1, 0, stop - center, endpoint=False)
+        */
+
+        int n = stop;
+        std::vector<float> data(n, 0.0f);
+
+        float dx = 1.0f / center;
 
-    for (int i = 1; i < centers.size() - 1; ++i) {
-        std::vector<float> filt(fft_size / 2 + 1, 0.0);
-        float l = hzToBin(centers[i - 1]);
-        float c = hzToBin(centers[i]);
-        float r = hzToBin(centers[i + 1]);
+        // rising edge(without the center)
+        float x0 = 0.0f;
+        for (int i = 0; i < center; ++i) {
+            data[i] = x0 + (i * dx);            
+        }
+
+        // falling edge (including the center, but without the last bin)
+        x0 = 1.0f;
+        for (int i = center; i < stop; ++i) {
+            data[i] = x0 - ( (i - center) * dx);
+        }
 
-        for (int j = (int)std::ceil(l); j < (int)std::ceil(c) && j < filt.size(); ++j)
-            filt[j] = (j - l) / (c - l);
+        std::copy(data.begin(), data.end(), filt.begin() + start);
 
-        for (int j = (int)std::ceil(c); j < (int)std::ceil(r) && j < filt.size(); ++j)
-            filt[j] = (r - j) / (r - c);
 
         if (norm_filters) {
             float sum = std::accumulate(filt.begin(), filt.end(), 0.0);
@@ -66,6 +119,26 @@ int FilterBankProcessor::numBands() const
     return (int)filters.size();
 }
 
-float FilterBankProcessor::hzToBin(float f) const {
-    return (f / (float)sample_rate) * fft_size;
+std::vector<int> FilterBankProcessor::centersHzToBins(const std::vector<float>& centers) const {
+
+    std::vector<int> bins(centers.size());
+    for (int i= 0; i < bins.size(); ++i)
+    {
+        const float value =  std::round( centers[i] / ((float) sample_rate / 2.0f)* fft_size);
+        bins[i] = static_cast<int>(value);
+    }
+
+    // keep values unique 
+    auto newend = std::unique(bins.begin(), bins.end());
+    bins.erase(newend, bins.end());
+
+    // remove values higher than fft_size
+    const int size_max = fft_size - 1;
+    newend = std::remove_if(bins.begin(), bins.end(), [&size_max](int x) {return x > size_max;});
+    bins.erase(newend, bins.end());
+
+    // add the size_max value at the end of the array
+    bins.push_back(size_max);
+
+    return bins;    
 }
diff --git a/onnx/filterbankprocessor.h b/onnx/filterbankprocessor.h
@@ -29,7 +29,7 @@ class FilterBankProcessor {
     bool unique_filters;
     std::vector<std::vector<float>> filters;
 
-    float hzToBin(float f) const;
+    std::vector<int> centersHzToBins(const std::vector<float>& centers) const;
 
 };
 

diff --git a/onnx/framedSignal.cpp b/onnx/framedSignal.cpp
@@ -0,0 +1,56 @@
+#include "framedSignal.h"
+#include <algorithm>
+#include <stdexcept>
+#include "iostream"
+
+FramedSignal::FramedSignal(const std::vector<float>& inputSignal, int nFrames, int frameSize, int hopSize)
+	: original_signal(inputSignal),
+	  nFrames(nFrames),
+	  frameSize(frameSize),
+	  hopSize(hopSize)
+{
+	int nMax = ((nFrames -1) * hopSize) + frameSize;
+	padded_signal.assign(nMax, 0.0f);
+
+	{
+		auto s0 = original_signal.begin();
+		auto sEnd = original_signal.end();
+		auto destination = padded_signal.begin() + frameSize / 2;
+
+		int i = frameSize / 2;
+
+		std::copy_if(s0, sEnd, destination, 
+			[&i, nMax](float x) 
+			{
+				return i++ < nMax; 
+			});
+	}
+
+	for (int iFrame = 0, index = 0; iFrame < nFrames; iFrame++, index += hopSize)
+	{
+		auto i0 = padded_signal.begin() + index;
+
+		std::vector<float> signal(i0,  i0 + frameSize);
+		frames.push_back(signal);
+	}
+}
 bool process(const std::vector<float>& input, std::vector<float>& frame_out); 
 bool process(const std::vector<float>& input, std::vector<float>& frame_out); 
+
+FramedSignal::~FramedSignal()
+{
+
+}
+
+std::vector<float> FramedSignal::operator[](int i)
+{
+	return frames.at(i);
+}
+
+std::vector<float> FramedSignal::getOriginalSignal()
+{
+	return original_signal;
+}
+
+int FramedSignal::get_nFrames()
+{
+	return nFrames;
+}
diff --git a/onnx/framedSignal.h b/onnx/framedSignal.h
@@ -0,0 +1,26 @@
+#ifndef FRAMEDSIGNAL_H
+#define FRAMEDSIGNAL_H
+
+#include <vector>
+
+class FramedSignal {
+public:
+
+    FramedSignal(const std::vector<float>& inputSignal, int nFrames, int frameSize, int hopSize);
+    ~FramedSignal();
+
+    std::vector<float> operator[](int i);
+    std::vector<float> getOriginalSignal();
+    int get_nFrames();
+
+private:
+    std::vector<float> original_signal;
+    int nFrames;
+    int frameSize;
+    int hopSize;
+
+    std::vector<float> padded_signal;
+    std::vector<std::vector<float>> frames;
+};
+
+#endif