|
| 1 | +//! AudioFrame: 48-byte codec for one frame of audio. |
| 2 | +//! |
| 3 | +//! The complete encode/decode pipeline: |
| 4 | +//! encode: PCM → MDCT → band energies (gain) + PVQ (shape) → AudioFrame |
| 5 | +//! decode: AudioFrame → band energies × PVQ shape → iMDCT → PCM |
| 6 | +//! |
| 7 | +//! One AudioFrame = one graph node in lance-graph. 48 bytes = CAM-compatible. |
| 8 | +
|
| 9 | +use super::mdct; |
| 10 | +use super::bands; |
| 11 | +use super::pvq; |
| 12 | + |
| 13 | +/// One audio frame: 42 bytes gain + 6 bytes shape = 48 bytes. |
| 14 | +/// |
| 15 | +/// Maps to SPO: |
| 16 | +/// Subject = spectral (WHAT frequencies) → band energies |
| 17 | +/// Predicate = temporal (WHEN they happen) → PVQ summary bytes 2-3 |
| 18 | +/// Object = harmonic (HOW they ring) → PVQ summary bytes 4-5 |
| 19 | +#[derive(Clone, Copy, Debug, PartialEq, Eq)] |
| 20 | +pub struct AudioFrame { |
| 21 | + /// 21 band energies as BF16 (42 bytes). The gain component. |
| 22 | + pub band_energies: [u16; bands::N_BANDS], |
| 23 | + /// PVQ shape fingerprint (6 bytes). HEEL/HIP/TWIG levels. |
| 24 | + pub pvq_summary: [u8; 6], |
| 25 | +} |
| 26 | + |
| 27 | +impl AudioFrame { |
| 28 | + /// Total byte size: 42 (energies) + 6 (pvq) = 48. |
| 29 | + pub const BYTE_SIZE: usize = bands::N_BANDS * 2 + 6; |
| 30 | + |
| 31 | + /// Encode one frame of PCM audio. |
| 32 | + /// |
| 33 | + /// `pcm`: mono f32 samples (padded to power of 2 internally). |
| 34 | + /// `pvq_k`: PVQ pulse budget per band (higher = better quality, more bits). |
| 35 | + pub fn encode(pcm: &[f32], pvq_k: u32) -> Self { |
| 36 | + // MDCT: time → frequency |
| 37 | + let coeffs = mdct::mdct_forward(pcm); |
| 38 | + |
| 39 | + // Band energies (gain) |
| 40 | + let energies = bands::band_energies(&coeffs); |
| 41 | + let bf16_energies = bands::energies_to_bf16(&energies); |
| 42 | + |
| 43 | + // Normalize bands (remove gain, keep shape) |
| 44 | + let shape = bands::normalize_bands(&coeffs, &energies); |
| 45 | + |
| 46 | + // PVQ encode the shape of the first (most important) band |
| 47 | + // For production: encode all 21 bands. For the POC: just first band's summary. |
| 48 | + let first_band_end = bands::CELT_BANDS_48K[1].min(shape.len()); |
| 49 | + let pulses = pvq::pvq_encode(&shape[..first_band_end], pvq_k); |
| 50 | + let summary = pvq::pvq_summary(&pulses); |
| 51 | + |
| 52 | + AudioFrame { |
| 53 | + band_energies: bf16_energies, |
| 54 | + pvq_summary: summary, |
| 55 | + } |
| 56 | + } |
| 57 | + |
| 58 | + /// Decode: reconstruct PCM from AudioFrame + optional full PVQ data. |
| 59 | + /// |
| 60 | + /// Without PVQ data: uses band energies only (coarse reconstruction). |
| 61 | + /// The PVQ summary gives the HHTL routing info, not the full shape. |
| 62 | + /// For full quality: pass the per-band PVQ pulse vectors. |
| 63 | + pub fn decode_coarse(&self) -> Vec<f32> { |
| 64 | + let energies = bands::bf16_to_energies(&self.band_energies); |
| 65 | + |
| 66 | + // Synthesize a simple spectral envelope from band energies |
| 67 | + // Each band gets a flat spectrum at its energy level |
| 68 | + let n2 = bands::CELT_BANDS_48K[bands::N_BANDS].min(480); |
| 69 | + let mut coeffs = vec![0.0f32; n2]; |
| 70 | + for band in 0..bands::N_BANDS { |
| 71 | + let lo = bands::CELT_BANDS_48K[band]; |
| 72 | + let hi = bands::CELT_BANDS_48K[band + 1].min(n2); |
| 73 | + let n_bins = (hi - lo).max(1); |
| 74 | + let per_bin = energies[band] / (n_bins as f32).sqrt(); |
| 75 | + for i in lo..hi { |
| 76 | + // Alternate signs for a more natural-sounding shape |
| 77 | + let sign = if (i - lo) % 2 == 0 { 1.0 } else { -1.0 }; |
| 78 | + coeffs[i] = per_bin * sign; |
| 79 | + } |
| 80 | + } |
| 81 | + |
| 82 | + // iMDCT: frequency → time |
| 83 | + mdct::mdct_backward(&coeffs) |
| 84 | + } |
| 85 | + |
| 86 | + /// Serialize to 48 bytes. |
| 87 | + pub fn to_bytes(&self) -> [u8; Self::BYTE_SIZE] { |
| 88 | + let mut bytes = [0u8; Self::BYTE_SIZE]; |
| 89 | + for i in 0..bands::N_BANDS { |
| 90 | + let b = self.band_energies[i].to_le_bytes(); |
| 91 | + bytes[i * 2] = b[0]; |
| 92 | + bytes[i * 2 + 1] = b[1]; |
| 93 | + } |
| 94 | + bytes[42..48].copy_from_slice(&self.pvq_summary); |
| 95 | + bytes |
| 96 | + } |
| 97 | + |
| 98 | + /// Deserialize from 48 bytes. |
| 99 | + pub fn from_bytes(bytes: &[u8; Self::BYTE_SIZE]) -> Self { |
| 100 | + let mut band_energies = [0u16; bands::N_BANDS]; |
| 101 | + for i in 0..bands::N_BANDS { |
| 102 | + band_energies[i] = u16::from_le_bytes([bytes[i * 2], bytes[i * 2 + 1]]); |
| 103 | + } |
| 104 | + let mut pvq_summary = [0u8; 6]; |
| 105 | + pvq_summary.copy_from_slice(&bytes[42..48]); |
| 106 | + AudioFrame { band_energies, pvq_summary } |
| 107 | + } |
| 108 | +} |
| 109 | + |
| 110 | +#[cfg(test)] |
| 111 | +mod tests { |
| 112 | + use super::*; |
| 113 | + use core::f32::consts::PI; |
| 114 | + |
| 115 | + #[test] |
| 116 | + fn frame_48_bytes() { |
| 117 | + assert_eq!(AudioFrame::BYTE_SIZE, 48); |
| 118 | + } |
| 119 | + |
| 120 | + #[test] |
| 121 | + fn encode_decode_nonzero() { |
| 122 | + // 440Hz sine at 48kHz, 1024 samples |
| 123 | + let pcm: Vec<f32> = (0..1024) |
| 124 | + .map(|i| (2.0 * PI * 440.0 * i as f32 / 48000.0).sin()) |
| 125 | + .collect(); |
| 126 | + |
| 127 | + let frame = AudioFrame::encode(&pcm, 8); |
| 128 | + |
| 129 | + // Band energies should be nonzero (at least the band containing 440Hz) |
| 130 | + let total_energy: f32 = frame.band_energies.iter() |
| 131 | + .map(|&b| f32::from_bits((b as u32) << 16)) |
| 132 | + .sum(); |
| 133 | + assert!(total_energy > 0.01, "Encoded frame has no energy: {}", total_energy); |
| 134 | + |
| 135 | + // Decode |
| 136 | + let decoded = frame.decode_coarse(); |
| 137 | + assert!(!decoded.is_empty()); |
| 138 | + let decoded_energy: f32 = decoded.iter().map(|s| s * s).sum(); |
| 139 | + assert!(decoded_energy > 1e-10, "Decoded has no energy: {}", decoded_energy); |
| 140 | + } |
| 141 | + |
| 142 | + #[test] |
| 143 | + fn serialize_roundtrip() { |
| 144 | + let frame = AudioFrame { |
| 145 | + band_energies: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], |
| 146 | + pvq_summary: [0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF], |
| 147 | + }; |
| 148 | + let bytes = frame.to_bytes(); |
| 149 | + let recovered = AudioFrame::from_bytes(&bytes); |
| 150 | + assert_eq!(frame, recovered); |
| 151 | + } |
| 152 | +} |
0 commit comments