11// neural-codec.cpp: neural audio codec (Oobleck VAE)
22//
33// 2 modes:
4- // encode: WAV -> latent file (f32 or Q8 )
4+ // encode: WAV -> latent file (f32, Q8, or Q4 )
55// decode: latent file -> WAV (48kHz stereo)
66//
7- // Two latent formats, decode auto-detects:
7+ // Three latent formats, decode auto-detects:
88//
99// f32 (default): flat [T, 64] f32, no header.
1010// T = file_size / 256. 25Hz, ~6.4 KB/s, ~51 kbit/s.
1111//
1212// Q8 (--q8): symmetric per-frame int8 quantization.
13- // header: "NCA8 " magic (4B) + uint32 T_latent (4B)
13+ // header: "NAC8 " magic (4B) + uint32 T_latent (4B)
1414// frame: f16 scale (2B) + int8[64] (64B) = 66B
1515// 25Hz, ~1.6 KB/s, ~13 kbit/s.
1616//
17+ // Q4 (--q4): symmetric per-frame 4-bit quantization.
18+ // header: "NAC4" magic (4B) + uint32 T_latent (4B)
19+ // frame: f16 scale (2B) + nibbles[32] (32B) = 34B
20+ // 25Hz, ~850 B/s, ~6.8 kbit/s.
21+ //
1722// Usage:
1823// neural-codec --vae model.gguf --encode -i song.wav -o song.latent
19- // neural-codec --vae model.gguf --encode --q8 -i song.wav -o song.nca8
20- // neural-codec --vae model.gguf --decode -i song.nca8 -o song.wav
24+ // neural-codec --vae model.gguf --encode --q8 -i song.wav -o song.nac8
25+ // neural-codec --vae model.gguf --encode --q4 -i song.wav -o song.nac4
26+ // neural-codec --vae model.gguf --decode -i song.nac4 -o song.wav
2127
2228#include " vae.h"
2329#include " vae-enc.h"
@@ -149,16 +155,16 @@ static bool write_wav(const char * path, const float * audio, int T_audio, int s
149155}
150156
151157// Q8 format constants
152- static const char NCA8_MAGIC [4 ] = {' N' , ' C ' , ' A ' , ' 8' };
153- static const int NCA8_HEADER = 8 ; // 4B magic + 4B T_latent
154- static const int NCA8_FRAME = 66 ; // 2B f16 scale + 64B int8
158+ static const char NAC8_MAGIC [4 ] = {' N' , ' A ' , ' C ' , ' 8' };
159+ static const int NAC8_HEADER = 8 ; // 4B magic + 4B T_latent
160+ static const int NAC8_FRAME = 66 ; // 2B f16 scale + 64B int8
155161
156162// Write Q8 quantized latent
157163static bool write_latent_q8 (const char * path, const float * data, int T_latent) {
158164 FILE * f = fopen (path, " wb" );
159165 if (!f) return false ;
160166
161- fwrite (NCA8_MAGIC , 1 , 4 , f);
167+ fwrite (NAC8_MAGIC , 1 , 4 , f);
162168 uint32_t t = (uint32_t )T_latent;
163169 fwrite (&t, 4 , 1 , f);
164170
@@ -186,14 +192,65 @@ static bool write_latent_q8(const char * path, const float * data, int T_latent)
186192 }
187193 fclose (f);
188194
189- size_t bytes = NCA8_HEADER + (size_t )T_latent * NCA8_FRAME ;
195+ size_t bytes = NAC8_HEADER + (size_t )T_latent * NAC8_FRAME ;
190196 float duration = (float )T_latent * 1920 .0f / 48000 .0f ;
191197 float kbps = (float )bytes * 8 .0f / (duration * 1000 .0f );
192198 fprintf (stderr, " [Latent] Wrote %s: Q8, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n " ,
193199 path, T_latent, duration, (float )bytes / 1024 .0f , kbps);
194200 return true ;
195201}
196202
203+ // Q4 format constants
204+ static const char NAC4_MAGIC[4 ] = {' N' , ' A' , ' C' , ' 4' };
205+ static const int NAC4_HEADER = 8 ; // 4B magic + 4B T_latent
206+ static const int NAC4_FRAME = 34 ; // 2B f16 scale + 32B packed nibbles
207+
208+ // Write Q4 quantized latent
209+ // Symmetric 4-bit: range [-7, 7], scale = amax / 7.0
210+ // Packing: byte = (low & 0x0F) | (high << 4), two signed nibbles per byte
211+ static bool write_latent_q4 (const char * path, const float * data, int T_latent) {
212+ FILE * f = fopen (path, " wb" );
213+ if (!f) return false ;
214+
215+ fwrite (NAC4_MAGIC, 1 , 4 , f);
216+ uint32_t t = (uint32_t )T_latent;
217+ fwrite (&t, 4 , 1 , f);
218+
219+ for (int i = 0 ; i < T_latent; i++) {
220+ const float * frame = data + i * 64 ;
221+
222+ // find max abs for symmetric quant
223+ float amax = 0 .0f ;
224+ for (int j = 0 ; j < 64 ; j++) {
225+ float a = fabsf (frame[j]);
226+ if (a > amax) amax = a;
227+ }
228+ float scale = amax / 7 .0f ;
229+ ggml_fp16_t scale_f16 = ggml_fp32_to_fp16 (scale);
230+ fwrite (&scale_f16, 2 , 1 , f);
231+
232+ // quantize and pack pairs into bytes
233+ float inv = (scale > 0 .0f ) ? 7 .0f / amax : 0 .0f ;
234+ uint8_t packed[32 ];
235+ for (int j = 0 ; j < 32 ; j++) {
236+ int lo = (int )roundf (frame[j * 2 + 0 ] * inv);
237+ int hi = (int )roundf (frame[j * 2 + 1 ] * inv);
238+ lo = lo < -7 ? -7 : (lo > 7 ? 7 : lo);
239+ hi = hi < -7 ? -7 : (hi > 7 ? 7 : hi);
240+ packed[j] = (uint8_t )((lo & 0x0F ) | (hi << 4 ));
241+ }
242+ fwrite (packed, 1 , 32 , f);
243+ }
244+ fclose (f);
245+
246+ size_t bytes = NAC4_HEADER + (size_t )T_latent * NAC4_FRAME;
247+ float duration = (float )T_latent * 1920 .0f / 48000 .0f ;
248+ float kbps = (float )bytes * 8 .0f / (duration * 1000 .0f );
249+ fprintf (stderr, " [Latent] Wrote %s: Q4, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n " ,
250+ path, T_latent, duration, (float )bytes / 1024 .0f , kbps);
251+ return true ;
252+ }
253+
197254// Write f32 raw latent (no header)
198255static bool write_latent_f32 (const char * path, const float * data, int T_latent) {
199256 FILE * f = fopen (path, " wb" );
@@ -208,8 +265,8 @@ static bool write_latent_f32(const char * path, const float * data, int T_latent
208265 return true ;
209266}
210267
211- // Read latent, auto-detect format (NCA8 magic -> Q8, else f32).
212- // Returns [T_latent, 64] f32 (dequantized if Q8 ). Caller frees.
268+ // Read latent, auto-detect format (NAC8 -> Q8, NAC4 -> Q4 , else f32).
269+ // Returns [T_latent, 64] f32 (dequantized if quantized ). Caller frees.
213270static float * read_latent (const char * path, int * T_latent) {
214271 FILE * f = fopen (path, " rb" );
215272 if (!f) { fprintf (stderr, " [Latent] Cannot open %s\n " , path); return NULL ; }
@@ -221,13 +278,13 @@ static float * read_latent(const char * path, int * T_latent) {
221278 char magic[4 ] = {};
222279 if (fsize >= 8 ) fread (magic, 1 , 4 , f);
223280
224- if (memcmp (magic, NCA8_MAGIC , 4 ) == 0 ) {
281+ if (memcmp (magic, NAC8_MAGIC , 4 ) == 0 ) {
225282 // Q8 format
226283 uint32_t t;
227284 fread (&t, 4 , 1 , f);
228285 *T_latent = (int )t;
229286
230- long expected = NCA8_HEADER + (long )t * NCA8_FRAME ;
287+ long expected = NAC8_HEADER + (long )t * NAC8_FRAME ;
231288 if (fsize != expected) {
232289 fprintf (stderr, " [Latent] Q8 size mismatch: expected %ld, got %ld\n " , expected, fsize);
233290 fclose (f); return NULL ;
@@ -255,6 +312,47 @@ static float * read_latent(const char * path, int * T_latent) {
255312 return data;
256313 }
257314
315+ if (memcmp (magic, NAC4_MAGIC, 4 ) == 0 ) {
316+ // Q4 format
317+ uint32_t t;
318+ fread (&t, 4 , 1 , f);
319+ *T_latent = (int )t;
320+
321+ long expected = NAC4_HEADER + (long )t * NAC4_FRAME;
322+ if (fsize != expected) {
323+ fprintf (stderr, " [Latent] Q4 size mismatch: expected %ld, got %ld\n " , expected, fsize);
324+ fclose (f); return NULL ;
325+ }
326+
327+ float * data = (float *)malloc ((size_t )t * 64 * sizeof (float ));
328+ for (int i = 0 ; i < (int )t; i++) {
329+ ggml_fp16_t scale_f16;
330+ fread (&scale_f16, 2 , 1 , f);
331+ float scale = ggml_fp16_to_fp32 (scale_f16);
332+
333+ uint8_t packed[32 ];
334+ fread (packed, 1 , 32 , f);
335+
336+ // unpack signed nibbles
337+ float * frame = data + i * 64 ;
338+ for (int j = 0 ; j < 32 ; j++) {
339+ int lo = (int )(packed[j] & 0x0F );
340+ int hi = (int )(packed[j] >> 4 );
341+ if (lo >= 8 ) lo -= 16 ;
342+ if (hi >= 8 ) hi -= 16 ;
343+ frame[j * 2 + 0 ] = (float )lo * scale;
344+ frame[j * 2 + 1 ] = (float )hi * scale;
345+ }
346+ }
347+ fclose (f);
348+
349+ float duration = (float )(*T_latent) * 1920 .0f / 48000 .0f ;
350+ float kbps = (float )fsize * 8 .0f / (duration * 1000 .0f );
351+ fprintf (stderr, " [Latent] Read %s: Q4, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n " ,
352+ path, *T_latent, duration, (float )fsize / 1024 .0f , kbps);
353+ return data;
354+ }
355+
258356 // f32 format (no header, rewind)
259357 fseek (f, 0 , SEEK_SET);
260358 if (fsize % (64 * (int )sizeof (float )) != 0 ) {
@@ -277,22 +375,24 @@ static float * read_latent(const char * path, int * T_latent) {
277375
278376static void print_usage (const char * prog) {
279377 fprintf (stderr,
280- " Usage: %s --vae <gguf> --encode|--decode -i <input> [-o <output>] [--q8]\n\n "
378+ " Usage: %s --vae <gguf> --encode|--decode -i <input> [-o <output>] [--q8|--q4 ]\n\n "
281379 " Required:\n "
282380 " --vae <path> VAE GGUF file\n "
283381 " --encode | --decode Encode WAV to latent, or decode latent to WAV\n "
284382 " -i <path> Input (WAV for encode, latent for decode)\n\n "
285383 " Output:\n "
286384 " -o <path> Output file (auto-named if omitted)\n "
287- " --q8 Quantize latent to int8 (~13 kbit/s vs ~51 kbit/s f32)\n\n "
288- " Output naming: song.wav -> song.latent (f32) or song.nca8 (Q8)\n "
385+ " --q8 Quantize latent to int8 (~13 kbit/s)\n "
386+ " --q4 Quantize latent to int4 (~6.8 kbit/s)\n\n "
387+ " Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4)\n "
289388 " song.latent -> song.wav\n\n "
290389 " VAE tiling (memory control):\n "
291390 " --vae-chunk <N> Latent frames per tile (default: 256)\n "
292391 " --vae-overlap <N> Overlap frames per side (default: 64)\n\n "
293392 " Latent formats (decode auto-detects):\n "
294393 " f32: flat [T, 64] f32, no header. ~51 kbit/s.\n "
295- " NCA8: header + per-frame Q8. ~13 kbit/s.\n " ,
394+ " NAC8: header + per-frame Q8. ~13 kbit/s.\n "
395+ " NAC4: header + per-frame Q4. ~6.8 kbit/s.\n " ,
296396 prog);
297397}
298398
@@ -311,7 +411,7 @@ int main(int argc, char ** argv) {
311411 int chunk_size = 256 ;
312412 int overlap = 64 ;
313413 int mode = -1 ; // 0 = encode, 1 = decode
314- bool use_q8 = false ;
414+ int quant = 0 ; // 0 = f32, 8 = q8, 4 = q4
315415
316416 for (int i = 1 ; i < argc; i++) {
317417 if (strcmp (argv[i], " --vae" ) == 0 && i + 1 < argc) vae_path = argv[++i];
@@ -323,7 +423,8 @@ int main(int argc, char ** argv) {
323423 else if (strcmp (argv[i], " --vae-overlap" ) == 0 && i + 1 < argc) overlap = atoi (argv[++i]);
324424 else if (strcmp (argv[i], " --encode" ) == 0 ) mode = 0 ;
325425 else if (strcmp (argv[i], " --decode" ) == 0 ) mode = 1 ;
326- else if (strcmp (argv[i], " --q8" ) == 0 ) use_q8 = true ;
426+ else if (strcmp (argv[i], " --q8" ) == 0 ) quant = 8 ;
427+ else if (strcmp (argv[i], " --q4" ) == 0 ) quant = 4 ;
327428 else if (strcmp (argv[i], " -h" ) == 0 || strcmp (argv[i], " --help" ) == 0 ) {
328429 print_usage (argv[0 ]); return 0 ;
329430 } else {
@@ -339,15 +440,21 @@ int main(int argc, char ** argv) {
339440 // Auto output names
340441 std::string out_str;
341442 if (!output_path) {
342- if (mode == 0 )
343- out_str = auto_output (input_path, use_q8 ? " .nca8" : " .latent" );
344- else
443+ if (mode == 0 ) {
444+ const char * ext = " .latent" ;
445+ if (quant == 8 ) ext = " .nac8" ;
446+ if (quant == 4 ) ext = " .nac4" ;
447+ out_str = auto_output (input_path, ext);
448+ } else {
345449 out_str = auto_output (input_path, " .wav" );
450+ }
346451 output_path = out_str.c_str ();
347452 }
348453
349- fprintf (stderr, " \n [VAE] Mode: %s%s\n " , mode == 0 ? " encode" : " decode" ,
350- (mode == 0 && use_q8) ? " (Q8)" : " " );
454+ const char * quant_str = " " ;
455+ if (mode == 0 && quant == 8 ) quant_str = " (Q8)" ;
456+ if (mode == 0 && quant == 4 ) quant_str = " (Q4)" ;
457+ fprintf (stderr, " \n [VAE] Mode: %s%s\n " , mode == 0 ? " encode" : " decode" , quant_str);
351458 fprintf (stderr, " [VAE] Input: %s\n " , input_path);
352459 fprintf (stderr, " [VAE] Output: %s\n\n " , output_path);
353460
@@ -372,8 +479,10 @@ int main(int argc, char ** argv) {
372479 free (audio);
373480 if (T_latent < 0 ) { vae_enc_free (&enc); return 1 ; }
374481
375- if (use_q8 )
482+ if (quant == 8 )
376483 write_latent_q8 (output_path, latent.data (), T_latent);
484+ else if (quant == 4 )
485+ write_latent_q4 (output_path, latent.data (), T_latent);
377486 else
378487 write_latent_f32 (output_path, latent.data (), T_latent);
379488
@@ -382,7 +491,7 @@ int main(int argc, char ** argv) {
382491 return 0 ;
383492 }
384493
385- // DECODE (auto-detects f32 vs Q8 from file content)
494+ // DECODE (auto-detects f32 vs Q8 vs Q4 from file content)
386495 {
387496 int T_latent = 0 ;
388497 float * latent = read_latent (input_path, &T_latent);
0 commit comments