Skip to content

Commit aeff0b8

Browse files
neural-codec: add Q4 format (6.8 kbit/s), fix NCA -> NAC (Neural Audio Codec)
1 parent 37347ac commit aeff0b8

2 files changed

Lines changed: 164 additions & 40 deletions

File tree

README.md

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -354,11 +354,11 @@ Models are loaded once and reused across all requests.
354354

355355
GGML-native neural audio codec based on the Oobleck VAE encoder and decoder.
356356
Serves two purposes: validating the precision of the full VAE chain (encode +
357-
decode roundtrip), and compressing music at 1.6 KB/s with CD quality and no
358-
perceptible difference from the original.
357+
decode roundtrip), and compressing music at ~850 B/s with no perceptible
358+
difference from the original.
359359

360360
```
361-
Usage: neural-codec --vae <gguf> --encode|--decode -i <input> [-o <o>] [--q8]
361+
Usage: neural-codec --vae <gguf> --encode|--decode -i <input> [-o <o>] [--q8|--q4]
362362
363363
Required:
364364
--vae <path> VAE GGUF file
@@ -367,9 +367,10 @@ Required:
367367
368368
Output:
369369
-o <path> Output file (auto-named if omitted)
370-
--q8 Quantize latent to int8 (~13 kbit/s vs ~51 kbit/s f32)
370+
--q8 Quantize latent to int8 (~13 kbit/s)
371+
--q4 Quantize latent to int4 (~6.8 kbit/s)
371372
372-
Output naming: song.wav -> song.latent (f32) or song.nca8 (Q8)
373+
Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4)
373374
song.latent -> song.wav
374375
375376
VAE tiling (memory control):
@@ -378,24 +379,38 @@ VAE tiling (memory control):
378379
379380
Latent formats (decode auto-detects):
380381
f32: flat [T, 64] f32, no header. ~51 kbit/s.
381-
NCA8: header + per-frame Q8. ~13 kbit/s.
382+
NAC8: header + per-frame Q8. ~13 kbit/s.
383+
NAC4: header + per-frame Q4. ~6.8 kbit/s.
382384
```
383385

384386
The encoder is the symmetric mirror of the decoder: same snake activations,
385387
same residual units, strided conv1d for downsampling instead of transposed
386388
conv1d for upsampling. No new GGML ops. Downsample 2x4x4x6x10 = 1920x.
387389

388390
48kHz stereo audio is compressed to 64-dimensional latent frames at 25 Hz.
389-
With Q8 quantization, each frame is 66 bytes (2B scale + 64B int8), giving
390-
~13 kbit/s. The quantization error is 39 dB below the VAE reconstruction
391-
error, meaning the Q8 step is perceptually free.
391+
Three output formats, decode auto-detects from file content:
392+
393+
| Format | Frame size | Bitrate | 3 min song | vs f32 (cossim) |
394+
|--------|-----------|---------|------------|-----------------|
395+
| f32 | 256B | 51 kbit/s | 1.1 MB | baseline |
396+
| NAC8 | 66B | 13 kbit/s | 290 KB | 0.9999 |
397+
| NAC4 | 34B | 6.8 kbit/s | 150 KB | 0.989 |
398+
399+
NAC = Neural Audio Codec. The NAC8 and NAC4 file formats are headerless
400+
except for a 4-byte magic (`NAC8` or `NAC4`) and a uint32 frame count.
401+
Q8 quantization error is 39 dB below the VAE reconstruction error (free).
402+
Q4 quantization error is 16 dB below the VAE reconstruction error (inaudible
403+
on most material).
392404

393405
```bash
394-
# encode
395-
neural-codec --vae models/vae-BF16.gguf --encode --q8 -i song.wav -o song.nca8
406+
# encode (Q4: 6.8 kbit/s, ~150 KB for 3 minutes)
407+
neural-codec --vae models/vae-BF16.gguf --encode --q4 -i song.wav -o song.nac4
408+
409+
# encode (Q8: 13 kbit/s, ~290 KB for 3 minutes)
410+
neural-codec --vae models/vae-BF16.gguf --encode --q8 -i song.wav -o song.nac8
396411

397-
# decode
398-
neural-codec --vae models/vae-BF16.gguf --decode -i song.nca8 -o song_decoded.wav
412+
# decode (auto-detects format)
413+
neural-codec --vae models/vae-BF16.gguf --decode -i song.nac4 -o song_decoded.wav
399414

400415
# roundtrip validation: compare song.wav and song_decoded.wav with your ears
401416
```

tools/neural-codec.cpp

Lines changed: 136 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,29 @@
11
// neural-codec.cpp: neural audio codec (Oobleck VAE)
22
//
33
// 2 modes:
4-
// encode: WAV -> latent file (f32 or Q8)
4+
// encode: WAV -> latent file (f32, Q8, or Q4)
55
// decode: latent file -> WAV (48kHz stereo)
66
//
7-
// Two latent formats, decode auto-detects:
7+
// Three latent formats, decode auto-detects:
88
//
99
// f32 (default): flat [T, 64] f32, no header.
1010
// T = file_size / 256. 25Hz, ~6.4 KB/s, ~51 kbit/s.
1111
//
1212
// Q8 (--q8): symmetric per-frame int8 quantization.
13-
// header: "NCA8" magic (4B) + uint32 T_latent (4B)
13+
// header: "NAC8" magic (4B) + uint32 T_latent (4B)
1414
// frame: f16 scale (2B) + int8[64] (64B) = 66B
1515
// 25Hz, ~1.6 KB/s, ~13 kbit/s.
1616
//
17+
// Q4 (--q4): symmetric per-frame 4-bit quantization.
18+
// header: "NAC4" magic (4B) + uint32 T_latent (4B)
19+
// frame: f16 scale (2B) + nibbles[32] (32B) = 34B
20+
// 25Hz, ~850 B/s, ~6.8 kbit/s.
21+
//
1722
// Usage:
1823
// neural-codec --vae model.gguf --encode -i song.wav -o song.latent
19-
// neural-codec --vae model.gguf --encode --q8 -i song.wav -o song.nca8
20-
// neural-codec --vae model.gguf --decode -i song.nca8 -o song.wav
24+
// neural-codec --vae model.gguf --encode --q8 -i song.wav -o song.nac8
25+
// neural-codec --vae model.gguf --encode --q4 -i song.wav -o song.nac4
26+
// neural-codec --vae model.gguf --decode -i song.nac4 -o song.wav
2127

2228
#include "vae.h"
2329
#include "vae-enc.h"
@@ -149,16 +155,16 @@ static bool write_wav(const char * path, const float * audio, int T_audio, int s
149155
}
150156

151157
// Q8 format constants
152-
static const char NCA8_MAGIC[4] = {'N', 'C', 'A', '8'};
153-
static const int NCA8_HEADER = 8; // 4B magic + 4B T_latent
154-
static const int NCA8_FRAME = 66; // 2B f16 scale + 64B int8
158+
static const char NAC8_MAGIC[4] = {'N', 'A', 'C', '8'};
159+
static const int NAC8_HEADER = 8; // 4B magic + 4B T_latent
160+
static const int NAC8_FRAME = 66; // 2B f16 scale + 64B int8
155161

156162
// Write Q8 quantized latent
157163
static bool write_latent_q8(const char * path, const float * data, int T_latent) {
158164
FILE * f = fopen(path, "wb");
159165
if (!f) return false;
160166

161-
fwrite(NCA8_MAGIC, 1, 4, f);
167+
fwrite(NAC8_MAGIC, 1, 4, f);
162168
uint32_t t = (uint32_t)T_latent;
163169
fwrite(&t, 4, 1, f);
164170

@@ -186,14 +192,65 @@ static bool write_latent_q8(const char * path, const float * data, int T_latent)
186192
}
187193
fclose(f);
188194

189-
size_t bytes = NCA8_HEADER + (size_t)T_latent * NCA8_FRAME;
195+
size_t bytes = NAC8_HEADER + (size_t)T_latent * NAC8_FRAME;
190196
float duration = (float)T_latent * 1920.0f / 48000.0f;
191197
float kbps = (float)bytes * 8.0f / (duration * 1000.0f);
192198
fprintf(stderr, "[Latent] Wrote %s: Q8, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
193199
path, T_latent, duration, (float)bytes / 1024.0f, kbps);
194200
return true;
195201
}
196202

203+
// Q4 format constants
204+
static const char NAC4_MAGIC[4] = {'N', 'A', 'C', '4'};
205+
static const int NAC4_HEADER = 8; // 4B magic + 4B T_latent
206+
static const int NAC4_FRAME = 34; // 2B f16 scale + 32B packed nibbles
207+
208+
// Write Q4 quantized latent
209+
// Symmetric 4-bit: range [-7, 7], scale = amax / 7.0
210+
// Packing: byte = (low & 0x0F) | (high << 4), two signed nibbles per byte
211+
static bool write_latent_q4(const char * path, const float * data, int T_latent) {
212+
FILE * f = fopen(path, "wb");
213+
if (!f) return false;
214+
215+
fwrite(NAC4_MAGIC, 1, 4, f);
216+
uint32_t t = (uint32_t)T_latent;
217+
fwrite(&t, 4, 1, f);
218+
219+
for (int i = 0; i < T_latent; i++) {
220+
const float * frame = data + i * 64;
221+
222+
// find max abs for symmetric quant
223+
float amax = 0.0f;
224+
for (int j = 0; j < 64; j++) {
225+
float a = fabsf(frame[j]);
226+
if (a > amax) amax = a;
227+
}
228+
float scale = amax / 7.0f;
229+
ggml_fp16_t scale_f16 = ggml_fp32_to_fp16(scale);
230+
fwrite(&scale_f16, 2, 1, f);
231+
232+
// quantize and pack pairs into bytes
233+
float inv = (scale > 0.0f) ? 7.0f / amax : 0.0f;
234+
uint8_t packed[32];
235+
for (int j = 0; j < 32; j++) {
236+
int lo = (int)roundf(frame[j * 2 + 0] * inv);
237+
int hi = (int)roundf(frame[j * 2 + 1] * inv);
238+
lo = lo < -7 ? -7 : (lo > 7 ? 7 : lo);
239+
hi = hi < -7 ? -7 : (hi > 7 ? 7 : hi);
240+
packed[j] = (uint8_t)((lo & 0x0F) | (hi << 4));
241+
}
242+
fwrite(packed, 1, 32, f);
243+
}
244+
fclose(f);
245+
246+
size_t bytes = NAC4_HEADER + (size_t)T_latent * NAC4_FRAME;
247+
float duration = (float)T_latent * 1920.0f / 48000.0f;
248+
float kbps = (float)bytes * 8.0f / (duration * 1000.0f);
249+
fprintf(stderr, "[Latent] Wrote %s: Q4, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
250+
path, T_latent, duration, (float)bytes / 1024.0f, kbps);
251+
return true;
252+
}
253+
197254
// Write f32 raw latent (no header)
198255
static bool write_latent_f32(const char * path, const float * data, int T_latent) {
199256
FILE * f = fopen(path, "wb");
@@ -208,8 +265,8 @@ static bool write_latent_f32(const char * path, const float * data, int T_latent
208265
return true;
209266
}
210267

211-
// Read latent, auto-detect format (NCA8 magic -> Q8, else f32).
212-
// Returns [T_latent, 64] f32 (dequantized if Q8). Caller frees.
268+
// Read latent, auto-detect format (NAC8 -> Q8, NAC4 -> Q4, else f32).
269+
// Returns [T_latent, 64] f32 (dequantized if quantized). Caller frees.
213270
static float * read_latent(const char * path, int * T_latent) {
214271
FILE * f = fopen(path, "rb");
215272
if (!f) { fprintf(stderr, "[Latent] Cannot open %s\n", path); return NULL; }
@@ -221,13 +278,13 @@ static float * read_latent(const char * path, int * T_latent) {
221278
char magic[4] = {};
222279
if (fsize >= 8) fread(magic, 1, 4, f);
223280

224-
if (memcmp(magic, NCA8_MAGIC, 4) == 0) {
281+
if (memcmp(magic, NAC8_MAGIC, 4) == 0) {
225282
// Q8 format
226283
uint32_t t;
227284
fread(&t, 4, 1, f);
228285
*T_latent = (int)t;
229286

230-
long expected = NCA8_HEADER + (long)t * NCA8_FRAME;
287+
long expected = NAC8_HEADER + (long)t * NAC8_FRAME;
231288
if (fsize != expected) {
232289
fprintf(stderr, "[Latent] Q8 size mismatch: expected %ld, got %ld\n", expected, fsize);
233290
fclose(f); return NULL;
@@ -255,6 +312,47 @@ static float * read_latent(const char * path, int * T_latent) {
255312
return data;
256313
}
257314

315+
if (memcmp(magic, NAC4_MAGIC, 4) == 0) {
316+
// Q4 format
317+
uint32_t t;
318+
fread(&t, 4, 1, f);
319+
*T_latent = (int)t;
320+
321+
long expected = NAC4_HEADER + (long)t * NAC4_FRAME;
322+
if (fsize != expected) {
323+
fprintf(stderr, "[Latent] Q4 size mismatch: expected %ld, got %ld\n", expected, fsize);
324+
fclose(f); return NULL;
325+
}
326+
327+
float * data = (float *)malloc((size_t)t * 64 * sizeof(float));
328+
for (int i = 0; i < (int)t; i++) {
329+
ggml_fp16_t scale_f16;
330+
fread(&scale_f16, 2, 1, f);
331+
float scale = ggml_fp16_to_fp32(scale_f16);
332+
333+
uint8_t packed[32];
334+
fread(packed, 1, 32, f);
335+
336+
// unpack signed nibbles
337+
float * frame = data + i * 64;
338+
for (int j = 0; j < 32; j++) {
339+
int lo = (int)(packed[j] & 0x0F);
340+
int hi = (int)(packed[j] >> 4);
341+
if (lo >= 8) lo -= 16;
342+
if (hi >= 8) hi -= 16;
343+
frame[j * 2 + 0] = (float)lo * scale;
344+
frame[j * 2 + 1] = (float)hi * scale;
345+
}
346+
}
347+
fclose(f);
348+
349+
float duration = (float)(*T_latent) * 1920.0f / 48000.0f;
350+
float kbps = (float)fsize * 8.0f / (duration * 1000.0f);
351+
fprintf(stderr, "[Latent] Read %s: Q4, %d frames (%.2fs, %.1f KB, %.1f kbit/s)\n",
352+
path, *T_latent, duration, (float)fsize / 1024.0f, kbps);
353+
return data;
354+
}
355+
258356
// f32 format (no header, rewind)
259357
fseek(f, 0, SEEK_SET);
260358
if (fsize % (64 * (int)sizeof(float)) != 0) {
@@ -277,22 +375,24 @@ static float * read_latent(const char * path, int * T_latent) {
277375

278376
static void print_usage(const char * prog) {
279377
fprintf(stderr,
280-
"Usage: %s --vae <gguf> --encode|--decode -i <input> [-o <output>] [--q8]\n\n"
378+
"Usage: %s --vae <gguf> --encode|--decode -i <input> [-o <output>] [--q8|--q4]\n\n"
281379
"Required:\n"
282380
" --vae <path> VAE GGUF file\n"
283381
" --encode | --decode Encode WAV to latent, or decode latent to WAV\n"
284382
" -i <path> Input (WAV for encode, latent for decode)\n\n"
285383
"Output:\n"
286384
" -o <path> Output file (auto-named if omitted)\n"
287-
" --q8 Quantize latent to int8 (~13 kbit/s vs ~51 kbit/s f32)\n\n"
288-
"Output naming: song.wav -> song.latent (f32) or song.nca8 (Q8)\n"
385+
" --q8 Quantize latent to int8 (~13 kbit/s)\n"
386+
" --q4 Quantize latent to int4 (~6.8 kbit/s)\n\n"
387+
"Output naming: song.wav -> song.latent (f32) or song.nac8 (Q8) or song.nac4 (Q4)\n"
289388
" song.latent -> song.wav\n\n"
290389
"VAE tiling (memory control):\n"
291390
" --vae-chunk <N> Latent frames per tile (default: 256)\n"
292391
" --vae-overlap <N> Overlap frames per side (default: 64)\n\n"
293392
"Latent formats (decode auto-detects):\n"
294393
" f32: flat [T, 64] f32, no header. ~51 kbit/s.\n"
295-
" NCA8: header + per-frame Q8. ~13 kbit/s.\n",
394+
" NAC8: header + per-frame Q8. ~13 kbit/s.\n"
395+
" NAC4: header + per-frame Q4. ~6.8 kbit/s.\n",
296396
prog);
297397
}
298398

@@ -311,7 +411,7 @@ int main(int argc, char ** argv) {
311411
int chunk_size = 256;
312412
int overlap = 64;
313413
int mode = -1; // 0 = encode, 1 = decode
314-
bool use_q8 = false;
414+
int quant = 0; // 0 = f32, 8 = q8, 4 = q4
315415

316416
for (int i = 1; i < argc; i++) {
317417
if (strcmp(argv[i], "--vae") == 0 && i + 1 < argc) vae_path = argv[++i];
@@ -323,7 +423,8 @@ int main(int argc, char ** argv) {
323423
else if (strcmp(argv[i], "--vae-overlap") == 0 && i + 1 < argc) overlap = atoi(argv[++i]);
324424
else if (strcmp(argv[i], "--encode") == 0) mode = 0;
325425
else if (strcmp(argv[i], "--decode") == 0) mode = 1;
326-
else if (strcmp(argv[i], "--q8") == 0) use_q8 = true;
426+
else if (strcmp(argv[i], "--q8") == 0) quant = 8;
427+
else if (strcmp(argv[i], "--q4") == 0) quant = 4;
327428
else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
328429
print_usage(argv[0]); return 0;
329430
} else {
@@ -339,15 +440,21 @@ int main(int argc, char ** argv) {
339440
// Auto output names
340441
std::string out_str;
341442
if (!output_path) {
342-
if (mode == 0)
343-
out_str = auto_output(input_path, use_q8 ? ".nca8" : ".latent");
344-
else
443+
if (mode == 0) {
444+
const char * ext = ".latent";
445+
if (quant == 8) ext = ".nac8";
446+
if (quant == 4) ext = ".nac4";
447+
out_str = auto_output(input_path, ext);
448+
} else {
345449
out_str = auto_output(input_path, ".wav");
450+
}
346451
output_path = out_str.c_str();
347452
}
348453

349-
fprintf(stderr, "\n[VAE] Mode: %s%s\n", mode == 0 ? "encode" : "decode",
350-
(mode == 0 && use_q8) ? " (Q8)" : "");
454+
const char * quant_str = "";
455+
if (mode == 0 && quant == 8) quant_str = " (Q8)";
456+
if (mode == 0 && quant == 4) quant_str = " (Q4)";
457+
fprintf(stderr, "\n[VAE] Mode: %s%s\n", mode == 0 ? "encode" : "decode", quant_str);
351458
fprintf(stderr, "[VAE] Input: %s\n", input_path);
352459
fprintf(stderr, "[VAE] Output: %s\n\n", output_path);
353460

@@ -372,8 +479,10 @@ int main(int argc, char ** argv) {
372479
free(audio);
373480
if (T_latent < 0) { vae_enc_free(&enc); return 1; }
374481

375-
if (use_q8)
482+
if (quant == 8)
376483
write_latent_q8(output_path, latent.data(), T_latent);
484+
else if (quant == 4)
485+
write_latent_q4(output_path, latent.data(), T_latent);
377486
else
378487
write_latent_f32(output_path, latent.data(), T_latent);
379488

@@ -382,7 +491,7 @@ int main(int argc, char ** argv) {
382491
return 0;
383492
}
384493

385-
// DECODE (auto-detects f32 vs Q8 from file content)
494+
// DECODE (auto-detects f32 vs Q8 vs Q4 from file content)
386495
{
387496
int T_latent = 0;
388497
float * latent = read_latent(input_path, &T_latent);

0 commit comments

Comments
 (0)