diff --git a/README.md b/README.md index 702cdd3..78c5c80 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Rust-implemented fast JSON decoder exposed to LuaJIT via FFI. Optimized for the ## Status -Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson. +Initial implementation complete: scalar, AVX2/PCLMUL, and ARM64 NEON/PMULL structural scanners (runtime-dispatched); root-path and cursor APIs; escape-decoded strings; integer/float/bool/typeof/len accessors; FFI panic barrier; and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson. ARM64 NEON/PMULL is correctness-tested via the scanner cross-check suite; published benchmarks are x86_64 only. ## Building diff --git a/benches/arm_bench.lua b/benches/arm_bench.lua new file mode 100644 index 0000000..5dbe523 --- /dev/null +++ b/benches/arm_bench.lua @@ -0,0 +1,149 @@ +-- ARM64 NEON benchmark: qjson vs lua-cjson (parse + access only) +-- Run from worktree root: +-- DYLD_LIBRARY_PATH=./target/release LUA_CPATH='./vendor/lua-cjson/?.so;./target/release/lib?.so' \ +-- luajit arm_bench.lua + +package.cpath = "./vendor/lua-cjson/?.so;./target/release/lib?.so;" .. package.cpath + +local qjson = require("qjson") +local cjson = require("cjson") +local function make_b64_block() + local b64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" + local rng = 12345 + local t = {} + for i = 1, 64 * 1024 do + rng = (rng * 48271) % 2147483647 + local idx = (rng % 64) + 1 + t[i] = b64_chars:sub(idx, idx) + end + return table.concat(t) +end + +local function make_b64(size) + if size <= B64_BLOCK_LEN then + return B64_BLOCK:sub(1, size) + end + local reps = math.ceil(size / B64_BLOCK_LEN) + return string.rep(B64_BLOCK, reps):sub(1, size) +end + +local function make_payload(target_bytes) + local message_count = math.max(1, math.ceil(target_bytes / (1024 * 1024))) + local envelope = '{"model":"gpt-4-vision","temperature":0.7,"messages":[]}' + local text = string.rep("Q", 256) + local text_part = '{"type":"text","text":"' .. text .. '"}' + local image_prefix = '{"type":"image_url","image_url":{"url":"data:image/jpeg;base64,' + local image_suffix = '"}}' + local message_overhead = #('{"role":"user","content":[,]}') + #text_part + + #image_prefix + #image_suffix + local remaining = target_bytes - #envelope - (message_count * message_overhead) + local image_size = math.max(1024, math.floor(remaining / message_count)) + + local messages = {} + for i = 1, message_count do + local role = i % 2 == 1 and "user" or "assistant" + local b64 = make_b64(image_size) + local image_part = image_prefix .. b64 .. image_suffix + messages[i] = '{"role":"' .. role .. '","content":[' + .. text_part .. "," .. image_part .. ']}' + end + + return '{"model":"gpt-4-vision","temperature":0.7,"messages":[' + .. table.concat(messages, ",") .. ']}' +end + +local ROUNDS = 5 + +local function bench(name, iters, fn) + local warmup = math.max(50, math.floor(iters / 5)) + for _ = 1, warmup do fn() end + + collectgarbage("collect") + + local ops = {} + for r = 1, ROUNDS do + local t0 = os.clock() + for _ = 1, iters do fn() end + local t1 = os.clock() + ops[r] = iters / (t1 - t0) + end + + table.sort(ops) + return ops[math.ceil(ROUNDS / 2)] +end + +local content_paths_cache = {} + +local function content_paths(n) + local paths = content_paths_cache[n] + if paths then return paths end + paths = {} + for i = 0, n - 1 do + paths[i + 1] = "messages[" .. i .. "].content" + end + content_paths_cache[n] = paths + return paths +end + +local scenarios = { + {name = "small", target = 2 * 1024, iters = 5000}, + {name = "medium", target = 60 * 1024, iters = 500}, + {name = "100k", target = 100 * 1024, iters = 200}, + {name = "1m", target = 1024 * 1024, iters = 50}, + {name = "10m", target = 10 * 1024 * 1024, iters = 5}, +} + +B64_BLOCK = make_b64_block() +B64_BLOCK_LEN = #B64_BLOCK + +io.write("Generating payloads...") +io.flush() +local payloads = {} +for _, s in ipairs(scenarios) do + payloads[s.name] = make_payload(s.target) + io.write(" " .. s.name) + io.flush() +end +print(" done.") +print("") + +local header_fmt = "%-10s %-10s %-12s %-12s %-10s" +print(string.format(header_fmt, "Scenario", "Size", "cjson", "qjson.parse", "speedup")) +print(string.rep("-", 58)) + +for _, s in ipairs(scenarios) do + local payload = payloads[s.name] + local size_kb = #payload / 1024 + local size_label + if size_kb >= 1024 then + size_label = string.format("%.1f MB", size_kb / 1024) + else + size_label = string.format("%.0f KB", size_kb) + end + + local cjson_ops = bench("cjson " .. s.name, s.iters, function() + local obj = cjson.decode(payload) + local _ = obj.model + local _ = obj.temperature + if obj.messages then + for _, msg in ipairs(obj.messages) do + local _ = msg.content + end + end + end) + + local qjson_ops = bench("qjson " .. s.name, s.iters, function() + local doc = qjson.parse(payload) + local _ = doc:get_str("model") + local _ = doc:get_f64("temperature") + local n = doc:len("messages") or 0 + local paths = content_paths(n) + for i = 1, n do + local _ = doc:typeof(paths[i]) + end + end) + + local speedup = qjson_ops / cjson_ops + print(string.format("%-10s %-10s %-12.0f %-12.0f %-10.1fx", + s.name, size_label, cjson_ops, qjson_ops, speedup)) +end diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 4083ceb..bed244b 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -22,6 +22,10 @@ Lua-table baselines. | `lua-cjson` | vendored `openresty/lua-cjson` | | `lua-resty-simdjson` | `Kong/lua-resty-simdjson` commit `77322db640927c14968f1314a9fb1bb2bc084015`, installed under OpenResty lualib | +> **Platform scope:** x86_64 benchmarks include simdjson and modify+encode +> scenarios. ARM64 NEON benchmarks cover parse + access (cjson comparison +> only); simdjson is not available on macOS ARM64. + ## Methodology The harness lives at `benches/lua_bench.lua`. For each scenario: @@ -87,7 +91,7 @@ harness prints a skip message and omits the simdjson rows. Numbers below come from one such run. -## Results — throughput (median ops/s) +## Results — throughput (x86_64, median ops/s) Each row is "parse + access request fields" on the named payload. @@ -105,6 +109,29 @@ Each row is "parse + access request fields" on the named payload. | 10m | 10.00 MB | 51 | 363 | 1,830 | 1,783 | 1,749 | | interleaved (100k/200k/500k/1m, cycled) | — | 1,125 | 9,701 | 34,173 | 36,278 | 36,456 | +## Results — throughput (ARM64 NEON, median ops/s) + +Each row is "parse + access request fields" on the named payload. The same +workload as the x86_64 table above. `simdjson` is omitted (no OpenResty on +macOS ARM64). Numbers below come from a single run on Apple M4. + +| Scenario | Size | cjson | `qjson.parse` | speedup vs. cjson | +|---|---:|---:|---:|---:| +| small | 2 KB | 493,827 | 906,618 | 1.8× | +| medium | 60 KB | 24,847 | 215,146 | 8.7× | +| 100k | 100 KB | 15,475 | 146,413 | 9.5× | +| 1m | 1.0 MB | 1,468 | 20,251 | 13.8× | +| 10m | 10.0 MB | 150 | 2,058 | 13.8× | + +> **Environment:** Apple M4 (ARM64), 16 GB, macOS 15.x. LuaJIT 2.1.1774896198 +> (Homebrew). `qjson` release build, NEON + PMULL scanner active. +> `lua-cjson` from vendored `openresty/lua-cjson`. Reproduce with: +> ```sh +> cargo build --release +> LUA_PATH='./lua/?.lua;;' DYLD_LIBRARY_PATH=./target/release \ +> luajit arm_bench.lua +> ``` + ### Modify + encode throughput (PR #54) One-shot modify-then-encode benchmarks. Exercises the decode → mutate → @@ -128,7 +155,7 @@ fresh-process run on x86_64 Linux (AMD EPYC Rome, Zen 2). For a before/after comparison against the pre-#54 baseline, see the [PR #54 benchmark comment](https://github.com/api7/lua-qjson/pull/54#issuecomment-4525477361). -### Speed-up vs. baselines +### Speed-up vs. baselines (x86_64) | Scenario | `qjson.parse` / cjson | `qjson.parse` / simdjson | `qjson.decode + access content` / cjson | `qjson.decode + access content` / simdjson | |---|---:|---:|---:|---:| @@ -204,10 +231,17 @@ key into the Lua table heap. redundant tree walks and array/object re-scans inside the encoder. Large payloads (≥5 MB) are dominated by the root-container materialization cost, which copies all fields into a plain table. -8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache - interference between payload sizes. Each size now runs in its own - `resty` process, eliminating the systemic cross-scenario variance - observed in earlier benchmark runs. + 8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache + interference between payload sizes. Each size now runs in its own + `resty` process, eliminating the systemic cross-scenario variance + observed in earlier benchmark runs. + 9. **ARM64 NEON delivers 1.8–13.8× over cjson** on the same multimodal + workload (Apple M4, LuaJIT 2.1 Homebrew). The speedup is lower than + x86_64 at equivalent sizes (~9.5× vs ~30.3× at 100 KB) primarily + because cjson runs faster on ARM64 hardware (JIT-compiled scalar code + benefits from wider out-of-order execution on M4). The absolute + `qjson.parse` throughput is competitive: ~146k ops/s at 100 KB vs + ~84k on the x86_64 Zen 2. ## When to pick which