Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Rust-implemented fast JSON decoder exposed to LuaJIT via FFI. Optimized for the

## Status

Initial implementation complete: scalar + AVX2/PCLMUL + ARM64 NEON/PMULL structural scanner (runtime-dispatched), root-path and cursor APIs, escape-decoded strings, integer/float/bool/typeof/len, FFI panic barrier, and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson.
Initial implementation complete: scalar, AVX2/PCLMUL, and ARM64 NEON/PMULL structural scanners (runtime-dispatched); root-path and cursor APIs; escape-decoded strings; integer/float/bool/typeof/len accessors; FFI panic barrier; and a LuaJIT wrapper. Rust unit/integration tests and Lua busted tests run in CI. The benchmark harness compares against lua-cjson and lua-resty-simdjson. ARM64 NEON/PMULL is correctness-tested via the scanner cross-check suite; published benchmarks are x86_64 only.

## Building

Expand Down
149 changes: 149 additions & 0 deletions benches/arm_bench.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
-- ARM64 NEON benchmark: qjson vs lua-cjson (parse + access only)
-- Run from worktree root:
-- DYLD_LIBRARY_PATH=./target/release LUA_CPATH='./vendor/lua-cjson/?.so;./target/release/lib?.so' \
-- luajit arm_bench.lua

Comment on lines +4 to +5
package.cpath = "./vendor/lua-cjson/?.so;./target/release/lib?.so;" .. package.cpath
Comment on lines +3 to +6

local qjson = require("qjson")
local cjson = require("cjson")
local function make_b64_block()
local b64_chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
local rng = 12345
local t = {}
for i = 1, 64 * 1024 do
rng = (rng * 48271) % 2147483647
local idx = (rng % 64) + 1
t[i] = b64_chars:sub(idx, idx)
end
return table.concat(t)
end

local function make_b64(size)
if size <= B64_BLOCK_LEN then
return B64_BLOCK:sub(1, size)
end
local reps = math.ceil(size / B64_BLOCK_LEN)
return string.rep(B64_BLOCK, reps):sub(1, size)
end

local function make_payload(target_bytes)
local message_count = math.max(1, math.ceil(target_bytes / (1024 * 1024)))
local envelope = '{"model":"gpt-4-vision","temperature":0.7,"messages":[]}'
local text = string.rep("Q", 256)
local text_part = '{"type":"text","text":"' .. text .. '"}'
local image_prefix = '{"type":"image_url","image_url":{"url":"data:image/jpeg;base64,'
local image_suffix = '"}}'
local message_overhead = #('{"role":"user","content":[,]}') + #text_part
+ #image_prefix + #image_suffix
local remaining = target_bytes - #envelope - (message_count * message_overhead)
local image_size = math.max(1024, math.floor(remaining / message_count))

local messages = {}
for i = 1, message_count do
local role = i % 2 == 1 and "user" or "assistant"
local b64 = make_b64(image_size)
local image_part = image_prefix .. b64 .. image_suffix
messages[i] = '{"role":"' .. role .. '","content":['
.. text_part .. "," .. image_part .. ']}'
end

return '{"model":"gpt-4-vision","temperature":0.7,"messages":['
.. table.concat(messages, ",") .. ']}'
end

local ROUNDS = 5

local function bench(name, iters, fn)
local warmup = math.max(50, math.floor(iters / 5))
for _ = 1, warmup do fn() end

collectgarbage("collect")

local ops = {}
for r = 1, ROUNDS do
local t0 = os.clock()
for _ = 1, iters do fn() end
local t1 = os.clock()
ops[r] = iters / (t1 - t0)
end

table.sort(ops)
return ops[math.ceil(ROUNDS / 2)]
end

local content_paths_cache = {}

local function content_paths(n)
local paths = content_paths_cache[n]
if paths then return paths end
paths = {}
for i = 0, n - 1 do
paths[i + 1] = "messages[" .. i .. "].content"
end
content_paths_cache[n] = paths
return paths
end

local scenarios = {
{name = "small", target = 2 * 1024, iters = 5000},
{name = "medium", target = 60 * 1024, iters = 500},
{name = "100k", target = 100 * 1024, iters = 200},
{name = "1m", target = 1024 * 1024, iters = 50},
{name = "10m", target = 10 * 1024 * 1024, iters = 5},
}

B64_BLOCK = make_b64_block()
B64_BLOCK_LEN = #B64_BLOCK
Comment on lines +96 to +97

io.write("Generating payloads...")
io.flush()
local payloads = {}
for _, s in ipairs(scenarios) do
payloads[s.name] = make_payload(s.target)
io.write(" " .. s.name)
io.flush()
end
print(" done.")
print("")

local header_fmt = "%-10s %-10s %-12s %-12s %-10s"
print(string.format(header_fmt, "Scenario", "Size", "cjson", "qjson.parse", "speedup"))
print(string.rep("-", 58))

for _, s in ipairs(scenarios) do
local payload = payloads[s.name]
local size_kb = #payload / 1024
local size_label
if size_kb >= 1024 then
size_label = string.format("%.1f MB", size_kb / 1024)
else
size_label = string.format("%.0f KB", size_kb)
end

local cjson_ops = bench("cjson " .. s.name, s.iters, function()
local obj = cjson.decode(payload)
local _ = obj.model
local _ = obj.temperature
if obj.messages then
for _, msg in ipairs(obj.messages) do
local _ = msg.content
end
end
end)

local qjson_ops = bench("qjson " .. s.name, s.iters, function()
local doc = qjson.parse(payload)
local _ = doc:get_str("model")
local _ = doc:get_f64("temperature")
local n = doc:len("messages") or 0
local paths = content_paths(n)
for i = 1, n do
local _ = doc:typeof(paths[i])
end
end)

local speedup = qjson_ops / cjson_ops
print(string.format("%-10s %-10s %-12.0f %-12.0f %-10.1fx",
s.name, size_label, cjson_ops, qjson_ops, speedup))
end
46 changes: 40 additions & 6 deletions docs/benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ Lua-table baselines.
| `lua-cjson` | vendored `openresty/lua-cjson` |
| `lua-resty-simdjson` | `Kong/lua-resty-simdjson` commit `77322db640927c14968f1314a9fb1bb2bc084015`, installed under OpenResty lualib |

> **Platform scope:** x86_64 benchmarks include simdjson and modify+encode
> scenarios. ARM64 NEON benchmarks cover parse + access (cjson comparison
> only); simdjson is not available on macOS ARM64.

## Methodology

The harness lives at `benches/lua_bench.lua`. For each scenario:
Expand Down Expand Up @@ -87,7 +91,7 @@ harness prints a skip message and omits the simdjson rows.

Numbers below come from one such run.

## Results — throughput (median ops/s)
## Results — throughput (x86_64, median ops/s)

Each row is "parse + access request fields" on the named payload.

Expand All @@ -105,6 +109,29 @@ Each row is "parse + access request fields" on the named payload.
| 10m | 10.00 MB | 51 | 363 | 1,830 | 1,783 | 1,749 |
| interleaved (100k/200k/500k/1m, cycled) | — | 1,125 | 9,701 | 34,173 | 36,278 | 36,456 |

## Results — throughput (ARM64 NEON, median ops/s)

Each row is "parse + access request fields" on the named payload. The same
workload as the x86_64 table above. `simdjson` is omitted (no OpenResty on
macOS ARM64). Numbers below come from a single run on Apple M4.

| Scenario | Size | cjson | `qjson.parse` | speedup vs. cjson |
|---|---:|---:|---:|---:|
| small | 2 KB | 493,827 | 906,618 | 1.8× |
| medium | 60 KB | 24,847 | 215,146 | 8.7× |
| 100k | 100 KB | 15,475 | 146,413 | 9.5× |
| 1m | 1.0 MB | 1,468 | 20,251 | 13.8× |
| 10m | 10.0 MB | 150 | 2,058 | 13.8× |

> **Environment:** Apple M4 (ARM64), 16 GB, macOS 15.x. LuaJIT 2.1.1774896198
> (Homebrew). `qjson` release build, NEON + PMULL scanner active.
> `lua-cjson` from vendored `openresty/lua-cjson`. Reproduce with:
> ```sh
> cargo build --release
> LUA_PATH='./lua/?.lua;;' DYLD_LIBRARY_PATH=./target/release \
> luajit arm_bench.lua
> ```

### Modify + encode throughput (PR #54)

One-shot modify-then-encode benchmarks. Exercises the decode → mutate →
Expand All @@ -128,7 +155,7 @@ fresh-process run on x86_64 Linux (AMD EPYC Rome, Zen 2).
For a before/after comparison against the pre-#54 baseline, see the
[PR #54 benchmark comment](https://github.com/api7/lua-qjson/pull/54#issuecomment-4525477361).

### Speed-up vs. baselines
### Speed-up vs. baselines (x86_64)

| Scenario | `qjson.parse` / cjson | `qjson.parse` / simdjson | `qjson.decode + access content` / cjson | `qjson.decode + access content` / simdjson |
|---|---:|---:|---:|---:|
Expand Down Expand Up @@ -204,10 +231,17 @@ key into the Lua table heap.
redundant tree walks and array/object re-scans inside the encoder.
Large payloads (≥5 MB) are dominated by the root-container
materialization cost, which copies all fields into a plain table.
8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
interference between payload sizes. Each size now runs in its own
`resty` process, eliminating the systemic cross-scenario variance
observed in earlier benchmark runs.
8. **Fresh-process isolation** removes accumulated GC and JIT trace-cache
interference between payload sizes. Each size now runs in its own
`resty` process, eliminating the systemic cross-scenario variance
observed in earlier benchmark runs.
9. **ARM64 NEON delivers 1.8–13.8× over cjson** on the same multimodal
workload (Apple M4, LuaJIT 2.1 Homebrew). The speedup is lower than
x86_64 at equivalent sizes (~9.5× vs ~30.3× at 100 KB) primarily
because cjson runs faster on ARM64 hardware (JIT-compiled scalar code
benefits from wider out-of-order execution on M4). The absolute
`qjson.parse` throughput is competitive: ~146k ops/s at 100 KB vs
~84k on the x86_64 Zen 2.

## When to pick which

Expand Down
Loading