From fac7961e2db3bed92ecb69427d161526daabc0aa Mon Sep 17 00:00:00 2001 From: Asger F Date: Tue, 19 Nov 2024 11:23:02 +0100 Subject: [PATCH 1/7] Add streaming jsonl parser --- .../ql-vscode/src/common/jsonl-reader.ts | 54 ++++++++++++++----- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/extensions/ql-vscode/src/common/jsonl-reader.ts b/extensions/ql-vscode/src/common/jsonl-reader.ts index a20488c48d2..b1d4932f8ce 100644 --- a/extensions/ql-vscode/src/common/jsonl-reader.ts +++ b/extensions/ql-vscode/src/common/jsonl-reader.ts @@ -1,26 +1,54 @@ -import { readFile } from "fs-extra"; +import { stat } from "fs/promises"; +import { createReadStream } from "fs-extra"; + +const doubleLineBreakRegexp = /\n\r?\n/; /** * Read a file consisting of multiple JSON objects. Each object is separated from the previous one * by a double newline sequence. This is basically a more human-readable form of JSONL. * - * The current implementation reads the entire text of the document into memory, but in the future - * it will stream the document to improve the performance with large documents. - * * @param path The path to the file. * @param handler Callback to be invoked for each top-level JSON object in order. */ export async function readJsonlFile( path: string, handler: (value: T) => Promise, + logger?: { log: (message: string) => void }, ): Promise { - const logSummary = await readFile(path, "utf-8"); - - // Remove newline delimiters because summary is in .jsonl format. - const jsonSummaryObjects: string[] = logSummary.split(/\r?\n\r?\n/g); - - for (const obj of jsonSummaryObjects) { - const jsonObj = JSON.parse(obj) as T; - await handler(jsonObj); - } + void logger?.log( + `Parsing ${path} (${(await stat(path)).size / 1024 / 1024} MB)...`, + ); + return new Promise((resolve, reject) => { + const stream = createReadStream(path, { encoding: "utf8" }); + let buffer = ""; + stream.on("data", async (chunk: string) => { + const parts = (buffer + chunk).split(doubleLineBreakRegexp); + buffer = parts.pop()!; + if (parts.length > 0) { + try { + stream.pause(); + for (const part of parts) { + await handler(JSON.parse(part)); + } + stream.resume(); + } catch (e) { + stream.destroy(); + reject(e); + } + } + }); + stream.on("end", async () => { + if (buffer.trim().length > 0) { + try { + await handler(JSON.parse(buffer)); + } catch (e) { + reject(e); + return; + } + } + void logger?.log(`Finishing parsing ${path}`); + resolve(); + }); + stream.on("error", reject); + }); } From 2cde3b9c2f8210014861338d75ebf6523dc3587b Mon Sep 17 00:00:00 2001 From: Asger F Date: Tue, 19 Nov 2024 11:23:46 +0100 Subject: [PATCH 2/7] Add benchmark script The current build setup doesn't seem to have a concept for benchmark scripts, so for now you'll have to run it with something like ts-node. --- .../test/benchmarks/jsonl-reader.bench.ts | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 extensions/ql-vscode/test/benchmarks/jsonl-reader.bench.ts diff --git a/extensions/ql-vscode/test/benchmarks/jsonl-reader.bench.ts b/extensions/ql-vscode/test/benchmarks/jsonl-reader.bench.ts new file mode 100644 index 00000000000..0d61e9d7197 --- /dev/null +++ b/extensions/ql-vscode/test/benchmarks/jsonl-reader.bench.ts @@ -0,0 +1,73 @@ +import { readFile } from "fs-extra"; +import { readJsonlFile } from "../../src/common/jsonl-reader"; +import { performance } from "perf_hooks"; +import { join } from "path"; + +/** An "obviously correct" implementation to test against. */ +async function readJsonlReferenceImpl( + path: string, + handler: (value: T) => Promise, +): Promise { + const logSummary = await readFile(path, "utf-8"); + + // Remove newline delimiters because summary is in .jsonl format. + const jsonSummaryObjects: string[] = logSummary.split(/\r?\n\r?\n/g); + + for (const obj of jsonSummaryObjects) { + const jsonObj = JSON.parse(obj) as T; + await handler(jsonObj); + } +} + +type ParserFn = ( + text: string, + callback: (v: unknown) => Promise, +) => Promise; + +const parsers: Record = { + readJsonlReferenceImpl, + readJsonlFile, +}; + +async function main() { + const args = process.argv.slice(2); + const file = + args.length > 0 + ? args[0] + : join( + __dirname, + "../unit-tests/data/evaluator-log-summaries/bad-join-order.jsonl", + ); + const numTrials = args.length > 1 ? Number(args[1]) : 100; + const referenceValues: any[] = []; + await readJsonlReferenceImpl(file, async (event) => { + referenceValues.push(event); + }); + const referenceValueString = JSON.stringify(referenceValues); + // Do warm-up runs and check against reference implementation + for (const [name, parser] of Object.entries(parsers)) { + const values: unknown[] = []; + await parser(file, async (event) => { + values.push(event); + }); + if (JSON.stringify(values) !== referenceValueString) { + console.error(`${name}: failed to match reference implementation`); + } + } + for (const [name, parser] of Object.entries(parsers)) { + const startTime = performance.now(); + for (let i = 0; i < numTrials; ++i) { + await Promise.all([ + parser(file, async () => {}), + parser(file, async () => {}), + ]); + } + const duration = performance.now() - startTime; + const durationPerTrial = duration / numTrials; + console.log(`${name}: ${durationPerTrial.toFixed(1)} ms`); + } +} + +main().catch((err: unknown) => { + console.error(err); +}); From bb1da9c6ff99e9faabce2aacba035d5ccde6ea25 Mon Sep 17 00:00:00 2001 From: Asger F Date: Tue, 19 Nov 2024 13:28:41 +0100 Subject: [PATCH 3/7] Explain why we need to stream and why not use readline --- extensions/ql-vscode/src/common/jsonl-reader.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/extensions/ql-vscode/src/common/jsonl-reader.ts b/extensions/ql-vscode/src/common/jsonl-reader.ts index b1d4932f8ce..e50d5312649 100644 --- a/extensions/ql-vscode/src/common/jsonl-reader.ts +++ b/extensions/ql-vscode/src/common/jsonl-reader.ts @@ -15,6 +15,8 @@ export async function readJsonlFile( handler: (value: T) => Promise, logger?: { log: (message: string) => void }, ): Promise { + // Stream the data as large evaluator logs won't fit in memory. + // Also avoid using 'readline' as it is slower than our manual line splitting. void logger?.log( `Parsing ${path} (${(await stat(path)).size / 1024 / 1024} MB)...`, ); From 38849f70f5ad352c18c2f83ba67e1322330637a6 Mon Sep 17 00:00:00 2001 From: Asger F Date: Wed, 20 Nov 2024 11:06:10 +0100 Subject: [PATCH 4/7] Apply suggestions from code review Co-authored-by: Andrew Eisenberg --- extensions/ql-vscode/src/common/jsonl-reader.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extensions/ql-vscode/src/common/jsonl-reader.ts b/extensions/ql-vscode/src/common/jsonl-reader.ts index e50d5312649..202720a9819 100644 --- a/extensions/ql-vscode/src/common/jsonl-reader.ts +++ b/extensions/ql-vscode/src/common/jsonl-reader.ts @@ -13,7 +13,7 @@ const doubleLineBreakRegexp = /\n\r?\n/; export async function readJsonlFile( path: string, handler: (value: T) => Promise, - logger?: { log: (message: string) => void }, + logger?: BaseLogger, ): Promise { // Stream the data as large evaluator logs won't fit in memory. // Also avoid using 'readline' as it is slower than our manual line splitting. @@ -48,7 +48,7 @@ export async function readJsonlFile( return; } } - void logger?.log(`Finishing parsing ${path}`); + void logger?.log(`Finished parsing ${path}`); resolve(); }); stream.on("error", reject); From d05cdf49ece6785acfa65015351146c8e590b70a Mon Sep 17 00:00:00 2001 From: Asger F Date: Wed, 20 Nov 2024 11:11:42 +0100 Subject: [PATCH 5/7] Fix missing import --- extensions/ql-vscode/src/common/jsonl-reader.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/extensions/ql-vscode/src/common/jsonl-reader.ts b/extensions/ql-vscode/src/common/jsonl-reader.ts index 202720a9819..d2f1edd926e 100644 --- a/extensions/ql-vscode/src/common/jsonl-reader.ts +++ b/extensions/ql-vscode/src/common/jsonl-reader.ts @@ -1,5 +1,6 @@ import { stat } from "fs/promises"; import { createReadStream } from "fs-extra"; +import type { BaseLogger } from "./logging"; const doubleLineBreakRegexp = /\n\r?\n/; From b90cfb670b8eec4d71fe53920bf80b10f9659edc Mon Sep 17 00:00:00 2001 From: Asger F Date: Wed, 20 Nov 2024 11:12:37 +0100 Subject: [PATCH 6/7] Move some calls into the try block --- extensions/ql-vscode/src/common/jsonl-reader.ts | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/extensions/ql-vscode/src/common/jsonl-reader.ts b/extensions/ql-vscode/src/common/jsonl-reader.ts index d2f1edd926e..fe9861aec6e 100644 --- a/extensions/ql-vscode/src/common/jsonl-reader.ts +++ b/extensions/ql-vscode/src/common/jsonl-reader.ts @@ -41,16 +41,15 @@ export async function readJsonlFile( } }); stream.on("end", async () => { - if (buffer.trim().length > 0) { - try { + try { + if (buffer.trim().length > 0) { await handler(JSON.parse(buffer)); - } catch (e) { - reject(e); - return; } + void logger?.log(`Finished parsing ${path}`); + resolve(); + } catch (e) { + reject(e); } - void logger?.log(`Finished parsing ${path}`); - resolve(); }); stream.on("error", reject); }); From 57e2b51b438c1d83fb3cec5786b3d0368b62dd6e Mon Sep 17 00:00:00 2001 From: Asger F Date: Wed, 20 Nov 2024 11:19:31 +0100 Subject: [PATCH 7/7] Add a file comment to the benchmark script --- .../test/benchmarks/jsonl-reader.bench.ts | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/extensions/ql-vscode/test/benchmarks/jsonl-reader.bench.ts b/extensions/ql-vscode/test/benchmarks/jsonl-reader.bench.ts index 0d61e9d7197..a5a3b64b0b2 100644 --- a/extensions/ql-vscode/test/benchmarks/jsonl-reader.bench.ts +++ b/extensions/ql-vscode/test/benchmarks/jsonl-reader.bench.ts @@ -1,3 +1,17 @@ +/** + * Benchmarks the jsonl-parser against a reference implementation and checks that it generates + * the same output. + * + * Usage: + * + * ts-node json-reader.bench.ts [evaluator-log.summary.jsonl] [count] + * + * The log file defaults to a small checked-in log and count defaults to 100 + * (and should be lowered significantly for large files). + * + * At the time of writing it is about as fast as the synchronous reference implementation, + * but doesn't run out of memory for large files. + */ import { readFile } from "fs-extra"; import { readJsonlFile } from "../../src/common/jsonl-reader"; import { performance } from "perf_hooks";