From 42fafca01b645cdccefd15f4d9cb16f155dd3801 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Thu, 26 Mar 2026 22:12:31 -0600 Subject: [PATCH 1/6] perf(ast): bulk-insert AST nodes via native Rust/rusqlite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move AST node SQLite inserts from per-row JS iteration to a single native Rust transaction via napi-rs + rusqlite. The new bulkInsertAstNodes function opens the DB directly from Rust, pre-fetches parent node definitions, and inserts all rows in one transaction — eliminating the JS-native FFI overhead per row. The JS-side buildAstNodes tries the native fast path first (when all files have native astNodes arrays), falling back to the existing JS loop for WASM or mixed-engine scenarios. Target: astMs < 50ms on native full builds (was ~393ms). --- crates/codegraph-core/Cargo.toml | 1 + crates/codegraph-core/src/ast_db.rs | 160 ++++++++++++++++++++++++++++ crates/codegraph-core/src/lib.rs | 1 + src/features/ast.ts | 42 ++++++++ src/types.ts | 13 +++ 5 files changed, 217 insertions(+) create mode 100644 crates/codegraph-core/src/ast_db.rs diff --git a/crates/codegraph-core/Cargo.toml b/crates/codegraph-core/Cargo.toml index d968ad1c..a9b5fda8 100644 --- a/crates/codegraph-core/Cargo.toml +++ b/crates/codegraph-core/Cargo.toml @@ -24,6 +24,7 @@ tree-sitter-ruby = "0.23" tree-sitter-php = "0.23" tree-sitter-hcl = "1" rayon = "1" +rusqlite = { version = "0.32", features = ["bundled"] } send_wrapper = "0.6" [build-dependencies] diff --git a/crates/codegraph-core/src/ast_db.rs b/crates/codegraph-core/src/ast_db.rs new file mode 100644 index 00000000..b2807943 --- /dev/null +++ b/crates/codegraph-core/src/ast_db.rs @@ -0,0 +1,160 @@ +//! Bulk AST node insertion via rusqlite. +//! +//! Bypasses the JS iteration loop by opening the SQLite database directly +//! from Rust and inserting all AST nodes in a single transaction. +//! Parent node IDs are resolved by querying the `nodes` table. + +use std::collections::HashMap; + +use napi_derive::napi; +use rusqlite::{params, Connection, OpenFlags}; +use serde::{Deserialize, Serialize}; + +/// A single AST node to insert (received from JS). +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AstInsertNode { + pub line: u32, + pub kind: String, + pub name: String, + pub text: Option, + pub receiver: Option, +} + +/// A batch of AST nodes for a single file. +#[napi(object)] +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FileAstBatch { + pub file: String, + pub nodes: Vec, +} + +/// A definition row from the `nodes` table used for parent resolution. +struct NodeDef { + id: i64, + line: u32, + end_line: Option, +} + +/// Find the narrowest enclosing definition for a given source line. +/// Returns the node ID of the best match, or None if no definition encloses this line. +fn find_parent_id(defs: &[NodeDef], line: u32) -> Option { + let mut best_id: Option = None; + let mut best_span = u32::MAX; + for d in defs { + if d.line <= line { + if let Some(el) = d.end_line { + if el >= line { + let span = el - d.line; + if span < best_span { + best_id = Some(d.id); + best_span = span; + } + } + } + } + } + best_id +} + +/// Bulk-insert AST nodes into the database, resolving `parent_node_id` +/// from the `nodes` table. Runs all inserts in a single SQLite transaction. +/// +/// Returns the number of rows inserted. Returns 0 on any error (DB open +/// failure, missing table, transaction failure). +#[napi] +pub fn bulk_insert_ast_nodes(db_path: String, batches: Vec) -> u32 { + if batches.is_empty() { + return 0; + } + + let flags = OpenFlags::SQLITE_OPEN_READ_WRITE | OpenFlags::SQLITE_OPEN_NO_MUTEX; + let mut conn = match Connection::open_with_flags(&db_path, flags) { + Ok(c) => c, + Err(_) => return 0, + }; + + // Match the JS-side performance pragmas + let _ = conn.execute_batch("PRAGMA synchronous = NORMAL"); + + // Bail out if the ast_nodes table doesn't exist (schema too old) + let has_table: bool = conn + .prepare("SELECT 1 FROM sqlite_master WHERE type='table' AND name='ast_nodes'") + .and_then(|mut s| s.query_row([], |_| Ok(true))) + .unwrap_or(false); + if !has_table { + return 0; + } + + // ── Phase 1: Pre-fetch node definitions for parent resolution ──────── + let mut file_defs: HashMap> = HashMap::new(); + { + let Ok(mut stmt) = + conn.prepare("SELECT id, line, end_line FROM nodes WHERE file = ?1") + else { + return 0; + }; + + for batch in &batches { + if batch.nodes.is_empty() || file_defs.contains_key(&batch.file) { + continue; + } + let defs: Vec = stmt + .query_map(params![&batch.file], |row| { + Ok(NodeDef { + id: row.get(0)?, + line: row.get(1)?, + end_line: row.get(2)?, + }) + }) + .map(|rows| rows.filter_map(|r| r.ok()).collect()) + .unwrap_or_default(); + file_defs.insert(batch.file.clone(), defs); + } + } // `stmt` dropped — releases the immutable borrow on `conn` + + // ── Phase 2: Bulk insert in a single transaction ───────────────────── + let Ok(tx) = conn.transaction() else { + return 0; + }; + + let mut total = 0u32; + { + let Ok(mut insert_stmt) = tx.prepare( + "INSERT INTO ast_nodes (file, line, kind, name, text, receiver, parent_node_id) \ + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)", + ) else { + return 0; + }; + + for batch in &batches { + let empty = Vec::new(); + let defs = file_defs.get(&batch.file).unwrap_or(&empty); + + for node in &batch.nodes { + let parent_id = find_parent_id(defs, node.line); + + if insert_stmt + .execute(params![ + &batch.file, + node.line, + &node.kind, + &node.name, + &node.text, + &node.receiver, + parent_id, + ]) + .is_ok() + { + total += 1; + } + } + } + } // `insert_stmt` dropped + + if tx.commit().is_err() { + return 0; + } + + total +} diff --git a/crates/codegraph-core/src/lib.rs b/crates/codegraph-core/src/lib.rs index 6d3aa6d0..391f0854 100644 --- a/crates/codegraph-core/src/lib.rs +++ b/crates/codegraph-core/src/lib.rs @@ -1,3 +1,4 @@ +pub mod ast_db; pub mod cfg; pub mod complexity; pub mod constants; diff --git a/src/features/ast.ts b/src/features/ast.ts index 55307fa0..30918662 100644 --- a/src/features/ast.ts +++ b/src/features/ast.ts @@ -6,6 +6,7 @@ import { createAstStoreVisitor } from '../ast-analysis/visitors/ast-store-visito import { bulkNodeIdsByFile, openReadonlyOrFail } from '../db/index.js'; import { buildFileConditionSQL } from '../db/query-builder.js'; import { debug } from '../infrastructure/logger.js'; +import { loadNative } from '../infrastructure/native.js'; import { outputResult } from '../infrastructure/result-formatter.js'; import { paginateResult } from '../shared/paginate.js'; import type { ASTNodeKind, BetterSqlite3Database, Definition, TreeSitterNode } from '../types.js'; @@ -67,6 +68,47 @@ export async function buildAstNodes( _rootDir: string, _engineOpts?: unknown, ): Promise { + // ── Native bulk-insert fast path ────────────────────────────────────── + const native = loadNative(); + if (native?.bulkInsertAstNodes) { + let needsJsFallback = false; + const batches: Array<{ + file: string; + nodes: Array<{ + line: number; + kind: string; + name: string; + text?: string | null; + receiver?: string | null; + }>; + }> = []; + + for (const [relPath, symbols] of fileSymbols) { + if (Array.isArray(symbols.astNodes)) { + batches.push({ + file: relPath, + nodes: symbols.astNodes.map((n) => ({ + line: n.line, + kind: n.kind, + name: n.name, + text: n.text, + receiver: n.receiver, + })), + }); + } else if (symbols.calls || symbols._tree) { + needsJsFallback = true; + break; + } + } + + if (!needsJsFallback) { + const inserted = native.bulkInsertAstNodes(db.name, batches); + debug(`AST extraction (native bulk): ${inserted} nodes stored`); + return; + } + } + + // ── JS fallback path ────────────────────────────────────────────────── let insertStmt: ReturnType; try { insertStmt = db.prepare( diff --git a/src/types.ts b/src/types.ts index 7dc1236b..41058cce 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1802,6 +1802,19 @@ export interface NativeAddon { computeConfidence(callerFile: string, targetFile: string, importedFrom: string | null): number; detectCycles(edges: Array<{ source: string; target: string }>): string[][]; buildCallEdges(files: unknown[], nodes: unknown[], builtinReceivers: string[]): unknown[]; + bulkInsertAstNodes( + dbPath: string, + batches: Array<{ + file: string; + nodes: Array<{ + line: number; + kind: string; + name: string; + text?: string | null; + receiver?: string | null; + }>; + }>, + ): number; engineVersion(): string; ParseTreeCache: new () => NativeParseTreeCache; } From 89a1ddb63d37f5b6130a9e63bccc9f836045fa31 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Thu, 26 Mar 2026 22:42:42 -0600 Subject: [PATCH 2/6] fix(ast): add busy_timeout pragma to Rust SQLite connection (#651) The Rust connection omitted busy_timeout = 5000 which the JS-side connection.ts sets. Without it, SQLITE_BUSY is returned immediately on WAL contention instead of retrying for 5 seconds. --- crates/codegraph-core/src/ast_db.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/codegraph-core/src/ast_db.rs b/crates/codegraph-core/src/ast_db.rs index b2807943..816ada7e 100644 --- a/crates/codegraph-core/src/ast_db.rs +++ b/crates/codegraph-core/src/ast_db.rs @@ -74,8 +74,10 @@ pub fn bulk_insert_ast_nodes(db_path: String, batches: Vec) -> u32 Err(_) => return 0, }; - // Match the JS-side performance pragmas - let _ = conn.execute_batch("PRAGMA synchronous = NORMAL"); + // Match the JS-side performance pragmas (including busy_timeout for WAL contention) + let _ = conn.execute_batch( + "PRAGMA synchronous = NORMAL; PRAGMA busy_timeout = 5000", + ); // Bail out if the ast_nodes table doesn't exist (schema too old) let has_table: bool = conn From 4d91d0b14c74f3904869e83843b7e51d5fdc2e4d Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Thu, 26 Mar 2026 22:43:16 -0600 Subject: [PATCH 3/6] fix(ast): fall back to JS when native bulk insert count mismatches (#651) bulkInsertAstNodes returns 0 for both "nothing to insert" and hard errors (DB open failure, SQLITE_BUSY, etc). Compare expected vs actual count and fall through to the JS path on mismatch so errors don't silently drop all AST nodes. --- src/features/ast.ts | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/features/ast.ts b/src/features/ast.ts index 30918662..6edd428f 100644 --- a/src/features/ast.ts +++ b/src/features/ast.ts @@ -102,9 +102,16 @@ export async function buildAstNodes( } if (!needsJsFallback) { + const expectedNodes = batches.reduce((s, b) => s + b.nodes.length, 0); const inserted = native.bulkInsertAstNodes(db.name, batches); - debug(`AST extraction (native bulk): ${inserted} nodes stored`); - return; + if (inserted === expectedNodes) { + debug(`AST extraction (native bulk): ${inserted} nodes stored`); + return; + } + debug( + `AST extraction (native bulk): expected ${expectedNodes}, got ${inserted} — falling back to JS`, + ); + // fall through to JS path } } From d4cbd3274a4e2d6fd0c6c357377bb331a1d4c77e Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Thu, 26 Mar 2026 22:43:40 -0600 Subject: [PATCH 4/6] docs(cargo): document rusqlite bundled feature rationale (#651) Explain why bundled is intentional: Windows CI lacks system SQLite, and dual-instance WAL coordination is OS-safe. --- crates/codegraph-core/Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/codegraph-core/Cargo.toml b/crates/codegraph-core/Cargo.toml index a9b5fda8..e7cd155d 100644 --- a/crates/codegraph-core/Cargo.toml +++ b/crates/codegraph-core/Cargo.toml @@ -24,6 +24,9 @@ tree-sitter-ruby = "0.23" tree-sitter-php = "0.23" tree-sitter-hcl = "1" rayon = "1" +# `bundled` embeds a second SQLite copy (better-sqlite3 already bundles one). +# This is intentional: Windows CI lacks a system SQLite, and WAL coordination +# between the two instances is handled safely at the OS level. rusqlite = { version = "0.32", features = ["bundled"] } send_wrapper = "0.6" From 5be93ca96356fdc31e91be8406ef5343abe95bc9 Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Thu, 26 Mar 2026 22:53:35 -0600 Subject: [PATCH 5/6] fix(ast): match JS findParentDef semantics for null end_line (#651) The Rust find_parent_id skipped definitions with end_line = NULL, but the JS findParentDef treats them as always-enclosing with a negative span (preferred over wider defs). This caused parent_node_id mismatches between native and JS paths. --- crates/codegraph-core/src/ast_db.rs | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/crates/codegraph-core/src/ast_db.rs b/crates/codegraph-core/src/ast_db.rs index 816ada7e..be821bed 100644 --- a/crates/codegraph-core/src/ast_db.rs +++ b/crates/codegraph-core/src/ast_db.rs @@ -38,19 +38,24 @@ struct NodeDef { /// Find the narrowest enclosing definition for a given source line. /// Returns the node ID of the best match, or None if no definition encloses this line. +/// +/// Mirrors the JS `findParentDef` semantics: a definition with `end_line = NULL` +/// is treated as always enclosing, with a negative sentinel span so it is preferred +/// over definitions that have an explicit (wider) `end_line`. fn find_parent_id(defs: &[NodeDef], line: u32) -> Option { let mut best_id: Option = None; - let mut best_span = u32::MAX; + let mut best_span: i64 = i64::MAX; for d in defs { if d.line <= line { - if let Some(el) = d.end_line { - if el >= line { - let span = el - d.line; - if span < best_span { - best_id = Some(d.id); - best_span = span; - } - } + let span: i64 = match d.end_line { + Some(el) if el >= line => (el - d.line) as i64, + Some(_) => continue, + // JS: (def.endLine ?? 0) - def.line → negative, always preferred + None => -(d.line as i64), + }; + if span < best_span { + best_id = Some(d.id); + best_span = span; } } } From 6dc0a80299d6bb450c722089c30e9a95555d66fa Mon Sep 17 00:00:00 2001 From: carlos-alm <127798846+carlos-alm@users.noreply.github.com> Date: Thu, 26 Mar 2026 23:18:22 -0600 Subject: [PATCH 6/6] fix(ast): treat row-level execute errors as fatal for transaction (#651) Return 0 immediately on any insert_stmt.execute() failure so the transaction drops and rolls back, ensuring all-or-nothing semantics. Previously, .is_ok() silently swallowed row-level errors which could commit partial data and misfire the JS fallback causing duplicate rows. --- crates/codegraph-core/src/ast_db.rs | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/crates/codegraph-core/src/ast_db.rs b/crates/codegraph-core/src/ast_db.rs index be821bed..4f317db1 100644 --- a/crates/codegraph-core/src/ast_db.rs +++ b/crates/codegraph-core/src/ast_db.rs @@ -141,19 +141,17 @@ pub fn bulk_insert_ast_nodes(db_path: String, batches: Vec) -> u32 for node in &batch.nodes { let parent_id = find_parent_id(defs, node.line); - if insert_stmt - .execute(params![ - &batch.file, - node.line, - &node.kind, - &node.name, - &node.text, - &node.receiver, - parent_id, - ]) - .is_ok() - { - total += 1; + match insert_stmt.execute(params![ + &batch.file, + node.line, + &node.kind, + &node.name, + &node.text, + &node.receiver, + parent_id, + ]) { + Ok(_) => total += 1, + Err(_) => return 0, // abort; tx rolls back on drop } } }