Skip to content

Commit d2235f0

Browse files
authored
perf(ast): bulk-insert AST nodes via native Rust/rusqlite (#651)
* perf(ast): bulk-insert AST nodes via native Rust/rusqlite Move AST node SQLite inserts from per-row JS iteration to a single native Rust transaction via napi-rs + rusqlite. The new bulkInsertAstNodes function opens the DB directly from Rust, pre-fetches parent node definitions, and inserts all rows in one transaction — eliminating the JS-native FFI overhead per row. The JS-side buildAstNodes tries the native fast path first (when all files have native astNodes arrays), falling back to the existing JS loop for WASM or mixed-engine scenarios. Target: astMs < 50ms on native full builds (was ~393ms). * fix(ast): add busy_timeout pragma to Rust SQLite connection (#651) The Rust connection omitted busy_timeout = 5000 which the JS-side connection.ts sets. Without it, SQLITE_BUSY is returned immediately on WAL contention instead of retrying for 5 seconds. * fix(ast): fall back to JS when native bulk insert count mismatches (#651) bulkInsertAstNodes returns 0 for both "nothing to insert" and hard errors (DB open failure, SQLITE_BUSY, etc). Compare expected vs actual count and fall through to the JS path on mismatch so errors don't silently drop all AST nodes. * docs(cargo): document rusqlite bundled feature rationale (#651) Explain why bundled is intentional: Windows CI lacks system SQLite, and dual-instance WAL coordination is OS-safe. * fix(ast): match JS findParentDef semantics for null end_line (#651) The Rust find_parent_id skipped definitions with end_line = NULL, but the JS findParentDef treats them as always-enclosing with a negative span (preferred over wider defs). This caused parent_node_id mismatches between native and JS paths. * fix(ast): treat row-level execute errors as fatal for transaction (#651) Return 0 immediately on any insert_stmt.execute() failure so the transaction drops and rolls back, ensuring all-or-nothing semantics. Previously, .is_ok() silently swallowed row-level errors which could commit partial data and misfire the JS fallback causing duplicate rows.
1 parent 398f9f8 commit d2235f0

5 files changed

Lines changed: 232 additions & 0 deletions

File tree

crates/codegraph-core/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ tree-sitter-ruby = "0.23"
2424
tree-sitter-php = "0.23"
2525
tree-sitter-hcl = "1"
2626
rayon = "1"
27+
# `bundled` embeds a second SQLite copy (better-sqlite3 already bundles one).
28+
# This is intentional: Windows CI lacks a system SQLite, and WAL coordination
29+
# between the two instances is handled safely at the OS level.
30+
rusqlite = { version = "0.32", features = ["bundled"] }
2731
send_wrapper = "0.6"
2832

2933
[build-dependencies]
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
//! Bulk AST node insertion via rusqlite.
2+
//!
3+
//! Bypasses the JS iteration loop by opening the SQLite database directly
4+
//! from Rust and inserting all AST nodes in a single transaction.
5+
//! Parent node IDs are resolved by querying the `nodes` table.
6+
7+
use std::collections::HashMap;
8+
9+
use napi_derive::napi;
10+
use rusqlite::{params, Connection, OpenFlags};
11+
use serde::{Deserialize, Serialize};
12+
13+
/// A single AST node to insert (received from JS).
14+
#[napi(object)]
15+
#[derive(Debug, Clone, Serialize, Deserialize)]
16+
pub struct AstInsertNode {
17+
pub line: u32,
18+
pub kind: String,
19+
pub name: String,
20+
pub text: Option<String>,
21+
pub receiver: Option<String>,
22+
}
23+
24+
/// A batch of AST nodes for a single file.
25+
#[napi(object)]
26+
#[derive(Debug, Clone, Serialize, Deserialize)]
27+
pub struct FileAstBatch {
28+
pub file: String,
29+
pub nodes: Vec<AstInsertNode>,
30+
}
31+
32+
/// A definition row from the `nodes` table used for parent resolution.
33+
struct NodeDef {
34+
id: i64,
35+
line: u32,
36+
end_line: Option<u32>,
37+
}
38+
39+
/// Find the narrowest enclosing definition for a given source line.
40+
/// Returns the node ID of the best match, or None if no definition encloses this line.
41+
///
42+
/// Mirrors the JS `findParentDef` semantics: a definition with `end_line = NULL`
43+
/// is treated as always enclosing, with a negative sentinel span so it is preferred
44+
/// over definitions that have an explicit (wider) `end_line`.
45+
fn find_parent_id(defs: &[NodeDef], line: u32) -> Option<i64> {
46+
let mut best_id: Option<i64> = None;
47+
let mut best_span: i64 = i64::MAX;
48+
for d in defs {
49+
if d.line <= line {
50+
let span: i64 = match d.end_line {
51+
Some(el) if el >= line => (el - d.line) as i64,
52+
Some(_) => continue,
53+
// JS: (def.endLine ?? 0) - def.line → negative, always preferred
54+
None => -(d.line as i64),
55+
};
56+
if span < best_span {
57+
best_id = Some(d.id);
58+
best_span = span;
59+
}
60+
}
61+
}
62+
best_id
63+
}
64+
65+
/// Bulk-insert AST nodes into the database, resolving `parent_node_id`
66+
/// from the `nodes` table. Runs all inserts in a single SQLite transaction.
67+
///
68+
/// Returns the number of rows inserted. Returns 0 on any error (DB open
69+
/// failure, missing table, transaction failure).
70+
#[napi]
71+
pub fn bulk_insert_ast_nodes(db_path: String, batches: Vec<FileAstBatch>) -> u32 {
72+
if batches.is_empty() {
73+
return 0;
74+
}
75+
76+
let flags = OpenFlags::SQLITE_OPEN_READ_WRITE | OpenFlags::SQLITE_OPEN_NO_MUTEX;
77+
let mut conn = match Connection::open_with_flags(&db_path, flags) {
78+
Ok(c) => c,
79+
Err(_) => return 0,
80+
};
81+
82+
// Match the JS-side performance pragmas (including busy_timeout for WAL contention)
83+
let _ = conn.execute_batch(
84+
"PRAGMA synchronous = NORMAL; PRAGMA busy_timeout = 5000",
85+
);
86+
87+
// Bail out if the ast_nodes table doesn't exist (schema too old)
88+
let has_table: bool = conn
89+
.prepare("SELECT 1 FROM sqlite_master WHERE type='table' AND name='ast_nodes'")
90+
.and_then(|mut s| s.query_row([], |_| Ok(true)))
91+
.unwrap_or(false);
92+
if !has_table {
93+
return 0;
94+
}
95+
96+
// ── Phase 1: Pre-fetch node definitions for parent resolution ────────
97+
let mut file_defs: HashMap<String, Vec<NodeDef>> = HashMap::new();
98+
{
99+
let Ok(mut stmt) =
100+
conn.prepare("SELECT id, line, end_line FROM nodes WHERE file = ?1")
101+
else {
102+
return 0;
103+
};
104+
105+
for batch in &batches {
106+
if batch.nodes.is_empty() || file_defs.contains_key(&batch.file) {
107+
continue;
108+
}
109+
let defs: Vec<NodeDef> = stmt
110+
.query_map(params![&batch.file], |row| {
111+
Ok(NodeDef {
112+
id: row.get(0)?,
113+
line: row.get(1)?,
114+
end_line: row.get(2)?,
115+
})
116+
})
117+
.map(|rows| rows.filter_map(|r| r.ok()).collect())
118+
.unwrap_or_default();
119+
file_defs.insert(batch.file.clone(), defs);
120+
}
121+
} // `stmt` dropped — releases the immutable borrow on `conn`
122+
123+
// ── Phase 2: Bulk insert in a single transaction ─────────────────────
124+
let Ok(tx) = conn.transaction() else {
125+
return 0;
126+
};
127+
128+
let mut total = 0u32;
129+
{
130+
let Ok(mut insert_stmt) = tx.prepare(
131+
"INSERT INTO ast_nodes (file, line, kind, name, text, receiver, parent_node_id) \
132+
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
133+
) else {
134+
return 0;
135+
};
136+
137+
for batch in &batches {
138+
let empty = Vec::new();
139+
let defs = file_defs.get(&batch.file).unwrap_or(&empty);
140+
141+
for node in &batch.nodes {
142+
let parent_id = find_parent_id(defs, node.line);
143+
144+
match insert_stmt.execute(params![
145+
&batch.file,
146+
node.line,
147+
&node.kind,
148+
&node.name,
149+
&node.text,
150+
&node.receiver,
151+
parent_id,
152+
]) {
153+
Ok(_) => total += 1,
154+
Err(_) => return 0, // abort; tx rolls back on drop
155+
}
156+
}
157+
}
158+
} // `insert_stmt` dropped
159+
160+
if tx.commit().is_err() {
161+
return 0;
162+
}
163+
164+
total
165+
}

crates/codegraph-core/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
pub mod ast_db;
12
pub mod cfg;
23
pub mod complexity;
34
pub mod constants;

src/features/ast.ts

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { createAstStoreVisitor } from '../ast-analysis/visitors/ast-store-visito
66
import { bulkNodeIdsByFile, openReadonlyOrFail } from '../db/index.js';
77
import { buildFileConditionSQL } from '../db/query-builder.js';
88
import { debug } from '../infrastructure/logger.js';
9+
import { loadNative } from '../infrastructure/native.js';
910
import { outputResult } from '../infrastructure/result-formatter.js';
1011
import { paginateResult } from '../shared/paginate.js';
1112
import type { ASTNodeKind, BetterSqlite3Database, Definition, TreeSitterNode } from '../types.js';
@@ -67,6 +68,54 @@ export async function buildAstNodes(
6768
_rootDir: string,
6869
_engineOpts?: unknown,
6970
): Promise<void> {
71+
// ── Native bulk-insert fast path ──────────────────────────────────────
72+
const native = loadNative();
73+
if (native?.bulkInsertAstNodes) {
74+
let needsJsFallback = false;
75+
const batches: Array<{
76+
file: string;
77+
nodes: Array<{
78+
line: number;
79+
kind: string;
80+
name: string;
81+
text?: string | null;
82+
receiver?: string | null;
83+
}>;
84+
}> = [];
85+
86+
for (const [relPath, symbols] of fileSymbols) {
87+
if (Array.isArray(symbols.astNodes)) {
88+
batches.push({
89+
file: relPath,
90+
nodes: symbols.astNodes.map((n) => ({
91+
line: n.line,
92+
kind: n.kind,
93+
name: n.name,
94+
text: n.text,
95+
receiver: n.receiver,
96+
})),
97+
});
98+
} else if (symbols.calls || symbols._tree) {
99+
needsJsFallback = true;
100+
break;
101+
}
102+
}
103+
104+
if (!needsJsFallback) {
105+
const expectedNodes = batches.reduce((s, b) => s + b.nodes.length, 0);
106+
const inserted = native.bulkInsertAstNodes(db.name, batches);
107+
if (inserted === expectedNodes) {
108+
debug(`AST extraction (native bulk): ${inserted} nodes stored`);
109+
return;
110+
}
111+
debug(
112+
`AST extraction (native bulk): expected ${expectedNodes}, got ${inserted} — falling back to JS`,
113+
);
114+
// fall through to JS path
115+
}
116+
}
117+
118+
// ── JS fallback path ──────────────────────────────────────────────────
70119
let insertStmt: ReturnType<BetterSqlite3Database['prepare']>;
71120
try {
72121
insertStmt = db.prepare(

src/types.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1802,6 +1802,19 @@ export interface NativeAddon {
18021802
computeConfidence(callerFile: string, targetFile: string, importedFrom: string | null): number;
18031803
detectCycles(edges: Array<{ source: string; target: string }>): string[][];
18041804
buildCallEdges(files: unknown[], nodes: unknown[], builtinReceivers: string[]): unknown[];
1805+
bulkInsertAstNodes(
1806+
dbPath: string,
1807+
batches: Array<{
1808+
file: string;
1809+
nodes: Array<{
1810+
line: number;
1811+
kind: string;
1812+
name: string;
1813+
text?: string | null;
1814+
receiver?: string | null;
1815+
}>;
1816+
}>,
1817+
): number;
18051818
engineVersion(): string;
18061819
ParseTreeCache: new () => NativeParseTreeCache;
18071820
}

0 commit comments

Comments
 (0)