Skip to content
Merged
4 changes: 4 additions & 0 deletions crates/codegraph-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ tree-sitter-ruby = "0.23"
tree-sitter-php = "0.23"
tree-sitter-hcl = "1"
rayon = "1"
# `bundled` embeds a second SQLite copy (better-sqlite3 already bundles one).
# This is intentional: Windows CI lacks a system SQLite, and WAL coordination
# between the two instances is handled safely at the OS level.
rusqlite = { version = "0.32", features = ["bundled"] }
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 bundled feature embeds a second independent copy of SQLite into the process

With features = ["bundled"], rusqlite statically compiles its own SQLite C library. better-sqlite3 already bundles its own copy. The process will therefore contain two separate SQLite runtime instances that each open the same .db file via OS-level file locking.

This is functionally correct — WAL-mode coordination is handled by the OS — but it has a few downsides:

  • Binary size grows (SQLite is ~1 MB of C compiled)
  • rusqlite's bundled SQLite version is independent of better-sqlite3's, requiring separate tracking when a SQLite CVE lands

If the target platform is guaranteed to have a system SQLite (Linux/macOS), removing bundled and relying on libsqlite3-sys with dynamic linking would keep a single SQLite instance. If portability is the priority (e.g., Windows CI without system SQLite), bundled is the safer choice — just worth a conscious decision here.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a comment documenting the rationale: bundled is intentional because Windows CI lacks a system SQLite, and WAL coordination between the two instances is handled safely at the OS level. Removing it would break Windows builds.

send_wrapper = "0.6"

[build-dependencies]
Expand Down
165 changes: 165 additions & 0 deletions crates/codegraph-core/src/ast_db.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
//! Bulk AST node insertion via rusqlite.
//!
//! Bypasses the JS iteration loop by opening the SQLite database directly
//! from Rust and inserting all AST nodes in a single transaction.
//! Parent node IDs are resolved by querying the `nodes` table.

use std::collections::HashMap;

use napi_derive::napi;
use rusqlite::{params, Connection, OpenFlags};
use serde::{Deserialize, Serialize};

/// A single AST node to insert (received from JS).
#[napi(object)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AstInsertNode {
pub line: u32,
pub kind: String,
pub name: String,
pub text: Option<String>,
pub receiver: Option<String>,
}

/// A batch of AST nodes for a single file.
#[napi(object)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileAstBatch {
pub file: String,
pub nodes: Vec<AstInsertNode>,
}

/// A definition row from the `nodes` table used for parent resolution.
struct NodeDef {
id: i64,
line: u32,
end_line: Option<u32>,
}

/// Find the narrowest enclosing definition for a given source line.
/// Returns the node ID of the best match, or None if no definition encloses this line.
///
/// Mirrors the JS `findParentDef` semantics: a definition with `end_line = NULL`
/// is treated as always enclosing, with a negative sentinel span so it is preferred
/// over definitions that have an explicit (wider) `end_line`.
fn find_parent_id(defs: &[NodeDef], line: u32) -> Option<i64> {
let mut best_id: Option<i64> = None;
let mut best_span: i64 = i64::MAX;
for d in defs {
if d.line <= line {
let span: i64 = match d.end_line {
Some(el) if el >= line => (el - d.line) as i64,
Some(_) => continue,
// JS: (def.endLine ?? 0) - def.line → negative, always preferred
None => -(d.line as i64),
};
if span < best_span {
best_id = Some(d.id);
best_span = span;
}
}
}
best_id
}

/// Bulk-insert AST nodes into the database, resolving `parent_node_id`
/// from the `nodes` table. Runs all inserts in a single SQLite transaction.
///
/// Returns the number of rows inserted. Returns 0 on any error (DB open
/// failure, missing table, transaction failure).
#[napi]
pub fn bulk_insert_ast_nodes(db_path: String, batches: Vec<FileAstBatch>) -> u32 {
if batches.is_empty() {
return 0;
}

let flags = OpenFlags::SQLITE_OPEN_READ_WRITE | OpenFlags::SQLITE_OPEN_NO_MUTEX;
let mut conn = match Connection::open_with_flags(&db_path, flags) {
Ok(c) => c,
Err(_) => return 0,
};

// Match the JS-side performance pragmas (including busy_timeout for WAL contention)
let _ = conn.execute_batch(
"PRAGMA synchronous = NORMAL; PRAGMA busy_timeout = 5000",
);

// Bail out if the ast_nodes table doesn't exist (schema too old)
let has_table: bool = conn
.prepare("SELECT 1 FROM sqlite_master WHERE type='table' AND name='ast_nodes'")
.and_then(|mut s| s.query_row([], |_| Ok(true)))
.unwrap_or(false);
if !has_table {
return 0;
}

// ── Phase 1: Pre-fetch node definitions for parent resolution ────────
let mut file_defs: HashMap<String, Vec<NodeDef>> = HashMap::new();
{
let Ok(mut stmt) =
conn.prepare("SELECT id, line, end_line FROM nodes WHERE file = ?1")
else {
return 0;
};

for batch in &batches {
if batch.nodes.is_empty() || file_defs.contains_key(&batch.file) {
continue;
}
let defs: Vec<NodeDef> = stmt
.query_map(params![&batch.file], |row| {
Ok(NodeDef {
id: row.get(0)?,
line: row.get(1)?,
end_line: row.get(2)?,
})
})
.map(|rows| rows.filter_map(|r| r.ok()).collect())
.unwrap_or_default();
file_defs.insert(batch.file.clone(), defs);
}
} // `stmt` dropped — releases the immutable borrow on `conn`

// ── Phase 2: Bulk insert in a single transaction ─────────────────────
let Ok(tx) = conn.transaction() else {
return 0;
};

let mut total = 0u32;
{
let Ok(mut insert_stmt) = tx.prepare(
"INSERT INTO ast_nodes (file, line, kind, name, text, receiver, parent_node_id) \
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
) else {
return 0;
};

for batch in &batches {
let empty = Vec::new();
let defs = file_defs.get(&batch.file).unwrap_or(&empty);

for node in &batch.nodes {
let parent_id = find_parent_id(defs, node.line);

match insert_stmt.execute(params![
&batch.file,
node.line,
&node.kind,
&node.name,
&node.text,
&node.receiver,
parent_id,
]) {
Ok(_) => total += 1,
Err(_) => return 0, // abort; tx rolls back on drop
}
}
}
} // `insert_stmt` dropped

if tx.commit().is_err() {
return 0;
}

total
}
1 change: 1 addition & 0 deletions crates/codegraph-core/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub mod ast_db;
pub mod cfg;
pub mod complexity;
pub mod constants;
Expand Down
49 changes: 49 additions & 0 deletions src/features/ast.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { createAstStoreVisitor } from '../ast-analysis/visitors/ast-store-visito
import { bulkNodeIdsByFile, openReadonlyOrFail } from '../db/index.js';
import { buildFileConditionSQL } from '../db/query-builder.js';
import { debug } from '../infrastructure/logger.js';
import { loadNative } from '../infrastructure/native.js';
import { outputResult } from '../infrastructure/result-formatter.js';
import { paginateResult } from '../shared/paginate.js';
import type { ASTNodeKind, BetterSqlite3Database, Definition, TreeSitterNode } from '../types.js';
Expand Down Expand Up @@ -67,6 +68,54 @@ export async function buildAstNodes(
_rootDir: string,
_engineOpts?: unknown,
): Promise<void> {
// ── Native bulk-insert fast path ──────────────────────────────────────
const native = loadNative();
if (native?.bulkInsertAstNodes) {
let needsJsFallback = false;
const batches: Array<{
file: string;
nodes: Array<{
line: number;
kind: string;
name: string;
text?: string | null;
receiver?: string | null;
}>;
}> = [];

for (const [relPath, symbols] of fileSymbols) {
if (Array.isArray(symbols.astNodes)) {
batches.push({
file: relPath,
nodes: symbols.astNodes.map((n) => ({
line: n.line,
kind: n.kind,
name: n.name,
text: n.text,
receiver: n.receiver,
})),
});
} else if (symbols.calls || symbols._tree) {
needsJsFallback = true;
break;
}
}

if (!needsJsFallback) {
const expectedNodes = batches.reduce((s, b) => s + b.nodes.length, 0);
const inserted = native.bulkInsertAstNodes(db.name, batches);
if (inserted === expectedNodes) {
debug(`AST extraction (native bulk): ${inserted} nodes stored`);
return;
}
debug(
`AST extraction (native bulk): expected ${expectedNodes}, got ${inserted} — falling back to JS`,
);
// fall through to JS path
}
}

// ── JS fallback path ──────────────────────────────────────────────────
let insertStmt: ReturnType<BetterSqlite3Database['prepare']>;
try {
insertStmt = db.prepare(
Expand Down
13 changes: 13 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1802,6 +1802,19 @@ export interface NativeAddon {
computeConfidence(callerFile: string, targetFile: string, importedFrom: string | null): number;
detectCycles(edges: Array<{ source: string; target: string }>): string[][];
buildCallEdges(files: unknown[], nodes: unknown[], builtinReceivers: string[]): unknown[];
bulkInsertAstNodes(
dbPath: string,
batches: Array<{
file: string;
nodes: Array<{
line: number;
kind: string;
name: string;
text?: string | null;
receiver?: string | null;
}>;
}>,
): number;
engineVersion(): string;
ParseTreeCache: new () => NativeParseTreeCache;
}
Expand Down
Loading