diff --git a/Cargo.lock b/Cargo.lock index 29add1d9..31fb2555 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -78,12 +78,56 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + [[package]] name = "anstyle" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.1", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.1", +] + [[package]] name = "anyhow" version = "1.0.100" @@ -1218,6 +1262,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae" dependencies = [ "clap_builder", + "clap_derive", ] [[package]] @@ -1226,8 +1271,22 @@ version = "4.5.48" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9" dependencies = [ + "anstream", "anstyle", "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.106", ] [[package]] @@ -1245,6 +1304,12 @@ dependencies = [ "cc", ] +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + [[package]] name = "comfy-table" version = "7.1.2" @@ -3666,6 +3731,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + [[package]] name = "itertools" version = "0.10.5" @@ -4135,6 +4206,28 @@ dependencies = [ "wiremock", ] +[[package]] +name = "lance-graph-cli" +version = "0.5.3" +dependencies = [ + "anyhow", + "arrow", + "arrow-array", + "arrow-cast", + "arrow-csv", + "arrow-json", + "arrow-schema", + "clap", + "datafusion", + "lance", + "lance-graph", + "lance-namespace", + "serde", + "serde_json", + "tokio", + "toml", +] + [[package]] name = "lance-graph-python" version = "0.5.3" @@ -4966,6 +5059,12 @@ version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + [[package]] name = "oneshot" version = "0.1.11" @@ -5437,7 +5536,7 @@ version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" dependencies = [ - "toml_edit", + "toml_edit 0.23.10+spec-1.0.0", ] [[package]] @@ -6388,6 +6487,15 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -7175,6 +7283,27 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime 0.6.11", + "toml_edit 0.22.27", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + [[package]] name = "toml_datetime" version = "0.7.5+spec-1.1.0" @@ -7184,6 +7313,20 @@ dependencies = [ "serde_core", ] +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "serde", + "serde_spanned", + "toml_datetime 0.6.11", + "toml_write", + "winnow", +] + [[package]] name = "toml_edit" version = "0.23.10+spec-1.0.0" @@ -7191,7 +7334,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" dependencies = [ "indexmap", - "toml_datetime", + "toml_datetime 0.7.5+spec-1.1.0", "toml_parser", "winnow", ] @@ -7205,6 +7348,12 @@ dependencies = [ "winnow", ] +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + [[package]] name = "tower" version = "0.5.2" @@ -7404,6 +7553,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "uuid" version = "1.18.1" diff --git a/Cargo.toml b/Cargo.toml index c66043bd..06abd2c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,5 +4,6 @@ members = [ "crates/lance-graph-catalog", "crates/lance-graph-python", "crates/lance-graph-benches", + "crates/lance-graph-cli", ] resolver = "2" diff --git a/crates/lance-graph-cli/Cargo.toml b/crates/lance-graph-cli/Cargo.toml new file mode 100644 index 00000000..f5c14405 --- /dev/null +++ b/crates/lance-graph-cli/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "lance-graph-cli" +version = "0.5.3" +edition = "2021" +license = "Apache-2.0" +description = "CLI for lance-graph: run Cypher and SQL queries from the command line" + +[[bin]] +name = "lgraph" +path = "src/main.rs" + +[dependencies] +lance-graph = { path = "../lance-graph" } +lance = "1.0.0" +lance-namespace = "1.0.1" +clap = { version = "4", features = ["derive"] } +tokio = { version = "1", features = ["full"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +toml = "0.8" +arrow = { version = "56.2", features = ["prettyprint", "json"] } +arrow-array = "56.2" +arrow-cast = "56.2" +arrow-json = "56.2" +arrow-csv = "56.2" +arrow-schema = "56.2" +datafusion = { version = "50.3", default-features = false, features = ["parquet"] } +anyhow = "1" diff --git a/crates/lance-graph-cli/README.md b/crates/lance-graph-cli/README.md new file mode 100644 index 00000000..b41c8b59 --- /dev/null +++ b/crates/lance-graph-cli/README.md @@ -0,0 +1,201 @@ +# lgraph CLI + +Command-line interface for the lance-graph query engine. Run Cypher and SQL queries against Lance, Parquet, and Delta Lake datasets. + +## Installation + +```bash +cargo install --path crates/lance-graph-cli +``` + +Or build from the workspace: + +```bash +cargo build -p lance-graph-cli --release +``` + +The binary is named `lgraph`. + +## Quick Start + +```bash +# Generate a config file +lgraph init + +# Edit lgraph.toml to point to your data and define graph schema +# Then run queries: +lgraph cypher "MATCH (p:Person)-[:KNOWS]->(f:Person) RETURN p.name, f.name" +lgraph sql "SELECT name, age FROM person WHERE age > 30" +``` + +## Configuration + +`lgraph` uses a TOML config file (default: `./lgraph.toml`). Use `-c ` to specify a different location. + +### Namespace (local Lance/Parquet tables) + +```toml +namespace = "/path/to/tables" + +[graph.nodes.Person] +id_field = "person_id" + +[graph.nodes.Company] +id_field = "company_id" + +[graph.relationships.WORKS_AT] +source_field = "person_id" +target_field = "company_id" +``` + +The `namespace` path should point to a directory containing `.lance` datasets. By default, the section key (e.g., `Person`) is used as both the Cypher label and the table name. + +### Table name aliases + +When your physical table name differs from the graph label you want to use in Cypher, use the `table` field: + +```toml +namespace = "/path/to/tables" + +# Cypher label "Person" reads from the "person_entity" table +[graph.nodes.Person] +id_field = "person_id" +table = "person_entity" + +# Cypher label "Company" reads from a table also called "Company" (default) +[graph.nodes.Company] +id_field = "company_id" + +# Cypher type "WORKS_AT" reads from the "employment_info" table +[graph.relationships.WORKS_AT] +source_field = "person_id" +target_field = "company_id" +table = "employment_info" +``` + +Now you can query using the graph labels: + +```cypher +MATCH (p:Person)-[:WORKS_AT]->(c:Company) RETURN p.name, c.name +``` + +The engine reads from `person_entity` and `employment_info` under the hood. + +### Unity Catalog + +```toml +[catalog] +url = "http://localhost:8080/api/2.1/unity-catalog" +catalog_name = "main" +schema_name = "default" + +[catalog.storage_options] +aws_access_key_id = "..." +aws_secret_access_key = "..." +``` + +### Graph schema + +The `[graph]` section maps your tabular data to a property graph model. It is required for Cypher queries but optional for SQL. + +- **Nodes**: each entry names a node label and its ID column. Use `table` to map to a different physical table. +- **Relationships**: each entry names a relationship type and its source/target ID columns. Use `table` to map to a different physical table. + +## Commands + +### `cypher` — Run a Cypher query + +```bash +lgraph cypher "MATCH (p:Person) WHERE p.age > 30 RETURN p.name, p.age" +``` + +Requires a `[graph]` section in the config file. + +### `sql` — Run a SQL query + +```bash +lgraph sql "SELECT name, age FROM person ORDER BY age DESC LIMIT 10" +``` + +Tables defined in the `[graph]` section are registered and available by name (lowercased). + +### `tables` — List available tables + +```bash +lgraph tables +``` + +### `schema` — Show table schema + +```bash +lgraph schema Person +``` + +### `init` — Create a template config + +```bash +lgraph init # creates ./lgraph.toml +lgraph init -c my-config.toml # creates at custom path +``` + +## Output Formats + +Use `--format` (or `-f`) to control output. When omitted, the format is auto-detected: + +- **TTY** (interactive terminal): `table` +- **Piped** (scripts, Claude skills): `jsonl` + +| Format | Flag | Description | +|--------|------|-------------| +| Table | `-f table` | Human-readable ASCII table | +| JSONL | `-f jsonl` | One JSON object per row (best for agents) | +| JSON | `-f json` | Single JSON object with metadata envelope | +| CSV | `-f csv` | Comma-separated values with header | + +### Examples + +```bash +# Pretty table in terminal +lgraph cypher "MATCH (p:Person) RETURN p.name" -f table + +# JSONL for piping to jq +lgraph cypher "MATCH (p:Person) RETURN p.name" -f jsonl | jq '.name' + +# JSON envelope with metadata +lgraph sql "SELECT count(*) as cnt FROM person" -f json + +# CSV for data export +lgraph sql "SELECT * FROM person" -f csv > people.csv +``` + +### JSONL output (default when piped) + +```json +{"name": "Alice", "age": 28} +{"name": "Bob", "age": 34} +``` + +### JSON envelope output + +```json +{ + "columns": ["name", "age"], + "row_count": 2, + "rows": [ + {"name": "Alice", "age": 28}, + {"name": "Bob", "age": 34} + ] +} +``` + +## Usage with Claude Skills + +`lgraph` is designed to be called from Claude Code skills via bash. When piped, it defaults to JSONL output which is optimal for LLM consumption. + +```bash +# In a Claude skill, simply call lgraph: +lgraph cypher "MATCH (p:Person)-[:WORKS_AT]->(c:Company) RETURN p.name, c.name" + +# The JSONL output is self-describing and truncation-safe +lgraph sql "SELECT * FROM person WHERE city = 'New York'" -f json +``` diff --git a/crates/lance-graph-cli/src/catalog_helpers.rs b/crates/lance-graph-cli/src/catalog_helpers.rs new file mode 100644 index 00000000..45fd368e --- /dev/null +++ b/crates/lance-graph-cli/src/catalog_helpers.rs @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Helper for building a DataFusion SessionContext from a Unity Catalog. + +use anyhow::{Context, Result}; +use datafusion::execution::context::SessionContext; +use lance_graph::{default_table_readers, Connector, UnityCatalogConfig, UnityCatalogProvider}; +use std::sync::Arc; + +use crate::config::CatalogConfig; + +/// Build a SessionContext with tables from Unity Catalog registered. +pub async fn build_catalog_context(catalog_cfg: &CatalogConfig) -> Result { + let uc_config = UnityCatalogConfig::new(&catalog_cfg.url); + let provider = UnityCatalogProvider::new(uc_config).context("connecting to Unity Catalog")?; + let readers = default_table_readers(); + let connector = Connector::new(Arc::new(provider), readers) + .with_storage_options(catalog_cfg.storage_options.clone()); + let ctx = SessionContext::new(); + connector + .register_schema(&ctx, &catalog_cfg.catalog_name, &catalog_cfg.schema_name) + .await + .context("registering catalog schema")?; + Ok(ctx) +} diff --git a/crates/lance-graph-cli/src/commands/cypher.rs b/crates/lance-graph-cli/src/commands/cypher.rs new file mode 100644 index 00000000..7205f14d --- /dev/null +++ b/crates/lance-graph-cli/src/commands/cypher.rs @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use anyhow::{bail, Context, Result}; +use lance_graph::{CypherQuery, DirNamespace}; + +use crate::config::LGraphConfig; +use crate::output::{print_batch, OutputFormat}; + +pub async fn run(config: &LGraphConfig, query: &str, format: OutputFormat) -> Result<()> { + let graph_config = config + .build_graph_config()? + .context("Cypher queries require a [graph] section in the config file")?; + + let cypher = CypherQuery::new(query) + .context("parsing Cypher query")? + .with_config(graph_config); + + let result = if let Some(ns_path) = &config.namespace { + let namespace = DirNamespace::new(ns_path); + cypher + .execute_with_namespace(namespace, None) + .await + .context("executing Cypher query")? + } else if let Some(catalog_cfg) = &config.catalog { + let ctx = crate::catalog_helpers::build_catalog_context(catalog_cfg) + .await + .context("setting up catalog")?; + cypher + .execute_with_context(ctx) + .await + .context("executing Cypher query")? + } else { + bail!("No data source configured. Set 'namespace' or '[catalog]' in your config file."); + }; + + print_batch(&result, format)?; + Ok(()) +} diff --git a/crates/lance-graph-cli/src/commands/init.rs b/crates/lance-graph-cli/src/commands/init.rs new file mode 100644 index 00000000..39638709 --- /dev/null +++ b/crates/lance-graph-cli/src/commands/init.rs @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use anyhow::Result; +use std::path::Path; + +use crate::config::template_config; + +pub fn run(path: &Path) -> Result<()> { + if path.exists() { + eprintln!("Config file already exists: {}", path.display()); + std::process::exit(1); + } + std::fs::write(path, template_config())?; + println!("Created {}", path.display()); + Ok(()) +} diff --git a/crates/lance-graph-cli/src/commands/mod.rs b/crates/lance-graph-cli/src/commands/mod.rs new file mode 100644 index 00000000..74adbafc --- /dev/null +++ b/crates/lance-graph-cli/src/commands/mod.rs @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +pub mod cypher; +pub mod init; +pub mod schema; +pub mod sql; +pub mod tables; diff --git a/crates/lance-graph-cli/src/commands/schema.rs b/crates/lance-graph-cli/src/commands/schema.rs new file mode 100644 index 00000000..c997cb4e --- /dev/null +++ b/crates/lance-graph-cli/src/commands/schema.rs @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use anyhow::{bail, Context, Result}; + +use crate::config::LGraphConfig; +use crate::output::OutputFormat; + +pub async fn run(config: &LGraphConfig, table_name: &str, format: OutputFormat) -> Result<()> { + let ctx = if let Some(ns_path) = &config.namespace { + crate::namespace_helpers::build_namespace_context(config, ns_path) + .await + .context("setting up namespace tables")? + } else if let Some(catalog_cfg) = &config.catalog { + crate::catalog_helpers::build_catalog_context(catalog_cfg) + .await + .context("setting up catalog")? + } else { + bail!("No data source configured."); + }; + + let table_provider = ctx + .table_provider(&table_name.to_lowercase()) + .await + .context(format!("table '{}' not found", table_name))?; + let schema = table_provider.schema(); + + match format { + OutputFormat::Json | OutputFormat::Jsonl => { + for field in schema.fields() { + let obj = serde_json::json!({ + "name": field.name(), + "type": format!("{}", field.data_type()), + "nullable": field.is_nullable(), + }); + println!("{}", serde_json::to_string(&obj).unwrap()); + } + } + OutputFormat::Table => { + println!("Schema for '{table_name}':"); + println!("{:<30} {:<20} Nullable", "Name", "Type"); + println!("{}", "-".repeat(60)); + for field in schema.fields() { + println!( + "{:<30} {:<20} {}", + field.name(), + field.data_type(), + field.is_nullable() + ); + } + } + OutputFormat::Csv => { + println!("name,type,nullable"); + for field in schema.fields() { + println!( + "{},{},{}", + field.name(), + field.data_type(), + field.is_nullable() + ); + } + } + } + + Ok(()) +} diff --git a/crates/lance-graph-cli/src/commands/sql.rs b/crates/lance-graph-cli/src/commands/sql.rs new file mode 100644 index 00000000..25052647 --- /dev/null +++ b/crates/lance-graph-cli/src/commands/sql.rs @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use anyhow::{bail, Context, Result}; +use lance_graph::SqlQuery; + +use crate::config::LGraphConfig; +use crate::output::{print_batch, OutputFormat}; + +pub async fn run(config: &LGraphConfig, query: &str, format: OutputFormat) -> Result<()> { + let sql = SqlQuery::new(query); + + let ctx = if let Some(ns_path) = &config.namespace { + crate::namespace_helpers::build_namespace_context(config, ns_path) + .await + .context("setting up namespace tables")? + } else if let Some(catalog_cfg) = &config.catalog { + crate::catalog_helpers::build_catalog_context(catalog_cfg) + .await + .context("setting up catalog")? + } else { + bail!("No data source configured. Set 'namespace' or '[catalog]' in your config file."); + }; + + let result = sql + .execute_with_context(ctx) + .await + .context("executing SQL query")?; + + print_batch(&result, format)?; + Ok(()) +} diff --git a/crates/lance-graph-cli/src/commands/tables.rs b/crates/lance-graph-cli/src/commands/tables.rs new file mode 100644 index 00000000..9252141d --- /dev/null +++ b/crates/lance-graph-cli/src/commands/tables.rs @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use anyhow::{bail, Context, Result}; + +use crate::config::LGraphConfig; +use crate::output::OutputFormat; + +struct TableEntry { + label: String, + table: String, + kind: &'static str, // "node" or "relationship" +} + +pub async fn run(config: &LGraphConfig, format: OutputFormat) -> Result<()> { + if let Some(ns_path) = &config.namespace { + let graph = config.build_graph_config()?; + match graph { + Some(gc) => { + let mut entries: Vec = Vec::new(); + for nm in gc.node_mappings.values() { + entries.push(TableEntry { + label: nm.label.clone(), + table: nm.resolved_table_name().to_string(), + kind: "node", + }); + } + for rm in gc.relationship_mappings.values() { + entries.push(TableEntry { + label: rm.relationship_type.clone(), + table: rm.resolved_table_name().to_string(), + kind: "relationship", + }); + } + entries.sort_by(|a, b| a.label.cmp(&b.label)); + print_entries(&entries, &format!("namespace: {ns_path}"), format); + } + None => { + eprintln!("No [graph] section in config. Cannot determine tables in namespace."); + eprintln!("Add node and relationship mappings to your lgraph.toml."); + } + } + } else if let Some(catalog_cfg) = &config.catalog { + let ctx = crate::catalog_helpers::build_catalog_context(catalog_cfg) + .await + .context("setting up catalog")?; + let catalog_list = ctx.catalog_names(); + let mut entries: Vec = Vec::new(); + for catalog_name in &catalog_list { + if let Some(catalog) = ctx.catalog(catalog_name) { + for schema_name in catalog.schema_names() { + if let Some(schema) = catalog.schema(&schema_name) { + for name in schema.table_names() { + entries.push(TableEntry { + label: name.clone(), + table: name, + kind: "table", + }); + } + } + } + } + } + entries.sort_by(|a, b| a.label.cmp(&b.label)); + let source = format!( + "catalog: {}.{}", + catalog_cfg.catalog_name, catalog_cfg.schema_name + ); + print_entries(&entries, &source, format); + } else { + bail!("No data source configured. Set 'namespace' or '[catalog]' in your config file."); + } + + Ok(()) +} + +fn print_entries(entries: &[TableEntry], source: &str, format: OutputFormat) { + match format { + OutputFormat::Json | OutputFormat::Jsonl => { + for e in entries { + let mut obj = serde_json::json!({ + "label": e.label, + "kind": e.kind, + }); + if e.table != e.label { + obj["table"] = serde_json::Value::String(e.table.clone()); + } + println!("{}", serde_json::to_string(&obj).unwrap()); + } + } + OutputFormat::Table => { + println!("Tables ({source}):"); + for e in entries { + if e.table != e.label { + println!(" {} -> {} ({})", e.label, e.table, e.kind); + } else { + println!(" {} ({})", e.label, e.kind); + } + } + } + OutputFormat::Csv => { + println!("label,table,kind"); + for e in entries { + println!("{},{},{}", e.label, e.table, e.kind); + } + } + } +} diff --git a/crates/lance-graph-cli/src/config.rs b/crates/lance-graph-cli/src/config.rs new file mode 100644 index 00000000..71cec0f5 --- /dev/null +++ b/crates/lance-graph-cli/src/config.rs @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Configuration file loading for lgraph CLI. + +use anyhow::{Context, Result}; +use lance_graph::{GraphConfig, NodeMapping, RelationshipMapping}; +use serde::Deserialize; +use std::collections::HashMap; +use std::path::Path; + +/// Top-level config file structure (`lgraph.toml`). +#[derive(Debug, Deserialize)] +pub struct LGraphConfig { + /// Path to a directory of Lance tables. + pub namespace: Option, + + /// Unity Catalog connection settings. + pub catalog: Option, + + /// Graph schema mappings (needed for Cypher queries). + pub graph: Option, +} + +/// Unity Catalog connection config. +#[derive(Debug, Deserialize)] +pub struct CatalogConfig { + pub url: String, + pub catalog_name: String, + pub schema_name: String, + /// Cloud storage options (S3/Azure/GCS credentials). + #[serde(default)] + pub storage_options: HashMap, +} + +/// Graph schema section of the config. +#[derive(Debug, Deserialize)] +pub struct GraphSection { + #[serde(default)] + pub nodes: HashMap, + #[serde(default)] + pub relationships: HashMap, +} + +#[derive(Debug, Deserialize)] +pub struct NodeConfig { + pub id_field: String, + /// Optional: actual table name if different from the node label. + /// E.g., label "Person" can read from table "person_entity". + pub table: Option, +} + +#[derive(Debug, Deserialize)] +pub struct RelationshipConfig { + pub source_field: String, + pub target_field: String, + /// Optional: actual table name if different from the relationship type. + /// E.g., type "WORKS_AT" can read from table "employment_info". + pub table: Option, +} + +impl LGraphConfig { + /// Load config from a TOML file. + pub fn load(path: &Path) -> Result { + let content = + std::fs::read_to_string(path).with_context(|| format!("reading {}", path.display()))?; + let config: LGraphConfig = + toml::from_str(&content).with_context(|| format!("parsing {}", path.display()))?; + Ok(config) + } + + /// Build a `GraphConfig` from the `[graph]` section. + pub fn build_graph_config(&self) -> Result> { + let section = match &self.graph { + Some(g) => g, + None => return Ok(None), + }; + + let mut builder = GraphConfig::builder(); + for (label, node) in §ion.nodes { + let mut mapping = NodeMapping::new(label, &node.id_field); + if let Some(table) = &node.table { + mapping = mapping.with_table_name(table); + } + builder = builder.with_node_mapping(mapping); + } + for (rel_type, rel) in §ion.relationships { + let mut mapping = + RelationshipMapping::new(rel_type, &rel.source_field, &rel.target_field); + if let Some(table) = &rel.table { + mapping = mapping.with_table_name(table); + } + builder = builder.with_relationship_mapping(mapping); + } + let config = builder.build().context("building GraphConfig")?; + Ok(Some(config)) + } +} + +/// Generate a template config file. +pub fn template_config() -> &'static str { + r#"# lgraph configuration file +# Uncomment and configure the data source you want to use. + +# Option 1: Local directory of Lance / Parquet tables +# namespace = "/path/to/tables" + +# Option 2: Unity Catalog +# [catalog] +# url = "http://localhost:8080/api/2.1/unity-catalog" +# catalog_name = "main" +# schema_name = "default" +# [catalog.storage_options] +# aws_access_key_id = "..." +# aws_secret_access_key = "..." + +# Graph schema mappings (required for Cypher queries) +# Each section key is the graph label used in Cypher queries. +# Use "table" to map a label to a different physical table name. +# +# [graph.nodes.Person] +# id_field = "person_id" +# table = "person_entity" # optional: reads from "person_entity" table +# +# [graph.nodes.Company] +# id_field = "company_id" +# +# [graph.relationships.WORKS_AT] +# source_field = "person_id" +# target_field = "company_id" +# table = "employment_info" # optional: reads from "employment_info" table +"# +} diff --git a/crates/lance-graph-cli/src/main.rs b/crates/lance-graph-cli/src/main.rs new file mode 100644 index 00000000..e3f99f38 --- /dev/null +++ b/crates/lance-graph-cli/src/main.rs @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! lgraph — CLI for lance-graph query engine. + +mod catalog_helpers; +mod commands; +mod config; +mod namespace_helpers; +mod output; + +use clap::{Parser, Subcommand}; +use output::OutputFormat; +use std::path::PathBuf; + +#[derive(Parser)] +#[command(name = "lgraph", about = "Lance Graph query engine CLI")] +struct Cli { + /// Path to config file. + #[arg(short, long, default_value = "lgraph.toml")] + config: PathBuf, + + /// Output format (auto-detected if not specified). + #[arg(short, long)] + format: Option, + + #[command(subcommand)] + command: Command, +} + +#[derive(Subcommand)] +enum Command { + /// Run a Cypher query. + Cypher { + /// The Cypher query string. + query: String, + }, + /// Run a SQL query. + Sql { + /// The SQL query string. + query: String, + }, + /// List available tables. + Tables, + /// Show schema for a table. + Schema { + /// Table name. + table: String, + }, + /// Create a template lgraph.toml config file. + Init, +} + +#[tokio::main] +async fn main() { + let cli = Cli::parse(); + let format = cli.format.unwrap_or_else(OutputFormat::default_for_stdout); + + let result = match &cli.command { + Command::Init => commands::init::run(&cli.config), + cmd => { + let cfg = match config::LGraphConfig::load(&cli.config) { + Ok(c) => c, + Err(e) => { + eprintln!("Error: {e:#}"); + eprintln!("Run 'lgraph init' to create a config file."); + std::process::exit(1); + } + }; + match cmd { + Command::Cypher { query } => commands::cypher::run(&cfg, query, format).await, + Command::Sql { query } => commands::sql::run(&cfg, query, format).await, + Command::Tables => commands::tables::run(&cfg, format).await, + Command::Schema { table } => commands::schema::run(&cfg, table, format).await, + Command::Init => unreachable!(), + } + } + }; + + if let Err(e) = result { + eprintln!("Error: {e:#}"); + std::process::exit(1); + } +} diff --git a/crates/lance-graph-cli/src/namespace_helpers.rs b/crates/lance-graph-cli/src/namespace_helpers.rs new file mode 100644 index 00000000..6b1e16d6 --- /dev/null +++ b/crates/lance-graph-cli/src/namespace_helpers.rs @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Helper for building a DataFusion SessionContext from a namespace directory. + +use anyhow::{Context, Result}; +use datafusion::execution::context::SessionContext; +use lance::datafusion::LanceTableProvider; +use lance_graph::DirNamespace; +use lance_namespace::models::DescribeTableRequest; +use lance_namespace::LanceNamespace; +use std::sync::Arc; + +use crate::config::LGraphConfig; + +/// Build a SessionContext with tables from a namespace directory registered. +/// +/// Uses the `[graph]` config section to determine which tables to register. +/// When a node/relationship has a `table_name` override, the physical table is +/// resolved using that name but registered under the graph label so Cypher +/// queries can reference it by label. +pub async fn build_namespace_context( + config: &LGraphConfig, + ns_path: &str, +) -> Result { + let graph_config = config + .build_graph_config()? + .context("A [graph] section is required to resolve tables from a namespace")?; + + let namespace = DirNamespace::new(ns_path); + let ctx = SessionContext::new(); + + // Collect (physical_table_name, register_as_label) pairs. + let mut table_entries: Vec<(String, String)> = Vec::new(); + for nm in graph_config.node_mappings.values() { + table_entries.push(( + nm.resolved_table_name().to_string(), + nm.label.to_lowercase(), + )); + } + for rm in graph_config.relationship_mappings.values() { + table_entries.push(( + rm.resolved_table_name().to_string(), + rm.relationship_type.to_lowercase(), + )); + } + + for (physical_name, register_as) in &table_entries { + let mut request = DescribeTableRequest::new(); + request.id = Some(vec![physical_name.clone()]); + + let response = namespace + .describe_table(request) + .await + .with_context(|| format!("resolving table '{physical_name}' in namespace"))?; + + let location = response + .location + .with_context(|| format!("no location for table '{physical_name}'"))?; + + let dataset = lance::dataset::Dataset::open(&location) + .await + .with_context(|| format!("opening dataset for table '{physical_name}'"))?; + + let provider: Arc = + Arc::new(LanceTableProvider::new(Arc::new(dataset), true, true)); + + ctx.register_table(register_as, provider) + .with_context(|| format!("registering table '{physical_name}' as '{register_as}'"))?; + } + + Ok(ctx) +} diff --git a/crates/lance-graph-cli/src/output.rs b/crates/lance-graph-cli/src/output.rs new file mode 100644 index 00000000..012072b0 --- /dev/null +++ b/crates/lance-graph-cli/src/output.rs @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Output formatting for query results. + +use anyhow::Result; +use arrow::util::pretty::pretty_format_batches; +use arrow_array::RecordBatch; +use arrow_cast::display::{ArrayFormatter, FormatOptions}; +use std::io::{self, IsTerminal, Write}; + +/// Output format for query results. +#[derive(Debug, Clone, Copy, clap::ValueEnum)] +pub enum OutputFormat { + /// Full JSON object with metadata envelope. + Json, + /// One JSON object per row (streaming-friendly, best for agents). + Jsonl, + /// Human-readable ASCII table. + Table, + /// Comma-separated values with header. + Csv, +} + +impl OutputFormat { + /// Pick a sensible default: table for TTY, jsonl when piped. + pub fn default_for_stdout() -> Self { + if io::stdout().is_terminal() { + OutputFormat::Table + } else { + OutputFormat::Jsonl + } + } +} + +/// Format and print a `RecordBatch` to stdout. +pub fn print_batch(batch: &RecordBatch, format: OutputFormat) -> Result<()> { + let stdout = io::stdout(); + let mut out = stdout.lock(); + + match format { + OutputFormat::Json => print_json(&mut out, batch)?, + OutputFormat::Jsonl => print_jsonl(&mut out, batch)?, + OutputFormat::Table => print_table(&mut out, batch)?, + OutputFormat::Csv => print_csv(&mut out, batch)?, + } + + Ok(()) +} + +fn print_json(out: &mut impl Write, batch: &RecordBatch) -> Result<()> { + let rows = batch_to_json_rows(batch)?; + let envelope = serde_json::json!({ + "columns": batch.schema().fields().iter().map(|f| f.name().as_str()).collect::>(), + "row_count": batch.num_rows(), + "rows": rows, + }); + serde_json::to_writer_pretty(&mut *out, &envelope)?; + writeln!(out)?; + Ok(()) +} + +fn print_jsonl(out: &mut impl Write, batch: &RecordBatch) -> Result<()> { + let rows = batch_to_json_rows(batch)?; + for row in rows { + serde_json::to_writer(&mut *out, &row)?; + writeln!(out)?; + } + Ok(()) +} + +fn print_table(out: &mut impl Write, batch: &RecordBatch) -> Result<()> { + let formatted = pretty_format_batches(std::slice::from_ref(batch))?; + write!(out, "{formatted}")?; + writeln!(out)?; + Ok(()) +} + +fn print_csv(out: &mut impl Write, batch: &RecordBatch) -> Result<()> { + let mut writer = arrow_csv::WriterBuilder::new().with_header(true).build(out); + writer.write(batch)?; + Ok(()) +} + +/// Convert a RecordBatch to a Vec of JSON objects. +fn batch_to_json_rows(batch: &RecordBatch) -> Result> { + let schema = batch.schema(); + let opts = FormatOptions::default(); + let formatters: Vec = batch + .columns() + .iter() + .map(|col| ArrayFormatter::try_new(col.as_ref(), &opts)) + .collect::, _>>()?; + + let mut rows = Vec::with_capacity(batch.num_rows()); + for row_idx in 0..batch.num_rows() { + let mut map = serde_json::Map::new(); + for (col_idx, field) in schema.fields().iter().enumerate() { + let value = if batch.column(col_idx).is_null(row_idx) { + serde_json::Value::Null + } else { + let display = formatters[col_idx].value(row_idx); + let s = display.to_string(); + // Try to parse as number, otherwise use string. + if let Ok(n) = s.parse::() { + serde_json::Value::Number(n.into()) + } else if let Ok(n) = s.parse::() { + serde_json::Number::from_f64(n) + .map(serde_json::Value::Number) + .unwrap_or(serde_json::Value::String(s)) + } else { + serde_json::Value::String(s) + } + }; + map.insert(field.name().clone(), value); + } + rows.push(serde_json::Value::Object(map)); + } + Ok(rows) +} diff --git a/crates/lance-graph-python/src/catalog.rs b/crates/lance-graph-python/src/catalog.rs index 26b253dc..e7a060cd 100644 --- a/crates/lance-graph-python/src/catalog.rs +++ b/crates/lance-graph-python/src/catalog.rs @@ -250,9 +250,7 @@ impl PyUnityCatalog { let connector = self.connector.clone(); let cat = catalog_name.to_string(); let result = RT - .block_on(Some(py), async move { - connector.list_schemas(&cat).await - })? + .block_on(Some(py), async move { connector.list_schemas(&cat).await })? .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; Ok(result .into_iter() @@ -278,9 +276,10 @@ impl PyUnityCatalog { let cat = catalog_name.to_string(); let sch = schema_name.to_string(); let result = RT - .block_on(Some(py), async move { - connector.list_tables(&cat, &sch).await - })? + .block_on( + Some(py), + async move { connector.list_tables(&cat, &sch).await }, + )? .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; Ok(result .into_iter() diff --git a/crates/lance-graph/src/config.rs b/crates/lance-graph/src/config.rs index 6f986a75..0ad259fd 100644 --- a/crates/lance-graph/src/config.rs +++ b/crates/lance-graph/src/config.rs @@ -63,6 +63,12 @@ pub struct NodeMapping { pub label: String, /// Field name that serves as the node identifier pub id_field: String, + /// Optional table name override. When set, the engine reads data from this + /// table instead of using the label as the table name. + /// E.g., label="Person" with table_name="person_entity" reads from the + /// "person_entity" table but exposes it as `:Person` in Cypher. + #[serde(default)] + pub table_name: Option, /// Optional fields that define node properties pub property_fields: Vec, /// Optional filter conditions for this node type @@ -78,6 +84,12 @@ pub struct RelationshipMapping { pub source_id_field: String, /// Field containing the target node ID pub target_id_field: String, + /// Optional table name override. When set, the engine reads data from this + /// table instead of using the relationship type as the table name. + /// E.g., relationship_type="WORKS_AT" with table_name="employment_info" + /// reads from "employment_info" but exposes it as `[:WORKS_AT]` in Cypher. + #[serde(default)] + pub table_name: Option, /// Optional field containing the relationship type pub type_field: Option, /// Optional fields that define relationship properties @@ -204,6 +216,7 @@ impl GraphConfigBuilder { NodeMapping { label: label_str, // Keep original case for display id_field: id_field.into(), + table_name: None, property_fields: Vec::new(), filter_conditions: None, }, @@ -233,6 +246,7 @@ impl GraphConfigBuilder { relationship_type: type_str, // Keep original case for display source_id_field: source_field.into(), target_id_field: target_field.into(), + table_name: None, type_field: None, property_fields: Vec::new(), filter_conditions: None, @@ -284,11 +298,24 @@ impl NodeMapping { Self { label: label.into(), id_field: id_field.into(), + table_name: None, property_fields: Vec::new(), filter_conditions: None, } } + /// The actual table name to read data from. + /// Returns `table_name` if set, otherwise falls back to `label`. + pub fn resolved_table_name(&self) -> &str { + self.table_name.as_deref().unwrap_or(&self.label) + } + + /// Set an explicit table name that differs from the node label. + pub fn with_table_name>(mut self, table_name: S) -> Self { + self.table_name = Some(table_name.into()); + self + } + /// Add property fields to the mapping pub fn with_properties(mut self, fields: Vec) -> Self { self.property_fields = fields; @@ -309,12 +336,27 @@ impl RelationshipMapping { relationship_type: rel_type.into(), source_id_field: source_field.into(), target_id_field: target_field.into(), + table_name: None, type_field: None, property_fields: Vec::new(), filter_conditions: None, } } + /// The actual table name to read data from. + /// Returns `table_name` if set, otherwise falls back to `relationship_type`. + pub fn resolved_table_name(&self) -> &str { + self.table_name + .as_deref() + .unwrap_or(&self.relationship_type) + } + + /// Set an explicit table name that differs from the relationship type. + pub fn with_table_name>(mut self, table_name: S) -> Self { + self.table_name = Some(table_name.into()); + self + } + /// Set the type field for this relationship pub fn with_type_field>(mut self, type_field: S) -> Self { self.type_field = Some(type_field.into()); @@ -366,6 +408,7 @@ mod tests { NodeMapping { label: "Person".to_string(), id_field: "".to_string(), + table_name: None, property_fields: Vec::new(), filter_conditions: None, }, diff --git a/crates/lance-graph/src/logical_plan.rs b/crates/lance-graph/src/logical_plan.rs index 9d6c32f8..3fb83765 100644 --- a/crates/lance-graph/src/logical_plan.rs +++ b/crates/lance-graph/src/logical_plan.rs @@ -1330,6 +1330,7 @@ mod tests { .with_node_mapping(NodeMapping { label: "Person".to_string(), id_field: "id".to_string(), + table_name: None, property_fields: vec!["name".to_string(), "age".to_string()], filter_conditions: None, }) diff --git a/crates/lance-graph/src/query.rs b/crates/lance-graph/src/query.rs index 9625f273..2b1ffdbd 100644 --- a/crates/lance-graph/src/query.rs +++ b/crates/lance-graph/src/query.rs @@ -620,15 +620,19 @@ impl CypherQuery { let config = self.require_config()?; let mut required_tables: HashSet = HashSet::new(); - // Use original label/type names (not lowercase keys) for namespace resolution - // The namespace needs the original casing to find files on disk - required_tables.extend(config.node_mappings.values().map(|m| m.label.clone())); - required_tables.extend( - config - .relationship_mappings - .values() - .map(|m| m.relationship_type.clone()), - ); + // Map from resolved table name -> label (for registering under the right name) + let mut table_to_label: HashMap = HashMap::new(); + // Use resolved_table_name() which returns table_name if set, else falls back to label + for m in config.node_mappings.values() { + let table = m.resolved_table_name().to_string(); + required_tables.insert(table.clone()); + table_to_label.insert(table, m.label.clone()); + } + for m in config.relationship_mappings.values() { + let table = m.resolved_table_name().to_string(); + required_tables.insert(table.clone()); + table_to_label.insert(table, m.relationship_type.clone()); + } if required_tables.is_empty() { return Err(GraphError::ConfigError { @@ -678,9 +682,11 @@ impl CypherQuery { let provider: Arc = Arc::new(LanceTableProvider::new(dataset.clone(), true, true)); - // Register with lowercase table name for case-insensitive behavior - let normalized_table_name = table_name.to_lowercase(); - ctx.register_table(&normalized_table_name, provider.clone()) + // Register under the graph label (lowercased), not the physical table name. + // This allows the query planner to find the table using the Cypher label. + let label = table_to_label.get(&table_name).unwrap_or(&table_name); + let normalized_label = label.to_lowercase(); + ctx.register_table(&normalized_label, provider.clone()) .map_err(|e| GraphError::PlanError { message: format!( "Failed to register table '{}' in SessionContext: {}", @@ -689,8 +695,8 @@ impl CypherQuery { location: snafu::Location::new(file!(), line!(), column!()), })?; - // Store provider with normalized (lowercase) key for consistent lookup - providers.insert(normalized_table_name.clone(), provider); + // Store provider with normalized label key for consistent lookup + providers.insert(normalized_label, provider); } for label in config.node_mappings.keys() { diff --git a/crates/lance-graph/tests/test_complex_return_clauses.rs b/crates/lance-graph/tests/test_complex_return_clauses.rs index 24e53e32..0df008a1 100644 --- a/crates/lance-graph/tests/test_complex_return_clauses.rs +++ b/crates/lance-graph/tests/test_complex_return_clauses.rs @@ -68,6 +68,7 @@ fn create_graph_config() -> GraphConfig { .with_node_mapping(NodeMapping { label: "Person".to_string(), id_field: "id".to_string(), + table_name: None, property_fields: vec!["name".to_string()], filter_conditions: None, })