From caa792d4ed3b49dd6932a512fcc31d5d8a39c69e Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Wed, 25 Feb 2026 23:19:12 -0800 Subject: [PATCH 1/7] feat(sql): add direct SQL query support for data analytics Add SqlQuery and SqlEngine that let users run standard SQL directly against their datasets without requiring a GraphConfig. This is useful for data analytics workflows where users want explicit JOINs and aggregations against node/relationship tables. DataFusion handles SQL parsing and execution. --- crates/lance-graph-python/src/graph.rs | 255 ++++++++++++++- crates/lance-graph/src/lib.rs | 2 + crates/lance-graph/src/query.rs | 4 +- crates/lance-graph/src/sql_query.rs | 356 +++++++++++++++++++++ crates/lance-graph/tests/test_sql_query.rs | 338 +++++++++++++++++++ python/python/lance_graph/__init__.py | 4 + python/python/tests/test_sql.py | 175 ++++++++++ 7 files changed, 1115 insertions(+), 19 deletions(-) create mode 100644 crates/lance-graph/src/sql_query.rs create mode 100644 crates/lance-graph/tests/test_sql_query.rs create mode 100644 python/python/tests/test_sql.py diff --git a/crates/lance-graph-python/src/graph.rs b/crates/lance-graph-python/src/graph.rs index b7a4337c..4a9c3c30 100644 --- a/crates/lance-graph-python/src/graph.rs +++ b/crates/lance-graph-python/src/graph.rs @@ -27,7 +27,7 @@ use lance_graph::{ ast::{DistanceMetric as RustDistanceMetric, GraphPattern, ReadingClause}, CypherQuery as RustCypherQuery, ExecutionStrategy as RustExecutionStrategy, GraphConfig as RustGraphConfig, GraphError as RustGraphError, InMemoryCatalog, - VectorSearch as RustVectorSearch, + SqlQuery as RustSqlQuery, VectorSearch as RustVectorSearch, }; use pyo3::{ exceptions::{PyNotImplementedError, PyRuntimeError, PyValueError}, @@ -710,7 +710,9 @@ impl CypherQuery { vector_search: &VectorSearch, ) -> PyResult { if vector_search.use_lance_index { - if let Some(result) = try_execute_with_lance_index(py, &self.inner, datasets, vector_search)? { + if let Some(result) = + try_execute_with_lance_index(py, &self.inner, datasets, vector_search)? + { return record_batch_to_python_table(py, &result); } } @@ -917,10 +919,7 @@ fn split_vector_column(column: &str) -> (Option, &str) { } } -fn resolve_vector_label( - query: &RustCypherQuery, - alias: Option<&str>, -) -> PyResult> { +fn resolve_vector_label(query: &RustCypherQuery, alias: Option<&str>) -> PyResult> { let alias_map = alias_map_from_query(query); if let Some(alias) = alias { return Ok(alias_map.get(alias).cloned()); @@ -956,11 +955,17 @@ fn collect_aliases_from_pattern(pattern: &GraphPattern, map: &mut HashMap { - if let (Some(var), Some(label)) = (path.start_node.variable.as_ref(), path.start_node.labels.first()) { + if let (Some(var), Some(label)) = ( + path.start_node.variable.as_ref(), + path.start_node.labels.first(), + ) { map.entry(var.clone()).or_insert_with(|| label.clone()); } for segment in &path.segments { - if let (Some(var), Some(label)) = (segment.end_node.variable.as_ref(), segment.end_node.labels.first()) { + if let (Some(var), Some(label)) = ( + segment.end_node.variable.as_ref(), + segment.end_node.labels.first(), + ) { map.entry(var.clone()).or_insert_with(|| label.clone()); } } @@ -1168,14 +1173,20 @@ impl CypherEngine { // Register all datasets as tables for (name, batch) in &arrow_datasets { let mem_table = Arc::new( - MemTable::try_new(batch.schema(), vec![vec![batch.clone()]]) - .map_err(|e| PyRuntimeError::new_err(format!("Failed to create MemTable for {}: {}", name, e)))?, + MemTable::try_new(batch.schema(), vec![vec![batch.clone()]]).map_err(|e| { + PyRuntimeError::new_err(format!( + "Failed to create MemTable for {}: {}", + name, e + )) + })?, ); // Register in session context for execution let normalized_name = name.to_lowercase(); ctx.register_table(&normalized_name, mem_table.clone()) - .map_err(|e| PyRuntimeError::new_err(format!("Failed to register table {}: {}", name, e)))?; + .map_err(|e| { + PyRuntimeError::new_err(format!("Failed to register table {}: {}", name, e)) + })?; let table_source = Arc::new(DefaultTableSource::new(mem_table)); @@ -1186,7 +1197,7 @@ impl CypherEngine { // based on the Cypher query pattern (e.g., MATCH (p:Person) vs -[:KNOWS]->). // // By registering all datasets in both catalogs, we allow the planner to look up - // the correct source based on query context. This pattern matches the Rust + // the correct source based on query context. This pattern matches the Rust // implementation in query.rs:build_catalog_and_context_from_datasets. catalog = catalog .with_node_source(name, table_source.clone()) @@ -1226,11 +1237,7 @@ impl CypherEngine { /// -------- /// >>> result = engine.execute("MATCH (p:Person) WHERE p.age > 30 RETURN p.name") /// >>> print(result.to_pandas()) - fn execute( - &self, - py: Python, - query: &str, - ) -> PyResult { + fn execute(&self, py: Python, query: &str) -> PyResult { // Parse the query let cypher_query = RustCypherQuery::new(query) .map_err(graph_error_to_pyerr)? @@ -1314,6 +1321,218 @@ impl CypherEngine { } } +/// Execute raw SQL queries against in-memory datasets +/// +/// This class allows executing standard SQL directly against Arrow tables, +/// without requiring a GraphConfig or Cypher parsing. DataFusion handles +/// SQL parsing and execution. +/// +/// Examples +/// -------- +/// >>> import pyarrow as pa +/// >>> from lance_graph import SqlQuery +/// >>> +/// >>> person = pa.table({"id": [1, 2], "name": ["Alice", "Bob"], "age": [28, 34]}) +/// >>> query = SqlQuery("SELECT name, age FROM person WHERE age > 30") +/// >>> result = query.execute({"person": person}) +/// >>> print(result.to_pandas()) +#[pyclass(name = "SqlQuery", module = "lance.graph")] +pub struct SqlQuery { + inner: RustSqlQuery, +} + +#[pymethods] +impl SqlQuery { + /// Create a new SQL query + /// + /// Parameters + /// ---------- + /// sql : str + /// The SQL query string + /// + /// Returns + /// ------- + /// SqlQuery + /// A new SQL query instance + #[new] + fn new(sql: &str) -> Self { + Self { + inner: RustSqlQuery::new(sql), + } + } + + /// Get the SQL query text + fn sql(&self) -> &str { + self.inner.sql() + } + + /// Execute query against in-memory datasets + /// + /// Parameters + /// ---------- + /// datasets : dict + /// Dictionary mapping table names to PyArrow tables or Lance datasets + /// + /// Returns + /// ------- + /// pyarrow.Table + /// Query results as Arrow table + /// + /// Raises + /// ------ + /// RuntimeError + /// If query execution fails + fn execute(&self, py: Python, datasets: &Bound<'_, PyDict>) -> PyResult { + let arrow_datasets = python_datasets_to_batches(datasets)?; + let inner = self.inner.clone(); + + let result_batch = RT + .block_on(Some(py), inner.execute(arrow_datasets))? + .map_err(graph_error_to_pyerr)?; + + record_batch_to_python_table(py, &result_batch) + } + + /// Return the execution plan as a string + /// + /// Parameters + /// ---------- + /// datasets : dict + /// Dictionary mapping table names to PyArrow tables or Lance datasets + /// + /// Returns + /// ------- + /// str + /// The DataFusion logical and physical execution plan + /// + /// Raises + /// ------ + /// RuntimeError + /// If planning fails + fn explain(&self, py: Python, datasets: &Bound<'_, PyDict>) -> PyResult { + let arrow_datasets = python_datasets_to_batches(datasets)?; + let inner = self.inner.clone(); + + let plan = RT + .block_on(Some(py), inner.explain(arrow_datasets))? + .map_err(graph_error_to_pyerr)?; + + Ok(plan) + } + + fn __repr__(&self) -> String { + format!("SqlQuery(\"{}\")", self.inner.sql()) + } +} + +/// Cached SQL execution engine for running multiple queries against the same datasets +/// +/// This class registers datasets once during initialization and reuses the +/// DataFusion SessionContext for subsequent queries, avoiding repeated setup. +/// +/// Examples +/// -------- +/// >>> from lance_graph import SqlEngine +/// >>> import pyarrow as pa +/// >>> +/// >>> datasets = { +/// ... "person": pa.table({"id": [1, 2], "name": ["Alice", "Bob"], "age": [28, 34]}), +/// ... "knows": pa.table({"src": [1], "dst": [2]}), +/// ... } +/// >>> +/// >>> engine = SqlEngine(datasets) +/// >>> result1 = engine.execute("SELECT * FROM person WHERE age > 30") +/// >>> result2 = engine.execute("SELECT p.name FROM person p JOIN knows k ON p.id = k.src") +#[pyclass(name = "SqlEngine", module = "lance.graph")] +pub struct SqlEngine { + context: Arc, +} + +#[pymethods] +impl SqlEngine { + /// Create a new SqlEngine with cached datasets + /// + /// Parameters + /// ---------- + /// datasets : dict + /// Dictionary mapping table names to PyArrow tables or Lance datasets. + /// Table names are lowercased for consistency. + /// + /// Returns + /// ------- + /// SqlEngine + /// A new engine instance ready to execute queries + /// + /// Raises + /// ------ + /// ValueError + /// If no datasets are provided + /// RuntimeError + /// If table registration fails + #[new] + fn new(datasets: &Bound<'_, PyDict>) -> PyResult { + let arrow_datasets = python_datasets_to_batches(datasets)?; + + if arrow_datasets.is_empty() { + return Err(PyValueError::new_err("No input datasets provided")); + } + + let ctx = SessionContext::new(); + + for (name, batch) in &arrow_datasets { + let mem_table = Arc::new( + MemTable::try_new(batch.schema(), vec![vec![batch.clone()]]).map_err(|e| { + PyRuntimeError::new_err(format!( + "Failed to create MemTable for {}: {}", + name, e + )) + })?, + ); + + let normalized_name = name.to_lowercase(); + ctx.register_table(&normalized_name, mem_table) + .map_err(|e| { + PyRuntimeError::new_err(format!("Failed to register table {}: {}", name, e)) + })?; + } + + Ok(Self { + context: Arc::new(ctx), + }) + } + + /// Execute a SQL query using the cached datasets + /// + /// Parameters + /// ---------- + /// sql : str + /// The SQL query string to execute + /// + /// Returns + /// ------- + /// pyarrow.Table + /// Query results as Arrow table + /// + /// Raises + /// ------ + /// RuntimeError + /// If query execution fails + fn execute(&self, py: Python, sql: &str) -> PyResult { + let query = RustSqlQuery::new(sql); + let context = self.context.as_ref().clone(); + + let result_batch = RT + .block_on(Some(py), query.execute_with_context(context))? + .map_err(graph_error_to_pyerr)?; + + record_batch_to_python_table(py, &result_batch) + } + + fn __repr__(&self) -> String { + "SqlEngine(...)".to_string() + } +} + /// Register graph functionality with the Python module pub fn register_graph_module(py: Python, parent_module: &Bound<'_, PyModule>) -> PyResult<()> { let graph_module = PyModule::new(py, "graph")?; @@ -1324,6 +1543,8 @@ pub fn register_graph_module(py: Python, parent_module: &Bound<'_, PyModule>) -> graph_module.add_class::()?; graph_module.add_class::()?; graph_module.add_class::()?; + graph_module.add_class::()?; + graph_module.add_class::()?; graph_module.add_class::()?; graph_module.add_class::()?; diff --git a/crates/lance-graph/src/lib.rs b/crates/lance-graph/src/lib.rs index 692773ad..25d8ebec 100644 --- a/crates/lance-graph/src/lib.rs +++ b/crates/lance-graph/src/lib.rs @@ -47,6 +47,7 @@ pub mod parser; pub mod query; pub mod semantic; pub mod simple_executor; +pub mod sql_query; /// Maximum allowed hops for variable-length relationship expansion (e.g., *1..N) pub const MAX_VARIABLE_LENGTH_HOPS: u32 = 20; @@ -58,3 +59,4 @@ pub use lance_graph_catalog::{ }; pub use lance_vector_search::VectorSearch; pub use query::{CypherQuery, ExecutionStrategy}; +pub use sql_query::SqlQuery; diff --git a/crates/lance-graph/src/query.rs b/crates/lance-graph/src/query.rs index bdf33384..cc4f96c9 100644 --- a/crates/lance-graph/src/query.rs +++ b/crates/lance-graph/src/query.rs @@ -23,7 +23,7 @@ use std::sync::Arc; /// /// This ensures that column names in the dataset match the normalized /// qualified column names used internally (e.g., "fullName" becomes "fullname"). -fn normalize_schema(schema: SchemaRef) -> Result { +pub(crate) fn normalize_schema(schema: SchemaRef) -> Result { let fields: Vec<_> = schema .fields() .iter() @@ -42,7 +42,7 @@ fn normalize_schema(schema: SchemaRef) -> Result { /// /// This creates a new RecordBatch with a normalized schema while /// preserving all the data arrays. -fn normalize_record_batch(batch: &RecordBatch) -> Result { +pub(crate) fn normalize_record_batch(batch: &RecordBatch) -> Result { let normalized_schema = normalize_schema(batch.schema())?; RecordBatch::try_new(normalized_schema, batch.columns().to_vec()).map_err(|e| { GraphError::PlanError { diff --git a/crates/lance-graph/src/sql_query.rs b/crates/lance-graph/src/sql_query.rs new file mode 100644 index 00000000..97705d1b --- /dev/null +++ b/crates/lance-graph/src/sql_query.rs @@ -0,0 +1,356 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Direct SQL query interface for Lance datasets +//! +//! This module provides a way to execute standard SQL queries directly against +//! in-memory datasets (as RecordBatches) or a pre-configured DataFusion SessionContext, +//! without requiring a GraphConfig or Cypher parsing. + +use crate::error::{GraphError, Result}; +use crate::query::{normalize_record_batch, normalize_schema}; +use arrow_array::RecordBatch; +use datafusion::datasource::MemTable; +use datafusion::execution::context::SessionContext; +use std::collections::HashMap; +use std::sync::Arc; + +/// A SQL query that can be executed against in-memory datasets or a DataFusion SessionContext. +/// +/// Unlike `CypherQuery`, this does not require a `GraphConfig` — users write standard SQL +/// with explicit JOINs against their node/relationship tables. +/// +/// # Example +/// +/// ```no_run +/// use lance_graph::SqlQuery; +/// use arrow_array::RecordBatch; +/// use std::collections::HashMap; +/// +/// # async fn example() -> lance_graph::Result<()> { +/// let mut datasets: HashMap = HashMap::new(); +/// // datasets.insert("person".to_string(), person_batch); +/// +/// let query = SqlQuery::new("SELECT name, age FROM person WHERE age > 30"); +/// // let result = query.execute(datasets).await?; +/// # Ok(()) +/// # } +/// ``` +#[derive(Debug, Clone)] +pub struct SqlQuery { + sql: String, +} + +impl SqlQuery { + /// Create a new SQL query from a SQL string. + /// + /// No parsing is done at construction time — the SQL is validated when executed. + pub fn new(sql: &str) -> Self { + Self { + sql: sql.to_string(), + } + } + + /// Get the SQL query text. + pub fn sql(&self) -> &str { + &self.sql + } + + /// Execute the SQL query against in-memory datasets. + /// + /// Each entry in `datasets` is registered as a table in a fresh DataFusion + /// SessionContext. Table names are lowercased for consistency. + /// + /// # Arguments + /// * `datasets` - HashMap of table name to RecordBatch + /// + /// # Returns + /// A single `RecordBatch` containing all result rows. + pub async fn execute(&self, datasets: HashMap) -> Result { + let ctx = self.build_context(datasets)?; + self.execute_with_context(ctx).await + } + + /// Execute the SQL query against a pre-configured DataFusion SessionContext. + /// + /// Use this when tables are already registered (e.g., CSV/Parquet files, + /// external data sources, or a context shared across queries). + pub async fn execute_with_context(&self, ctx: SessionContext) -> Result { + let df = ctx + .sql(&self.sql) + .await + .map_err(|e| GraphError::PlanError { + message: format!("SQL execution error: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + + let batches = df.collect().await.map_err(|e| GraphError::PlanError { + message: format!("Failed to collect SQL results: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + + if batches.is_empty() { + // Return an empty batch with the schema from the logical plan + let schema = df_schema_from_ctx(&ctx, &self.sql).await?; + return Ok(RecordBatch::new_empty(schema)); + } + + let schema = batches[0].schema(); + arrow::compute::concat_batches(&schema, &batches).map_err(|e| GraphError::PlanError { + message: format!("Failed to concatenate result batches: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + }) + } + + /// Return the DataFusion execution plan as a formatted string. + /// + /// Useful for debugging and understanding how the query will be executed. + pub async fn explain(&self, datasets: HashMap) -> Result { + let ctx = self.build_context(datasets)?; + + let df = ctx + .sql(&self.sql) + .await + .map_err(|e| GraphError::PlanError { + message: format!("SQL explain error: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + + let logical_plan = df.logical_plan(); + + let physical_plan = ctx + .state() + .create_physical_plan(logical_plan) + .await + .map_err(|e| GraphError::PlanError { + message: format!("Failed to create physical plan: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + + let physical_plan_str = datafusion::physical_plan::displayable(physical_plan.as_ref()) + .indent(true) + .to_string(); + + Ok(format!( + "== Logical Plan ==\n{}\n\n== Physical Plan ==\n{}", + logical_plan.display_indent(), + physical_plan_str, + )) + } + + /// Build a DataFusion SessionContext from in-memory datasets. + fn build_context(&self, datasets: HashMap) -> Result { + let ctx = SessionContext::new(); + + for (name, batch) in datasets { + let normalized_batch = normalize_record_batch(&batch)?; + let schema = normalized_batch.schema(); + let mem_table = Arc::new( + MemTable::try_new(schema, vec![vec![normalized_batch]]).map_err(|e| { + GraphError::PlanError { + message: format!("Failed to create MemTable for {}: {}", name, e), + location: snafu::Location::new(file!(), line!(), column!()), + } + })?, + ); + + let normalized_name = name.to_lowercase(); + ctx.register_table(&normalized_name, mem_table) + .map_err(|e| GraphError::PlanError { + message: format!("Failed to register table {}: {}", name, e), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + } + + Ok(ctx) + } +} + +/// Helper to get the output schema from a SQL query without executing it. +async fn df_schema_from_ctx(ctx: &SessionContext, sql: &str) -> Result> { + let df = ctx.sql(sql).await.map_err(|e| GraphError::PlanError { + message: format!("Failed to plan SQL for schema: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + let arrow_schema = Arc::new(arrow_schema::Schema::from(df.schema())); + normalize_schema(arrow_schema) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{Float64Array, Int64Array, StringArray}; + use arrow_schema::{DataType, Field, Schema}; + + fn person_batch() -> RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int64, false), + Field::new("city", DataType::Utf8, false), + ])); + RecordBatch::try_new( + schema, + vec![ + Arc::new(Int64Array::from(vec![1, 2, 3, 4])), + Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol", "David"])), + Arc::new(Int64Array::from(vec![28, 34, 29, 42])), + Arc::new(StringArray::from(vec![ + "New York", + "San Francisco", + "New York", + "Chicago", + ])), + ], + ) + .unwrap() + } + + fn datasets_with(name: &str, batch: RecordBatch) -> HashMap { + let mut datasets = HashMap::new(); + datasets.insert(name.to_string(), batch); + datasets + } + + #[tokio::test] + async fn test_basic_select() { + let query = SqlQuery::new("SELECT name, age FROM person WHERE age > 30 ORDER BY age"); + let result = query + .execute(datasets_with("person", person_batch())) + .await + .unwrap(); + + let names: Vec<&str> = result + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|v| v.unwrap()) + .collect(); + assert_eq!(names, vec!["Bob", "David"]); + } + + #[tokio::test] + async fn test_select_star() { + let query = SqlQuery::new("SELECT * FROM person"); + let result = query + .execute(datasets_with("person", person_batch())) + .await + .unwrap(); + assert_eq!(result.num_rows(), 4); + assert_eq!(result.num_columns(), 4); + } + + #[tokio::test] + async fn test_limit() { + let query = SqlQuery::new("SELECT name FROM person ORDER BY name LIMIT 2"); + let result = query + .execute(datasets_with("person", person_batch())) + .await + .unwrap(); + assert_eq!(result.num_rows(), 2); + } + + #[tokio::test] + async fn test_aggregation() { + let query = SqlQuery::new( + "SELECT COUNT(*) as cnt, AVG(age) as avg_age, SUM(age) as total_age FROM person", + ); + let result = query + .execute(datasets_with("person", person_batch())) + .await + .unwrap(); + assert_eq!(result.num_rows(), 1); + + let cnt = result + .column_by_name("cnt") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + assert_eq!(cnt, 4); + + let avg_age = result + .column_by_name("avg_age") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + assert!((avg_age - 33.25).abs() < 0.01); + } + + #[tokio::test] + async fn test_group_by() { + let query = SqlQuery::new( + "SELECT city, COUNT(*) as cnt FROM person GROUP BY city ORDER BY cnt DESC", + ); + let result = query + .execute(datasets_with("person", person_batch())) + .await + .unwrap(); + + let cities: Vec<&str> = result + .column_by_name("city") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|v| v.unwrap()) + .collect(); + // New York has 2, others have 1 + assert_eq!(cities[0], "New York"); + } + + #[tokio::test] + async fn test_invalid_sql() { + let query = SqlQuery::new("INVALID SQL STATEMENT"); + let result = query.execute(datasets_with("person", person_batch())).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_explain() { + let query = SqlQuery::new("SELECT name FROM person WHERE age > 30"); + let plan = query + .explain(datasets_with("person", person_batch())) + .await + .unwrap(); + assert!(plan.contains("Logical Plan")); + assert!(plan.contains("Physical Plan")); + } + + #[tokio::test] + async fn test_execute_with_context() { + // Build context manually and execute against it + let ctx = SessionContext::new(); + let batch = person_batch(); + let schema = batch.schema(); + let mem_table = Arc::new(MemTable::try_new(schema, vec![vec![batch]]).unwrap()); + ctx.register_table("people", mem_table).unwrap(); + + let query = SqlQuery::new("SELECT name FROM people ORDER BY name LIMIT 1"); + let result = query.execute_with_context(ctx).await.unwrap(); + + let names: Vec<&str> = result + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|v| v.unwrap()) + .collect(); + assert_eq!(names, vec!["Alice"]); + } + + #[tokio::test] + async fn test_sql_text_accessor() { + let query = SqlQuery::new("SELECT 1"); + assert_eq!(query.sql(), "SELECT 1"); + } +} diff --git a/crates/lance-graph/tests/test_sql_query.rs b/crates/lance-graph/tests/test_sql_query.rs new file mode 100644 index 00000000..09baad9f --- /dev/null +++ b/crates/lance-graph/tests/test_sql_query.rs @@ -0,0 +1,338 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Integration tests for SqlQuery + +use arrow_array::{Float64Array, Int64Array, RecordBatch, StringArray}; +use arrow_schema::{DataType, Field, Schema}; +use lance_graph::SqlQuery; +use std::collections::HashMap; +use std::sync::Arc; + +fn person_batch() -> RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int64, false), + Field::new("city", DataType::Utf8, false), + ])); + RecordBatch::try_new( + schema, + vec![ + Arc::new(Int64Array::from(vec![1, 2, 3, 4])), + Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol", "David"])), + Arc::new(Int64Array::from(vec![28, 34, 29, 42])), + Arc::new(StringArray::from(vec![ + "New York", + "San Francisco", + "New York", + "Chicago", + ])), + ], + ) + .unwrap() +} + +fn knows_batch() -> RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("src_id", DataType::Int64, false), + Field::new("dst_id", DataType::Int64, false), + Field::new("since_year", DataType::Int64, false), + ])); + RecordBatch::try_new( + schema, + vec![ + Arc::new(Int64Array::from(vec![1, 1, 2, 3])), + Arc::new(Int64Array::from(vec![2, 3, 4, 4])), + Arc::new(Int64Array::from(vec![2015, 2018, 2020, 2021])), + ], + ) + .unwrap() +} + +fn make_datasets() -> HashMap { + let mut datasets = HashMap::new(); + datasets.insert("person".to_string(), person_batch()); + datasets.insert("knows".to_string(), knows_batch()); + datasets +} + +// ============================================================================ +// Basic SELECT with WHERE, ORDER BY, LIMIT +// ============================================================================ + +#[tokio::test] +async fn test_select_with_where_order_by_limit() { + let query = SqlQuery::new("SELECT name, age FROM person WHERE age > 30 ORDER BY age LIMIT 10"); + let result = query.execute(make_datasets()).await.unwrap(); + + let names: Vec<&str> = result + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|v| v.unwrap()) + .collect(); + + assert_eq!(names, vec!["Bob", "David"]); + + let ages: Vec = result + .column_by_name("age") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|v| v.unwrap()) + .collect(); + + assert_eq!(ages, vec![34, 42]); +} + +#[tokio::test] +async fn test_select_star() { + let query = SqlQuery::new("SELECT * FROM person ORDER BY id"); + let result = query.execute(make_datasets()).await.unwrap(); + assert_eq!(result.num_rows(), 4); + assert_eq!(result.num_columns(), 4); +} + +#[tokio::test] +async fn test_select_limit() { + let query = SqlQuery::new("SELECT name FROM person ORDER BY name LIMIT 2"); + let result = query.execute(make_datasets()).await.unwrap(); + assert_eq!(result.num_rows(), 2); +} + +// ============================================================================ +// JOINs between node and relationship tables +// ============================================================================ + +#[tokio::test] +async fn test_inner_join() { + let query = SqlQuery::new( + "SELECT p.name, k.dst_id, k.since_year \ + FROM person p \ + JOIN knows k ON p.id = k.src_id \ + ORDER BY p.name, k.dst_id", + ); + let result = query.execute(make_datasets()).await.unwrap(); + + let names: Vec<&str> = result + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|v| v.unwrap()) + .collect(); + + // Alice->2, Alice->3, Bob->4, Carol->4 + assert_eq!(names, vec!["Alice", "Alice", "Bob", "Carol"]); +} + +#[tokio::test] +async fn test_self_join_friends() { + let query = SqlQuery::new( + "SELECT p1.name AS person, p2.name AS friend \ + FROM person p1 \ + JOIN knows k ON p1.id = k.src_id \ + JOIN person p2 ON p2.id = k.dst_id \ + ORDER BY p1.name, p2.name", + ); + let result = query.execute(make_datasets()).await.unwrap(); + + let persons: Vec<&str> = result + .column_by_name("person") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|v| v.unwrap()) + .collect(); + let friends: Vec<&str> = result + .column_by_name("friend") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|v| v.unwrap()) + .collect(); + + assert_eq!(persons, vec!["Alice", "Alice", "Bob", "Carol"]); + assert_eq!(friends, vec!["Bob", "Carol", "David", "David"]); +} + +// ============================================================================ +// Aggregations (COUNT, SUM, AVG) +// ============================================================================ + +#[tokio::test] +async fn test_count() { + let query = SqlQuery::new("SELECT COUNT(*) AS cnt FROM person"); + let result = query.execute(make_datasets()).await.unwrap(); + + let cnt = result + .column_by_name("cnt") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + assert_eq!(cnt, 4); +} + +#[tokio::test] +async fn test_sum() { + let query = SqlQuery::new("SELECT SUM(age) AS total_age FROM person"); + let result = query.execute(make_datasets()).await.unwrap(); + + let total = result + .column_by_name("total_age") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + assert_eq!(total, 28 + 34 + 29 + 42); +} + +#[tokio::test] +async fn test_avg() { + let query = SqlQuery::new("SELECT AVG(age) AS avg_age FROM person"); + let result = query.execute(make_datasets()).await.unwrap(); + + let avg = result + .column_by_name("avg_age") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + assert!((avg - 33.25).abs() < 0.01); +} + +#[tokio::test] +async fn test_group_by_with_count() { + let query = SqlQuery::new( + "SELECT city, COUNT(*) AS cnt FROM person GROUP BY city ORDER BY cnt DESC, city", + ); + let result = query.execute(make_datasets()).await.unwrap(); + + let cities: Vec<&str> = result + .column_by_name("city") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|v| v.unwrap()) + .collect(); + let counts: Vec = result + .column_by_name("cnt") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|v| v.unwrap()) + .collect(); + + assert_eq!(cities[0], "New York"); + assert_eq!(counts[0], 2); +} + +// ============================================================================ +// Execute with SessionContext (pre-registered tables) +// ============================================================================ + +#[tokio::test] +async fn test_execute_with_session_context() { + use datafusion::datasource::MemTable; + use datafusion::execution::context::SessionContext; + + let ctx = SessionContext::new(); + + // Register person table + let batch = person_batch(); + let schema = batch.schema(); + let mem_table = Arc::new(MemTable::try_new(schema, vec![vec![batch]]).unwrap()); + ctx.register_table("people", mem_table).unwrap(); + + // Register knows table + let batch = knows_batch(); + let schema = batch.schema(); + let mem_table = Arc::new(MemTable::try_new(schema, vec![vec![batch]]).unwrap()); + ctx.register_table("relationships", mem_table).unwrap(); + + let query = SqlQuery::new( + "SELECT p.name, r.since_year \ + FROM people p \ + JOIN relationships r ON p.id = r.src_id \ + ORDER BY p.name, r.since_year", + ); + let result = query.execute_with_context(ctx).await.unwrap(); + + assert_eq!(result.num_rows(), 4); + + let names: Vec<&str> = result + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|v| v.unwrap()) + .collect(); + assert_eq!(names[0], "Alice"); +} + +// ============================================================================ +// Explain +// ============================================================================ + +#[tokio::test] +async fn test_explain_output() { + let query = SqlQuery::new("SELECT p.name FROM person p JOIN knows k ON p.id = k.src_id"); + let plan = query.explain(make_datasets()).await.unwrap(); + assert!(plan.contains("Logical Plan")); + assert!(plan.contains("Physical Plan")); +} + +// ============================================================================ +// Error handling +// ============================================================================ + +#[tokio::test] +async fn test_invalid_sql() { + let query = SqlQuery::new("NOT VALID SQL"); + let result = query.execute(make_datasets()).await; + assert!(result.is_err()); +} + +#[tokio::test] +async fn test_missing_table() { + let query = SqlQuery::new("SELECT * FROM nonexistent_table"); + let result = query.execute(make_datasets()).await; + assert!(result.is_err()); +} + +// ============================================================================ +// Case insensitivity (table names are lowercased) +// ============================================================================ + +#[tokio::test] +async fn test_case_insensitive_table_names() { + let mut datasets = HashMap::new(); + datasets.insert("Person".to_string(), person_batch()); + + // Table registered as lowercase "person", so SQL should use lowercase + let query = SqlQuery::new("SELECT name FROM person ORDER BY name LIMIT 1"); + let result = query.execute(datasets).await.unwrap(); + assert_eq!(result.num_rows(), 1); +} diff --git a/python/python/lance_graph/__init__.py b/python/python/lance_graph/__init__.py index 8ea65ee1..3a7418e4 100644 --- a/python/python/lance_graph/__init__.py +++ b/python/python/lance_graph/__init__.py @@ -72,6 +72,8 @@ def _load_dev_build() -> ModuleType: GraphConfigBuilder = _bindings.graph.GraphConfigBuilder CypherQuery = _bindings.graph.CypherQuery CypherEngine = _bindings.graph.CypherEngine +SqlQuery = _bindings.graph.SqlQuery +SqlEngine = _bindings.graph.SqlEngine ExecutionStrategy = _bindings.graph.ExecutionStrategy VectorSearch = _bindings.graph.VectorSearch DistanceMetric = _bindings.graph.DistanceMetric @@ -83,6 +85,8 @@ def _load_dev_build() -> ModuleType: "GraphConfigBuilder", "CypherQuery", "CypherEngine", + "SqlQuery", + "SqlEngine", "ExecutionStrategy", "VectorSearch", "DistanceMetric", diff --git a/python/python/tests/test_sql.py b/python/python/tests/test_sql.py new file mode 100644 index 00000000..e9d1b533 --- /dev/null +++ b/python/python/tests/test_sql.py @@ -0,0 +1,175 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +import pyarrow as pa +import pytest +from lance_graph import SqlEngine, SqlQuery + + +@pytest.fixture +def person_table(): + return pa.table( + { + "id": [1, 2, 3, 4], + "name": ["Alice", "Bob", "Carol", "David"], + "age": [28, 34, 29, 42], + "city": ["New York", "San Francisco", "New York", "Chicago"], + } + ) + + +@pytest.fixture +def knows_table(): + return pa.table( + { + "src_id": [1, 1, 2, 3], + "dst_id": [2, 3, 4, 4], + "since_year": [2015, 2018, 2020, 2021], + } + ) + + +@pytest.fixture +def datasets(person_table, knows_table): + return {"person": person_table, "knows": knows_table} + + +# ========================================================================== +# SqlQuery tests +# ========================================================================== + + +class TestSqlQuery: + def test_basic_select(self, datasets): + query = SqlQuery("SELECT name, age FROM person WHERE age > 30 ORDER BY age") + result = query.execute(datasets) + data = result.to_pydict() + + assert data["name"] == ["Bob", "David"] + assert data["age"] == [34, 42] + + def test_select_star(self, datasets): + query = SqlQuery("SELECT * FROM person") + result = query.execute(datasets) + assert result.num_rows == 4 + assert result.num_columns == 4 + + def test_limit(self, datasets): + query = SqlQuery("SELECT name FROM person ORDER BY name LIMIT 2") + result = query.execute(datasets) + assert result.num_rows == 2 + + def test_join(self, datasets): + query = SqlQuery( + "SELECT p.name, k.dst_id " + "FROM person p " + "JOIN knows k ON p.id = k.src_id " + "ORDER BY p.name, k.dst_id" + ) + result = query.execute(datasets) + data = result.to_pydict() + + assert data["name"] == ["Alice", "Alice", "Bob", "Carol"] + + def test_self_join(self, datasets): + query = SqlQuery( + "SELECT p1.name AS person, p2.name AS friend " + "FROM person p1 " + "JOIN knows k ON p1.id = k.src_id " + "JOIN person p2 ON p2.id = k.dst_id " + "ORDER BY p1.name, p2.name" + ) + result = query.execute(datasets) + data = result.to_pydict() + + assert data["person"] == ["Alice", "Alice", "Bob", "Carol"] + assert data["friend"] == ["Bob", "Carol", "David", "David"] + + def test_count(self, datasets): + query = SqlQuery("SELECT COUNT(*) AS cnt FROM person") + result = query.execute(datasets) + assert result.to_pydict()["cnt"] == [4] + + def test_sum(self, datasets): + query = SqlQuery("SELECT SUM(age) AS total FROM person") + result = query.execute(datasets) + assert result.to_pydict()["total"] == [28 + 34 + 29 + 42] + + def test_avg(self, datasets): + query = SqlQuery("SELECT AVG(age) AS avg_age FROM person") + result = query.execute(datasets) + avg = result.to_pydict()["avg_age"][0] + assert abs(avg - 33.25) < 0.01 + + def test_group_by(self, datasets): + query = SqlQuery( + "SELECT city, COUNT(*) AS cnt " + "FROM person GROUP BY city ORDER BY cnt DESC, city" + ) + result = query.execute(datasets) + data = result.to_pydict() + assert data["city"][0] == "New York" + assert data["cnt"][0] == 2 + + def test_explain(self, datasets): + query = SqlQuery("SELECT name FROM person WHERE age > 30") + plan = query.explain(datasets) + assert "Logical Plan" in plan + assert "Physical Plan" in plan + + def test_sql_accessor(self): + query = SqlQuery("SELECT 1") + assert query.sql() == "SELECT 1" + + def test_repr(self): + query = SqlQuery("SELECT 1") + assert "SqlQuery" in repr(query) + + def test_invalid_sql(self, datasets): + query = SqlQuery("INVALID SQL") + with pytest.raises((RuntimeError, ValueError)): + query.execute(datasets) + + def test_case_insensitive_table_names(self, person_table): + """Table name 'Person' should be lowercased to 'person'.""" + query = SqlQuery("SELECT name FROM person LIMIT 1") + result = query.execute({"Person": person_table}) + assert result.num_rows == 1 + + +# ========================================================================== +# SqlEngine tests +# ========================================================================== + + +class TestSqlEngine: + def test_basic_query(self, datasets): + engine = SqlEngine(datasets) + result = engine.execute( + "SELECT name, age FROM person WHERE age > 30 ORDER BY age" + ) + data = result.to_pydict() + assert data["name"] == ["Bob", "David"] + + def test_multiple_queries(self, datasets): + engine = SqlEngine(datasets) + + r1 = engine.execute("SELECT COUNT(*) AS cnt FROM person") + r2 = engine.execute("SELECT name FROM person WHERE age > 30 ORDER BY name") + r3 = engine.execute( + "SELECT p.name, k.dst_id " + "FROM person p JOIN knows k ON p.id = k.src_id " + "ORDER BY p.name LIMIT 2" + ) + + assert r1.to_pydict()["cnt"] == [4] + assert r2.to_pydict()["name"] == ["Bob", "David"] + assert r3.num_rows == 2 + + def test_repr(self, datasets): + engine = SqlEngine(datasets) + assert "SqlEngine" in repr(engine) + + def test_empty_datasets_raises(self): + with pytest.raises(ValueError, match="No input datasets"): + SqlEngine({}) From 49e490d33f3bcfd8874b1dda75c08bf03e108188 Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Thu, 26 Feb 2026 13:27:49 -0800 Subject: [PATCH 2/7] docs: add SqlQuery/SqlEngine to Python and project READMEs --- README.md | 26 ++++++++++++++++++++ python/README.md | 34 +++++++++++++++++++++++++++ python/python/lance_graph/__init__.py | 10 +++++++- 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f1ed4070..8e61c4b6 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,32 @@ result = query.execute({"Person": people}) print(result.to_pydict()) # {'name': ['Bob', 'David'], 'age': [34, 42]} ``` +## Python example: Direct SQL query + +For data analytics workflows where you prefer standard SQL, use `SqlQuery` or `SqlEngine`. No `GraphConfig` is needed: + +```python +import pyarrow as pa +from lance_graph import SqlQuery, SqlEngine + +person = pa.table({ + "id": [1, 2, 3], + "name": ["Alice", "Bob", "Carol"], + "age": [28, 34, 29], +}) + +# One-off query +result = SqlQuery( + "SELECT name, age FROM person WHERE age > 30" +).execute({"person": person}) +print(result.to_pydict()) # {'name': ['Bob'], 'age': [34]} + +# Multi-query with cached context +engine = SqlEngine({"person": person}) +r1 = engine.execute("SELECT COUNT(*) AS cnt FROM person") +r2 = engine.execute("SELECT name FROM person ORDER BY age DESC LIMIT 2") +``` + ## Knowledge Graph CLI & API The `knowledge_graph` package layers a simple Lance-backed knowledge graph diff --git a/python/README.md b/python/README.md index 2a44a911..21a64a8f 100644 --- a/python/README.md +++ b/python/README.md @@ -82,6 +82,40 @@ print(result1.to_pylist()) # [{'p.name': 'Alice'}] ``` +### 3. Direct SQL Queries + +For data analytics workflows where you prefer standard SQL over Cypher, use `SqlQuery` or `SqlEngine`. No `GraphConfig` is needed — write explicit JOINs against your tables directly: + +```python +import pyarrow as pa +from lance_graph import SqlQuery, SqlEngine + +person = pa.table({ + "id": [1, 2, 3], + "name": ["Alice", "Bob", "Carol"], + "age": [28, 34, 29], +}) +knows = pa.table({"src_id": [1, 1, 2], "dst_id": [2, 3, 3]}) +datasets = {"person": person, "knows": knows} + +# One-off query +result = SqlQuery( + "SELECT p.name, p.age FROM person p WHERE p.age > 30" +).execute(datasets) +print(result.to_pylist()) +# [{'name': 'Bob', 'age': 34}] + +# Multi-query with cached context +engine = SqlEngine(datasets) +r1 = engine.execute("SELECT COUNT(*) AS cnt FROM person") +r2 = engine.execute( + "SELECT p1.name AS person, p2.name AS friend " + "FROM person p1 " + "JOIN knows k ON p1.id = k.src_id " + "JOIN person p2 ON p2.id = k.dst_id" +) +``` + ### 3. Build a Knowledge Graph from Text ```python diff --git a/python/python/lance_graph/__init__.py b/python/python/lance_graph/__init__.py index 3a7418e4..4a348fd8 100644 --- a/python/python/lance_graph/__init__.py +++ b/python/python/lance_graph/__init__.py @@ -1,4 +1,12 @@ -"""Python bindings for the ``lance-graph`` crate.""" +"""Python bindings for the ``lance-graph`` crate. + +Provides two query interfaces: + +- **Cypher**: ``CypherQuery`` and ``CypherEngine`` for graph-pattern queries + (requires a ``GraphConfig`` with node/relationship mappings). +- **SQL**: ``SqlQuery`` and ``SqlEngine`` for standard SQL queries executed + directly against datasets via DataFusion (no ``GraphConfig`` needed). +""" from __future__ import annotations From 132c147b140c9c2ab9abf5b16b56c91140ef9fd4 Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Thu, 26 Feb 2026 23:23:06 -0800 Subject: [PATCH 3/7] feat(catalog): add Unity Catalog integration with extensible connector architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for browsing and querying tables from Unity Catalog (OSS). Inspired by Presto's connector SPI, the design cleanly separates: - CatalogProvider trait: catalog metadata browsing (UC first, extensible to Hive Metastore, AWS Glue, Iceberg REST Catalog) - TableReader trait: format-specific data reading (Parquet + Delta Lake, extensible to CSV, Iceberg, ORC) - Connector struct: facade bundling catalog + readers Key features: - Full UC REST API client (list/get catalogs, schemas, tables, columns) - UC type → Arrow type mapping (20 type mappings) - ParquetTableReader via DataFusion register_parquet() - DeltaTableReader via deltalake 0.29 (behind "delta" feature flag) - Auto-register UC tables into SqlEngine via create_sql_engine() - Python bindings: UnityCatalog, CatalogInfo, SchemaInfo, TableInfo - 15 wiremock integration tests for UC REST client - 12 type mapping unit tests - 9 Python unit tests Python API: uc = UnityCatalog("http://localhost:8080/api/2.1/unity-catalog") engine = uc.create_sql_engine("unity", "default") result = engine.execute("SELECT * FROM my_table") Co-Authored-By: Claude Opus 4.6 (1M context) --- Cargo.lock | 618 +++++++++++++++++- crates/lance-graph-catalog/Cargo.toml | 8 + .../src/catalog_provider.rs | 176 +++++ crates/lance-graph-catalog/src/connector.rs | 201 ++++++ crates/lance-graph-catalog/src/lib.rs | 26 + .../lance-graph-catalog/src/table_reader.rs | 57 ++ .../lance-graph-catalog/src/type_mapping.rs | 222 +++++++ .../lance-graph-catalog/src/unity_catalog.rs | 362 ++++++++++ .../tests/unity_catalog_integration.rs | 467 +++++++++++++ crates/lance-graph-python/src/catalog.rs | 352 ++++++++++ crates/lance-graph-python/src/graph.rs | 14 +- crates/lance-graph-python/src/lib.rs | 1 + crates/lance-graph/Cargo.toml | 9 + crates/lance-graph/src/lib.rs | 12 + crates/lance-graph/src/sql_catalog.rs | 47 ++ crates/lance-graph/src/table_readers.rs | 128 ++++ python/python/lance_graph/__init__.py | 9 + python/python/tests/test_unity_catalog.py | 167 +++++ python/uv.lock | 2 +- 19 files changed, 2858 insertions(+), 20 deletions(-) create mode 100644 crates/lance-graph-catalog/src/catalog_provider.rs create mode 100644 crates/lance-graph-catalog/src/connector.rs create mode 100644 crates/lance-graph-catalog/src/table_reader.rs create mode 100644 crates/lance-graph-catalog/src/type_mapping.rs create mode 100644 crates/lance-graph-catalog/src/unity_catalog.rs create mode 100644 crates/lance-graph-catalog/tests/unity_catalog_integration.rs create mode 100644 crates/lance-graph-python/src/catalog.rs create mode 100644 crates/lance-graph/src/sql_catalog.rs create mode 100644 crates/lance-graph/src/table_readers.rs create mode 100644 python/python/tests/test_unity_catalog.py diff --git a/Cargo.lock b/Cargo.lock index 7d5c0d74..b5ab81ae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -356,6 +356,16 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "assert-json-diff" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e4f2b81832e72834d7518d8487a0396a28cc408186a2e8854c0f98011faf12" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "async-channel" version = "2.5.0" @@ -946,7 +956,7 @@ version = "3.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d82020dadcb845a345591863adb65d74fa8dc5c18a0b6d408470e13b7adc7005" dependencies = [ - "darling", + "darling 0.21.3", "ident_case", "prettyplease", "proc-macro2", @@ -1204,8 +1214,10 @@ version = "7.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" dependencies = [ - "strum", - "strum_macros", + "crossterm 0.27.0", + "crossterm 0.28.1", + "strum 0.26.3", + "strum_macros 0.26.4", "unicode-width", ] @@ -1250,6 +1262,25 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" +[[package]] +name = "convert_case" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baaaa0ecca5b51987b9423ccdc971514dd8b0bb7b4060b983d3664dad3f1f89f" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation" version = "0.10.1" @@ -1374,6 +1405,39 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crossterm" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f476fe445d41c9e991fd07515a6f463074b782242ccf4a5b7b1d1012e70824df" +dependencies = [ + "bitflags", + "crossterm_winapi", + "libc", + "parking_lot", + "winapi", +] + +[[package]] +name = "crossterm" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" +dependencies = [ + "bitflags", + "parking_lot", + "rustix 0.38.44", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + [[package]] name = "crunchy" version = "0.2.4" @@ -1411,14 +1475,54 @@ dependencies = [ "memchr", ] +[[package]] +name = "ctor" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e" +dependencies = [ + "ctor-proc-macro", + "dtor", +] + +[[package]] +name = "ctor-proc-macro" +version = "0.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1" + +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core 0.20.11", + "darling_macro 0.20.11", +] + [[package]] name = "darling" version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.21.3", + "darling_macro 0.21.3", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.106", ] [[package]] @@ -1435,13 +1539,24 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core 0.20.11", + "quote", + "syn 2.0.106", +] + [[package]] name = "darling_macro" version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" dependencies = [ - "darling_core", + "darling_core 0.21.3", "quote", "syn 2.0.106", ] @@ -1506,7 +1621,7 @@ dependencies = [ "parquet", "rand 0.9.2", "regex", - "sqlparser", + "sqlparser 0.58.0", "tempfile", "tokio", "url", @@ -1584,7 +1699,7 @@ dependencies = [ "parquet", "paste", "recursive", - "sqlparser", + "sqlparser 0.58.0", "tokio", "web-time", ] @@ -1765,7 +1880,7 @@ dependencies = [ "paste", "recursive", "serde_json", - "sqlparser", + "sqlparser 0.58.0", ] [[package]] @@ -2044,6 +2159,33 @@ dependencies = [ "tokio", ] +[[package]] +name = "datafusion-proto" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7df9f606892e6af45763d94d210634eec69b9bb6ced5353381682ff090028a3" +dependencies = [ + "arrow", + "chrono", + "datafusion", + "datafusion-common", + "datafusion-expr", + "datafusion-proto-common", + "object_store", + "prost", +] + +[[package]] +name = "datafusion-proto-common" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4b14f288ca4ef77743d9672cafecf3adfffff0b9b04af9af79ecbeaaf736901" +dependencies = [ + "arrow", + "datafusion-common", + "prost", +] + [[package]] name = "datafusion-pruning" version = "50.3.0" @@ -2100,9 +2242,27 @@ dependencies = [ "log", "recursive", "regex", - "sqlparser", + "sqlparser 0.58.0", +] + +[[package]] +name = "deadpool" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0be2b1d1d6ec8d846f05e137292d0b89133caf95ef33695424c09568bdd39b1b" +dependencies = [ + "deadpool-runtime", + "lazy_static", + "num_cpus", + "tokio", ] +[[package]] +name = "deadpool-runtime" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "092966b41edc516079bdf31ec78a2e0588d1d0c08f78b91d8307215928642b2b" + [[package]] name = "deepsize" version = "0.2.0" @@ -2123,6 +2283,123 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "delta_kernel" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb6b80fa39021744edf13509bbdd7caef94c1bf101e384990210332dbddddf44" +dependencies = [ + "arrow", + "bytes", + "chrono", + "comfy-table", + "delta_kernel_derive", + "futures", + "indexmap", + "itertools 0.14.0", + "object_store", + "parquet", + "reqwest", + "roaring 0.11.3", + "rustc_version", + "serde", + "serde_json", + "strum 0.27.2", + "thiserror 2.0.17", + "tokio", + "tracing", + "url", + "uuid", + "z85", +] + +[[package]] +name = "delta_kernel_derive" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae1d02d9f5d886ae8bb7fc3f7a3cb8f1b75cd0f5c95f9b5f45bba308f1a0aa58" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "deltalake" +version = "0.29.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09169ef5ecf35911f5f1c3117844a4e00da1edcce58fe8593a237761525f6e3a" +dependencies = [ + "ctor", + "delta_kernel", + "deltalake-core", +] + +[[package]] +name = "deltalake-core" +version = "0.29.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b2a5e889f44e4cdf66a70fe6456857cee948d52fa2f6b5d7e0cc3f1639d21d8" +dependencies = [ + "arrow", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "async-trait", + "bytes", + "cfg-if", + "chrono", + "dashmap", + "datafusion", + "datafusion-proto", + "delta_kernel", + "deltalake-derive", + "dirs", + "either", + "futures", + "indexmap", + "itertools 0.14.0", + "num_cpus", + "object_store", + "parking_lot", + "parquet", + "percent-encoding", + "percent-encoding-rfc3986", + "pin-project-lite", + "rand 0.8.5", + "regex", + "serde", + "serde_json", + "sqlparser 0.59.0", + "strum 0.27.2", + "thiserror 2.0.17", + "tokio", + "tracing", + "url", + "uuid", + "validator", +] + +[[package]] +name = "deltalake-derive" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a785b4702c2d1b6ff286075f375fb2fd52dfbb2fadf17b9233f4d5eea35c6ec" +dependencies = [ + "convert_case", + "itertools 0.14.0", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "der" version = "0.7.10" @@ -2203,6 +2480,21 @@ version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "117240f60069e65410b3ae1bb213295bd828f707b5bec6596a1afc8793ce0cbc" +[[package]] +name = "dtor" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301" +dependencies = [ + "dtor-proc-macro", +] + +[[package]] +name = "dtor-proc-macro" +version = "0.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5" + [[package]] name = "dunce" version = "1.0.5" @@ -2346,6 +2638,21 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -2856,6 +3163,12 @@ version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + [[package]] name = "humantime" version = "2.3.0" @@ -2876,6 +3189,7 @@ dependencies = [ "http 1.3.1", "http-body 1.0.1", "httparse", + "httpdate", "itoa", "pin-project-lite", "pin-utils", @@ -2902,6 +3216,22 @@ dependencies = [ "webpki-roots", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.17" @@ -2921,9 +3251,11 @@ dependencies = [ "percent-encoding", "pin-project-lite", "socket2", + "system-configuration", "tokio", "tower-service", "tracing", + "windows-registry", ] [[package]] @@ -3368,7 +3700,7 @@ dependencies = [ "prost", "prost-types", "rand 0.9.2", - "roaring", + "roaring 0.10.12", "semver", "serde", "serde_json", @@ -3439,7 +3771,7 @@ dependencies = [ "pin-project", "prost", "rand 0.9.2", - "roaring", + "roaring 0.10.12", "serde_json", "snafu", "tempfile", @@ -3533,7 +3865,7 @@ dependencies = [ "prost-types", "rand 0.9.2", "snafu", - "strum", + "strum 0.26.3", "tokio", "tracing", "xxhash-rust", @@ -3594,12 +3926,14 @@ dependencies = [ "arrow", "arrow-array", "arrow-schema", + "async-trait", "criterion", "datafusion", "datafusion-common", "datafusion-expr", "datafusion-functions-aggregate", "datafusion-sql", + "deltalake", "futures", "lance", "lance-arrow", @@ -3613,6 +3947,7 @@ dependencies = [ "snafu", "tempfile", "tokio", + "url", ] [[package]] @@ -3623,8 +3958,12 @@ dependencies = [ "async-trait", "datafusion", "lance-namespace", + "reqwest", + "serde", + "serde_json", "snafu", "tokio", + "wiremock", ] [[package]] @@ -3695,7 +4034,7 @@ dependencies = [ "rand 0.9.2", "rand_distr 0.5.1", "rayon", - "roaring", + "roaring 0.10.12", "serde", "serde_json", "snafu", @@ -3822,7 +4161,7 @@ dependencies = [ "prost-types", "rand 0.9.2", "rangemap", - "roaring", + "roaring 0.10.12", "semver", "serde", "serde_json", @@ -4200,6 +4539,23 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" +[[package]] +name = "native-tls" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe 0.2.1", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + [[package]] name = "ndarray" version = "0.16.1" @@ -4483,12 +4839,56 @@ dependencies = [ "uuid", ] +[[package]] +name = "openssl" +version = "0.10.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "openssl-probe" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" +[[package]] +name = "openssl-probe" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" + +[[package]] +name = "openssl-sys" +version = "0.9.111" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -4657,6 +5057,12 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" +[[package]] +name = "percent-encoding-rfc3986" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3637c05577168127568a64e9dc5a6887da720efef07b3d9472d45f63ab191166" + [[package]] name = "permutation" version = "0.4.1" @@ -4865,6 +5271,28 @@ dependencies = [ "toml_edit", ] +[[package]] +name = "proc-macro-error-attr2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96de42df36bb9bba5542fe9f1a054b8cc87e172759a1868aa05c1f3acc89dfc5" +dependencies = [ + "proc-macro2", + "quote", +] + +[[package]] +name = "proc-macro-error2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11ec05c52be0a07b08061f7dd003e7d7092e0472bc731b4af7bb1ef876109802" +dependencies = [ + "proc-macro-error-attr2", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "proc-macro2" version = "1.0.101" @@ -5351,11 +5779,13 @@ dependencies = [ "http-body-util", "hyper", "hyper-rustls", + "hyper-tls", "hyper-util", "js-sys", "log", "mime", "mime_guess", + "native-tls", "percent-encoding", "pin-project-lite", "quinn", @@ -5367,6 +5797,7 @@ dependencies = [ "serde_urlencoded", "sync_wrapper", "tokio", + "tokio-native-tls", "tokio-rustls", "tokio-util", "tower", @@ -5404,6 +5835,16 @@ dependencies = [ "byteorder", ] +[[package]] +name = "roaring" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ba9ce64a8f45d7fc86358410bb1a82e8c987504c0d4900e9141d69a9f26c885" +dependencies = [ + "bytemuck", + "byteorder", +] + [[package]] name = "robust" version = "1.2.0" @@ -5524,7 +5965,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" dependencies = [ - "openssl-probe", + "openssl-probe 0.1.6", "rustls-pki-types", "schannel", "security-framework", @@ -5630,7 +6071,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" dependencies = [ "bitflags", - "core-foundation", + "core-foundation 0.10.1", "core-foundation-sys", "libc", "security-framework-sys", @@ -5916,6 +6357,16 @@ dependencies = [ "sqlparser_derive", ] +[[package]] +name = "sqlparser" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4591acadbcf52f0af60eafbb2c003232b2b4cd8de5f0e9437cb8b1b59046cc0f" +dependencies = [ + "log", + "recursive", +] + [[package]] name = "sqlparser_derive" version = "0.3.0" @@ -5970,7 +6421,16 @@ version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" dependencies = [ - "strum_macros", + "strum_macros 0.26.4", +] + +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" +dependencies = [ + "strum_macros 0.27.2", ] [[package]] @@ -5986,6 +6446,18 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "strum_macros" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "subtle" version = "2.6.1" @@ -6034,6 +6506,27 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tagptr" version = "0.2.0" @@ -6364,6 +6857,7 @@ dependencies = [ "bytes", "libc", "mio", + "parking_lot", "pin-project-lite", "signal-hook-registry", "socket2", @@ -6382,6 +6876,16 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.26.4" @@ -6502,6 +7006,7 @@ version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ + "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -6652,16 +7157,53 @@ checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" dependencies = [ "getrandom 0.3.3", "js-sys", + "rand 0.9.2", "serde", "wasm-bindgen", ] +[[package]] +name = "validator" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0b4a29d8709210980a09379f27ee31549b73292c87ab9899beee1c0d3be6303" +dependencies = [ + "idna", + "once_cell", + "regex", + "serde", + "serde_derive", + "serde_json", + "url", + "validator_derive", +] + +[[package]] +name = "validator_derive" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bac855a2ce6f843beb229757e6e570a42e837bcb15e5f449dd48d5747d41bf77" +dependencies = [ + "darling 0.20.11", + "once_cell", + "proc-macro-error2", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "valuable" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -6965,6 +7507,17 @@ dependencies = [ "windows-link 0.1.3", ] +[[package]] +name = "windows-registry" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8a9ed28765efc97bbc954883f4e6796c33a06546ebafacbabee9696967499e" +dependencies = [ + "windows-link 0.1.3", + "windows-result 0.3.4", + "windows-strings 0.4.2", +] + [[package]] name = "windows-result" version = "0.3.4" @@ -7184,6 +7737,29 @@ dependencies = [ "memchr", ] +[[package]] +name = "wiremock" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08db1edfb05d9b3c1542e521aea074442088292f00b5f28e435c714a98f85031" +dependencies = [ + "assert-json-diff", + "base64", + "deadpool", + "futures", + "http 1.3.1", + "http-body-util", + "hyper", + "hyper-util", + "log", + "once_cell", + "regex", + "serde", + "serde_json", + "tokio", + "url", +] + [[package]] name = "wit-bindgen" version = "0.46.0" @@ -7275,6 +7851,12 @@ dependencies = [ "synstructure", ] +[[package]] +name = "z85" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6e61e59a957b7ccee15d2049f86e8bfd6f66968fcd88f018950662d9b86e675" + [[package]] name = "zerocopy" version = "0.8.27" diff --git a/crates/lance-graph-catalog/Cargo.toml b/crates/lance-graph-catalog/Cargo.toml index 1fe01f6d..471e98a1 100644 --- a/crates/lance-graph-catalog/Cargo.toml +++ b/crates/lance-graph-catalog/Cargo.toml @@ -15,7 +15,15 @@ arrow-schema = "56.2" async-trait = "0.1" datafusion = { version = "50.3", default-features = false } lance-namespace = "1.0.1" +reqwest = { version = "0.12", features = ["json"], optional = true } +serde = { version = "1", features = ["derive"] } +serde_json = "1" snafu = "0.8" +[features] +default = ["unity-catalog"] +unity-catalog = ["dep:reqwest"] + [dev-dependencies] tokio = { version = "1.37", features = ["macros", "rt-multi-thread"] } +wiremock = "0.6" diff --git a/crates/lance-graph-catalog/src/catalog_provider.rs b/crates/lance-graph-catalog/src/catalog_provider.rs new file mode 100644 index 00000000..b0e75580 --- /dev/null +++ b/crates/lance-graph-catalog/src/catalog_provider.rs @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Catalog provider trait and data types for external catalog integration. +//! +//! Inspired by Presto's `ConnectorMetadata` SPI, this module defines the +//! abstract interface for browsing external catalogs (Unity Catalog, Hive +//! Metastore, AWS Glue, etc.). + +use std::collections::HashMap; + +use arrow_schema::SchemaRef; +use async_trait::async_trait; + +/// Metadata about a catalog (top-level namespace). +#[derive(Debug, Clone)] +pub struct CatalogInfo { + pub name: String, + pub comment: Option, + pub properties: HashMap, + pub created_at: Option, + pub updated_at: Option, +} + +/// Metadata about a schema (second-level namespace within a catalog). +#[derive(Debug, Clone)] +pub struct SchemaInfo { + pub name: String, + pub catalog_name: String, + pub comment: Option, + pub properties: HashMap, + pub created_at: Option, + pub updated_at: Option, +} + +/// Metadata about a column in a table. +#[derive(Debug, Clone)] +pub struct ColumnInfo { + pub name: String, + /// Human-readable type string (e.g., "INT", "VARCHAR(255)"). + pub type_text: String, + /// Canonical type name from the catalog (e.g., "INT", "STRING"). + pub type_name: String, + /// Column position (0-based). + pub position: i32, + pub nullable: bool, + pub comment: Option, +} + +/// Data format of the underlying storage. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum DataSourceFormat { + Delta, + Parquet, + Csv, + Json, + Avro, + Orc, + Text, + Other(String), +} + +/// Type of table (managed vs external). +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum TableType { + Managed, + External, +} + +/// Full table metadata including columns and storage information. +#[derive(Debug, Clone)] +pub struct TableInfo { + pub name: String, + pub catalog_name: String, + pub schema_name: String, + pub table_type: TableType, + pub data_source_format: DataSourceFormat, + pub columns: Vec, + pub storage_location: Option, + pub comment: Option, + pub properties: HashMap, + pub created_at: Option, + pub updated_at: Option, +} + +/// Errors that can occur during catalog operations. +#[derive(Debug)] +pub enum CatalogError { + /// Network or HTTP error. + ConnectionError(String), + /// Resource not found (catalog, schema, or table). + NotFound(String), + /// Authentication or authorization failure. + AuthError(String), + /// Invalid or unparseable response from the catalog server. + InvalidResponse(String), + /// Failed to map a catalog type to an Arrow type. + TypeMappingError(String), + /// Other errors. + Other(String), +} + +impl std::fmt::Display for CatalogError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CatalogError::ConnectionError(msg) => write!(f, "Catalog connection error: {}", msg), + CatalogError::NotFound(msg) => write!(f, "Not found: {}", msg), + CatalogError::AuthError(msg) => write!(f, "Auth error: {}", msg), + CatalogError::InvalidResponse(msg) => write!(f, "Invalid response: {}", msg), + CatalogError::TypeMappingError(msg) => write!(f, "Type mapping error: {}", msg), + CatalogError::Other(msg) => write!(f, "Catalog error: {}", msg), + } + } +} + +impl std::error::Error for CatalogError {} + +pub type CatalogResult = std::result::Result; + +/// Abstract trait for browsing an external catalog. +/// +/// Analogous to Presto's `ConnectorMetadata`. Implementations provide access +/// to catalog metadata (catalogs, schemas, tables, columns) without being +/// coupled to any specific data format or storage backend. +/// +/// # Extensibility +/// +/// Implement this trait to add support for new catalog backends: +/// - Unity Catalog (provided) +/// - Hive Metastore (future) +/// - AWS Glue (future) +/// - Iceberg REST Catalog (future) +#[async_trait] +pub trait CatalogProvider: Send + Sync { + /// Human-readable name of this catalog provider (e.g., "unity-catalog"). + fn name(&self) -> &str; + + /// List all catalogs available in this provider. + async fn list_catalogs(&self) -> CatalogResult>; + + /// Get information about a specific catalog. + async fn get_catalog(&self, name: &str) -> CatalogResult; + + /// List all schemas within a catalog. + async fn list_schemas(&self, catalog_name: &str) -> CatalogResult>; + + /// Get information about a specific schema. + async fn get_schema( + &self, + catalog_name: &str, + schema_name: &str, + ) -> CatalogResult; + + /// List all tables within a schema. + async fn list_tables( + &self, + catalog_name: &str, + schema_name: &str, + ) -> CatalogResult>; + + /// Get detailed information about a specific table, including columns. + async fn get_table( + &self, + catalog_name: &str, + schema_name: &str, + table_name: &str, + ) -> CatalogResult; + + /// Convert a table's column definitions to an Arrow schema. + /// + /// The default implementation uses the standard type mapping from + /// [`crate::type_mapping::columns_to_arrow_schema`]. + fn table_to_arrow_schema(&self, table: &TableInfo) -> CatalogResult { + crate::type_mapping::columns_to_arrow_schema(&table.columns) + } +} diff --git a/crates/lance-graph-catalog/src/connector.rs b/crates/lance-graph-catalog/src/connector.rs new file mode 100644 index 00000000..8be44f95 --- /dev/null +++ b/crates/lance-graph-catalog/src/connector.rs @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Connector facade that bundles a [`CatalogProvider`] with [`TableReader`]s. +//! +//! Inspired by Presto's `Connector` interface which exposes `getMetadata()` + +//! `getPageSourceProvider()`, this struct provides a convenient entry point +//! for users who want to browse a catalog and register tables for querying. + +use std::sync::Arc; + +use arrow_schema::SchemaRef; +use datafusion::datasource::MemTable; +use datafusion::execution::context::SessionContext; + +use crate::catalog_provider::{CatalogError, CatalogInfo, CatalogProvider, CatalogResult, DataSourceFormat, SchemaInfo, TableInfo}; +use crate::table_reader::TableReader; + +/// Bundles a [`CatalogProvider`] with [`TableReader`]s for convenient use. +/// +/// The `Connector` is the primary entry point for interacting with an external +/// catalog. It delegates metadata operations to the catalog provider and data +/// reading to the appropriate table reader based on the table's data format. +/// +/// # Extensibility +/// +/// - Swap the catalog: pass a different `CatalogProvider` (e.g., AWS Glue). +/// - Add formats: pass additional `TableReader`s (e.g., Iceberg). +/// +/// # Example +/// +/// ```no_run +/// # use lance_graph_catalog::connector::Connector; +/// # use lance_graph_catalog::{UnityCatalogConfig, UnityCatalogProvider}; +/// # fn example() { +/// // let catalog = Arc::new(UnityCatalogProvider::new(config)?); +/// // let readers = default_table_readers(); // from lance-graph crate +/// // let connector = Connector::new(catalog, readers); +/// // let tables = connector.list_tables("unity", "default").await?; +/// # } +/// ``` +pub struct Connector { + catalog: Arc, + readers: Vec>, +} + +impl Connector { + /// Create a new connector with the given catalog provider and table readers. + pub fn new( + catalog: Arc, + readers: Vec>, + ) -> Self { + Self { catalog, readers } + } + + /// Get a reference to the underlying catalog provider. + pub fn catalog(&self) -> &dyn CatalogProvider { + self.catalog.as_ref() + } + + /// Find a table reader that supports the given data format. + pub fn reader_for(&self, format: &DataSourceFormat) -> Option<&dyn TableReader> { + self.readers + .iter() + .find(|r| r.supported_formats().contains(format)) + .map(|r| r.as_ref()) + } + + /// List all available table readers. + pub fn readers(&self) -> &[Arc] { + &self.readers + } + + // ---- Delegate catalog operations ---- + + pub async fn list_catalogs(&self) -> CatalogResult> { + self.catalog.list_catalogs().await + } + + pub async fn get_catalog(&self, name: &str) -> CatalogResult { + self.catalog.get_catalog(name).await + } + + pub async fn list_schemas(&self, catalog_name: &str) -> CatalogResult> { + self.catalog.list_schemas(catalog_name).await + } + + pub async fn get_schema( + &self, + catalog_name: &str, + schema_name: &str, + ) -> CatalogResult { + self.catalog.get_schema(catalog_name, schema_name).await + } + + pub async fn list_tables( + &self, + catalog_name: &str, + schema_name: &str, + ) -> CatalogResult> { + self.catalog.list_tables(catalog_name, schema_name).await + } + + pub async fn get_table( + &self, + catalog_name: &str, + schema_name: &str, + table_name: &str, + ) -> CatalogResult { + self.catalog + .get_table(catalog_name, schema_name, table_name) + .await + } + + /// Register all tables from a catalog schema into a DataFusion `SessionContext`. + /// + /// For each table: + /// 1. Retrieves full table metadata (including columns) from the catalog. + /// 2. Converts columns to an Arrow schema. + /// 3. Finds an appropriate [`TableReader`] for the table's data format. + /// 4. Registers the table in the session context. + /// + /// If no reader matches the table's format, falls back to registering an + /// empty `MemTable` with the correct schema (schema-only, for planning). + /// + /// Individual table failures are logged as warnings but do not abort the + /// registration of remaining tables. + /// + /// Returns a list of `(table_name, schema)` for successfully registered tables. + pub async fn register_schema( + &self, + ctx: &SessionContext, + catalog_name: &str, + schema_name: &str, + ) -> CatalogResult> { + let tables = self.catalog.list_tables(catalog_name, schema_name).await?; + let mut registered = Vec::new(); + + for table_summary in &tables { + match self + .register_single_table(ctx, catalog_name, schema_name, &table_summary.name) + .await + { + Ok((name, schema)) => { + registered.push((name, schema)); + } + Err(e) => { + eprintln!( + "Warning: failed to register table {}.{}.{}: {}", + catalog_name, schema_name, table_summary.name, e + ); + } + } + } + + Ok(registered) + } + + async fn register_single_table( + &self, + ctx: &SessionContext, + catalog_name: &str, + schema_name: &str, + table_name: &str, + ) -> CatalogResult<(String, SchemaRef)> { + let table_info = self + .catalog + .get_table(catalog_name, schema_name, table_name) + .await?; + let arrow_schema = self.catalog.table_to_arrow_schema(&table_info)?; + let normalized_name = table_info.name.to_lowercase(); + + // Find a reader for this format + let reader = self.reader_for(&table_info.data_source_format); + + match reader { + Some(r) => { + r.register_table(ctx, &normalized_name, &table_info, arrow_schema.clone()) + .await?; + } + None => { + // No reader — register schema-only (empty MemTable for planning) + let mem_table = MemTable::try_new(arrow_schema.clone(), vec![]).map_err(|e| { + CatalogError::Other(format!( + "Failed to create empty table '{}': {}", + normalized_name, e + )) + })?; + ctx.register_table(&normalized_name, Arc::new(mem_table)) + .map_err(|e| { + CatalogError::Other(format!( + "Failed to register table '{}': {}", + normalized_name, e + )) + })?; + } + } + + Ok((normalized_name, arrow_schema)) + } +} diff --git a/crates/lance-graph-catalog/src/lib.rs b/crates/lance-graph-catalog/src/lib.rs index b43457f6..cbd5c71d 100644 --- a/crates/lance-graph-catalog/src/lib.rs +++ b/crates/lance-graph-catalog/src/lib.rs @@ -2,9 +2,35 @@ // SPDX-FileCopyrightText: Copyright The Lance Authors //! Catalog and namespace utilities for Lance Graph. +//! +//! This crate provides the SPI (Service Provider Interface) layer for external +//! catalog integration, inspired by Presto's connector architecture: +//! +//! - [`CatalogProvider`] — browse catalog metadata (analogous to `ConnectorMetadata`) +//! - [`TableReader`] — read table data in specific formats (analogous to `ConnectorPageSourceProvider`) +//! - [`Connector`] — bundles catalog + readers (analogous to Presto's `Connector`) +pub mod catalog_provider; +pub mod connector; pub mod namespace; pub mod source_catalog; +pub mod table_reader; +pub mod type_mapping; +#[cfg(feature = "unity-catalog")] +pub mod unity_catalog; +// Existing exports pub use namespace::DirNamespace; pub use source_catalog::{GraphSourceCatalog, InMemoryCatalog, SimpleTableSource}; + +// Catalog provider exports +pub use catalog_provider::{ + CatalogError, CatalogInfo, CatalogProvider, CatalogResult, ColumnInfo, DataSourceFormat, + SchemaInfo, TableInfo, TableType, +}; +pub use connector::Connector; +pub use table_reader::TableReader; +pub use type_mapping::columns_to_arrow_schema; + +#[cfg(feature = "unity-catalog")] +pub use unity_catalog::{UnityCatalogConfig, UnityCatalogProvider}; diff --git a/crates/lance-graph-catalog/src/table_reader.rs b/crates/lance-graph-catalog/src/table_reader.rs new file mode 100644 index 00000000..3b7d2f1c --- /dev/null +++ b/crates/lance-graph-catalog/src/table_reader.rs @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Table reader trait for format-specific data reading. +//! +//! Inspired by Presto's `ConnectorPageSourceProvider`, this trait decouples +//! data format reading from catalog metadata. Each implementation handles +//! one or more data formats and is reusable across any [`CatalogProvider`]. + +use arrow_schema::SchemaRef; +use async_trait::async_trait; +use datafusion::execution::context::SessionContext; + +use crate::catalog_provider::{CatalogResult, DataSourceFormat, TableInfo}; + +/// Reads table data in a specific format and registers it into a DataFusion +/// `SessionContext`. +/// +/// Analogous to Presto's `ConnectorPageSourceProvider` — decoupled from +/// catalog metadata so that format readers are reusable across any catalog. +/// +/// # Extensibility +/// +/// Implement this trait to add support for new data formats: +/// - Parquet (provided) +/// - Delta Lake (provided, behind `delta` feature) +/// - CSV (future) +/// - Iceberg (future) +/// - ORC (future) +#[async_trait] +pub trait TableReader: Send + Sync { + /// Human-readable name of this reader (e.g., "parquet", "delta"). + fn name(&self) -> &str; + + /// The data format(s) this reader can handle. + fn supported_formats(&self) -> &[DataSourceFormat]; + + /// Register a table into a DataFusion `SessionContext` using its storage + /// location. + /// + /// The reader should read (or reference) the data at `table_info.storage_location` + /// and register it as a DataFusion `TableProvider` so it can be queried via SQL. + /// + /// # Arguments + /// + /// * `ctx` - The DataFusion session context to register the table in. + /// * `table_name` - The name to register the table under (already lowercased). + /// * `table_info` - Full table metadata from the catalog, including `storage_location`. + /// * `schema` - Arrow schema derived from the table's column definitions. + async fn register_table( + &self, + ctx: &SessionContext, + table_name: &str, + table_info: &TableInfo, + schema: SchemaRef, + ) -> CatalogResult<()>; +} diff --git a/crates/lance-graph-catalog/src/type_mapping.rs b/crates/lance-graph-catalog/src/type_mapping.rs new file mode 100644 index 00000000..878e2823 --- /dev/null +++ b/crates/lance-graph-catalog/src/type_mapping.rs @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Type mapping from Unity Catalog types to Arrow data types. + +use std::sync::Arc; + +use arrow_schema::{DataType, Field, Schema, SchemaRef}; + +use crate::catalog_provider::{CatalogError, CatalogResult, ColumnInfo}; + +/// Map a Unity Catalog `type_name` string to an Arrow `DataType`. +/// +/// Handles the standard UC type names. Case-insensitive. +pub fn uc_type_to_arrow(type_name: &str) -> CatalogResult { + match type_name.to_uppercase().as_str() { + // Boolean + "BOOLEAN" => Ok(DataType::Boolean), + + // Integer types + "BYTE" | "TINYINT" => Ok(DataType::Int8), + "SHORT" | "SMALLINT" => Ok(DataType::Int16), + "INT" | "INTEGER" => Ok(DataType::Int32), + "LONG" | "BIGINT" => Ok(DataType::Int64), + + // Floating-point types + "FLOAT" | "REAL" => Ok(DataType::Float32), + "DOUBLE" => Ok(DataType::Float64), + + // Decimal (default precision/scale; future: parse from type_text) + "DECIMAL" | "NUMERIC" | "DEC" => Ok(DataType::Decimal128(38, 10)), + + // String types + "STRING" | "VARCHAR" | "CHAR" | "TEXT" => Ok(DataType::Utf8), + + // Binary + "BINARY" => Ok(DataType::Binary), + + // Date and time + "DATE" => Ok(DataType::Date32), + "TIMESTAMP" | "TIMESTAMP_NTZ" => Ok(DataType::Timestamp( + arrow_schema::TimeUnit::Microsecond, + None, + )), + + // Null + "NULL" | "VOID" => Ok(DataType::Null), + + // Complex types — represented as Utf8 (JSON string) for now. + // A future iteration could parse type_text for ARRAY, MAP, STRUCT<...>. + "ARRAY" => Ok(DataType::Utf8), + "MAP" => Ok(DataType::Utf8), + "STRUCT" => Ok(DataType::Utf8), + + other => Err(CatalogError::TypeMappingError(format!( + "Unsupported Unity Catalog type: '{}'", + other + ))), + } +} + +/// Convert a slice of [`ColumnInfo`] to an Arrow [`Schema`]. +/// +/// Columns are sorted by `position` to ensure correct field order. +pub fn columns_to_arrow_schema(columns: &[ColumnInfo]) -> CatalogResult { + let mut sorted: Vec<&ColumnInfo> = columns.iter().collect(); + sorted.sort_by_key(|c| c.position); + + let fields: Vec = sorted + .iter() + .map(|col| { + let data_type = uc_type_to_arrow(&col.type_name)?; + Ok(Field::new(&col.name, data_type, col.nullable)) + }) + .collect::>>()?; + + Ok(Arc::new(Schema::new(fields))) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_type_mappings() { + assert_eq!(uc_type_to_arrow("BOOLEAN").unwrap(), DataType::Boolean); + assert_eq!(uc_type_to_arrow("INT").unwrap(), DataType::Int32); + assert_eq!(uc_type_to_arrow("INTEGER").unwrap(), DataType::Int32); + assert_eq!(uc_type_to_arrow("LONG").unwrap(), DataType::Int64); + assert_eq!(uc_type_to_arrow("BIGINT").unwrap(), DataType::Int64); + assert_eq!(uc_type_to_arrow("FLOAT").unwrap(), DataType::Float32); + assert_eq!(uc_type_to_arrow("DOUBLE").unwrap(), DataType::Float64); + assert_eq!(uc_type_to_arrow("STRING").unwrap(), DataType::Utf8); + assert_eq!(uc_type_to_arrow("VARCHAR").unwrap(), DataType::Utf8); + assert_eq!(uc_type_to_arrow("BINARY").unwrap(), DataType::Binary); + assert_eq!(uc_type_to_arrow("DATE").unwrap(), DataType::Date32); + assert_eq!(uc_type_to_arrow("BYTE").unwrap(), DataType::Int8); + assert_eq!(uc_type_to_arrow("SHORT").unwrap(), DataType::Int16); + } + + #[test] + fn test_timestamp_types() { + assert_eq!( + uc_type_to_arrow("TIMESTAMP").unwrap(), + DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None) + ); + assert_eq!( + uc_type_to_arrow("TIMESTAMP_NTZ").unwrap(), + DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None) + ); + } + + #[test] + fn test_decimal() { + assert_eq!( + uc_type_to_arrow("DECIMAL").unwrap(), + DataType::Decimal128(38, 10) + ); + assert_eq!( + uc_type_to_arrow("NUMERIC").unwrap(), + DataType::Decimal128(38, 10) + ); + } + + #[test] + fn test_case_insensitive() { + assert_eq!(uc_type_to_arrow("int").unwrap(), DataType::Int32); + assert_eq!(uc_type_to_arrow("String").unwrap(), DataType::Utf8); + assert_eq!(uc_type_to_arrow("boolean").unwrap(), DataType::Boolean); + } + + #[test] + fn test_complex_types_fallback_to_utf8() { + assert_eq!(uc_type_to_arrow("ARRAY").unwrap(), DataType::Utf8); + assert_eq!(uc_type_to_arrow("MAP").unwrap(), DataType::Utf8); + assert_eq!(uc_type_to_arrow("STRUCT").unwrap(), DataType::Utf8); + } + + #[test] + fn test_unsupported_type() { + let err = uc_type_to_arrow("UNKNOWN_TYPE").unwrap_err(); + assert!(err.to_string().contains("UNKNOWN_TYPE")); + } + + #[test] + fn test_columns_to_schema() { + let columns = vec![ + ColumnInfo { + name: "id".into(), + type_text: "INT".into(), + type_name: "INT".into(), + position: 0, + nullable: false, + comment: None, + }, + ColumnInfo { + name: "name".into(), + type_text: "STRING".into(), + type_name: "STRING".into(), + position: 1, + nullable: true, + comment: None, + }, + ColumnInfo { + name: "age".into(), + type_text: "INT".into(), + type_name: "INT".into(), + position: 2, + nullable: true, + comment: Some("User age".into()), + }, + ]; + let schema = columns_to_arrow_schema(&columns).unwrap(); + assert_eq!(schema.fields().len(), 3); + assert_eq!(schema.field(0).name(), "id"); + assert_eq!(*schema.field(0).data_type(), DataType::Int32); + assert!(!schema.field(0).is_nullable()); + assert_eq!(schema.field(1).name(), "name"); + assert_eq!(*schema.field(1).data_type(), DataType::Utf8); + assert!(schema.field(1).is_nullable()); + } + + #[test] + fn test_columns_sorted_by_position() { + let columns = vec![ + ColumnInfo { + name: "b".into(), + type_text: "STRING".into(), + type_name: "STRING".into(), + position: 2, + nullable: true, + comment: None, + }, + ColumnInfo { + name: "a".into(), + type_text: "INT".into(), + type_name: "INT".into(), + position: 0, + nullable: false, + comment: None, + }, + ColumnInfo { + name: "c".into(), + type_text: "DOUBLE".into(), + type_name: "DOUBLE".into(), + position: 1, + nullable: true, + comment: None, + }, + ]; + let schema = columns_to_arrow_schema(&columns).unwrap(); + assert_eq!(schema.field(0).name(), "a"); + assert_eq!(schema.field(1).name(), "c"); + assert_eq!(schema.field(2).name(), "b"); + } + + #[test] + fn test_empty_columns() { + let schema = columns_to_arrow_schema(&[]).unwrap(); + assert_eq!(schema.fields().len(), 0); + } +} diff --git a/crates/lance-graph-catalog/src/unity_catalog.rs b/crates/lance-graph-catalog/src/unity_catalog.rs new file mode 100644 index 00000000..4e18aa12 --- /dev/null +++ b/crates/lance-graph-catalog/src/unity_catalog.rs @@ -0,0 +1,362 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Unity Catalog REST API client implementing the [`CatalogProvider`] trait. +//! +//! Connects to an OSS Unity Catalog server and provides catalog browsing +//! capabilities. This is the first `CatalogProvider` implementation. + +use std::collections::HashMap; + +use async_trait::async_trait; +use reqwest::Client; +use serde::Deserialize; + +use crate::catalog_provider::*; + +/// Configuration for connecting to a Unity Catalog server. +#[derive(Debug, Clone)] +pub struct UnityCatalogConfig { + /// Base URL of the UC server (e.g., `http://localhost:8080/api/2.1/unity-catalog`). + pub base_url: String, + /// Optional bearer token for authenticated access. + pub bearer_token: Option, + /// Optional request timeout in seconds (default: 30). + pub timeout_secs: Option, +} + +impl UnityCatalogConfig { + pub fn new(base_url: impl Into) -> Self { + Self { + base_url: base_url.into().trim_end_matches('/').to_string(), + bearer_token: None, + timeout_secs: None, + } + } + + pub fn with_token(mut self, token: impl Into) -> Self { + self.bearer_token = Some(token.into()); + self + } + + pub fn with_timeout(mut self, secs: u64) -> Self { + self.timeout_secs = Some(secs); + self + } +} + +/// Unity Catalog REST API client. +pub struct UnityCatalogProvider { + config: UnityCatalogConfig, + client: Client, +} + +impl UnityCatalogProvider { + pub fn new(config: UnityCatalogConfig) -> CatalogResult { + let mut builder = Client::builder(); + if let Some(timeout) = config.timeout_secs { + builder = builder.timeout(std::time::Duration::from_secs(timeout)); + } + let client = builder + .build() + .map_err(|e| CatalogError::ConnectionError(format!("Failed to build HTTP client: {}", e)))?; + + Ok(Self { config, client }) + } + + fn request(&self, method: reqwest::Method, path: &str) -> reqwest::RequestBuilder { + let url = format!("{}{}", self.config.base_url, path); + let mut req = self.client.request(method, &url); + if let Some(ref token) = self.config.bearer_token { + req = req.bearer_auth(token); + } + req + } + + async fn handle_response( + &self, + resp: reqwest::Response, + resource_name: &str, + ) -> CatalogResult { + let status = resp.status(); + + if status == reqwest::StatusCode::NOT_FOUND { + return Err(CatalogError::NotFound(format!( + "{} not found", + resource_name + ))); + } + if status == reqwest::StatusCode::UNAUTHORIZED + || status == reqwest::StatusCode::FORBIDDEN + { + let body = resp.text().await.unwrap_or_default(); + return Err(CatalogError::AuthError(format!( + "HTTP {}: {}", + status, body + ))); + } + if !status.is_success() { + let body = resp.text().await.unwrap_or_default(); + return Err(CatalogError::ConnectionError(format!( + "HTTP {}: {}", + status, body + ))); + } + + resp.json::() + .await + .map_err(|e| CatalogError::InvalidResponse(e.to_string())) + } +} + +// ---- Serde models for UC REST API JSON responses ---- + +#[derive(Deserialize)] +struct ListCatalogsResponse { + #[serde(default)] + catalogs: Vec, +} + +#[derive(Deserialize)] +struct UcCatalog { + name: String, + comment: Option, + #[serde(default)] + properties: HashMap, + created_at: Option, + updated_at: Option, +} + +#[derive(Deserialize)] +struct ListSchemasResponse { + #[serde(default)] + schemas: Vec, +} + +#[derive(Deserialize)] +struct UcSchema { + name: String, + catalog_name: String, + comment: Option, + #[serde(default)] + properties: HashMap, + created_at: Option, + updated_at: Option, +} + +#[derive(Deserialize)] +struct ListTablesResponse { + #[serde(default)] + tables: Vec, +} + +#[derive(Deserialize)] +struct UcTable { + name: String, + catalog_name: String, + schema_name: String, + table_type: String, + data_source_format: Option, + #[serde(default)] + columns: Vec, + storage_location: Option, + comment: Option, + #[serde(default)] + properties: HashMap, + created_at: Option, + updated_at: Option, +} + +#[derive(Deserialize)] +struct UcColumn { + name: String, + type_text: String, + type_name: String, + position: i32, + #[serde(default = "default_nullable")] + nullable: bool, + comment: Option, +} + +fn default_nullable() -> bool { + true +} + +// ---- Conversion helpers ---- + +impl From for CatalogInfo { + fn from(uc: UcCatalog) -> Self { + CatalogInfo { + name: uc.name, + comment: uc.comment, + properties: uc.properties, + created_at: uc.created_at, + updated_at: uc.updated_at, + } + } +} + +impl From for SchemaInfo { + fn from(uc: UcSchema) -> Self { + SchemaInfo { + name: uc.name, + catalog_name: uc.catalog_name, + comment: uc.comment, + properties: uc.properties, + created_at: uc.created_at, + updated_at: uc.updated_at, + } + } +} + +impl From for TableInfo { + fn from(uc: UcTable) -> Self { + TableInfo { + name: uc.name, + catalog_name: uc.catalog_name, + schema_name: uc.schema_name, + table_type: match uc.table_type.as_str() { + "EXTERNAL" => TableType::External, + _ => TableType::Managed, + }, + data_source_format: match uc.data_source_format.as_deref() { + Some("DELTA") => DataSourceFormat::Delta, + Some("PARQUET") => DataSourceFormat::Parquet, + Some("CSV") => DataSourceFormat::Csv, + Some("JSON") => DataSourceFormat::Json, + Some("AVRO") => DataSourceFormat::Avro, + Some("ORC") => DataSourceFormat::Orc, + Some("TEXT") => DataSourceFormat::Text, + Some(other) => DataSourceFormat::Other(other.to_string()), + None => DataSourceFormat::Other("UNKNOWN".to_string()), + }, + columns: uc.columns.into_iter().map(Into::into).collect(), + storage_location: uc.storage_location, + comment: uc.comment, + properties: uc.properties, + created_at: uc.created_at, + updated_at: uc.updated_at, + } + } +} + +impl From for ColumnInfo { + fn from(uc: UcColumn) -> Self { + ColumnInfo { + name: uc.name, + type_text: uc.type_text, + type_name: uc.type_name, + position: uc.position, + nullable: uc.nullable, + comment: uc.comment, + } + } +} + +// ---- CatalogProvider implementation ---- + +#[async_trait] +impl CatalogProvider for UnityCatalogProvider { + fn name(&self) -> &str { + "unity-catalog" + } + + async fn list_catalogs(&self) -> CatalogResult> { + let resp = self + .request(reqwest::Method::GET, "/catalogs") + .send() + .await + .map_err(|e| CatalogError::ConnectionError(e.to_string()))?; + + let body: ListCatalogsResponse = self.handle_response(resp, "catalogs").await?; + Ok(body.catalogs.into_iter().map(Into::into).collect()) + } + + async fn get_catalog(&self, name: &str) -> CatalogResult { + let resp = self + .request(reqwest::Method::GET, &format!("/catalogs/{}", name)) + .send() + .await + .map_err(|e| CatalogError::ConnectionError(e.to_string()))?; + + let body: UcCatalog = self + .handle_response(resp, &format!("catalog '{}'", name)) + .await?; + Ok(body.into()) + } + + async fn list_schemas(&self, catalog_name: &str) -> CatalogResult> { + let resp = self + .request(reqwest::Method::GET, "/schemas") + .query(&[("catalog_name", catalog_name)]) + .send() + .await + .map_err(|e| CatalogError::ConnectionError(e.to_string()))?; + + let body: ListSchemasResponse = self + .handle_response(resp, &format!("schemas in '{}'", catalog_name)) + .await?; + Ok(body.schemas.into_iter().map(Into::into).collect()) + } + + async fn get_schema( + &self, + catalog_name: &str, + schema_name: &str, + ) -> CatalogResult { + let full_name = format!("{}.{}", catalog_name, schema_name); + let resp = self + .request(reqwest::Method::GET, &format!("/schemas/{}", full_name)) + .send() + .await + .map_err(|e| CatalogError::ConnectionError(e.to_string()))?; + + let body: UcSchema = self + .handle_response(resp, &format!("schema '{}'", full_name)) + .await?; + Ok(body.into()) + } + + async fn list_tables( + &self, + catalog_name: &str, + schema_name: &str, + ) -> CatalogResult> { + let resp = self + .request(reqwest::Method::GET, "/tables") + .query(&[ + ("catalog_name", catalog_name), + ("schema_name", schema_name), + ]) + .send() + .await + .map_err(|e| CatalogError::ConnectionError(e.to_string()))?; + + let body: ListTablesResponse = self + .handle_response( + resp, + &format!("tables in '{}.{}'", catalog_name, schema_name), + ) + .await?; + Ok(body.tables.into_iter().map(Into::into).collect()) + } + + async fn get_table( + &self, + catalog_name: &str, + schema_name: &str, + table_name: &str, + ) -> CatalogResult { + let full_name = format!("{}.{}.{}", catalog_name, schema_name, table_name); + let resp = self + .request(reqwest::Method::GET, &format!("/tables/{}", full_name)) + .send() + .await + .map_err(|e| CatalogError::ConnectionError(e.to_string()))?; + + let body: UcTable = self + .handle_response(resp, &format!("table '{}'", full_name)) + .await?; + Ok(body.into()) + } +} diff --git a/crates/lance-graph-catalog/tests/unity_catalog_integration.rs b/crates/lance-graph-catalog/tests/unity_catalog_integration.rs new file mode 100644 index 00000000..14f2f4db --- /dev/null +++ b/crates/lance-graph-catalog/tests/unity_catalog_integration.rs @@ -0,0 +1,467 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Integration tests for UnityCatalogProvider using wiremock to mock the REST API. + +use std::collections::HashMap; + +use lance_graph_catalog::{ + CatalogProvider, DataSourceFormat, TableType, UnityCatalogConfig, UnityCatalogProvider, +}; +use wiremock::matchers::{method, path, query_param}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +async fn setup_provider(server: &MockServer) -> UnityCatalogProvider { + let config = UnityCatalogConfig::new(server.uri()); + UnityCatalogProvider::new(config).unwrap() +} + +// ---- list_catalogs ---- + +#[tokio::test] +async fn test_list_catalogs() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/catalogs")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "catalogs": [ + { "name": "unity", "comment": "Main catalog" }, + { "name": "staging", "comment": null } + ] + }))) + .mount(&server) + .await; + + let provider = setup_provider(&server).await; + let catalogs = provider.list_catalogs().await.unwrap(); + + assert_eq!(catalogs.len(), 2); + assert_eq!(catalogs[0].name, "unity"); + assert_eq!(catalogs[0].comment.as_deref(), Some("Main catalog")); + assert_eq!(catalogs[1].name, "staging"); + assert_eq!(catalogs[1].comment, None); +} + +#[tokio::test] +async fn test_list_catalogs_empty() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/catalogs")) + .respond_with( + ResponseTemplate::new(200).set_body_json(serde_json::json!({ "catalogs": [] })), + ) + .mount(&server) + .await; + + let provider = setup_provider(&server).await; + let catalogs = provider.list_catalogs().await.unwrap(); + assert!(catalogs.is_empty()); +} + +// ---- get_catalog ---- + +#[tokio::test] +async fn test_get_catalog() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/catalogs/unity")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "name": "unity", + "comment": "Main catalog", + "properties": { "owner": "admin" }, + "created_at": 1700000000000_i64, + "updated_at": 1700000001000_i64 + }))) + .mount(&server) + .await; + + let provider = setup_provider(&server).await; + let catalog = provider.get_catalog("unity").await.unwrap(); + + assert_eq!(catalog.name, "unity"); + assert_eq!(catalog.comment.as_deref(), Some("Main catalog")); + assert_eq!(catalog.properties.get("owner").unwrap(), "admin"); + assert_eq!(catalog.created_at, Some(1700000000000)); +} + +#[tokio::test] +async fn test_get_catalog_not_found() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/catalogs/nonexistent")) + .respond_with(ResponseTemplate::new(404)) + .mount(&server) + .await; + + let provider = setup_provider(&server).await; + let err = provider.get_catalog("nonexistent").await.unwrap_err(); + assert!(err.to_string().contains("not found")); +} + +// ---- list_schemas ---- + +#[tokio::test] +async fn test_list_schemas() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/schemas")) + .and(query_param("catalog_name", "unity")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "schemas": [ + { + "name": "default", + "catalog_name": "unity", + "comment": "Default schema" + }, + { + "name": "staging", + "catalog_name": "unity", + "comment": null + } + ] + }))) + .mount(&server) + .await; + + let provider = setup_provider(&server).await; + let schemas = provider.list_schemas("unity").await.unwrap(); + + assert_eq!(schemas.len(), 2); + assert_eq!(schemas[0].name, "default"); + assert_eq!(schemas[0].catalog_name, "unity"); + assert_eq!(schemas[0].comment.as_deref(), Some("Default schema")); +} + +// ---- get_schema ---- + +#[tokio::test] +async fn test_get_schema() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/schemas/unity.default")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "name": "default", + "catalog_name": "unity", + "comment": "Default schema" + }))) + .mount(&server) + .await; + + let provider = setup_provider(&server).await; + let schema = provider.get_schema("unity", "default").await.unwrap(); + + assert_eq!(schema.name, "default"); + assert_eq!(schema.catalog_name, "unity"); +} + +#[tokio::test] +async fn test_get_schema_not_found() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/schemas/unity.nonexistent")) + .respond_with(ResponseTemplate::new(404)) + .mount(&server) + .await; + + let provider = setup_provider(&server).await; + let err = provider.get_schema("unity", "nonexistent").await.unwrap_err(); + assert!(err.to_string().contains("not found")); +} + +// ---- list_tables ---- + +#[tokio::test] +async fn test_list_tables() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/tables")) + .and(query_param("catalog_name", "unity")) + .and(query_param("schema_name", "default")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "tables": [ + { + "name": "marksheet", + "catalog_name": "unity", + "schema_name": "default", + "table_type": "MANAGED", + "data_source_format": "DELTA", + "columns": [], + "storage_location": "s3://bucket/marksheet", + "comment": "Student marks" + }, + { + "name": "users", + "catalog_name": "unity", + "schema_name": "default", + "table_type": "EXTERNAL", + "data_source_format": "PARQUET", + "columns": [], + "storage_location": "/data/users.parquet" + } + ] + }))) + .mount(&server) + .await; + + let provider = setup_provider(&server).await; + let tables = provider.list_tables("unity", "default").await.unwrap(); + + assert_eq!(tables.len(), 2); + assert_eq!(tables[0].name, "marksheet"); + assert_eq!(tables[0].table_type, TableType::Managed); + assert_eq!(tables[0].data_source_format, DataSourceFormat::Delta); + assert_eq!( + tables[0].storage_location.as_deref(), + Some("s3://bucket/marksheet") + ); + assert_eq!(tables[0].comment.as_deref(), Some("Student marks")); + + assert_eq!(tables[1].name, "users"); + assert_eq!(tables[1].table_type, TableType::External); + assert_eq!(tables[1].data_source_format, DataSourceFormat::Parquet); +} + +// ---- get_table with columns ---- + +#[tokio::test] +async fn test_get_table_with_columns() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/tables/unity.default.marksheet")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "name": "marksheet", + "catalog_name": "unity", + "schema_name": "default", + "table_type": "MANAGED", + "data_source_format": "DELTA", + "columns": [ + { + "name": "id", + "type_text": "INT", + "type_name": "INT", + "position": 0, + "nullable": false, + "comment": "Primary key" + }, + { + "name": "name", + "type_text": "STRING", + "type_name": "STRING", + "position": 1, + "nullable": true, + "comment": "Student name" + }, + { + "name": "mark", + "type_text": "DOUBLE", + "type_name": "DOUBLE", + "position": 2, + "nullable": true + } + ], + "storage_location": "s3://bucket/marksheet", + "comment": "Student marks", + "properties": { "delta.minReaderVersion": "1" } + }))) + .mount(&server) + .await; + + let provider = setup_provider(&server).await; + let table = provider + .get_table("unity", "default", "marksheet") + .await + .unwrap(); + + assert_eq!(table.name, "marksheet"); + assert_eq!(table.catalog_name, "unity"); + assert_eq!(table.schema_name, "default"); + assert_eq!(table.table_type, TableType::Managed); + assert_eq!(table.data_source_format, DataSourceFormat::Delta); + assert_eq!(table.columns.len(), 3); + + assert_eq!(table.columns[0].name, "id"); + assert_eq!(table.columns[0].type_name, "INT"); + assert_eq!(table.columns[0].position, 0); + assert!(!table.columns[0].nullable); + assert_eq!(table.columns[0].comment.as_deref(), Some("Primary key")); + + assert_eq!(table.columns[1].name, "name"); + assert_eq!(table.columns[1].type_name, "STRING"); + assert!(table.columns[1].nullable); + + assert_eq!(table.columns[2].name, "mark"); + assert_eq!(table.columns[2].type_name, "DOUBLE"); + assert_eq!(table.columns[2].comment, None); + + assert_eq!( + table.properties.get("delta.minReaderVersion").unwrap(), + "1" + ); +} + +#[tokio::test] +async fn test_get_table_not_found() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/tables/unity.default.nonexistent")) + .respond_with(ResponseTemplate::new(404)) + .mount(&server) + .await; + + let provider = setup_provider(&server).await; + let err = provider + .get_table("unity", "default", "nonexistent") + .await + .unwrap_err(); + assert!(err.to_string().contains("not found")); +} + +// ---- table_to_arrow_schema ---- + +#[tokio::test] +async fn test_table_to_arrow_schema() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/tables/unity.default.test_table")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "name": "test_table", + "catalog_name": "unity", + "schema_name": "default", + "table_type": "MANAGED", + "data_source_format": "PARQUET", + "columns": [ + { "name": "id", "type_text": "LONG", "type_name": "LONG", "position": 0, "nullable": false }, + { "name": "value", "type_text": "DOUBLE", "type_name": "DOUBLE", "position": 1, "nullable": true }, + { "name": "label", "type_text": "STRING", "type_name": "STRING", "position": 2, "nullable": true } + ], + "storage_location": "/data/test_table" + }))) + .mount(&server) + .await; + + let provider = setup_provider(&server).await; + let table = provider + .get_table("unity", "default", "test_table") + .await + .unwrap(); + let schema = provider.table_to_arrow_schema(&table).unwrap(); + + assert_eq!(schema.fields().len(), 3); + assert_eq!(schema.field(0).name(), "id"); + assert_eq!( + *schema.field(0).data_type(), + arrow_schema::DataType::Int64 + ); + assert!(!schema.field(0).is_nullable()); + assert_eq!(schema.field(1).name(), "value"); + assert_eq!( + *schema.field(1).data_type(), + arrow_schema::DataType::Float64 + ); + assert_eq!(schema.field(2).name(), "label"); + assert_eq!(*schema.field(2).data_type(), arrow_schema::DataType::Utf8); +} + +// ---- auth error ---- + +#[tokio::test] +async fn test_auth_error() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/catalogs")) + .respond_with(ResponseTemplate::new(401).set_body_string("Unauthorized")) + .mount(&server) + .await; + + let provider = setup_provider(&server).await; + let err = provider.list_catalogs().await.unwrap_err(); + assert!(err.to_string().contains("Auth error")); +} + +// ---- bearer token is sent ---- + +#[tokio::test] +async fn test_bearer_token_sent() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/catalogs")) + .and(wiremock::matchers::header("Authorization", "Bearer my-token")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "catalogs": [{ "name": "unity" }] + }))) + .mount(&server) + .await; + + let config = UnityCatalogConfig::new(server.uri()).with_token("my-token"); + let provider = UnityCatalogProvider::new(config).unwrap(); + let catalogs = provider.list_catalogs().await.unwrap(); + + assert_eq!(catalogs.len(), 1); + assert_eq!(catalogs[0].name, "unity"); +} + +// ---- data source format parsing ---- + +#[tokio::test] +async fn test_data_source_format_parsing() { + let server = MockServer::start().await; + + Mock::given(method("GET")) + .and(path("/tables")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "tables": [ + { "name": "t1", "catalog_name": "c", "schema_name": "s", "table_type": "MANAGED", "data_source_format": "DELTA" }, + { "name": "t2", "catalog_name": "c", "schema_name": "s", "table_type": "EXTERNAL", "data_source_format": "PARQUET" }, + { "name": "t3", "catalog_name": "c", "schema_name": "s", "table_type": "MANAGED", "data_source_format": "CSV" }, + { "name": "t4", "catalog_name": "c", "schema_name": "s", "table_type": "MANAGED", "data_source_format": "JSON" }, + { "name": "t5", "catalog_name": "c", "schema_name": "s", "table_type": "MANAGED", "data_source_format": "AVRO" }, + { "name": "t6", "catalog_name": "c", "schema_name": "s", "table_type": "MANAGED", "data_source_format": "ORC" }, + { "name": "t7", "catalog_name": "c", "schema_name": "s", "table_type": "MANAGED", "data_source_format": "CUSTOM_FORMAT" }, + { "name": "t8", "catalog_name": "c", "schema_name": "s", "table_type": "MANAGED" } + ] + }))) + .mount(&server) + .await; + + let provider = setup_provider(&server).await; + let tables = provider.list_tables("c", "s").await.unwrap(); + + assert_eq!(tables[0].data_source_format, DataSourceFormat::Delta); + assert_eq!(tables[1].data_source_format, DataSourceFormat::Parquet); + assert_eq!(tables[2].data_source_format, DataSourceFormat::Csv); + assert_eq!(tables[3].data_source_format, DataSourceFormat::Json); + assert_eq!(tables[4].data_source_format, DataSourceFormat::Avro); + assert_eq!(tables[5].data_source_format, DataSourceFormat::Orc); + assert_eq!( + tables[6].data_source_format, + DataSourceFormat::Other("CUSTOM_FORMAT".to_string()) + ); + assert_eq!( + tables[7].data_source_format, + DataSourceFormat::Other("UNKNOWN".to_string()) + ); +} + +// ---- connection error ---- + +#[tokio::test] +async fn test_connection_error_on_bad_url() { + let config = UnityCatalogConfig::new("http://localhost:1") + .with_timeout(1); + let provider = UnityCatalogProvider::new(config).unwrap(); + let err = provider.list_catalogs().await.unwrap_err(); + assert!(err.to_string().contains("connection error")); +} diff --git a/crates/lance-graph-python/src/catalog.rs b/crates/lance-graph-python/src/catalog.rs new file mode 100644 index 00000000..255e2635 --- /dev/null +++ b/crates/lance-graph-python/src/catalog.rs @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Python bindings for the catalog integration. + +use std::sync::Arc; + +use lance_graph::sql_catalog::build_context_from_connector; +use lance_graph::{ + CatalogInfo, Connector, SchemaInfo, TableInfo, UnityCatalogConfig, UnityCatalogProvider, +}; +use lance_graph::table_readers::default_table_readers; +use pyo3::exceptions::PyRuntimeError; +use pyo3::prelude::*; +use pyo3::types::{PyDict, PyList}; + +use crate::graph::{graph_error_to_pyerr, SqlEngine}; +use crate::RT; + +// ---- Python wrapper for CatalogInfo ---- + +#[pyclass(name = "CatalogInfo", module = "lance.graph")] +#[derive(Clone)] +pub struct PyCatalogInfo { + inner: CatalogInfo, +} + +#[pymethods] +impl PyCatalogInfo { + #[getter] + fn name(&self) -> &str { + &self.inner.name + } + + #[getter] + fn comment(&self) -> Option<&str> { + self.inner.comment.as_deref() + } + + fn __repr__(&self) -> String { + format!("CatalogInfo(name='{}')", self.inner.name) + } +} + +// ---- Python wrapper for SchemaInfo ---- + +#[pyclass(name = "SchemaInfo", module = "lance.graph")] +#[derive(Clone)] +pub struct PySchemaInfo { + inner: SchemaInfo, +} + +#[pymethods] +impl PySchemaInfo { + #[getter] + fn name(&self) -> &str { + &self.inner.name + } + + #[getter] + fn catalog_name(&self) -> &str { + &self.inner.catalog_name + } + + #[getter] + fn comment(&self) -> Option<&str> { + self.inner.comment.as_deref() + } + + fn __repr__(&self) -> String { + format!( + "SchemaInfo(name='{}.{}')", + self.inner.catalog_name, self.inner.name + ) + } +} + +// ---- Python wrapper for TableInfo ---- + +#[pyclass(name = "TableInfo", module = "lance.graph")] +#[derive(Clone)] +pub struct PyTableInfo { + inner: TableInfo, +} + +#[pymethods] +impl PyTableInfo { + #[getter] + fn name(&self) -> &str { + &self.inner.name + } + + #[getter] + fn catalog_name(&self) -> &str { + &self.inner.catalog_name + } + + #[getter] + fn schema_name(&self) -> &str { + &self.inner.schema_name + } + + #[getter] + fn table_type(&self) -> &str { + match self.inner.table_type { + lance_graph::TableType::Managed => "MANAGED", + lance_graph::TableType::External => "EXTERNAL", + } + } + + #[getter] + fn data_source_format(&self) -> String { + format!("{:?}", self.inner.data_source_format) + } + + #[getter] + fn storage_location(&self) -> Option<&str> { + self.inner.storage_location.as_deref() + } + + #[getter] + fn comment(&self) -> Option<&str> { + self.inner.comment.as_deref() + } + + #[getter] + fn num_columns(&self) -> usize { + self.inner.columns.len() + } + + /// Return column info as a list of dicts. + fn columns<'py>(&self, py: Python<'py>) -> PyResult> { + let list = PyList::empty(py); + for col in &self.inner.columns { + let d = PyDict::new(py); + d.set_item("name", &col.name)?; + d.set_item("type_text", &col.type_text)?; + d.set_item("type_name", &col.type_name)?; + d.set_item("position", col.position)?; + d.set_item("nullable", col.nullable)?; + d.set_item("comment", col.comment.as_deref())?; + list.append(d)?; + } + Ok(list) + } + + fn __repr__(&self) -> String { + format!( + "TableInfo(name='{}.{}.{}', format={:?}, columns={})", + self.inner.catalog_name, + self.inner.schema_name, + self.inner.name, + self.inner.data_source_format, + self.inner.columns.len() + ) + } +} + +// ---- Python UnityCatalog client ---- + +/// Unity Catalog client for browsing catalog metadata and auto-registering +/// tables into SqlEngine. +/// +/// Examples +/// -------- +/// >>> from lance_graph import UnityCatalog +/// >>> uc = UnityCatalog("http://localhost:8080/api/2.1/unity-catalog") +/// >>> catalogs = uc.list_catalogs() +/// >>> engine = uc.create_sql_engine("unity", "default") +/// >>> result = engine.execute("SELECT * FROM my_table LIMIT 10") +#[pyclass(name = "UnityCatalog", module = "lance.graph")] +pub struct PyUnityCatalog { + connector: Arc, +} + +#[pymethods] +impl PyUnityCatalog { + /// Create a new UnityCatalog client. + /// + /// Parameters + /// ---------- + /// base_url : str + /// Base URL of the Unity Catalog server + /// (e.g., "http://localhost:8080/api/2.1/unity-catalog") + /// token : str, optional + /// Bearer token for authentication + /// timeout : int, optional + /// Request timeout in seconds + #[new] + #[pyo3(signature = (base_url, token=None, timeout=None))] + fn new(base_url: &str, token: Option<&str>, timeout: Option) -> PyResult { + let mut config = UnityCatalogConfig::new(base_url); + if let Some(t) = token { + config = config.with_token(t); + } + if let Some(secs) = timeout { + config = config.with_timeout(secs); + } + + let provider = UnityCatalogProvider::new(config) + .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; + let readers = default_table_readers(); + let connector = Connector::new(Arc::new(provider), readers); + + Ok(Self { + connector: Arc::new(connector), + }) + } + + /// List all catalogs. + fn list_catalogs(&self, py: Python) -> PyResult> { + let connector = self.connector.clone(); + let result = RT + .block_on(Some(py), async move { connector.list_catalogs().await })? + .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; + Ok(result + .into_iter() + .map(|c| PyCatalogInfo { inner: c }) + .collect()) + } + + /// List schemas in a catalog. + /// + /// Parameters + /// ---------- + /// catalog_name : str + /// Name of the catalog + fn list_schemas(&self, py: Python, catalog_name: &str) -> PyResult> { + let connector = self.connector.clone(); + let cat = catalog_name.to_string(); + let result = RT + .block_on(Some(py), async move { + connector.list_schemas(&cat).await + })? + .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; + Ok(result + .into_iter() + .map(|s| PySchemaInfo { inner: s }) + .collect()) + } + + /// List tables in a schema. + /// + /// Parameters + /// ---------- + /// catalog_name : str + /// Name of the catalog + /// schema_name : str + /// Name of the schema + fn list_tables( + &self, + py: Python, + catalog_name: &str, + schema_name: &str, + ) -> PyResult> { + let connector = self.connector.clone(); + let cat = catalog_name.to_string(); + let sch = schema_name.to_string(); + let result = RT + .block_on(Some(py), async move { + connector.list_tables(&cat, &sch).await + })? + .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; + Ok(result + .into_iter() + .map(|t| PyTableInfo { inner: t }) + .collect()) + } + + /// Get detailed table info including columns. + /// + /// Parameters + /// ---------- + /// catalog_name : str + /// Name of the catalog + /// schema_name : str + /// Name of the schema + /// table_name : str + /// Name of the table + fn get_table( + &self, + py: Python, + catalog_name: &str, + schema_name: &str, + table_name: &str, + ) -> PyResult { + let connector = self.connector.clone(); + let cat = catalog_name.to_string(); + let sch = schema_name.to_string(); + let tbl = table_name.to_string(); + let result = RT + .block_on(Some(py), async move { + connector.get_table(&cat, &sch, &tbl).await + })? + .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; + Ok(PyTableInfo { inner: result }) + } + + /// Create an SqlEngine with all tables from a UC schema auto-registered. + /// + /// Discovers all tables in the specified catalog.schema, registers them + /// using the appropriate format reader (Parquet, Delta, etc.), and returns + /// an SqlEngine ready for SQL queries. + /// + /// Parameters + /// ---------- + /// catalog_name : str + /// Name of the UC catalog + /// schema_name : str + /// Name of the UC schema + /// + /// Returns + /// ------- + /// SqlEngine + /// An SqlEngine with all UC tables registered, ready for SQL queries. + /// + /// Raises + /// ------ + /// RuntimeError + /// If connection to UC fails or table registration fails + fn create_sql_engine( + &self, + py: Python, + catalog_name: &str, + schema_name: &str, + ) -> PyResult { + let connector = self.connector.clone(); + let cat = catalog_name.to_string(); + let sch = schema_name.to_string(); + + let ctx = RT + .block_on(Some(py), async move { + build_context_from_connector(&connector, &cat, &sch).await + })? + .map_err(graph_error_to_pyerr)?; + + Ok(SqlEngine::from_context(ctx)) + } + + fn __repr__(&self) -> String { + "UnityCatalog(...)".to_string() + } +} + +/// Register catalog classes with the Python module. +pub fn register_catalog_module(_py: Python, module: &Bound<'_, PyModule>) -> PyResult<()> { + module.add_class::()?; + module.add_class::()?; + module.add_class::()?; + module.add_class::()?; + Ok(()) +} diff --git a/crates/lance-graph-python/src/graph.rs b/crates/lance-graph-python/src/graph.rs index 4a9c3c30..2997c8e5 100644 --- a/crates/lance-graph-python/src/graph.rs +++ b/crates/lance-graph-python/src/graph.rs @@ -282,7 +282,7 @@ impl VectorSearch { } /// Convert GraphError to PyErr -fn graph_error_to_pyerr(err: RustGraphError) -> PyErr { +pub(crate) fn graph_error_to_pyerr(err: RustGraphError) -> PyErr { match &err { RustGraphError::ParseError { .. } | RustGraphError::ConfigError { .. } @@ -1533,6 +1533,16 @@ impl SqlEngine { } } +impl SqlEngine { + /// Create SqlEngine from a pre-built SessionContext. + /// Used internally by catalog integration (UnityCatalog.create_sql_engine). + pub(crate) fn from_context(ctx: SessionContext) -> Self { + Self { + context: Arc::new(ctx), + } + } +} + /// Register graph functionality with the Python module pub fn register_graph_module(py: Python, parent_module: &Bound<'_, PyModule>) -> PyResult<()> { let graph_module = PyModule::new(py, "graph")?; @@ -1548,6 +1558,8 @@ pub fn register_graph_module(py: Python, parent_module: &Bound<'_, PyModule>) -> graph_module.add_class::()?; graph_module.add_class::()?; + crate::catalog::register_catalog_module(py, &graph_module)?; + parent_module.add_submodule(&graph_module)?; Ok(()) } diff --git a/crates/lance-graph-python/src/lib.rs b/crates/lance-graph-python/src/lib.rs index d9f440fb..70fa2ab2 100644 --- a/crates/lance-graph-python/src/lib.rs +++ b/crates/lance-graph-python/src/lib.rs @@ -2,6 +2,7 @@ use std::sync::LazyLock; use pyo3::prelude::*; +mod catalog; mod executor; mod graph; mod namespace; diff --git a/crates/lance-graph/Cargo.toml b/crates/lance-graph/Cargo.toml index cc26fc27..782b6511 100644 --- a/crates/lance-graph/Cargo.toml +++ b/crates/lance-graph/Cargo.toml @@ -14,6 +14,7 @@ categories = ["database", "data-structures", "science"] arrow = { version = "56.2", features = ["prettyprint"] } arrow-array = "56.2" arrow-schema = "56.2" +async-trait = "0.1" datafusion = { version = "50.3", default-features = false, features = [ "nested_expressions", "regex_expressions", @@ -22,6 +23,7 @@ datafusion = { version = "50.3", default-features = false, features = [ "encoding_expressions", "datetime_expressions", "string_expressions", + "parquet", ] } datafusion-common = "50.3" datafusion-expr = "50.3" @@ -36,6 +38,13 @@ nom = "7.1" serde = { version = "1", features = ["derive"] } serde_json = "1" snafu = "0.8" +deltalake = { version = "0.29", features = ["datafusion"], optional = true } +url = { version = "2", optional = true } + +[features] +default = ["unity-catalog", "delta"] +unity-catalog = ["lance-graph-catalog/unity-catalog"] +delta = ["dep:deltalake", "dep:url"] [dev-dependencies] criterion = { version = "0.5", features = ["async", "async_tokio", "html_reports"] } diff --git a/crates/lance-graph/src/lib.rs b/crates/lance-graph/src/lib.rs index 25d8ebec..b2848eae 100644 --- a/crates/lance-graph/src/lib.rs +++ b/crates/lance-graph/src/lib.rs @@ -47,7 +47,9 @@ pub mod parser; pub mod query; pub mod semantic; pub mod simple_executor; +pub mod sql_catalog; pub mod sql_query; +pub mod table_readers; /// Maximum allowed hops for variable-length relationship expansion (e.g., *1..N) pub const MAX_VARIABLE_LENGTH_HOPS: u32 = 20; @@ -57,6 +59,16 @@ pub use error::{GraphError, Result}; pub use lance_graph_catalog::{ DirNamespace, GraphSourceCatalog, InMemoryCatalog, SimpleTableSource, }; +// Catalog provider re-exports +pub use lance_graph_catalog::{ + CatalogError, CatalogInfo, CatalogProvider, CatalogResult, ColumnInfo, Connector, + DataSourceFormat, SchemaInfo, TableInfo, TableReader, TableType, +}; +#[cfg(feature = "unity-catalog")] +pub use lance_graph_catalog::{UnityCatalogConfig, UnityCatalogProvider}; pub use lance_vector_search::VectorSearch; pub use query::{CypherQuery, ExecutionStrategy}; pub use sql_query::SqlQuery; +pub use table_readers::{default_table_readers, ParquetTableReader}; +#[cfg(feature = "delta")] +pub use table_readers::DeltaTableReader; diff --git a/crates/lance-graph/src/sql_catalog.rs b/crates/lance-graph/src/sql_catalog.rs new file mode 100644 index 00000000..619666aa --- /dev/null +++ b/crates/lance-graph/src/sql_catalog.rs @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Bridge between catalog connectors and the SQL query engine. +//! +//! Provides utilities for building a DataFusion `SessionContext` from a +//! [`Connector`], enabling users to auto-register tables from an external +//! catalog and query them via [`SqlQuery`] or [`SqlEngine`]. + +use crate::error::{GraphError, Result}; +use datafusion::execution::context::SessionContext; +use lance_graph_catalog::connector::Connector; + +/// Build a DataFusion `SessionContext` with all tables from a catalog schema +/// auto-registered. +/// +/// This discovers tables in the given catalog/schema via the connector's +/// [`CatalogProvider`] and registers them using the appropriate [`TableReader`] +/// based on each table's data format. +/// +/// # Example +/// +/// ```no_run +/// # use lance_graph::sql_catalog::build_context_from_connector; +/// # use lance_graph_catalog::Connector; +/// # async fn example(connector: &Connector) { +/// let ctx = build_context_from_connector(connector, "unity", "default") +/// .await +/// .unwrap(); +/// // ctx now has all tables from unity.default registered +/// # } +/// ``` +pub async fn build_context_from_connector( + connector: &Connector, + catalog_name: &str, + schema_name: &str, +) -> Result { + let ctx = SessionContext::new(); + connector + .register_schema(&ctx, catalog_name, schema_name) + .await + .map_err(|e| GraphError::PlanError { + message: format!("Failed to register catalog tables: {}", e), + location: snafu::Location::new(file!(), line!(), column!()), + })?; + Ok(ctx) +} diff --git a/crates/lance-graph/src/table_readers.rs b/crates/lance-graph/src/table_readers.rs new file mode 100644 index 00000000..7c20bd72 --- /dev/null +++ b/crates/lance-graph/src/table_readers.rs @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Built-in [`TableReader`] implementations for common data formats. +//! +//! - [`ParquetTableReader`] — reads Parquet tables using DataFusion's built-in support. +//! - [`DeltaTableReader`] — reads Delta Lake tables (behind `delta` feature flag). + +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::execution::context::SessionContext; + +use lance_graph_catalog::catalog_provider::{ + CatalogError, CatalogResult, DataSourceFormat, TableInfo, +}; +use lance_graph_catalog::table_reader::TableReader; + +/// Reads Parquet tables using DataFusion's built-in `register_parquet()`. +pub struct ParquetTableReader; + +#[async_trait] +impl TableReader for ParquetTableReader { + fn name(&self) -> &str { + "parquet" + } + + fn supported_formats(&self) -> &[DataSourceFormat] { + &[DataSourceFormat::Parquet] + } + + async fn register_table( + &self, + ctx: &SessionContext, + table_name: &str, + table_info: &TableInfo, + _schema: arrow_schema::SchemaRef, + ) -> CatalogResult<()> { + let location = table_info.storage_location.as_deref().ok_or_else(|| { + CatalogError::Other(format!( + "Table '{}' has no storage_location", + table_name + )) + })?; + + ctx.register_parquet( + table_name, + location, + datafusion::datasource::file_format::options::ParquetReadOptions::default(), + ) + .await + .map_err(|e| { + CatalogError::Other(format!( + "Failed to register Parquet table '{}' at '{}': {}", + table_name, location, e + )) + }) + } +} + +/// Reads Delta Lake tables using the `deltalake` crate. +/// +/// Opens the Delta table at the storage location and registers it as a +/// DataFusion `TableProvider`, enabling full SQL query support including +/// time travel, schema evolution, and partition pruning. +#[cfg(feature = "delta")] +pub struct DeltaTableReader; + +#[cfg(feature = "delta")] +#[async_trait] +impl TableReader for DeltaTableReader { + fn name(&self) -> &str { + "delta" + } + + fn supported_formats(&self) -> &[DataSourceFormat] { + &[DataSourceFormat::Delta] + } + + async fn register_table( + &self, + ctx: &SessionContext, + table_name: &str, + table_info: &TableInfo, + _schema: arrow_schema::SchemaRef, + ) -> CatalogResult<()> { + let location = table_info.storage_location.as_deref().ok_or_else(|| { + CatalogError::Other(format!( + "Table '{}' has no storage_location", + table_name + )) + })?; + + let table_url = url::Url::parse(location).map_err(|e| { + CatalogError::Other(format!( + "Invalid storage location URL '{}': {}", + location, e + )) + })?; + + let delta_table = deltalake::open_table(table_url).await.map_err(|e| { + CatalogError::Other(format!( + "Failed to open Delta table '{}' at '{}': {}", + table_name, location, e + )) + })?; + + ctx.register_table(table_name, Arc::new(delta_table)) + .map_err(|e| { + CatalogError::Other(format!( + "Failed to register Delta table '{}': {}", + table_name, e + )) + })?; + + Ok(()) + } +} + +/// Returns the default set of table readers. +/// +/// Includes Parquet support, and Delta Lake support when the `delta` feature is enabled. +pub fn default_table_readers() -> Vec> { + let mut readers: Vec> = vec![Arc::new(ParquetTableReader)]; + #[cfg(feature = "delta")] + readers.push(Arc::new(DeltaTableReader)); + readers +} diff --git a/python/python/lance_graph/__init__.py b/python/python/lance_graph/__init__.py index 4a348fd8..1d463c7d 100644 --- a/python/python/lance_graph/__init__.py +++ b/python/python/lance_graph/__init__.py @@ -88,6 +88,11 @@ def _load_dev_build() -> ModuleType: DirNamespace = _bindings.graph.DirNamespace +UnityCatalog = _bindings.graph.UnityCatalog +CatalogInfo = _bindings.graph.CatalogInfo +SchemaInfo = _bindings.graph.SchemaInfo +TableInfo = _bindings.graph.TableInfo + __all__ = [ "GraphConfig", "GraphConfigBuilder", @@ -99,6 +104,10 @@ def _load_dev_build() -> ModuleType: "VectorSearch", "DistanceMetric", "DirNamespace", + "UnityCatalog", + "CatalogInfo", + "SchemaInfo", + "TableInfo", ] __version__ = _bindings.__version__ diff --git a/python/python/tests/test_unity_catalog.py b/python/python/tests/test_unity_catalog.py new file mode 100644 index 00000000..bafabb35 --- /dev/null +++ b/python/python/tests/test_unity_catalog.py @@ -0,0 +1,167 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +"""Tests for Unity Catalog integration. + +Unit tests verify the Python binding classes work correctly. +Integration tests (marked with @pytest.mark.integration) require a running +Unity Catalog server and are skipped by default. +""" + +import pytest +from lance_graph import CatalogInfo, SchemaInfo, SqlEngine, TableInfo, UnityCatalog + + +# ========================================================================== +# Unit tests — verify Python class construction and repr +# ========================================================================== + + +class TestUnityCatalogConstruction: + def test_create_client(self): + """UnityCatalog client can be constructed.""" + uc = UnityCatalog("http://localhost:8080/api/2.1/unity-catalog") + assert repr(uc) == "UnityCatalog(...)" + + def test_create_client_with_token(self): + uc = UnityCatalog( + "http://localhost:8080/api/2.1/unity-catalog", + token="my-secret-token", + ) + assert repr(uc) == "UnityCatalog(...)" + + def test_create_client_with_timeout(self): + uc = UnityCatalog( + "http://localhost:8080/api/2.1/unity-catalog", + timeout=60, + ) + assert repr(uc) == "UnityCatalog(...)" + + def test_create_client_with_all_options(self): + uc = UnityCatalog( + "http://localhost:8080/api/2.1/unity-catalog", + token="tok", + timeout=30, + ) + assert repr(uc) == "UnityCatalog(...)" + + +class TestConnectionErrors: + def test_list_catalogs_connection_refused(self): + """Connecting to a non-existent server raises RuntimeError.""" + uc = UnityCatalog("http://localhost:1/api/2.1/unity-catalog", timeout=1) + with pytest.raises(RuntimeError, match="connection error|Connection refused|error"): + uc.list_catalogs() + + def test_list_schemas_connection_refused(self): + uc = UnityCatalog("http://localhost:1/api/2.1/unity-catalog", timeout=1) + with pytest.raises(RuntimeError): + uc.list_schemas("unity") + + def test_list_tables_connection_refused(self): + uc = UnityCatalog("http://localhost:1/api/2.1/unity-catalog", timeout=1) + with pytest.raises(RuntimeError): + uc.list_tables("unity", "default") + + def test_get_table_connection_refused(self): + uc = UnityCatalog("http://localhost:1/api/2.1/unity-catalog", timeout=1) + with pytest.raises(RuntimeError): + uc.get_table("unity", "default", "marksheet") + + def test_create_sql_engine_connection_refused(self): + uc = UnityCatalog("http://localhost:1/api/2.1/unity-catalog", timeout=1) + with pytest.raises((RuntimeError, ValueError)): + uc.create_sql_engine("unity", "default") + + +# ========================================================================== +# Integration tests — require a running UC server +# ========================================================================== + + +def _uc_available(): + """Check if a UC server is running on localhost:8080.""" + try: + uc = UnityCatalog( + "http://localhost:8080/api/2.1/unity-catalog", timeout=2 + ) + uc.list_catalogs() + return True + except Exception: + return False + + +@pytest.fixture +def uc(): + """Connect to local OSS Unity Catalog.""" + if not _uc_available(): + pytest.skip("Unity Catalog server not available on localhost:8080") + return UnityCatalog("http://localhost:8080/api/2.1/unity-catalog") + + +@pytest.mark.integration +class TestUnityCatalogBrowsing: + def test_list_catalogs(self, uc): + catalogs = uc.list_catalogs() + assert len(catalogs) > 0 + assert isinstance(catalogs[0], CatalogInfo) + assert catalogs[0].name + assert "CatalogInfo" in repr(catalogs[0]) + + def test_list_schemas(self, uc): + catalogs = uc.list_catalogs() + schemas = uc.list_schemas(catalogs[0].name) + assert len(schemas) > 0 + assert isinstance(schemas[0], SchemaInfo) + assert schemas[0].name + assert schemas[0].catalog_name == catalogs[0].name + assert "SchemaInfo" in repr(schemas[0]) + + def test_list_tables(self, uc): + catalogs = uc.list_catalogs() + schemas = uc.list_schemas(catalogs[0].name) + tables = uc.list_tables(catalogs[0].name, schemas[0].name) + assert len(tables) > 0 + assert isinstance(tables[0], TableInfo) + assert tables[0].name + assert tables[0].catalog_name == catalogs[0].name + assert tables[0].schema_name == schemas[0].name + assert tables[0].num_columns >= 0 + assert "TableInfo" in repr(tables[0]) + + def test_get_table(self, uc): + catalogs = uc.list_catalogs() + schemas = uc.list_schemas(catalogs[0].name) + tables = uc.list_tables(catalogs[0].name, schemas[0].name) + if not tables: + pytest.skip("No tables in the first schema") + + table = uc.get_table( + catalogs[0].name, schemas[0].name, tables[0].name + ) + assert table.name == tables[0].name + assert table.num_columns > 0 + cols = table.columns() + assert len(cols) == table.num_columns + assert all("name" in c and "type_name" in c for c in cols) + + def test_table_info_properties(self, uc): + catalogs = uc.list_catalogs() + schemas = uc.list_schemas(catalogs[0].name) + tables = uc.list_tables(catalogs[0].name, schemas[0].name) + if not tables: + pytest.skip("No tables") + + t = tables[0] + assert t.table_type in ("MANAGED", "EXTERNAL") + assert isinstance(t.data_source_format, str) + + +@pytest.mark.integration +class TestUnityCatalogSqlEngine: + def test_create_sql_engine(self, uc): + catalogs = uc.list_catalogs() + schemas = uc.list_schemas(catalogs[0].name) + engine = uc.create_sql_engine(catalogs[0].name, schemas[0].name) + assert isinstance(engine, SqlEngine) + assert "SqlEngine" in repr(engine) diff --git a/python/uv.lock b/python/uv.lock index a535bd3c..9555999c 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -1241,7 +1241,7 @@ wheels = [ [[package]] name = "lance-graph" -version = "0.5.2" +version = "0.5.3" source = { editable = "." } dependencies = [ { name = "fastapi" }, From 761b90887ad574e82b12593210b4a0c3b05fb4a8 Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Thu, 26 Feb 2026 23:35:04 -0800 Subject: [PATCH 4/7] style: fix formatting and clippy warnings - cargo fmt fixes across all new files - Replace EnumName::Variant with Self::Variant (clippy::unnecessary_structure_name_repetition) - Fix Python import sorting and line length (ruff) --- .../src/catalog_provider.rs | 18 +++++------- crates/lance-graph-catalog/src/connector.rs | 10 +++---- .../lance-graph-catalog/src/unity_catalog.rs | 29 +++++++------------ .../tests/unity_catalog_integration.rs | 23 +++++++-------- crates/lance-graph/src/lib.rs | 2 +- crates/lance-graph/src/table_readers.rs | 10 ++----- python/python/tests/test_unity_catalog.py | 13 ++++----- 7 files changed, 41 insertions(+), 64 deletions(-) diff --git a/crates/lance-graph-catalog/src/catalog_provider.rs b/crates/lance-graph-catalog/src/catalog_provider.rs index b0e75580..9bc802c1 100644 --- a/crates/lance-graph-catalog/src/catalog_provider.rs +++ b/crates/lance-graph-catalog/src/catalog_provider.rs @@ -103,12 +103,12 @@ pub enum CatalogError { impl std::fmt::Display for CatalogError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - CatalogError::ConnectionError(msg) => write!(f, "Catalog connection error: {}", msg), - CatalogError::NotFound(msg) => write!(f, "Not found: {}", msg), - CatalogError::AuthError(msg) => write!(f, "Auth error: {}", msg), - CatalogError::InvalidResponse(msg) => write!(f, "Invalid response: {}", msg), - CatalogError::TypeMappingError(msg) => write!(f, "Type mapping error: {}", msg), - CatalogError::Other(msg) => write!(f, "Catalog error: {}", msg), + Self::ConnectionError(msg) => write!(f, "Catalog connection error: {}", msg), + Self::NotFound(msg) => write!(f, "Not found: {}", msg), + Self::AuthError(msg) => write!(f, "Auth error: {}", msg), + Self::InvalidResponse(msg) => write!(f, "Invalid response: {}", msg), + Self::TypeMappingError(msg) => write!(f, "Type mapping error: {}", msg), + Self::Other(msg) => write!(f, "Catalog error: {}", msg), } } } @@ -145,11 +145,7 @@ pub trait CatalogProvider: Send + Sync { async fn list_schemas(&self, catalog_name: &str) -> CatalogResult>; /// Get information about a specific schema. - async fn get_schema( - &self, - catalog_name: &str, - schema_name: &str, - ) -> CatalogResult; + async fn get_schema(&self, catalog_name: &str, schema_name: &str) -> CatalogResult; /// List all tables within a schema. async fn list_tables( diff --git a/crates/lance-graph-catalog/src/connector.rs b/crates/lance-graph-catalog/src/connector.rs index 8be44f95..2f03ec87 100644 --- a/crates/lance-graph-catalog/src/connector.rs +++ b/crates/lance-graph-catalog/src/connector.rs @@ -13,7 +13,10 @@ use arrow_schema::SchemaRef; use datafusion::datasource::MemTable; use datafusion::execution::context::SessionContext; -use crate::catalog_provider::{CatalogError, CatalogInfo, CatalogProvider, CatalogResult, DataSourceFormat, SchemaInfo, TableInfo}; +use crate::catalog_provider::{ + CatalogError, CatalogInfo, CatalogProvider, CatalogResult, DataSourceFormat, SchemaInfo, + TableInfo, +}; use crate::table_reader::TableReader; /// Bundles a [`CatalogProvider`] with [`TableReader`]s for convenient use. @@ -46,10 +49,7 @@ pub struct Connector { impl Connector { /// Create a new connector with the given catalog provider and table readers. - pub fn new( - catalog: Arc, - readers: Vec>, - ) -> Self { + pub fn new(catalog: Arc, readers: Vec>) -> Self { Self { catalog, readers } } diff --git a/crates/lance-graph-catalog/src/unity_catalog.rs b/crates/lance-graph-catalog/src/unity_catalog.rs index 4e18aa12..9a4a7722 100644 --- a/crates/lance-graph-catalog/src/unity_catalog.rs +++ b/crates/lance-graph-catalog/src/unity_catalog.rs @@ -57,9 +57,9 @@ impl UnityCatalogProvider { if let Some(timeout) = config.timeout_secs { builder = builder.timeout(std::time::Duration::from_secs(timeout)); } - let client = builder - .build() - .map_err(|e| CatalogError::ConnectionError(format!("Failed to build HTTP client: {}", e)))?; + let client = builder.build().map_err(|e| { + CatalogError::ConnectionError(format!("Failed to build HTTP client: {}", e)) + })?; Ok(Self { config, client }) } @@ -86,9 +86,7 @@ impl UnityCatalogProvider { resource_name ))); } - if status == reqwest::StatusCode::UNAUTHORIZED - || status == reqwest::StatusCode::FORBIDDEN - { + if status == reqwest::StatusCode::UNAUTHORIZED || status == reqwest::StatusCode::FORBIDDEN { let body = resp.text().await.unwrap_or_default(); return Err(CatalogError::AuthError(format!( "HTTP {}: {}", @@ -186,7 +184,7 @@ fn default_nullable() -> bool { impl From for CatalogInfo { fn from(uc: UcCatalog) -> Self { - CatalogInfo { + Self { name: uc.name, comment: uc.comment, properties: uc.properties, @@ -198,7 +196,7 @@ impl From for CatalogInfo { impl From for SchemaInfo { fn from(uc: UcSchema) -> Self { - SchemaInfo { + Self { name: uc.name, catalog_name: uc.catalog_name, comment: uc.comment, @@ -211,7 +209,7 @@ impl From for SchemaInfo { impl From for TableInfo { fn from(uc: UcTable) -> Self { - TableInfo { + Self { name: uc.name, catalog_name: uc.catalog_name, schema_name: uc.schema_name, @@ -242,7 +240,7 @@ impl From for TableInfo { impl From for ColumnInfo { fn from(uc: UcColumn) -> Self { - ColumnInfo { + Self { name: uc.name, type_text: uc.type_text, type_name: uc.type_name, @@ -299,11 +297,7 @@ impl CatalogProvider for UnityCatalogProvider { Ok(body.schemas.into_iter().map(Into::into).collect()) } - async fn get_schema( - &self, - catalog_name: &str, - schema_name: &str, - ) -> CatalogResult { + async fn get_schema(&self, catalog_name: &str, schema_name: &str) -> CatalogResult { let full_name = format!("{}.{}", catalog_name, schema_name); let resp = self .request(reqwest::Method::GET, &format!("/schemas/{}", full_name)) @@ -324,10 +318,7 @@ impl CatalogProvider for UnityCatalogProvider { ) -> CatalogResult> { let resp = self .request(reqwest::Method::GET, "/tables") - .query(&[ - ("catalog_name", catalog_name), - ("schema_name", schema_name), - ]) + .query(&[("catalog_name", catalog_name), ("schema_name", schema_name)]) .send() .await .map_err(|e| CatalogError::ConnectionError(e.to_string()))?; diff --git a/crates/lance-graph-catalog/tests/unity_catalog_integration.rs b/crates/lance-graph-catalog/tests/unity_catalog_integration.rs index 14f2f4db..74a7b568 100644 --- a/crates/lance-graph-catalog/tests/unity_catalog_integration.rs +++ b/crates/lance-graph-catalog/tests/unity_catalog_integration.rs @@ -171,7 +171,10 @@ async fn test_get_schema_not_found() { .await; let provider = setup_provider(&server).await; - let err = provider.get_schema("unity", "nonexistent").await.unwrap_err(); + let err = provider + .get_schema("unity", "nonexistent") + .await + .unwrap_err(); assert!(err.to_string().contains("not found")); } @@ -302,10 +305,7 @@ async fn test_get_table_with_columns() { assert_eq!(table.columns[2].type_name, "DOUBLE"); assert_eq!(table.columns[2].comment, None); - assert_eq!( - table.properties.get("delta.minReaderVersion").unwrap(), - "1" - ); + assert_eq!(table.properties.get("delta.minReaderVersion").unwrap(), "1"); } #[tokio::test] @@ -359,10 +359,7 @@ async fn test_table_to_arrow_schema() { assert_eq!(schema.fields().len(), 3); assert_eq!(schema.field(0).name(), "id"); - assert_eq!( - *schema.field(0).data_type(), - arrow_schema::DataType::Int64 - ); + assert_eq!(*schema.field(0).data_type(), arrow_schema::DataType::Int64); assert!(!schema.field(0).is_nullable()); assert_eq!(schema.field(1).name(), "value"); assert_eq!( @@ -398,7 +395,10 @@ async fn test_bearer_token_sent() { Mock::given(method("GET")) .and(path("/catalogs")) - .and(wiremock::matchers::header("Authorization", "Bearer my-token")) + .and(wiremock::matchers::header( + "Authorization", + "Bearer my-token", + )) .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ "catalogs": [{ "name": "unity" }] }))) @@ -459,8 +459,7 @@ async fn test_data_source_format_parsing() { #[tokio::test] async fn test_connection_error_on_bad_url() { - let config = UnityCatalogConfig::new("http://localhost:1") - .with_timeout(1); + let config = UnityCatalogConfig::new("http://localhost:1").with_timeout(1); let provider = UnityCatalogProvider::new(config).unwrap(); let err = provider.list_catalogs().await.unwrap_err(); assert!(err.to_string().contains("connection error")); diff --git a/crates/lance-graph/src/lib.rs b/crates/lance-graph/src/lib.rs index b2848eae..a3e5778a 100644 --- a/crates/lance-graph/src/lib.rs +++ b/crates/lance-graph/src/lib.rs @@ -69,6 +69,6 @@ pub use lance_graph_catalog::{UnityCatalogConfig, UnityCatalogProvider}; pub use lance_vector_search::VectorSearch; pub use query::{CypherQuery, ExecutionStrategy}; pub use sql_query::SqlQuery; -pub use table_readers::{default_table_readers, ParquetTableReader}; #[cfg(feature = "delta")] pub use table_readers::DeltaTableReader; +pub use table_readers::{default_table_readers, ParquetTableReader}; diff --git a/crates/lance-graph/src/table_readers.rs b/crates/lance-graph/src/table_readers.rs index 7c20bd72..52314024 100644 --- a/crates/lance-graph/src/table_readers.rs +++ b/crates/lance-graph/src/table_readers.rs @@ -37,10 +37,7 @@ impl TableReader for ParquetTableReader { _schema: arrow_schema::SchemaRef, ) -> CatalogResult<()> { let location = table_info.storage_location.as_deref().ok_or_else(|| { - CatalogError::Other(format!( - "Table '{}' has no storage_location", - table_name - )) + CatalogError::Other(format!("Table '{}' has no storage_location", table_name)) })?; ctx.register_parquet( @@ -85,10 +82,7 @@ impl TableReader for DeltaTableReader { _schema: arrow_schema::SchemaRef, ) -> CatalogResult<()> { let location = table_info.storage_location.as_deref().ok_or_else(|| { - CatalogError::Other(format!( - "Table '{}' has no storage_location", - table_name - )) + CatalogError::Other(format!("Table '{}' has no storage_location", table_name)) })?; let table_url = url::Url::parse(location).map_err(|e| { diff --git a/python/python/tests/test_unity_catalog.py b/python/python/tests/test_unity_catalog.py index bafabb35..2d593b34 100644 --- a/python/python/tests/test_unity_catalog.py +++ b/python/python/tests/test_unity_catalog.py @@ -11,7 +11,6 @@ import pytest from lance_graph import CatalogInfo, SchemaInfo, SqlEngine, TableInfo, UnityCatalog - # ========================================================================== # Unit tests — verify Python class construction and repr # ========================================================================== @@ -50,7 +49,9 @@ class TestConnectionErrors: def test_list_catalogs_connection_refused(self): """Connecting to a non-existent server raises RuntimeError.""" uc = UnityCatalog("http://localhost:1/api/2.1/unity-catalog", timeout=1) - with pytest.raises(RuntimeError, match="connection error|Connection refused|error"): + with pytest.raises( + RuntimeError, match="connection error|Connection refused|error" + ): uc.list_catalogs() def test_list_schemas_connection_refused(self): @@ -82,9 +83,7 @@ def test_create_sql_engine_connection_refused(self): def _uc_available(): """Check if a UC server is running on localhost:8080.""" try: - uc = UnityCatalog( - "http://localhost:8080/api/2.1/unity-catalog", timeout=2 - ) + uc = UnityCatalog("http://localhost:8080/api/2.1/unity-catalog", timeout=2) uc.list_catalogs() return True except Exception: @@ -136,9 +135,7 @@ def test_get_table(self, uc): if not tables: pytest.skip("No tables in the first schema") - table = uc.get_table( - catalogs[0].name, schemas[0].name, tables[0].name - ) + table = uc.get_table(catalogs[0].name, schemas[0].name, tables[0].name) assert table.name == tables[0].name assert table.num_columns > 0 cols = table.columns() From 1bc69e48d9008c8ee8640933127735479413f0b7 Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Thu, 26 Feb 2026 23:37:12 -0800 Subject: [PATCH 5/7] =?UTF-8?q?fix:=20typo=20unparseable=20=E2=86=92=20unp?= =?UTF-8?q?arsable?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crates/lance-graph-catalog/src/catalog_provider.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/lance-graph-catalog/src/catalog_provider.rs b/crates/lance-graph-catalog/src/catalog_provider.rs index 9bc802c1..d866a990 100644 --- a/crates/lance-graph-catalog/src/catalog_provider.rs +++ b/crates/lance-graph-catalog/src/catalog_provider.rs @@ -92,7 +92,7 @@ pub enum CatalogError { NotFound(String), /// Authentication or authorization failure. AuthError(String), - /// Invalid or unparseable response from the catalog server. + /// Invalid or unparsable response from the catalog server. InvalidResponse(String), /// Failed to map a catalog type to an Arrow type. TypeMappingError(String), From 2b1d32d858a1fc1a1022346d72b4e3b786ff9ae3 Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Fri, 27 Feb 2026 00:23:36 -0800 Subject: [PATCH 6/7] feat(catalog): add storage options for cloud storage support (S3, Azure, GCS) - Add `storage_options` parameter to `TableReader::register_table()` trait - `Connector::with_storage_options()` stores credentials and passes them to table readers during registration - `DeltaTableReader` uses `open_table_with_storage_options()` when storage options are provided - Enable deltalake cloud features: s3, azure, gcs - Python: `UnityCatalog(url, storage_options={...})` accepts cloud creds Usage: uc = UnityCatalog( "http://localhost:8080/api/2.1/unity-catalog", storage_options={ "azure_storage_account_name": "myaccount", "azure_storage_account_key": "...", } ) engine = uc.create_sql_engine("unity", "default") --- Cargo.lock | 333 +++++++++++++++--- crates/lance-graph-catalog/src/connector.rs | 52 ++- .../lance-graph-catalog/src/table_reader.rs | 5 + crates/lance-graph-python/src/catalog.rs | 29 +- crates/lance-graph/Cargo.toml | 2 +- crates/lance-graph/src/table_readers.rs | 12 +- 6 files changed, 373 insertions(+), 60 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b5ab81ae..4670bee0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -202,7 +202,7 @@ dependencies = [ "arrow-schema", "arrow-select", "atoi", - "base64", + "base64 0.22.1", "chrono", "comfy-table", "half", @@ -548,6 +548,28 @@ dependencies = [ "uuid", ] +[[package]] +name = "aws-sdk-dynamodb" +version = "1.93.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d5b0656080dc4061db88742d2426fc09369107eee2485dfedbc7098a04f21d1" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "regex-lite", + "tracing", +] + [[package]] name = "aws-sdk-sso" version = "1.84.0" @@ -677,17 +699,23 @@ dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", "aws-smithy-types", - "h2", + "h2 0.3.27", + "h2 0.4.12", + "http 0.2.12", "http 1.3.1", - "hyper", - "hyper-rustls", + "http-body 0.4.6", + "hyper 0.14.32", + "hyper 1.7.0", + "hyper-rustls 0.24.2", + "hyper-rustls 0.27.7", "hyper-util", "pin-project-lite", - "rustls", - "rustls-native-certs", + "rustls 0.21.12", + "rustls 0.23.32", + "rustls-native-certs 0.8.1", "rustls-pki-types", "tokio", - "tokio-rustls", + "tokio-rustls 0.26.4", "tower", "tracing", ] @@ -770,6 +798,7 @@ dependencies = [ "base64-simd", "bytes", "bytes-utils", + "futures-core", "http 0.2.12", "http 1.3.1", "http-body 0.4.6", @@ -782,6 +811,8 @@ dependencies = [ "ryu", "serde", "time", + "tokio", + "tokio-util", ] [[package]] @@ -818,6 +849,12 @@ dependencies = [ "tokio", ] +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + [[package]] name = "base64" version = "0.22.1" @@ -1688,7 +1725,7 @@ dependencies = [ "ahash", "arrow", "arrow-ipc", - "base64", + "base64 0.22.1", "chrono", "half", "hashbrown 0.14.5", @@ -1904,7 +1941,7 @@ checksum = "7de2782136bd6014670fd84fe3b0ca3b3e4106c96403c3ae05c0598577139977" dependencies = [ "arrow", "arrow-buffer", - "base64", + "base64 0.22.1", "blake2", "blake3", "chrono", @@ -2332,7 +2369,50 @@ checksum = "09169ef5ecf35911f5f1c3117844a4e00da1edcce58fe8593a237761525f6e3a" dependencies = [ "ctor", "delta_kernel", + "deltalake-aws", + "deltalake-azure", + "deltalake-core", + "deltalake-gcp", +] + +[[package]] +name = "deltalake-aws" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0210d644f4ab27e6d477da99e4b4bf0c7d739fd399ac38c005b6d0dfa4fe132" +dependencies = [ + "async-trait", + "aws-config", + "aws-credential-types", + "aws-sdk-dynamodb", + "aws-sdk-sts", + "aws-smithy-runtime-api", + "backon", + "bytes", + "chrono", + "deltalake-core", + "futures", + "object_store", + "regex", + "thiserror 2.0.17", + "tokio", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "deltalake-azure" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b21941be5334ce9725054198463e2f382a5d3945eb7679ae1ba4b9fde91ef10e" +dependencies = [ + "bytes", "deltalake-core", + "object_store", + "thiserror 2.0.17", + "tokio", + "url", ] [[package]] @@ -2364,6 +2444,7 @@ dependencies = [ "dirs", "either", "futures", + "humantime", "indexmap", "itertools 0.14.0", "num_cpus", @@ -2400,6 +2481,23 @@ dependencies = [ "syn 2.0.106", ] +[[package]] +name = "deltalake-gcp" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df2708856cfa92e8309141fda1ee8d41e174b0868f26e557d0b2ea8d30fb92e" +dependencies = [ + "async-trait", + "bytes", + "deltalake-core", + "futures", + "object_store", + "thiserror 2.0.17", + "tokio", + "tracing", + "url", +] + [[package]] name = "der" version = "0.7.10" @@ -2983,6 +3081,25 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "h2" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http 0.2.12", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "h2" version = "0.4.12" @@ -3175,6 +3292,30 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" +[[package]] +name = "hyper" +version = "0.14.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2 0.3.27", + "http 0.2.12", + "http-body 0.4.6", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2 0.5.10", + "tokio", + "tower-service", + "tracing", + "want", +] + [[package]] name = "hyper" version = "1.7.0" @@ -3185,7 +3326,7 @@ dependencies = [ "bytes", "futures-channel", "futures-core", - "h2", + "h2 0.4.12", "http 1.3.1", "http-body 1.0.1", "httparse", @@ -3198,6 +3339,22 @@ dependencies = [ "want", ] +[[package]] +name = "hyper-rustls" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +dependencies = [ + "futures-util", + "http 0.2.12", + "hyper 0.14.32", + "log", + "rustls 0.21.12", + "rustls-native-certs 0.6.3", + "tokio", + "tokio-rustls 0.24.1", +] + [[package]] name = "hyper-rustls" version = "0.27.7" @@ -3205,13 +3362,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" dependencies = [ "http 1.3.1", - "hyper", + "hyper 1.7.0", "hyper-util", - "rustls", - "rustls-native-certs", + "rustls 0.23.32", + "rustls-native-certs 0.8.1", "rustls-pki-types", "tokio", - "tokio-rustls", + "tokio-rustls 0.26.4", "tower-service", "webpki-roots", ] @@ -3224,7 +3381,7 @@ checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" dependencies = [ "bytes", "http-body-util", - "hyper", + "hyper 1.7.0", "hyper-util", "native-tls", "tokio", @@ -3238,19 +3395,19 @@ version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c6995591a8f1380fcb4ba966a252a4b29188d51d2b89e3a252f5305be65aea8" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "futures-channel", "futures-core", "futures-util", "http 1.3.1", "http-body 1.0.1", - "hyper", + "hyper 1.7.0", "ipnet", "libc", "percent-encoding", "pin-project-lite", - "socket2", + "socket2 0.6.0", "system-configuration", "tokio", "tower-service", @@ -3638,7 +3795,7 @@ version = "9.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a87cc7a48537badeae96744432de36f4be2b4a34a05a5ef32e9dd8a1c169dde" dependencies = [ - "base64", + "base64 0.22.1", "js-sys", "pem", "ring", @@ -4551,7 +4708,7 @@ dependencies = [ "openssl-probe 0.2.1", "openssl-sys", "schannel", - "security-framework", + "security-framework 3.5.1", "security-framework-sys", "tempfile", ] @@ -4744,7 +4901,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740" dependencies = [ "async-trait", - "base64", + "base64 0.22.1", "bytes", "chrono", "form_urlencoded", @@ -4753,7 +4910,7 @@ dependencies = [ "http-body-util", "httparse", "humantime", - "hyper", + "hyper 1.7.0", "itertools 0.14.0", "md-5", "parking_lot", @@ -4762,7 +4919,7 @@ dependencies = [ "rand 0.9.2", "reqwest", "ring", - "rustls-pemfile", + "rustls-pemfile 2.2.0", "serde", "serde_json", "serde_urlencoded", @@ -4817,7 +4974,7 @@ checksum = "d075ab8a203a6ab4bc1bce0a4b9fe486a72bf8b939037f4b78d95386384bc80a" dependencies = [ "anyhow", "backon", - "base64", + "base64 0.22.1", "bytes", "crc32c", "futures", @@ -4981,7 +5138,7 @@ dependencies = [ "arrow-ipc", "arrow-schema", "arrow-select", - "base64", + "base64 0.22.1", "brotli", "bytes", "chrono", @@ -5038,7 +5195,7 @@ version = "3.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38af38e8470ac9dee3ce1bae1af9c1671fffc44ddfd8bd1d0a3445bf349a8ef3" dependencies = [ - "base64", + "base64 0.22.1", "serde", ] @@ -5458,8 +5615,8 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls", - "socket2", + "rustls 0.23.32", + "socket2 0.6.0", "thiserror 2.0.17", "tokio", "tracing", @@ -5478,7 +5635,7 @@ dependencies = [ "rand 0.9.2", "ring", "rustc-hash", - "rustls", + "rustls 0.23.32", "rustls-pki-types", "slab", "thiserror 2.0.17", @@ -5496,7 +5653,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2", + "socket2 0.6.0", "tracing", "windows-sys 0.60.2", ] @@ -5738,7 +5895,7 @@ checksum = "43451dbf3590a7590684c25fb8d12ecdcc90ed3ac123433e500447c7d77ed701" dependencies = [ "anyhow", "async-trait", - "base64", + "base64 0.22.1", "chrono", "form_urlencoded", "getrandom 0.2.16", @@ -5768,17 +5925,17 @@ version = "0.12.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "encoding_rs", "futures-core", "futures-util", - "h2", + "h2 0.4.12", "http 1.3.1", "http-body 1.0.1", "http-body-util", - "hyper", - "hyper-rustls", + "hyper 1.7.0", + "hyper-rustls 0.27.7", "hyper-tls", "hyper-util", "js-sys", @@ -5789,8 +5946,8 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls", - "rustls-native-certs", + "rustls 0.23.32", + "rustls-native-certs 0.8.1", "rustls-pki-types", "serde", "serde_json", @@ -5798,7 +5955,7 @@ dependencies = [ "sync_wrapper", "tokio", "tokio-native-tls", - "tokio-rustls", + "tokio-rustls 0.26.4", "tokio-util", "tower", "tower-http", @@ -5944,6 +6101,18 @@ dependencies = [ "windows-sys 0.61.1", ] +[[package]] +name = "rustls" +version = "0.21.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" +dependencies = [ + "log", + "ring", + "rustls-webpki 0.101.7", + "sct", +] + [[package]] name = "rustls" version = "0.23.32" @@ -5954,11 +6123,23 @@ dependencies = [ "once_cell", "ring", "rustls-pki-types", - "rustls-webpki", + "rustls-webpki 0.103.6", "subtle", "zeroize", ] +[[package]] +name = "rustls-native-certs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" +dependencies = [ + "openssl-probe 0.1.6", + "rustls-pemfile 1.0.4", + "schannel", + "security-framework 2.11.1", +] + [[package]] name = "rustls-native-certs" version = "0.8.1" @@ -5968,7 +6149,16 @@ dependencies = [ "openssl-probe 0.1.6", "rustls-pki-types", "schannel", - "security-framework", + "security-framework 3.5.1", +] + +[[package]] +name = "rustls-pemfile" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" +dependencies = [ + "base64 0.21.7", ] [[package]] @@ -5990,6 +6180,16 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rustls-webpki" +version = "0.101.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" +dependencies = [ + "ring", + "untrusted", +] + [[package]] name = "rustls-webpki" version = "0.103.6" @@ -6064,6 +6264,29 @@ dependencies = [ "sha2", ] +[[package]] +name = "sct" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags", + "core-foundation 0.9.4", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + [[package]] name = "security-framework" version = "3.5.1" @@ -6308,6 +6531,16 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" +[[package]] +name = "socket2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "socket2" version = "0.6.0" @@ -6541,7 +6774,7 @@ checksum = "64a966cb0e76e311f09cf18507c9af192f15d34886ee43d7ba7c7e3803660c43" dependencies = [ "aho-corasick", "arc-swap", - "base64", + "base64 0.22.1", "bitpacking", "bon", "byteorder", @@ -6860,7 +7093,7 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2", + "socket2 0.6.0", "tokio-macros", "windows-sys 0.61.1", ] @@ -6886,13 +7119,23 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-rustls" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +dependencies = [ + "rustls 0.21.12", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" dependencies = [ - "rustls", + "rustls 0.23.32", "tokio", ] @@ -7744,12 +7987,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08db1edfb05d9b3c1542e521aea074442088292f00b5f28e435c714a98f85031" dependencies = [ "assert-json-diff", - "base64", + "base64 0.22.1", "deadpool", "futures", "http 1.3.1", "http-body-util", - "hyper", + "hyper 1.7.0", "hyper-util", "log", "once_cell", diff --git a/crates/lance-graph-catalog/src/connector.rs b/crates/lance-graph-catalog/src/connector.rs index 2f03ec87..f65e761f 100644 --- a/crates/lance-graph-catalog/src/connector.rs +++ b/crates/lance-graph-catalog/src/connector.rs @@ -7,6 +7,7 @@ //! `getPageSourceProvider()`, this struct provides a convenient entry point //! for users who want to browse a catalog and register tables for querying. +use std::collections::HashMap; use std::sync::Arc; use arrow_schema::SchemaRef; @@ -25,6 +26,11 @@ use crate::table_reader::TableReader; /// catalog. It delegates metadata operations to the catalog provider and data /// reading to the appropriate table reader based on the table's data format. /// +/// # Storage Options +/// +/// Cloud storage credentials (S3, Azure, GCS) are passed via `storage_options` +/// and forwarded to each [`TableReader`] during table registration. +/// /// # Extensibility /// /// - Swap the catalog: pass a different `CatalogProvider` (e.g., AWS Glue). @@ -34,23 +40,45 @@ use crate::table_reader::TableReader; /// /// ```no_run /// # use lance_graph_catalog::connector::Connector; -/// # use lance_graph_catalog::{UnityCatalogConfig, UnityCatalogProvider}; +/// # use std::collections::HashMap; /// # fn example() { -/// // let catalog = Arc::new(UnityCatalogProvider::new(config)?); -/// // let readers = default_table_readers(); // from lance-graph crate -/// // let connector = Connector::new(catalog, readers); -/// // let tables = connector.list_tables("unity", "default").await?; +/// // let connector = Connector::new(catalog, readers) +/// // .with_storage_options(HashMap::from([ +/// // ("aws_access_key_id".into(), "...".into()), +/// // ("aws_secret_access_key".into(), "...".into()), +/// // ])); /// # } /// ``` pub struct Connector { catalog: Arc, readers: Vec>, + storage_options: HashMap, } impl Connector { /// Create a new connector with the given catalog provider and table readers. pub fn new(catalog: Arc, readers: Vec>) -> Self { - Self { catalog, readers } + Self { + catalog, + readers, + storage_options: HashMap::new(), + } + } + + /// Set storage options for cloud storage access (S3, Azure, GCS). + /// + /// Common keys: + /// - S3: `aws_access_key_id`, `aws_secret_access_key`, `aws_region` + /// - Azure: `azure_storage_account_name`, `azure_storage_account_key` + /// - GCS: `google_service_account_path` + pub fn with_storage_options(mut self, options: HashMap) -> Self { + self.storage_options = options; + self + } + + /// Get the current storage options. + pub fn storage_options(&self) -> &HashMap { + &self.storage_options } /// Get a reference to the underlying catalog provider. @@ -118,7 +146,7 @@ impl Connector { /// 1. Retrieves full table metadata (including columns) from the catalog. /// 2. Converts columns to an Arrow schema. /// 3. Finds an appropriate [`TableReader`] for the table's data format. - /// 4. Registers the table in the session context. + /// 4. Registers the table in the session context with storage options for cloud access. /// /// If no reader matches the table's format, falls back to registering an /// empty `MemTable` with the correct schema (schema-only, for planning). @@ -175,8 +203,14 @@ impl Connector { match reader { Some(r) => { - r.register_table(ctx, &normalized_name, &table_info, arrow_schema.clone()) - .await?; + r.register_table( + ctx, + &normalized_name, + &table_info, + arrow_schema.clone(), + &self.storage_options, + ) + .await?; } None => { // No reader — register schema-only (empty MemTable for planning) diff --git a/crates/lance-graph-catalog/src/table_reader.rs b/crates/lance-graph-catalog/src/table_reader.rs index 3b7d2f1c..64d6a070 100644 --- a/crates/lance-graph-catalog/src/table_reader.rs +++ b/crates/lance-graph-catalog/src/table_reader.rs @@ -7,6 +7,8 @@ //! data format reading from catalog metadata. Each implementation handles //! one or more data formats and is reusable across any [`CatalogProvider`]. +use std::collections::HashMap; + use arrow_schema::SchemaRef; use async_trait::async_trait; use datafusion::execution::context::SessionContext; @@ -47,11 +49,14 @@ pub trait TableReader: Send + Sync { /// * `table_name` - The name to register the table under (already lowercased). /// * `table_info` - Full table metadata from the catalog, including `storage_location`. /// * `schema` - Arrow schema derived from the table's column definitions. + /// * `storage_options` - Key-value pairs for cloud storage credentials + /// (e.g., `azure_storage_account_name`, `aws_access_key_id`, etc.). async fn register_table( &self, ctx: &SessionContext, table_name: &str, table_info: &TableInfo, schema: SchemaRef, + storage_options: &HashMap, ) -> CatalogResult<()>; } diff --git a/crates/lance-graph-python/src/catalog.rs b/crates/lance-graph-python/src/catalog.rs index 255e2635..26b253dc 100644 --- a/crates/lance-graph-python/src/catalog.rs +++ b/crates/lance-graph-python/src/catalog.rs @@ -3,13 +3,14 @@ //! Python bindings for the catalog integration. +use std::collections::HashMap; use std::sync::Arc; use lance_graph::sql_catalog::build_context_from_connector; +use lance_graph::table_readers::default_table_readers; use lance_graph::{ CatalogInfo, Connector, SchemaInfo, TableInfo, UnityCatalogConfig, UnityCatalogProvider, }; -use lance_graph::table_readers::default_table_readers; use pyo3::exceptions::PyRuntimeError; use pyo3::prelude::*; use pyo3::types::{PyDict, PyList}; @@ -168,6 +169,13 @@ impl PyTableInfo { /// >>> catalogs = uc.list_catalogs() /// >>> engine = uc.create_sql_engine("unity", "default") /// >>> result = engine.execute("SELECT * FROM my_table LIMIT 10") +/// +/// For cloud storage (S3, Azure, GCS): +/// +/// >>> uc = UnityCatalog( +/// ... "http://localhost:8080/api/2.1/unity-catalog", +/// ... storage_options={"azure_storage_account_name": "myaccount", "azure_storage_account_key": "..."} +/// ... ) #[pyclass(name = "UnityCatalog", module = "lance.graph")] pub struct PyUnityCatalog { connector: Arc, @@ -186,9 +194,19 @@ impl PyUnityCatalog { /// Bearer token for authentication /// timeout : int, optional /// Request timeout in seconds + /// storage_options : dict, optional + /// Key-value pairs for cloud storage credentials. + /// S3: aws_access_key_id, aws_secret_access_key, aws_region + /// Azure: azure_storage_account_name, azure_storage_account_key + /// GCS: google_service_account_path #[new] - #[pyo3(signature = (base_url, token=None, timeout=None))] - fn new(base_url: &str, token: Option<&str>, timeout: Option) -> PyResult { + #[pyo3(signature = (base_url, token=None, timeout=None, storage_options=None))] + fn new( + base_url: &str, + token: Option<&str>, + timeout: Option, + storage_options: Option>, + ) -> PyResult { let mut config = UnityCatalogConfig::new(base_url); if let Some(t) = token { config = config.with_token(t); @@ -200,7 +218,10 @@ impl PyUnityCatalog { let provider = UnityCatalogProvider::new(config) .map_err(|e| PyRuntimeError::new_err(e.to_string()))?; let readers = default_table_readers(); - let connector = Connector::new(Arc::new(provider), readers); + let mut connector = Connector::new(Arc::new(provider), readers); + if let Some(opts) = storage_options { + connector = connector.with_storage_options(opts); + } Ok(Self { connector: Arc::new(connector), diff --git a/crates/lance-graph/Cargo.toml b/crates/lance-graph/Cargo.toml index 782b6511..716a0888 100644 --- a/crates/lance-graph/Cargo.toml +++ b/crates/lance-graph/Cargo.toml @@ -38,7 +38,7 @@ nom = "7.1" serde = { version = "1", features = ["derive"] } serde_json = "1" snafu = "0.8" -deltalake = { version = "0.29", features = ["datafusion"], optional = true } +deltalake = { version = "0.29", features = ["datafusion", "s3", "azure", "gcs"], optional = true } url = { version = "2", optional = true } [features] diff --git a/crates/lance-graph/src/table_readers.rs b/crates/lance-graph/src/table_readers.rs index 52314024..b3148f4a 100644 --- a/crates/lance-graph/src/table_readers.rs +++ b/crates/lance-graph/src/table_readers.rs @@ -6,6 +6,7 @@ //! - [`ParquetTableReader`] — reads Parquet tables using DataFusion's built-in support. //! - [`DeltaTableReader`] — reads Delta Lake tables (behind `delta` feature flag). +use std::collections::HashMap; use std::sync::Arc; use async_trait::async_trait; @@ -35,6 +36,7 @@ impl TableReader for ParquetTableReader { table_name: &str, table_info: &TableInfo, _schema: arrow_schema::SchemaRef, + _storage_options: &HashMap, ) -> CatalogResult<()> { let location = table_info.storage_location.as_deref().ok_or_else(|| { CatalogError::Other(format!("Table '{}' has no storage_location", table_name)) @@ -60,6 +62,8 @@ impl TableReader for ParquetTableReader { /// Opens the Delta table at the storage location and registers it as a /// DataFusion `TableProvider`, enabling full SQL query support including /// time travel, schema evolution, and partition pruning. +/// +/// Supports cloud storage (S3, Azure, GCS) via `storage_options`. #[cfg(feature = "delta")] pub struct DeltaTableReader; @@ -80,6 +84,7 @@ impl TableReader for DeltaTableReader { table_name: &str, table_info: &TableInfo, _schema: arrow_schema::SchemaRef, + storage_options: &HashMap, ) -> CatalogResult<()> { let location = table_info.storage_location.as_deref().ok_or_else(|| { CatalogError::Other(format!("Table '{}' has no storage_location", table_name)) @@ -92,7 +97,12 @@ impl TableReader for DeltaTableReader { )) })?; - let delta_table = deltalake::open_table(table_url).await.map_err(|e| { + let delta_table = if storage_options.is_empty() { + deltalake::open_table(table_url).await + } else { + deltalake::open_table_with_storage_options(table_url, storage_options.clone()).await + } + .map_err(|e| { CatalogError::Other(format!( "Failed to open Delta table '{}' at '{}': {}", table_name, location, e From f3b240f498d36916ac2c333205aa23d0d8e6750b Mon Sep 17 00:00:00 2001 From: "jianjian.xie" Date: Fri, 27 Feb 2026 00:28:29 -0800 Subject: [PATCH 7/7] docs: add Unity Catalog integration to READMEs Add examples for UnityCatalog browsing, create_sql_engine, and cloud storage options (S3, Azure, GCS) to both project and Python READMEs. --- README.md | 33 +++++++++++++++++++++++++++++++++ python/README.md | 43 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8e61c4b6..7ed086b7 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,39 @@ r1 = engine.execute("SELECT COUNT(*) AS cnt FROM person") r2 = engine.execute("SELECT name FROM person ORDER BY age DESC LIMIT 2") ``` +## Python example: Unity Catalog integration + +Connect to [Unity Catalog](https://github.com/unitycatalog/unitycatalog) (OSS) to discover and query Delta Lake or Parquet tables directly: + +```python +from lance_graph import UnityCatalog + +# Connect to Unity Catalog +uc = UnityCatalog("http://localhost:8080/api/2.1/unity-catalog") + +# Browse catalog metadata +catalogs = uc.list_catalogs() +schemas = uc.list_schemas("unity") +tables = uc.list_tables("unity", "default") +table = uc.get_table("unity", "default", "marksheet") +print(table.columns()) # [{"name": "id", "type_name": "INT", ...}, ...] + +# Auto-register tables (Delta + Parquet) and query via SQL +engine = uc.create_sql_engine("unity", "default") +result = engine.execute("SELECT * FROM marksheet WHERE mark > 80") +print(result.to_pandas()) + +# For cloud storage (S3, Azure, GCS), pass storage options: +uc = UnityCatalog( + "http://localhost:8080/api/2.1/unity-catalog", + storage_options={ + "aws_access_key_id": "...", + "aws_secret_access_key": "...", + "aws_region": "us-east-1", + } +) +``` + ## Knowledge Graph CLI & API The `knowledge_graph` package layers a simple Lance-backed knowledge graph diff --git a/python/README.md b/python/README.md index 21a64a8f..520d8f27 100644 --- a/python/README.md +++ b/python/README.md @@ -116,7 +116,46 @@ r2 = engine.execute( ) ``` -### 3. Build a Knowledge Graph from Text +### 4. Unity Catalog Integration + +Connect to [Unity Catalog](https://github.com/unitycatalog/unitycatalog) (OSS) to discover and query Delta Lake or Parquet tables without manually loading data: + +```python +from lance_graph import UnityCatalog + +# Connect to Unity Catalog +uc = UnityCatalog("http://localhost:8080/api/2.1/unity-catalog") + +# Browse catalog hierarchy +catalogs = uc.list_catalogs() +schemas = uc.list_schemas("unity") +tables = uc.list_tables("unity", "default") + +# Inspect table metadata +table = uc.get_table("unity", "default", "marksheet") +print(table.data_source_format) # "Delta" +print(table.columns()) # [{"name": "id", "type_name": "INT", ...}, ...] + +# Auto-register all tables and query via SQL +engine = uc.create_sql_engine("unity", "default") +result = engine.execute("SELECT * FROM marksheet WHERE mark > 80") +print(result.to_pandas()) +``` + +For tables on cloud storage (S3, Azure, GCS): + +```python +uc = UnityCatalog( + "http://localhost:8080/api/2.1/unity-catalog", + storage_options={ + "azure_storage_account_name": "myaccount", + "azure_storage_account_key": "...", + } +) +engine = uc.create_sql_engine("unity", "default") +``` + +### 5. Build a Knowledge Graph from Text ```python from pathlib import Path @@ -175,7 +214,7 @@ result = kg.query(""" print(result.to_pylist()) ``` -### 4. Natural Language Q&A +### 6. Natural Language Q&A ```python from knowledge_graph.llm.qa import ask_question