diff --git a/crates/catalog/glue/src/schema.rs b/crates/catalog/glue/src/schema.rs index 864320dae4..4f5c1f664a 100644 --- a/crates/catalog/glue/src/schema.rs +++ b/crates/catalog/glue/src/schema.rs @@ -178,6 +178,12 @@ impl SchemaVisitor for GlueSchemaBuilder { PrimitiveType::Decimal { precision, scale } => { format!("decimal({precision},{scale})") } + PrimitiveType::Variant => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + "Conversion from Variant to Glue type is not supported", + )); + } }; Ok(glue_type) diff --git a/crates/catalog/hms/src/schema.rs b/crates/catalog/hms/src/schema.rs index c23b48719d..f48d163b30 100644 --- a/crates/catalog/hms/src/schema.rs +++ b/crates/catalog/hms/src/schema.rs @@ -135,6 +135,12 @@ impl SchemaVisitor for HiveSchemaBuilder { PrimitiveType::Decimal { precision, scale } => { format!("decimal({precision},{scale})") } + PrimitiveType::Variant => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + "Conversion from Variant to Hive type is not supported", + )); + } }; Ok(hive_type) diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index 9b504421ae..68fc288ce1 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -690,6 +690,10 @@ impl SchemaVisitor for ToArrowSchemaConverter { crate::spec::PrimitiveType::Binary => { Ok(ArrowSchemaOrFieldOrType::Type(DataType::LargeBinary)) } + crate::spec::PrimitiveType::Variant => Err(crate::Error::new( + crate::ErrorKind::FeatureUnsupported, + "Arrow schema conversion for Variant is not yet implemented", + )), } } } @@ -1131,6 +1135,7 @@ pub fn datum_to_arrow_type_with_ree(datum: &Datum) -> DataType { PrimitiveType::Uuid => make_ree(DataType::Binary), PrimitiveType::Fixed(_) => make_ree(DataType::Binary), PrimitiveType::Binary => make_ree(DataType::Binary), + PrimitiveType::Variant => make_ree(DataType::Binary), PrimitiveType::Decimal { precision, scale } => { make_ree(DataType::Decimal128(*precision as u8, *scale as i8)) } diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs index d07233c420..e349af2392 100644 --- a/crates/iceberg/src/arrow/value.rs +++ b/crates/iceberg/src/arrow/value.rs @@ -424,6 +424,10 @@ impl SchemaWithPartnerVisitor for ArrowArrayToIcebergStructConverter { )) } } + PrimitiveType::Variant => Err(Error::new( + ErrorKind::FeatureUnsupported, + "Arrow value extraction for Variant is not yet implemented", + )), } } } diff --git a/crates/iceberg/src/avro/schema.rs b/crates/iceberg/src/avro/schema.rs index fdbc680977..dbe70a482f 100644 --- a/crates/iceberg/src/avro/schema.rs +++ b/crates/iceberg/src/avro/schema.rs @@ -237,6 +237,7 @@ impl SchemaVisitor for SchemaToAvroSchema { PrimitiveType::Uuid => AvroSchema::Uuid, PrimitiveType::Fixed(len) => avro_fixed_schema((*len) as usize)?, PrimitiveType::Binary => AvroSchema::Bytes, + PrimitiveType::Variant => AvroSchema::Bytes, PrimitiveType::Decimal { precision, scale } => { avro_decimal_schema(*precision as usize, *scale as usize)? } diff --git a/crates/iceberg/src/spec/datatypes.rs b/crates/iceberg/src/spec/datatypes.rs index ad4aea758f..ecf8ceb0a9 100644 --- a/crates/iceberg/src/spec/datatypes.rs +++ b/crates/iceberg/src/spec/datatypes.rs @@ -247,6 +247,8 @@ pub enum PrimitiveType { Fixed(u64), /// Arbitrary-length byte array. Binary, + /// Semi-structured data type (Iceberg spec v3). Stored in Parquet as `LogicalType::Variant`. + Variant, } impl PrimitiveType { @@ -382,6 +384,7 @@ impl fmt::Display for PrimitiveType { PrimitiveType::Uuid => write!(f, "uuid"), PrimitiveType::Fixed(size) => write!(f, "fixed({size})"), PrimitiveType::Binary => write!(f, "binary"), + PrimitiveType::Variant => write!(f, "variant"), } } } @@ -884,7 +887,8 @@ mod tests { {"id": 13, "name": "uuid_field", "required": true, "type": "uuid"}, {"id": 14, "name": "fixed_field", "required": true, "type": "fixed[10]"}, {"id": 15, "name": "binary_field", "required": true, "type": "binary"}, - {"id": 16, "name": "string_field", "required": true, "type": "string"} + {"id": 16, "name": "string_field", "required": true, "type": "string"}, + {"id": 17, "name": "variant_field", "required": false, "type": "variant"} ] } "#; @@ -964,6 +968,12 @@ mod tests { Type::Primitive(PrimitiveType::String), ) .into(), + NestedField::optional( + 17, + "variant_field", + Type::Primitive(PrimitiveType::Variant), + ) + .into(), ], id_lookup: OnceLock::default(), name_lookup: OnceLock::default(), @@ -1320,4 +1330,25 @@ mod tests { .contains("expected type 'struct'") ); } + + #[test] + fn variant_type_display() { + assert_eq!(PrimitiveType::Variant.to_string(), "variant"); + } + + #[test] + fn variant_type_serde() { + let json = r#"{"id": 1, "name": "v", "required": false, "type": "variant"}"#; + let field: NestedField = serde_json::from_str(json).unwrap(); + assert_eq!(*field.field_type, Type::Primitive(PrimitiveType::Variant)); + let serialized = serde_json::to_string(&field).unwrap(); + assert!(serialized.contains("\"variant\"")); + } + + #[test] + fn variant_type_not_compatible_with_literals() { + assert!(!PrimitiveType::Variant.compatible(&PrimitiveLiteral::Boolean(true))); + assert!(!PrimitiveType::Variant.compatible(&PrimitiveLiteral::Int(0))); + assert!(!PrimitiveType::Variant.compatible(&PrimitiveLiteral::Binary(vec![]))); + } } diff --git a/crates/iceberg/src/spec/values/datum.rs b/crates/iceberg/src/spec/values/datum.rs index 68ea6b3d46..46a783a770 100644 --- a/crates/iceberg/src/spec/values/datum.rs +++ b/crates/iceberg/src/spec/values/datum.rs @@ -419,6 +419,7 @@ impl Datum { } PrimitiveType::Fixed(_) => PrimitiveLiteral::Binary(Vec::from(bytes)), PrimitiveType::Binary => PrimitiveLiteral::Binary(Vec::from(bytes)), + PrimitiveType::Variant => PrimitiveLiteral::Binary(Vec::from(bytes)), PrimitiveType::Decimal { .. } => { PrimitiveLiteral::Int128(i128_from_be_bytes(bytes).ok_or_else(|| { Error::new( diff --git a/docs/variant-progress.md b/docs/variant-progress.md new file mode 100644 index 0000000000..7af7f16e70 --- /dev/null +++ b/docs/variant-progress.md @@ -0,0 +1,147 @@ +# Variant Support Progress + +Tracking implementation of Iceberg spec v3 Variant type across iceberg-rust and arrow-rs. + +--- + +## Status + +| PR | Crate | Description | Status | +|----|-------|-------------|--------| +| PR 1 | iceberg-rust | `PrimitiveType::Variant` — type system foundation | DONE — branch `feat/variant-primitive-type` | +| PR 2 | iceberg-rust | Arrow schema mapping | PENDING | +| PR 3 | iceberg-rust | Statistics suppression | PENDING | +| PR 4 | arrow-rs | `VariantArrayBuilder::with_shredding` | PENDING | +| PR 5 | iceberg-rust | Iceberg scan path — read shredded Parquet | PENDING | +| PR 6 | iceberg-rust + datafusion | Predicate pushdown on shredded children | PENDING | + +--- + +## PR 1 — DONE + +**Branch:** `feat/variant-primitive-type` +**Files changed:** `spec/datatypes.rs`, `avro/schema.rs`, `spec/values/datum.rs`, `arrow/schema.rs`, `arrow/value.rs`, `catalog/hms/src/schema.rs`, `catalog/glue/src/schema.rs` + +What was done: +- `PrimitiveType::Variant` enum arm with serde rename `"variant"` +- `Display` — `"variant"` +- `compatible()` — `false` for all literals (no scalar predicate against Variant) +- Avro mapping — `AvroSchema::Bytes` +- `try_from_bytes` — `PrimitiveLiteral::Binary` +- Arrow schema/value stubs — `FeatureUnsupported` (Arrow mapping is PR 2) +- HMS/Glue stubs — `FeatureUnsupported` +- Tests: canonical `primitive_type_serde` updated, + 3 targeted Variant tests + +--- + +## PR 2 — PENDING: Arrow Schema Mapping + +**Crate:** `iceberg-rust` — `crates/iceberg/src/arrow/schema.rs` + +Tasks: +- `ToArrowSchemaConverter::primitive` for `PrimitiveType::Variant`: + - Return `ArrowSchemaOrFieldOrType::Field(...)` (not `Type`) + - `DataType::Struct([required binary "metadata", optional binary "value"])` + - Field metadata: `"ARROW:extension:name" = "arrow.parquet.variant"` +- New `arrow_field_to_type(field: &Field) -> Result`: + - Check `field.metadata().get("ARROW:extension:name") == Some("arrow.parquet.variant")` + - Return `Ok(Type::Primitive(PrimitiveType::Variant))` + - Otherwise delegate to `arrow_type_to_type(field.data_type())` +- Replace `FeatureUnsupported` stubs in `arrow/schema.rs` and `arrow/value.rs` with real implementations + +Notes: +- Requires `parquet` crate with `variant_experimental` feature as dependency to get `VariantType` +- Check `parquet-variant-compute/src/variant_array.rs` for the `{metadata, value}` field name constants +- `ArrowSchemaOrFieldOrType::Field` variant already exists in the visitor return type + +--- + +## PR 3 — PENDING: Statistics Suppression + +**Crate:** `iceberg-rust` — `crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs` + +Tasks: +- Add `PrimitiveType::Variant => None` arm — no min/max applicable, binary comparison is meaningless +- Also check `strict_metrics_evaluator.rs` for same pattern +- Ensure `make_ree_type` in `arrow/schema.rs` is cleaned up (currently returns `DataType::Binary` as placeholder) + +Notes: +- `null_count` is valid for Variant but min/max is not — however current `EnabledStatistics` API is all-or-nothing +- Statistics on `typed_value` children (shredded fields) are meaningful and must be preserved +- See `VariantShredding.md` line 168 for spec wording + +--- + +## PR 4 — PENDING: Shredded Write (arrow-rs) + +**Crate:** `arrow-rs` — `parquet-variant-compute/src/variant_array_builder.rs` +**Tracking issue:** https://github.com/apache/arrow-rs/issues/7895 + +Tasks: +- Add `shredding_schema: Option` field to `VariantArrayBuilder` +- Add `fn with_shredding(mut self, schema: DataType) -> Self` + - Validate: schema must be `DataType::Struct` (only valid shredding root) +- In `build()`: if `shredding_schema.is_some()`, call `shred_variant(&unshredded, schema)` and return result +- End-to-end test: build shredded → write Parquet via `ArrowWriter` → read back → `unshred_variant` → values match + +Notes: +- `shred_variant` already exists and is well-tested — this PR just wires it into the builder +- Memory: two representations held simultaneously at `build()` time — acceptable for row-group-sized batches +- `ArrowWriter` already handles shredded structs correctly — no writer changes needed for data correctness +- Statistics suppression (auto-disable min/max on `value`/`metadata`) should be added in `ArrowWriter::try_new_with_options` as a follow-on + +--- + +## PR 5 — PENDING: Iceberg Scan Path + +**Crate:** `iceberg-rust` — `crates/iceberg/src/arrow/` + +Tasks: +- `ArrowReader` Variant column handling: + - Detect `LogicalType::Variant` (via Arrow extension metadata on the field) + - Unshredded case: reconstruct `VariantArray` from `metadata` + `value` columns + - Shredded case: call `unshred_variant` when `typed_value` children present +- Projection pushdown: `variant_get(col, "$.brand")` should project only `typed_value.brand` leaf + +Dependencies: PR 2 (Arrow schema mapping) must be merged first + +--- + +## PR 6 — PENDING: Predicate Pushdown (DataFusion/Comet) + +**Crates:** `iceberg-rust` — `crates/integrations/datafusion/`, `arrow-rs` — `parquet-variant-compute/` + +Tasks: +- Map `variant_get(col, "$.brand") = 'Apple'` to a physical predicate on `typed_value.brand` +- `inclusive_projection` / `manifest_evaluator` — translate path-extraction expressions to shredded column references +- DataFusion `ScalarUDF` for `variant_get` that operates natively on `VariantArray` +- Bloom filter support on shredded `typed_value` children + +Notes: +- This is where Comet (datafusion-comet) gets the full benefit: + - File-level pruning via iceberg-rust `InclusiveMetricsEvaluator` + - Row-group pruning via DataFusion Parquet page index on `typed_value.brand` + - Zero JSON parsing for shredded fields — native column read +- Expected: 20x I/O reduction, ~100x compute reduction for selective queries on shredded fields + +--- + +## Pre-commit Checklist + +Every commit must pass: +``` +cargo fmt --all +cargo clippy --all-targets --all-features --workspace +cargo test -p --lib +``` + +Commit style: short one-line imperative message, logical batches. + +--- + +## Key Spec References + +- [VariantEncoding.md](https://github.com/apache/parquet-format/blob/master/VariantEncoding.md) +- [VariantShredding.md](https://github.com/apache/parquet-format/blob/master/VariantShredding.md) +- [arrow-rs tracking issue #6736](https://github.com/apache/arrow-rs/issues/6736) +- [arrow-rs shredded write issue #7895](https://github.com/apache/arrow-rs/issues/7895)