Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions crates/catalog/glue/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,12 @@ impl SchemaVisitor for GlueSchemaBuilder {
PrimitiveType::Decimal { precision, scale } => {
format!("decimal({precision},{scale})")
}
PrimitiveType::Variant => {
return Err(Error::new(
ErrorKind::FeatureUnsupported,
"Conversion from Variant to Glue type is not supported",
));
}
};

Ok(glue_type)
Expand Down
6 changes: 6 additions & 0 deletions crates/catalog/hms/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,12 @@ impl SchemaVisitor for HiveSchemaBuilder {
PrimitiveType::Decimal { precision, scale } => {
format!("decimal({precision},{scale})")
}
PrimitiveType::Variant => {
return Err(Error::new(
ErrorKind::FeatureUnsupported,
"Conversion from Variant to Hive type is not supported",
));
}
};

Ok(hive_type)
Expand Down
5 changes: 5 additions & 0 deletions crates/iceberg/src/arrow/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,10 @@ impl SchemaVisitor for ToArrowSchemaConverter {
crate::spec::PrimitiveType::Binary => {
Ok(ArrowSchemaOrFieldOrType::Type(DataType::LargeBinary))
}
crate::spec::PrimitiveType::Variant => Err(crate::Error::new(
crate::ErrorKind::FeatureUnsupported,
"Arrow schema conversion for Variant is not yet implemented",
)),
}
}
}
Expand Down Expand Up @@ -1131,6 +1135,7 @@ pub fn datum_to_arrow_type_with_ree(datum: &Datum) -> DataType {
PrimitiveType::Uuid => make_ree(DataType::Binary),
PrimitiveType::Fixed(_) => make_ree(DataType::Binary),
PrimitiveType::Binary => make_ree(DataType::Binary),
PrimitiveType::Variant => make_ree(DataType::Binary),
PrimitiveType::Decimal { precision, scale } => {
make_ree(DataType::Decimal128(*precision as u8, *scale as i8))
}
Expand Down
4 changes: 4 additions & 0 deletions crates/iceberg/src/arrow/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,10 @@ impl SchemaWithPartnerVisitor<ArrayRef> for ArrowArrayToIcebergStructConverter {
))
}
}
PrimitiveType::Variant => Err(Error::new(
ErrorKind::FeatureUnsupported,
"Arrow value extraction for Variant is not yet implemented",
)),
}
}
}
Expand Down
1 change: 1 addition & 0 deletions crates/iceberg/src/avro/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ impl SchemaVisitor for SchemaToAvroSchema {
PrimitiveType::Uuid => AvroSchema::Uuid,
PrimitiveType::Fixed(len) => avro_fixed_schema((*len) as usize)?,
PrimitiveType::Binary => AvroSchema::Bytes,
PrimitiveType::Variant => AvroSchema::Bytes,
PrimitiveType::Decimal { precision, scale } => {
avro_decimal_schema(*precision as usize, *scale as usize)?
}
Expand Down
33 changes: 32 additions & 1 deletion crates/iceberg/src/spec/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,8 @@ pub enum PrimitiveType {
Fixed(u64),
/// Arbitrary-length byte array.
Binary,
/// Semi-structured data type (Iceberg spec v3). Stored in Parquet as `LogicalType::Variant`.
Variant,
}

impl PrimitiveType {
Expand Down Expand Up @@ -382,6 +384,7 @@ impl fmt::Display for PrimitiveType {
PrimitiveType::Uuid => write!(f, "uuid"),
PrimitiveType::Fixed(size) => write!(f, "fixed({size})"),
PrimitiveType::Binary => write!(f, "binary"),
PrimitiveType::Variant => write!(f, "variant"),
}
}
}
Expand Down Expand Up @@ -884,7 +887,8 @@ mod tests {
{"id": 13, "name": "uuid_field", "required": true, "type": "uuid"},
{"id": 14, "name": "fixed_field", "required": true, "type": "fixed[10]"},
{"id": 15, "name": "binary_field", "required": true, "type": "binary"},
{"id": 16, "name": "string_field", "required": true, "type": "string"}
{"id": 16, "name": "string_field", "required": true, "type": "string"},
{"id": 17, "name": "variant_field", "required": false, "type": "variant"}
]
}
"#;
Expand Down Expand Up @@ -964,6 +968,12 @@ mod tests {
Type::Primitive(PrimitiveType::String),
)
.into(),
NestedField::optional(
17,
"variant_field",
Type::Primitive(PrimitiveType::Variant),
)
.into(),
],
id_lookup: OnceLock::default(),
name_lookup: OnceLock::default(),
Expand Down Expand Up @@ -1320,4 +1330,25 @@ mod tests {
.contains("expected type 'struct'")
);
}

#[test]
fn variant_type_display() {
assert_eq!(PrimitiveType::Variant.to_string(), "variant");
}

#[test]
fn variant_type_serde() {
let json = r#"{"id": 1, "name": "v", "required": false, "type": "variant"}"#;
let field: NestedField = serde_json::from_str(json).unwrap();
assert_eq!(*field.field_type, Type::Primitive(PrimitiveType::Variant));
let serialized = serde_json::to_string(&field).unwrap();
assert!(serialized.contains("\"variant\""));
}

#[test]
fn variant_type_not_compatible_with_literals() {
assert!(!PrimitiveType::Variant.compatible(&PrimitiveLiteral::Boolean(true)));
assert!(!PrimitiveType::Variant.compatible(&PrimitiveLiteral::Int(0)));
assert!(!PrimitiveType::Variant.compatible(&PrimitiveLiteral::Binary(vec![])));
}
}
1 change: 1 addition & 0 deletions crates/iceberg/src/spec/values/datum.rs
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,7 @@ impl Datum {
}
PrimitiveType::Fixed(_) => PrimitiveLiteral::Binary(Vec::from(bytes)),
PrimitiveType::Binary => PrimitiveLiteral::Binary(Vec::from(bytes)),
PrimitiveType::Variant => PrimitiveLiteral::Binary(Vec::from(bytes)),
PrimitiveType::Decimal { .. } => {
PrimitiveLiteral::Int128(i128_from_be_bytes(bytes).ok_or_else(|| {
Error::new(
Expand Down
147 changes: 147 additions & 0 deletions docs/variant-progress.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# Variant Support Progress

Tracking implementation of Iceberg spec v3 Variant type across iceberg-rust and arrow-rs.

---

## Status

| PR | Crate | Description | Status |
|----|-------|-------------|--------|
| PR 1 | iceberg-rust | `PrimitiveType::Variant` — type system foundation | DONE — branch `feat/variant-primitive-type` |
| PR 2 | iceberg-rust | Arrow schema mapping | PENDING |
| PR 3 | iceberg-rust | Statistics suppression | PENDING |
| PR 4 | arrow-rs | `VariantArrayBuilder::with_shredding` | PENDING |
| PR 5 | iceberg-rust | Iceberg scan path — read shredded Parquet | PENDING |
| PR 6 | iceberg-rust + datafusion | Predicate pushdown on shredded children | PENDING |

---

## PR 1 — DONE

**Branch:** `feat/variant-primitive-type`
**Files changed:** `spec/datatypes.rs`, `avro/schema.rs`, `spec/values/datum.rs`, `arrow/schema.rs`, `arrow/value.rs`, `catalog/hms/src/schema.rs`, `catalog/glue/src/schema.rs`

What was done:
- `PrimitiveType::Variant` enum arm with serde rename `"variant"`
- `Display` — `"variant"`
- `compatible()` — `false` for all literals (no scalar predicate against Variant)
- Avro mapping — `AvroSchema::Bytes`
- `try_from_bytes` — `PrimitiveLiteral::Binary`
- Arrow schema/value stubs — `FeatureUnsupported` (Arrow mapping is PR 2)
- HMS/Glue stubs — `FeatureUnsupported`
- Tests: canonical `primitive_type_serde` updated, + 3 targeted Variant tests

---

## PR 2 — PENDING: Arrow Schema Mapping

**Crate:** `iceberg-rust` — `crates/iceberg/src/arrow/schema.rs`

Tasks:
- `ToArrowSchemaConverter::primitive` for `PrimitiveType::Variant`:
- Return `ArrowSchemaOrFieldOrType::Field(...)` (not `Type`)
- `DataType::Struct([required binary "metadata", optional binary "value"])`
- Field metadata: `"ARROW:extension:name" = "arrow.parquet.variant"`
- New `arrow_field_to_type(field: &Field) -> Result<Type>`:
- Check `field.metadata().get("ARROW:extension:name") == Some("arrow.parquet.variant")`
- Return `Ok(Type::Primitive(PrimitiveType::Variant))`
- Otherwise delegate to `arrow_type_to_type(field.data_type())`
- Replace `FeatureUnsupported` stubs in `arrow/schema.rs` and `arrow/value.rs` with real implementations

Notes:
- Requires `parquet` crate with `variant_experimental` feature as dependency to get `VariantType`
- Check `parquet-variant-compute/src/variant_array.rs` for the `{metadata, value}` field name constants
- `ArrowSchemaOrFieldOrType::Field` variant already exists in the visitor return type

---

## PR 3 — PENDING: Statistics Suppression

**Crate:** `iceberg-rust` — `crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs`

Tasks:
- Add `PrimitiveType::Variant => None` arm — no min/max applicable, binary comparison is meaningless
- Also check `strict_metrics_evaluator.rs` for same pattern
- Ensure `make_ree_type` in `arrow/schema.rs` is cleaned up (currently returns `DataType::Binary` as placeholder)

Notes:
- `null_count` is valid for Variant but min/max is not — however current `EnabledStatistics` API is all-or-nothing
- Statistics on `typed_value` children (shredded fields) are meaningful and must be preserved
- See `VariantShredding.md` line 168 for spec wording

---

## PR 4 — PENDING: Shredded Write (arrow-rs)

**Crate:** `arrow-rs` — `parquet-variant-compute/src/variant_array_builder.rs`
**Tracking issue:** https://github.com/apache/arrow-rs/issues/7895

Tasks:
- Add `shredding_schema: Option<DataType>` field to `VariantArrayBuilder`
- Add `fn with_shredding(mut self, schema: DataType) -> Self`
- Validate: schema must be `DataType::Struct` (only valid shredding root)
- In `build()`: if `shredding_schema.is_some()`, call `shred_variant(&unshredded, schema)` and return result
- End-to-end test: build shredded → write Parquet via `ArrowWriter` → read back → `unshred_variant` → values match

Notes:
- `shred_variant` already exists and is well-tested — this PR just wires it into the builder
- Memory: two representations held simultaneously at `build()` time — acceptable for row-group-sized batches
- `ArrowWriter` already handles shredded structs correctly — no writer changes needed for data correctness
- Statistics suppression (auto-disable min/max on `value`/`metadata`) should be added in `ArrowWriter::try_new_with_options` as a follow-on

---

## PR 5 — PENDING: Iceberg Scan Path

**Crate:** `iceberg-rust` — `crates/iceberg/src/arrow/`

Tasks:
- `ArrowReader` Variant column handling:
- Detect `LogicalType::Variant` (via Arrow extension metadata on the field)
- Unshredded case: reconstruct `VariantArray` from `metadata` + `value` columns
- Shredded case: call `unshred_variant` when `typed_value` children present
- Projection pushdown: `variant_get(col, "$.brand")` should project only `typed_value.brand` leaf

Dependencies: PR 2 (Arrow schema mapping) must be merged first

---

## PR 6 — PENDING: Predicate Pushdown (DataFusion/Comet)

**Crates:** `iceberg-rust` — `crates/integrations/datafusion/`, `arrow-rs` — `parquet-variant-compute/`

Tasks:
- Map `variant_get(col, "$.brand") = 'Apple'` to a physical predicate on `typed_value.brand`
- `inclusive_projection` / `manifest_evaluator` — translate path-extraction expressions to shredded column references
- DataFusion `ScalarUDF` for `variant_get` that operates natively on `VariantArray`
- Bloom filter support on shredded `typed_value` children

Notes:
- This is where Comet (datafusion-comet) gets the full benefit:
- File-level pruning via iceberg-rust `InclusiveMetricsEvaluator`
- Row-group pruning via DataFusion Parquet page index on `typed_value.brand`
- Zero JSON parsing for shredded fields — native column read
- Expected: 20x I/O reduction, ~100x compute reduction for selective queries on shredded fields

---

## Pre-commit Checklist

Every commit must pass:
```
cargo fmt --all
cargo clippy --all-targets --all-features --workspace
cargo test -p <affected_crate> --lib
```

Commit style: short one-line imperative message, logical batches.

---

## Key Spec References

- [VariantEncoding.md](https://github.com/apache/parquet-format/blob/master/VariantEncoding.md)
- [VariantShredding.md](https://github.com/apache/parquet-format/blob/master/VariantShredding.md)
- [arrow-rs tracking issue #6736](https://github.com/apache/arrow-rs/issues/6736)
- [arrow-rs shredded write issue #7895](https://github.com/apache/arrow-rs/issues/7895)
Loading