diff --git a/crates/catalog/glue/src/schema.rs b/crates/catalog/glue/src/schema.rs index 864320dae4..86ad86ff96 100644 --- a/crates/catalog/glue/src/schema.rs +++ b/crates/catalog/glue/src/schema.rs @@ -157,6 +157,12 @@ impl SchemaVisitor for GlueSchemaBuilder { fn primitive(&mut self, p: &iceberg::spec::PrimitiveType) -> iceberg::Result { let glue_type = match p { + PrimitiveType::Unknown => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + format!("Conversion from {p:?} is not supported"), + )); + } PrimitiveType::Boolean => "boolean".to_string(), PrimitiveType::Int => "int".to_string(), PrimitiveType::Long => "bigint".to_string(), diff --git a/crates/catalog/hms/src/schema.rs b/crates/catalog/hms/src/schema.rs index c23b48719d..6cac2d1b22 100644 --- a/crates/catalog/hms/src/schema.rs +++ b/crates/catalog/hms/src/schema.rs @@ -114,6 +114,12 @@ impl SchemaVisitor for HiveSchemaBuilder { fn primitive(&mut self, p: &iceberg::spec::PrimitiveType) -> iceberg::Result { let hive_type = match p { + PrimitiveType::Unknown => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + format!("Conversion from {p:?} is not supported"), + )); + } PrimitiveType::Boolean => "boolean".to_string(), PrimitiveType::Int => "int".to_string(), PrimitiveType::Long => "bigint".to_string(), diff --git a/crates/iceberg/src/arrow/reader/projection.rs b/crates/iceberg/src/arrow/reader/projection.rs index deae027e14..180715c23b 100644 --- a/crates/iceberg/src/arrow/reader/projection.rs +++ b/crates/iceberg/src/arrow/reader/projection.rs @@ -61,6 +61,7 @@ impl ArrowReader { /// Nested types (struct/list/map) are flattened in Parquet's columnar format. fn include_leaf_field_id(field: &NestedField, field_ids: &mut Vec) { match field.field_type.as_ref() { + Type::Primitive(PrimitiveType::Unknown) => {} Type::Primitive(_) => { field_ids.push(field.id); } @@ -94,6 +95,7 @@ impl ArrowReader { (Some(lhs), Some(rhs)) if lhs == rhs => true, (Some(PrimitiveType::Int), Some(PrimitiveType::Long)) => true, (Some(PrimitiveType::Float), Some(PrimitiveType::Double)) => true, + (Some(PrimitiveType::Unknown), Some(_)) => true, ( Some(PrimitiveType::Decimal { precision: file_precision, diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index 9b504421ae..22261cacc0 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -120,6 +120,7 @@ fn visit_type(r#type: &DataType, visitor: &mut V) -> Resu | DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View + | DataType::Null | DataType::Binary | DataType::LargeBinary | DataType::BinaryView @@ -428,6 +429,7 @@ impl ArrowSchemaVisitor for ArrowSchemaConverter { fn primitive(&mut self, p: &DataType) -> Result { match p { + DataType::Null => Ok(Type::Primitive(PrimitiveType::Unknown)), DataType::Boolean => Ok(Type::Primitive(PrimitiveType::Boolean)), DataType::Int8 | DataType::Int16 | DataType::Int32 => { Ok(Type::Primitive(PrimitiveType::Int)) @@ -613,6 +615,9 @@ impl SchemaVisitor for ToArrowSchemaConverter { p: &crate::spec::PrimitiveType, ) -> crate::Result { match p { + crate::spec::PrimitiveType::Unknown => { + Ok(ArrowSchemaOrFieldOrType::Type(DataType::Null)) + } crate::spec::PrimitiveType::Boolean => { Ok(ArrowSchemaOrFieldOrType::Type(DataType::Boolean)) } @@ -1116,6 +1121,7 @@ pub fn datum_to_arrow_type_with_ree(datum: &Datum) -> DataType { // Match on the PrimitiveType from the Datum to determine the Arrow type match datum.data_type() { + PrimitiveType::Unknown => make_ree(DataType::Null), PrimitiveType::Boolean => make_ree(DataType::Boolean), PrimitiveType::Int => make_ree(DataType::Int32), PrimitiveType::Long => make_ree(DataType::Int64), @@ -1915,6 +1921,13 @@ mod tests { assert_eq!(iceberg_type, arrow_type_to_type(&arrow_type).unwrap()); } + { + let arrow_type = DataType::Null; + let iceberg_type = Type::Primitive(PrimitiveType::Unknown); + assert_eq!(arrow_type, type_to_arrow_type(&iceberg_type).unwrap()); + assert_eq!(iceberg_type, arrow_type_to_type(&arrow_type).unwrap()); + } + // test struct type { // no metadata will cause error diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs index d07233c420..c462cbdacf 100644 --- a/crates/iceberg/src/arrow/value.rs +++ b/crates/iceberg/src/arrow/value.rs @@ -206,6 +206,7 @@ impl SchemaWithPartnerVisitor for ArrowArrayToIcebergStructConverter { fn primitive(&mut self, p: &PrimitiveType, partner: &ArrayRef) -> Result>> { match p { + PrimitiveType::Unknown => Ok(vec![None; partner.len()]), PrimitiveType::Boolean => { let array = partner .as_any() @@ -629,6 +630,7 @@ pub(crate) fn create_primitive_array_single_element( prim_lit: &Option, ) -> Result { match (data_type, prim_lit) { + (DataType::Null, _) => Ok(Arc::new(arrow_array::NullArray::new(1))), (DataType::Boolean, Some(PrimitiveLiteral::Boolean(v))) => { Ok(Arc::new(BooleanArray::from(vec![*v]))) } diff --git a/crates/iceberg/src/avro/schema.rs b/crates/iceberg/src/avro/schema.rs index fdbc680977..e2641a6b41 100644 --- a/crates/iceberg/src/avro/schema.rs +++ b/crates/iceberg/src/avro/schema.rs @@ -74,7 +74,7 @@ impl SchemaVisitor for SchemaToAvroSchema { record.name = Name::from(format!("r{}", field.id).as_str()); } - if !field.required { + if !field.required && !matches!(field_schema, AvroSchema::Null) { field_schema = avro_optional(field_schema)?; } @@ -126,7 +126,7 @@ impl SchemaVisitor for SchemaToAvroSchema { record.name = Name::from(format!("r{}", list.element_field.id).as_str()); } - if !list.element_field.required { + if !list.element_field.required && !matches!(field_schema, AvroSchema::Null) { field_schema = avro_optional(field_schema)?; } @@ -147,7 +147,7 @@ impl SchemaVisitor for SchemaToAvroSchema { ) -> Result { let key_field_schema = key_value.unwrap_left(); let mut value_field_schema = value.unwrap_left(); - if !map.value_field.required { + if !map.value_field.required && !matches!(value_field_schema, AvroSchema::Null) { value_field_schema = avro_optional(value_field_schema)?; } @@ -222,6 +222,7 @@ impl SchemaVisitor for SchemaToAvroSchema { fn primitive(&mut self, p: &PrimitiveType) -> Result { let avro_schema = match p { + PrimitiveType::Unknown => AvroSchema::Null, PrimitiveType::Boolean => AvroSchema::Boolean, PrimitiveType::Int => AvroSchema::Int, PrimitiveType::Long => AvroSchema::Long, @@ -304,6 +305,10 @@ pub(crate) fn avro_decimal_schema(precision: usize, scale: usize) -> Result Result { + if matches!(avro_schema, AvroSchema::Null) { + return Ok(AvroSchema::Null); + } + Ok(AvroSchema::Union(UnionSchema::new(vec![ AvroSchema::Null, avro_schema, @@ -440,10 +445,11 @@ impl AvroSchemaVisitor for AvroSchemaToSchema { let field_id = Self::get_element_id_from_attributes(&avro_field.custom_attributes, FIELD_ID_PROP)?; - let optional = is_avro_optional(&avro_field.schema); + let optional = is_avro_optional(&avro_field.schema) + || matches!(&avro_field.schema, AvroSchema::Null); - let mut field = - NestedField::new(field_id, &avro_field.name, field_type.unwrap(), !optional); + let field_type = field_type.unwrap_or(Type::Primitive(PrimitiveType::Unknown)); + let mut field = NestedField::new(field_id, &avro_field.name, field_type, !optional); if let Some(doc) = &avro_field.doc { field = field.with_doc(doc); @@ -475,7 +481,9 @@ impl AvroSchemaVisitor for AvroSchemaToSchema { } if options.len() == 1 { - Ok(Some(options.remove(0).unwrap())) + Ok(options + .remove(0) + .or(Some(Type::Primitive(PrimitiveType::Unknown)))) } else { Ok(Some(options.remove(1).unwrap())) } @@ -483,10 +491,11 @@ impl AvroSchemaVisitor for AvroSchemaToSchema { fn array(&mut self, array: &ArraySchema, item: Option) -> Result { let element_field_id = Self::get_element_id_from_attributes(&array.attributes, ELEMENT_ID)?; + let item = item.unwrap_or(Type::Primitive(PrimitiveType::Unknown)); let element_field = NestedField::list_element( element_field_id, - item.unwrap(), - !is_avro_optional(&array.items), + item, + !is_avro_optional(&array.items) && !matches!(array.items.as_ref(), AvroSchema::Null), ) .into(); Ok(Some(Type::List(ListType { element_field }))) @@ -497,10 +506,11 @@ impl AvroSchemaVisitor for AvroSchemaToSchema { let key_field = NestedField::map_key_element(key_field_id, Type::Primitive(PrimitiveType::String)); let value_field_id = Self::get_element_id_from_attributes(&map.attributes, VALUE_ID)?; + let value = value.unwrap_or(Type::Primitive(PrimitiveType::Unknown)); let value_field = NestedField::map_value_element( value_field_id, - value.unwrap(), - !is_avro_optional(&map.types), + value, + !is_avro_optional(&map.types) && !matches!(map.types.as_ref(), AvroSchema::Null), ); Ok(Some(Type::Map(MapType { key_field: key_field.into(), @@ -550,12 +560,7 @@ impl AvroSchemaVisitor for AvroSchemaToSchema { "Can't convert avro map schema, missing key schema.", ) })?; - let value = value.ok_or_else(|| { - Error::new( - ErrorKind::DataInvalid, - "Can't convert avro map schema, missing value schema.", - ) - })?; + let value = value.unwrap_or(Type::Primitive(PrimitiveType::Unknown)); let key_id = Self::get_element_id_from_attributes( &array.fields[0].custom_attributes, FIELD_ID_PROP, @@ -568,7 +573,8 @@ impl AvroSchemaVisitor for AvroSchemaToSchema { let value_field = NestedField::map_value_element( value_id, value, - !is_avro_optional(&array.fields[1].schema), + !is_avro_optional(&array.fields[1].schema) + && !matches!(&array.fields[1].schema, AvroSchema::Null), ); Ok(Some(Type::Map(MapType { key_field: key_field.into(), @@ -650,6 +656,25 @@ mod tests { assert_eq!(iceberg_schema, converted_avro_converted_iceberg_schema); } + #[test] + fn test_unknown_type_schema_conversion() { + let schema = Schema::builder() + .with_fields(vec![ + NestedField::optional(1, "empty", PrimitiveType::Unknown.into()).into(), + ]) + .build() + .unwrap(); + + let avro_schema = schema_to_avro_schema("table", &schema).unwrap(); + let AvroSchema::Record(record) = &avro_schema else { + panic!("expected avro record schema"); + }; + assert!(matches!(record.fields[0].schema, AvroSchema::Null)); + assert_eq!(record.fields[0].default, Some(Value::Null)); + + assert_eq!(schema, avro_schema_to_schema(&avro_schema).unwrap()); + } + #[test] fn test_manifest_file_v1_schema() { let fields = vec![ diff --git a/crates/iceberg/src/expr/predicate.rs b/crates/iceberg/src/expr/predicate.rs index 35e88ff3e7..0d8817acde 100644 --- a/crates/iceberg/src/expr/predicate.rs +++ b/crates/iceberg/src/expr/predicate.rs @@ -310,7 +310,12 @@ impl Bind for SetExpression { impl Display for SetExpression { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - let mut literal_strs = self.literals.iter().map(|l| format!("{l}")); + let mut literals = self.literals.iter().collect_vec(); + literals.sort_by(|left, right| { + left.partial_cmp(right) + .unwrap_or_else(|| left.to_string().cmp(&right.to_string())) + }); + let mut literal_strs = literals.into_iter().map(ToString::to_string); write!(f, "{} {} ({})", self.term, self.op, literal_strs.join(", ")) } @@ -1363,7 +1368,7 @@ mod tests { let schema = table_schema_simple(); let expr = Reference::new("bar").is_in([Datum::int(10), Datum::int(20)]); let bound_expr = expr.bind(schema, true).unwrap(); - assert_eq!(&format!("{bound_expr}"), "bar IN (20, 10)"); + assert_eq!(&format!("{bound_expr}"), "bar IN (10, 20)"); test_bound_predicate_serialize_diserialize(bound_expr); } @@ -1398,7 +1403,7 @@ mod tests { let schema = table_schema_simple(); let expr = Reference::new("bar").is_not_in([Datum::int(10), Datum::int(20)]); let bound_expr = expr.bind(schema, true).unwrap(); - assert_eq!(&format!("{bound_expr}"), "bar NOT IN (20, 10)"); + assert_eq!(&format!("{bound_expr}"), "bar NOT IN (10, 20)"); test_bound_predicate_serialize_diserialize(bound_expr); } @@ -1571,13 +1576,7 @@ mod tests { let expected_bound = expected_predicate.bind(schema, true).unwrap(); assert_eq!(result, expected_bound); - // Note: HashSet order may vary, so we check that it contains the expected format - let result_str = format!("{result}"); - assert!( - result_str.contains("bar NOT IN") - && result_str.contains("10") - && result_str.contains("20") - ); + assert_eq!(&format!("{result}"), "bar NOT IN (10, 20)"); } #[test] diff --git a/crates/iceberg/src/expr/visitors/strict_projection.rs b/crates/iceberg/src/expr/visitors/strict_projection.rs index ebc6212c76..0e8f5c18c9 100644 --- a/crates/iceberg/src/expr/visitors/strict_projection.rs +++ b/crates/iceberg/src/expr/visitors/strict_projection.rs @@ -424,7 +424,7 @@ mod tests { .bind(schema.clone(), false) .unwrap(); let result = strict_projection.strict_project(&predicate).unwrap(); - assert_eq!(result.to_string(), "((((pcol1 NOT IN (0, -1)) AND (pcol2 NOT IN (0, -1))) AND (pcol3 NOT IN (0, -1))) AND (pcol4 NOT IN (0, -1))) AND (pcol5 NOT IN (0, -1))".to_string()); + assert_eq!(result.to_string(), "((((pcol1 NOT IN (-1, 0)) AND (pcol2 NOT IN (-1, 0))) AND (pcol3 NOT IN (-1, 0))) AND (pcol4 NOT IN (-1, 0))) AND (pcol5 NOT IN (-1, 0))".to_string()); // test in let predicate = @@ -658,7 +658,7 @@ mod tests { .bind(schema.clone(), false) .unwrap(); let result = strict_projection.strict_project(&predicate).unwrap(); - assert_eq!(result.to_string(), "((((pcol1 NOT IN (575, 564)) AND (pcol2 NOT IN (575, 564))) AND (pcol3 NOT IN (575, 564))) AND (pcol4 NOT IN (575, 564))) AND (pcol5 NOT IN (575, 564))".to_string()); + assert_eq!(result.to_string(), "((((pcol1 NOT IN (564, 575)) AND (pcol2 NOT IN (564, 575))) AND (pcol3 NOT IN (564, 575))) AND (pcol4 NOT IN (564, 575))) AND (pcol5 NOT IN (564, 575))".to_string()); // test in let predicate = Reference::new("col1") @@ -886,7 +886,7 @@ mod tests { .bind(schema.clone(), false) .unwrap(); let result = strict_projection.strict_project(&predicate).unwrap(); - assert_eq!(result.to_string(), "((((pcol1 NOT IN (-1, -12)) AND (pcol2 NOT IN (-1, -12))) AND (pcol3 NOT IN (-1, -12))) AND (pcol4 NOT IN (-1, -12))) AND (pcol5 NOT IN (-1, -12))".to_string()); + assert_eq!(result.to_string(), "((((pcol1 NOT IN (-12, -1)) AND (pcol2 NOT IN (-12, -1))) AND (pcol3 NOT IN (-12, -1))) AND (pcol4 NOT IN (-12, -1))) AND (pcol5 NOT IN (-12, -1))".to_string()); // test in let predicate = Reference::new("col1") @@ -1114,7 +1114,7 @@ mod tests { .bind(schema.clone(), false) .unwrap(); let result = strict_projection.strict_project(&predicate).unwrap(); - assert_eq!(result.to_string(), "((((pcol1 NOT IN (575, 564)) AND (pcol2 NOT IN (575, 564))) AND (pcol3 NOT IN (575, 564))) AND (pcol4 NOT IN (575, 564))) AND (pcol5 NOT IN (575, 564))".to_string()); + assert_eq!(result.to_string(), "((((pcol1 NOT IN (564, 575)) AND (pcol2 NOT IN (564, 575))) AND (pcol3 NOT IN (564, 575))) AND (pcol4 NOT IN (564, 575))) AND (pcol5 NOT IN (564, 575))".to_string()); // test in let predicate = Reference::new("col1") @@ -1724,7 +1724,7 @@ mod tests { .bind(schema.clone(), false) .unwrap(); let result = strict_projection.strict_project(&predicate).unwrap(); - assert_eq!(result.to_string(), "((((pcol1 NOT IN (1969-12-31, 1969-12-30)) AND (pcol2 NOT IN (1969-12-31, 1969-12-30))) AND (pcol3 NOT IN (1969-12-31, 1969-12-30))) AND (pcol4 NOT IN (1969-12-31, 1969-12-30))) AND (pcol5 NOT IN (1969-12-31, 1969-12-30))".to_string()); + assert_eq!(result.to_string(), "((((pcol1 NOT IN (1969-12-30, 1969-12-31)) AND (pcol2 NOT IN (1969-12-30, 1969-12-31))) AND (pcol3 NOT IN (1969-12-30, 1969-12-31))) AND (pcol4 NOT IN (1969-12-30, 1969-12-31))) AND (pcol5 NOT IN (1969-12-30, 1969-12-31))".to_string()); // test in let predicate = Reference::new("col1") @@ -1952,7 +1952,7 @@ mod tests { .bind(schema.clone(), false) .unwrap(); let result = strict_projection.strict_project(&predicate).unwrap(); - assert_eq!(result.to_string(), "((((pcol1 NOT IN (47, 46)) AND (pcol2 NOT IN (47, 46))) AND (pcol3 NOT IN (47, 46))) AND (pcol4 NOT IN (47, 46))) AND (pcol5 NOT IN (47, 46))".to_string()); + assert_eq!(result.to_string(), "((((pcol1 NOT IN (46, 47)) AND (pcol2 NOT IN (46, 47))) AND (pcol3 NOT IN (46, 47))) AND (pcol4 NOT IN (46, 47))) AND (pcol5 NOT IN (46, 47))".to_string()); // test in let predicate = Reference::new("col1") @@ -2384,7 +2384,7 @@ mod tests { .bind(schema.clone(), false) .unwrap(); let result = strict_projection.strict_project(&predicate).unwrap(); - assert_eq!(result.to_string(), "((((pcol1 NOT IN (47, 46)) AND (pcol2 NOT IN (47, 46))) AND (pcol3 NOT IN (47, 46))) AND (pcol4 NOT IN (47, 46))) AND (pcol5 NOT IN (47, 46))".to_string()); + assert_eq!(result.to_string(), "((((pcol1 NOT IN (46, 47)) AND (pcol2 NOT IN (46, 47))) AND (pcol3 NOT IN (46, 47))) AND (pcol4 NOT IN (46, 47))) AND (pcol5 NOT IN (46, 47))".to_string()); // test in let predicate = Reference::new("col1") @@ -2593,7 +2593,7 @@ mod tests { .bind(schema.clone(), false) .unwrap(); let result = strict_projection.strict_project(&predicate).unwrap(); - assert_eq!(result.to_string(), "(((((pcol1 NOT IN (8, 7, 6)) AND (pcol2 NOT IN (8, 7, 6))) AND (pcol3 NOT IN (6, 2))) AND (pcol4 NOT IN (9, 4))) AND (pcol5 NOT IN (4, 6))) AND (pcol6 NOT IN (4, 6))".to_string()); + assert_eq!(result.to_string(), "(((((pcol1 NOT IN (6, 7, 8)) AND (pcol2 NOT IN (6, 7, 8))) AND (pcol3 NOT IN (2, 6))) AND (pcol4 NOT IN (4, 9))) AND (pcol5 NOT IN (4, 6))) AND (pcol6 NOT IN (4, 6))".to_string()); } #[tokio::test] @@ -2690,7 +2690,7 @@ mod tests { .bind(schema.clone(), false) .unwrap(); let result = strict_projection.strict_project(&predicate).unwrap(); - assert_eq!(result.to_string(), "pcol1 IN (101, 100)".to_string()); + assert_eq!(result.to_string(), "pcol1 IN (100, 101)".to_string()); // test not in let predicate = Reference::new("col1") @@ -2698,7 +2698,7 @@ mod tests { .bind(schema.clone(), false) .unwrap(); let result = strict_projection.strict_project(&predicate).unwrap(); - assert_eq!(result.to_string(), "pcol1 NOT IN (101, 100)".to_string()); + assert_eq!(result.to_string(), "pcol1 NOT IN (100, 101)".to_string()); } #[tokio::test] @@ -2880,7 +2880,7 @@ mod tests { let result = strict_projection.strict_project(&predicate).unwrap(); assert_eq!( result.to_string(), - "((pcol1 NOT IN (100, 90)) AND (pcol2 NOT IN (100, 90))) AND (pcol3 NOT IN (10000, 10100, 9900))" + "((pcol1 NOT IN (90, 100)) AND (pcol2 NOT IN (90, 100))) AND (pcol3 NOT IN (9900, 10000, 10100))" .to_string() ); @@ -3103,7 +3103,7 @@ mod tests { let result = strict_projection.strict_project(&predicate).unwrap(); assert_eq!( result.to_string(), - "((pcol1 NOT IN (100, 90)) AND (pcol2 NOT IN (100, 90))) AND (pcol3 NOT IN (9890, 9990, 10090))" + "((pcol1 NOT IN (90, 100)) AND (pcol2 NOT IN (90, 100))) AND (pcol3 NOT IN (9890, 9990, 10090))" .to_string() ); diff --git a/crates/iceberg/src/spec/datatypes.rs b/crates/iceberg/src/spec/datatypes.rs index ad4aea758f..14595ee6d0 100644 --- a/crates/iceberg/src/spec/datatypes.rs +++ b/crates/iceberg/src/spec/datatypes.rs @@ -19,7 +19,6 @@ * Data Types */ use std::collections::HashMap; -use std::convert::identity; use std::fmt; use std::ops::Index; use std::sync::{Arc, OnceLock}; @@ -208,6 +207,8 @@ impl From for Type { #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone, Hash)] #[serde(rename_all = "lowercase", remote = "Self")] pub enum PrimitiveType { + /// Default / null column type used when a more specific type is not known. + Unknown, /// True or False Boolean, /// 32-bit signed integer @@ -364,6 +365,7 @@ where S: Serializer { impl fmt::Display for PrimitiveType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { + PrimitiveType::Unknown => write!(f, "unknown"), PrimitiveType::Boolean => write!(f, "boolean"), PrimitiveType::Int => write!(f, "int"), PrimitiveType::Long => write!(f, "long"), @@ -527,7 +529,7 @@ impl fmt::Display for StructType { } #[derive(Debug, PartialEq, Serialize, Deserialize, Eq, Clone)] -#[serde(from = "SerdeNestedField", into = "SerdeNestedField")] +#[serde(try_from = "SerdeNestedField", into = "SerdeNestedField")] /// A struct is a tuple of typed values. Each field in the tuple is named and has an integer id that is unique in the table schema. /// Each field can be either optional or required, meaning that values can (or cannot) be null. Fields may be any type. /// Fields may have an optional comment or doc string. Fields can have default values. @@ -564,25 +566,30 @@ struct SerdeNestedField { pub write_default: Option, } -impl From for NestedField { - fn from(value: SerdeNestedField) -> Self { - NestedField { +impl TryFrom for NestedField { + type Error = crate::Error; + + fn try_from(value: SerdeNestedField) -> Result { + let initial_default = value + .initial_default + .map(|x| Literal::try_from_json(x, &value.field_type)) + .transpose()? + .flatten(); + let write_default = value + .write_default + .map(|x| Literal::try_from_json(x, &value.field_type)) + .transpose()? + .flatten(); + + Ok(NestedField { id: value.id, name: value.name, required: value.required, - initial_default: value.initial_default.and_then(|x| { - Literal::try_from_json(x, &value.field_type) - .ok() - .and_then(identity) - }), - write_default: value.write_default.and_then(|x| { - Literal::try_from_json(x, &value.field_type) - .ok() - .and_then(identity) - }), + initial_default, + write_default, field_type: value.field_type, doc: value.doc, - } + }) } } @@ -869,6 +876,7 @@ mod tests { { "type": "struct", "fields": [ + {"id": 17, "name": "unknown_field", "required": false, "type": "unknown"}, {"id": 1, "name": "bool_field", "required": true, "type": "boolean"}, {"id": 2, "name": "int_field", "required": true, "type": "int"}, {"id": 3, "name": "long_field", "required": true, "type": "long"}, @@ -893,6 +901,12 @@ mod tests { record, Type::Struct(StructType { fields: vec![ + NestedField::optional( + 17, + "unknown_field", + Type::Primitive(PrimitiveType::Unknown), + ) + .into(), NestedField::required(1, "bool_field", Type::Primitive(PrimitiveType::Boolean)) .into(), NestedField::required(2, "int_field", Type::Primitive(PrimitiveType::Int)) @@ -1274,6 +1288,8 @@ mod tests { for (ty, literal) in pairs { assert!(ty.compatible(&literal)); } + + assert!(!PrimitiveType::Unknown.compatible(&PrimitiveLiteral::Int(1))); } #[test] diff --git a/crates/iceberg/src/spec/schema/mod.rs b/crates/iceberg/src/spec/schema/mod.rs index 13ad41818b..4d43e41155 100644 --- a/crates/iceberg/src/spec/schema/mod.rs +++ b/crates/iceberg/src/spec/schema/mod.rs @@ -127,6 +127,10 @@ impl SchemaBuilder { /// Builds the schema. pub fn build(self) -> Result { + for field in &self.fields { + Self::validate_unknown_type_field(field)?; + } + let field_id_to_accessor = self.build_accessors(); let r#struct = StructType::new(self.fields); @@ -184,6 +188,38 @@ impl SchemaBuilder { Ok(schema) } + fn validate_unknown_type_field(field: &NestedFieldRef) -> Result<()> { + match field.field_type.as_ref() { + Type::Primitive(PrimitiveType::Unknown) => { + ensure_data_valid!( + !field.required, + "Field {} cannot be required because unknown type must be optional", + field.name + ); + ensure_data_valid!( + field.initial_default.is_none() && field.write_default.is_none(), + "Field {} cannot have non-null defaults because unknown type requires null defaults", + field.name + ); + } + Type::Struct(struct_type) => { + for nested_field in struct_type.fields() { + Self::validate_unknown_type_field(nested_field)?; + } + } + Type::List(list_type) => { + Self::validate_unknown_type_field(&list_type.element_field)?; + } + Type::Map(map_type) => { + Self::validate_unknown_type_field(&map_type.key_field)?; + Self::validate_unknown_type_field(&map_type.value_field)?; + } + Type::Primitive(_) => {} + } + + Ok(()) + } + fn build_accessors(&self) -> HashMap> { let mut map = HashMap::new(); @@ -1228,4 +1264,90 @@ table { .is_err() ); } + + #[test] + fn test_unknown_type_deserialization_rejects_non_null_default() { + let schema_json = serde_json::json!({ + "type": "struct", + "schema-id": 1, + "fields": [ + { + "id": 1, + "name": "empty", + "required": false, + "type": "unknown", + "initial-default": 1 + } + ] + }); + + let error = serde_json::from_value::(schema_json).unwrap_err(); + assert!( + error + .to_string() + .contains("did not match any variant of untagged enum SchemaEnum"), + "unexpected error: {error}" + ); + } + + #[test] + fn test_unknown_type_deserialization_accepts_null_defaults() { + let schema_json = serde_json::json!({ + "type": "struct", + "schema-id": 1, + "fields": [ + { + "id": 1, + "name": "empty", + "required": false, + "type": "unknown", + "initial-default": null, + "write-default": null + } + ] + }); + + serde_json::from_value::(schema_json).unwrap(); + } + + #[test] + fn test_unknown_type_must_be_optional_with_null_defaults() { + assert!( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "empty", Primitive(PrimitiveType::Unknown)).into() + ]) + .build() + .is_ok() + ); + + let required_error = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "empty", Primitive(PrimitiveType::Unknown)).into(), + ]) + .build() + .unwrap_err(); + assert!( + required_error + .message() + .contains("unknown type must be optional") + ); + + let default_error = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "empty", Primitive(PrimitiveType::Unknown)) + .with_initial_default(Literal::int(1)) + .into(), + ]) + .build() + .unwrap_err(); + assert!( + default_error + .message() + .contains("unknown type requires null defaults") + ); + } } diff --git a/crates/iceberg/src/spec/values/datum.rs b/crates/iceberg/src/spec/values/datum.rs index 68ea6b3d46..26a94a6eb9 100644 --- a/crates/iceberg/src/spec/values/datum.rs +++ b/crates/iceberg/src/spec/values/datum.rs @@ -368,6 +368,12 @@ impl Datum { /// See [this spec](https://iceberg.apache.org/spec/#binary-single-value-serialization) for reference. pub fn try_from_bytes(bytes: &[u8], data_type: PrimitiveType) -> Result { let literal = match data_type { + PrimitiveType::Unknown => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + "Cannot create datum for unknown type from bytes", + )); + } PrimitiveType::Boolean => { if bytes.len() == 1 && bytes[0] == 0u8 { PrimitiveLiteral::Boolean(false) diff --git a/crates/iceberg/src/transform/bucket.rs b/crates/iceberg/src/transform/bucket.rs index 52fb72f1cf..befdb019f3 100644 --- a/crates/iceberg/src/transform/bucket.rs +++ b/crates/iceberg/src/transform/bucket.rs @@ -519,7 +519,7 @@ mod test { Datum::string(value), Datum::string(another), ]), - Some("name IN (9, 4)"), + Some("name IN (4, 9)"), )?; fixture.assert_projection( @@ -656,7 +656,7 @@ mod test { Datum::long(value), Datum::long(value + 1), ]), - Some("name IN (8, 7, 6)"), + Some("name IN (6, 7, 8)"), )?; fixture.assert_projection( @@ -716,7 +716,7 @@ mod test { Datum::int(value), Datum::int(value + 1), ]), - Some("name IN (8, 7, 6)"), + Some("name IN (6, 7, 8)"), )?; fixture.assert_projection( diff --git a/crates/iceberg/src/transform/temporal.rs b/crates/iceberg/src/transform/temporal.rs index 1cd4d6a436..890e011ab1 100644 --- a/crates/iceberg/src/transform/temporal.rs +++ b/crates/iceberg/src/transform/temporal.rs @@ -661,7 +661,7 @@ mod test { Datum::timestamp_from_str(value)?, Datum::timestamp_from_str(another)?, ]), - Some("name IN (420034, 412007)"), + Some("name IN (412007, 420034)"), )?; fixture.assert_projection( @@ -807,7 +807,7 @@ mod test { Datum::timestamp_from_str(value)?, Datum::timestamp_from_str(another)?, ]), - Some("name IN (47, 46)"), + Some("name IN (46, 47)"), )?; fixture.assert_projection( @@ -879,7 +879,7 @@ mod test { Datum::timestamp_from_str(value)?, Datum::timestamp_from_str(another)?, ]), - Some("name IN (47, 46)"), + Some("name IN (46, 47)"), )?; fixture.assert_projection( @@ -951,7 +951,7 @@ mod test { Datum::timestamp_from_str(value)?, Datum::timestamp_from_str(another)?, ]), - Some("name IN (0, -1)"), + Some("name IN (-1, 0)"), )?; fixture.assert_projection( @@ -1023,7 +1023,7 @@ mod test { Datum::timestamp_from_str(value)?, Datum::timestamp_from_str(another)?, ]), - Some("name IN (575, 574)"), + Some("name IN (574, 575)"), )?; fixture.assert_projection( @@ -1094,7 +1094,7 @@ mod test { Datum::timestamp_from_str(value)?, Datum::timestamp_from_str(another)?, ]), - Some("name IN (-10, -9, -12, -11)"), + Some("name IN (-12, -11, -10, -9)"), )?; fixture.assert_projection( @@ -1240,7 +1240,7 @@ mod test { Datum::timestamp_from_str(value)?, Datum::timestamp_from_str(another)?, ]), - Some("name IN (1970-01-01, 1969-12-31)"), + Some("name IN (1969-12-31, 1970-01-01)"), )?; fixture.assert_projection( @@ -1314,7 +1314,7 @@ mod test { Datum::timestamp_from_str(value)?, Datum::timestamp_from_str(another)?, ]), - Some("name IN (2017-12-02, 2017-12-01)"), + Some("name IN (2017-12-01, 2017-12-02)"), )?; fixture.assert_projection( @@ -1388,7 +1388,7 @@ mod test { Datum::timestamp_from_str(value)?, Datum::timestamp_from_str(another)?, ]), - Some("name IN (1969-01-02, 1969-01-01, 1969-01-03)"), + Some("name IN (1969-01-01, 1969-01-02, 1969-01-03)"), )?; fixture.assert_projection( @@ -1462,7 +1462,7 @@ mod test { Datum::timestamp_from_str(value)?, Datum::timestamp_from_str(another)?, ]), - Some("name IN (2017-12-02, 2017-12-01)"), + Some("name IN (2017-12-01, 2017-12-02)"), )?; fixture.assert_projection( @@ -1740,7 +1740,7 @@ mod test { Datum::date_from_str(value)?, Datum::date_from_str(another)?, ]), - Some("name IN (-1, -12, -11, 0)"), + Some("name IN (-12, -11, -1, 0)"), )?; fixture.assert_projection( @@ -1808,7 +1808,7 @@ mod test { Datum::date_from_str(value)?, Datum::date_from_str(another)?, ]), - Some("name IN (575, 564)"), + Some("name IN (564, 575)"), )?; fixture.assert_projection( @@ -1876,7 +1876,7 @@ mod test { Datum::date_from_str(value)?, Datum::date_from_str(another)?, ]), - Some("name IN (-1, -12, -11, 0)"), + Some("name IN (-12, -11, -1, 0)"), )?; fixture.assert_projection( @@ -1944,7 +1944,7 @@ mod test { Datum::date_from_str(value)?, Datum::date_from_str(another)?, ]), - Some("name IN (575, 564)"), + Some("name IN (564, 575)"), )?; fixture.assert_projection( @@ -2012,7 +2012,7 @@ mod test { Datum::date_from_str(value)?, Datum::date_from_str(another)?, ]), - Some("name IN (0, -1)"), + Some("name IN (-1, 0)"), )?; fixture.assert_projection( @@ -2079,7 +2079,7 @@ mod test { Datum::date_from_str(value)?, Datum::date_from_str(another)?, ]), - Some("name IN (0, -1)"), + Some("name IN (-1, 0)"), )?; fixture.assert_projection( @@ -2147,7 +2147,7 @@ mod test { Datum::date_from_str(value)?, Datum::date_from_str(another)?, ]), - Some("name IN (47, 46)"), + Some("name IN (46, 47)"), )?; fixture.assert_projection( @@ -2215,7 +2215,7 @@ mod test { Datum::date_from_str(value)?, Datum::date_from_str(another)?, ]), - Some("name IN (0, -1)"), + Some("name IN (-1, 0)"), )?; fixture.assert_projection( @@ -2283,7 +2283,7 @@ mod test { Datum::date_from_str(value)?, Datum::date_from_str(another)?, ]), - Some("name IN (47, 46)"), + Some("name IN (46, 47)"), )?; fixture.assert_projection( diff --git a/crates/iceberg/src/transform/truncate.rs b/crates/iceberg/src/transform/truncate.rs index 4fac48f7d5..cf7e3485e6 100644 --- a/crates/iceberg/src/transform/truncate.rs +++ b/crates/iceberg/src/transform/truncate.rs @@ -469,7 +469,7 @@ mod test { Datum::decimal_from_str(curr)?, Datum::decimal_from_str(next)?, ]), - Some("name IN (10000, 10100, 9900)"), + Some("name IN (9900, 10000, 10100)"), )?; fixture.assert_projection( @@ -524,7 +524,7 @@ mod test { Datum::long(value), Datum::long(value + 1), ]), - Some("name IN (100, 90)"), + Some("name IN (90, 100)"), )?; fixture.assert_projection( @@ -579,7 +579,7 @@ mod test { Datum::long(value), Datum::long(value + 1), ]), - Some("name IN (100, 90)"), + Some("name IN (90, 100)"), )?; fixture.assert_projection( @@ -634,7 +634,7 @@ mod test { Datum::int(value), Datum::int(value + 1), ]), - Some("name IN (100, 90)"), + Some("name IN (90, 100)"), )?; fixture.assert_projection( @@ -689,7 +689,7 @@ mod test { Datum::int(value), Datum::int(value + 1), ]), - Some("name IN (100, 90)"), + Some("name IN (90, 100)"), )?; fixture.assert_projection(