From bbbb49e684daeb0ede9acc5cb5fa93ff2214fcd9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 Mar 2026 10:38:15 -0600 Subject: [PATCH 1/6] feat: add native support for get_json_object expression Implement the Spark GetJsonObject expression natively using serde_json for JSON parsing and a custom JSONPath evaluator supporting field access, array indexing, bracket notation, and wildcards. Closes #3162 --- docs/spark_expressions_support.md | 2 +- native/spark-expr/src/comet_scalar_funcs.rs | 4 + .../src/string_funcs/get_json_object.rs | 465 ++++++++++++++++++ native/spark-expr/src/string_funcs/mod.rs | 2 + .../apache/comet/serde/QueryPlanSerde.scala | 1 + .../org/apache/comet/serde/strings.scala | 20 +- .../expressions/string/get_json_object.sql | 112 +++++ 7 files changed, 604 insertions(+), 2 deletions(-) create mode 100644 native/spark-expr/src/string_funcs/get_json_object.rs create mode 100644 spark/src/test/resources/sql-tests/expressions/string/get_json_object.sql diff --git a/docs/spark_expressions_support.md b/docs/spark_expressions_support.md index 5474894108..6892eb358d 100644 --- a/docs/spark_expressions_support.md +++ b/docs/spark_expressions_support.md @@ -245,7 +245,7 @@ ### json_funcs - [ ] from_json -- [ ] get_json_object +- [x] get_json_object - [ ] json_array_length - [ ] json_object_keys - [ ] json_tuple diff --git a/native/spark-expr/src/comet_scalar_funcs.rs b/native/spark-expr/src/comet_scalar_funcs.rs index ff75de763b..48cf00bc69 100644 --- a/native/spark-expr/src/comet_scalar_funcs.rs +++ b/native/spark-expr/src/comet_scalar_funcs.rs @@ -181,6 +181,10 @@ pub fn create_comet_physical_fun_with_eval_mode( let func = Arc::new(crate::string_funcs::spark_split); make_comet_scalar_udf!("split", func, without data_type) } + "get_json_object" => { + let func = Arc::new(crate::string_funcs::spark_get_json_object); + make_comet_scalar_udf!("get_json_object", func, without data_type) + } _ => registry.udf(fun_name).map_err(|e| { DataFusionError::Execution(format!( "Function {fun_name} not found in the registry: {e}", diff --git a/native/spark-expr/src/string_funcs/get_json_object.rs b/native/spark-expr/src/string_funcs/get_json_object.rs new file mode 100644 index 0000000000..21e4116c5a --- /dev/null +++ b/native/spark-expr/src/string_funcs/get_json_object.rs @@ -0,0 +1,465 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{Array, ArrayRef, StringArray}; +use datafusion::common::{ + cast::as_generic_string_array, exec_err, Result as DataFusionResult, ScalarValue, +}; +use datafusion::logical_expr::ColumnarValue; +use serde_json::Value; +use std::sync::Arc; + +/// Spark-compatible `get_json_object` function. +/// +/// Extracts a JSON value from a JSON string using a JSONPath expression. +/// Returns the result as a string, or null if the path doesn't match or input is invalid. +/// +/// Supported JSONPath syntax: +/// - `$` — root element +/// - `.name` or `['name']` — named child +/// - `[n]` — array index (0-based) +/// - `[*]` — array wildcard +/// - `.*` or `['*']` — object wildcard +pub fn spark_get_json_object(args: &[ColumnarValue]) -> DataFusionResult { + if args.len() != 2 { + return exec_err!( + "get_json_object expects 2 arguments (json, path), got {}", + args.len() + ); + } + + match (&args[0], &args[1]) { + // Column json, scalar path (most common case) + (ColumnarValue::Array(json_array), ColumnarValue::Scalar(path_scalar)) => { + let path_str = match path_scalar { + ScalarValue::Utf8(Some(p)) | ScalarValue::LargeUtf8(Some(p)) => p.as_str(), + ScalarValue::Utf8(None) | ScalarValue::LargeUtf8(None) => { + let null_array: ArrayRef = Arc::new(StringArray::new_null(json_array.len())); + return Ok(ColumnarValue::Array(null_array)); + } + _ => return exec_err!("get_json_object path must be a string"), + }; + + let parsed_path = match parse_json_path(path_str) { + Some(p) => p, + None => { + let null_array: ArrayRef = Arc::new(StringArray::new_null(json_array.len())); + return Ok(ColumnarValue::Array(null_array)); + } + }; + + let json_strings = as_generic_string_array::(json_array)?; + let mut builder = StringBuilder::new(); + + for i in 0..json_strings.len() { + if json_strings.is_null(i) { + builder.append_null(); + } else { + let json_str = json_strings.value(i); + match evaluate_path(json_str, &parsed_path) { + Some(result) => builder.append_value(&result), + None => builder.append_null(), + } + } + } + + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } + // Scalar json, scalar path + (ColumnarValue::Scalar(json_scalar), ColumnarValue::Scalar(path_scalar)) => { + let json_str = match json_scalar { + ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => s.as_str(), + ScalarValue::Utf8(None) | ScalarValue::LargeUtf8(None) => { + return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))); + } + _ => return exec_err!("get_json_object json must be a string"), + }; + let path_str = match path_scalar { + ScalarValue::Utf8(Some(p)) | ScalarValue::LargeUtf8(Some(p)) => p.as_str(), + ScalarValue::Utf8(None) | ScalarValue::LargeUtf8(None) => { + return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))); + } + _ => return exec_err!("get_json_object path must be a string"), + }; + + let parsed_path = match parse_json_path(path_str) { + Some(p) => p, + None => return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))), + }; + + let result = evaluate_path(json_str, &parsed_path); + Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result))) + } + // Column json, column path + (ColumnarValue::Array(json_array), ColumnarValue::Array(path_array)) => { + let json_strings = as_generic_string_array::(json_array)?; + let path_strings = as_generic_string_array::(path_array)?; + let mut builder = StringBuilder::new(); + + for i in 0..json_strings.len() { + if json_strings.is_null(i) || path_strings.is_null(i) { + builder.append_null(); + } else { + let json_str = json_strings.value(i); + let path_str = path_strings.value(i); + match parse_json_path(path_str) { + Some(parsed_path) => match evaluate_path(json_str, &parsed_path) { + Some(result) => builder.append_value(&result), + None => builder.append_null(), + }, + None => builder.append_null(), + } + } + } + + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } + _ => exec_err!("get_json_object: unsupported argument types"), + } +} + +use arrow::array::StringBuilder; + +/// A parsed JSONPath segment. +#[derive(Debug, Clone)] +enum PathSegment { + /// Named field: `.name` or `['name']` + Field(String), + /// Array index: `[n]` + Index(usize), + /// Wildcard: `[*]` or `.*` + Wildcard, +} + +/// Parse a Spark-compatible JSONPath expression. +/// Returns None for invalid paths. +fn parse_json_path(path: &str) -> Option> { + let mut chars = path.chars().peekable(); + + // Must start with '$' + if chars.next()? != '$' { + return None; + } + + let mut segments = Vec::new(); + + while chars.peek().is_some() { + match chars.peek()? { + '.' => { + chars.next(); + if chars.peek() == Some(&'.') { + // Recursive descent not supported + return None; + } + if chars.peek() == Some(&'*') { + chars.next(); + segments.push(PathSegment::Wildcard); + } else { + // Read field name + let mut name = String::new(); + while let Some(&c) = chars.peek() { + if c == '.' || c == '[' { + break; + } + name.push(c); + chars.next(); + } + if name.is_empty() { + return None; + } + segments.push(PathSegment::Field(name)); + } + } + '[' => { + chars.next(); + if chars.peek() == Some(&'\'') { + // Bracket notation with quotes: ['name'] or ['*'] + chars.next(); + let mut name = String::new(); + loop { + match chars.next()? { + '\'' => break, + c => name.push(c), + } + } + if chars.next()? != ']' { + return None; + } + if name == "*" { + segments.push(PathSegment::Wildcard); + } else { + segments.push(PathSegment::Field(name)); + } + } else if chars.peek() == Some(&'*') { + // [*] + chars.next(); + if chars.next()? != ']' { + return None; + } + segments.push(PathSegment::Wildcard); + } else { + // [n] — numeric index + let mut num_str = String::new(); + while let Some(&c) = chars.peek() { + if c == ']' { + break; + } + num_str.push(c); + chars.next(); + } + if chars.next()? != ']' { + return None; + } + let idx: usize = num_str.parse().ok()?; + segments.push(PathSegment::Index(idx)); + } + } + _ => { + // Unexpected character + return None; + } + } + } + + Some(segments) +} + +/// Evaluate a parsed JSONPath against a JSON string. +/// Returns the result as a string, or None if no match. +fn evaluate_path(json_str: &str, path: &[PathSegment]) -> Option { + let value: Value = serde_json::from_str(json_str).ok()?; + let results = evaluate_segments(&value, path); + + match results.len() { + 0 => None, + 1 => value_to_string(results[0]), + _ => { + // Multiple results: wrap in JSON array + let arr = Value::Array(results.into_iter().cloned().collect()); + Some(arr.to_string()) + } + } +} + +/// Recursively evaluate path segments against a JSON value. +/// Returns references to all matching values. +fn evaluate_segments<'a>(value: &'a Value, segments: &[PathSegment]) -> Vec<&'a Value> { + if segments.is_empty() { + return vec![value]; + } + + let segment = &segments[0]; + let rest = &segments[1..]; + + match segment { + PathSegment::Field(name) => match value { + Value::Object(map) => match map.get(name) { + Some(v) => evaluate_segments(v, rest), + None => vec![], + }, + _ => vec![], + }, + PathSegment::Index(idx) => match value { + Value::Array(arr) => match arr.get(*idx) { + Some(v) => evaluate_segments(v, rest), + None => vec![], + }, + _ => vec![], + }, + PathSegment::Wildcard => match value { + Value::Array(arr) => arr + .iter() + .flat_map(|v| evaluate_segments(v, rest)) + .collect(), + Value::Object(map) => map + .values() + .flat_map(|v| evaluate_segments(v, rest)) + .collect(), + _ => vec![], + }, + } +} + +/// Convert a JSON value to its string representation matching Spark behavior. +/// - Strings are returned without quotes +/// - null returns None +/// - Numbers, booleans, objects, arrays are serialized as JSON +fn value_to_string(value: &Value) -> Option { + match value { + Value::Null => None, + Value::String(s) => Some(s.clone()), + _ => Some(value.to_string()), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_json_path() { + // Root only + let path = parse_json_path("$").unwrap(); + assert!(path.is_empty()); + + // Simple field + let path = parse_json_path("$.name").unwrap(); + assert!(matches!(&path[0], PathSegment::Field(n) if n == "name")); + + // Array index + let path = parse_json_path("$[0]").unwrap(); + assert!(matches!(&path[0], PathSegment::Index(0))); + + // Bracket notation + let path = parse_json_path("$['key with spaces']").unwrap(); + assert!(matches!(&path[0], PathSegment::Field(n) if n == "key with spaces")); + + // Wildcard + let path = parse_json_path("$[*]").unwrap(); + assert!(matches!(&path[0], PathSegment::Wildcard)); + + let path = parse_json_path("$.*").unwrap(); + assert!(matches!(&path[0], PathSegment::Wildcard)); + + // Recursive descent not supported + assert!(parse_json_path("$..name").is_none()); + + // Must start with $ + assert!(parse_json_path("name").is_none()); + assert!(parse_json_path("[0]").is_none()); + } + + #[test] + fn test_evaluate_simple_field() { + assert_eq!( + evaluate_path( + r#"{"name":"John","age":30}"#, + &parse_json_path("$.name").unwrap() + ), + Some("John".to_string()) + ); + assert_eq!( + evaluate_path( + r#"{"name":"John","age":30}"#, + &parse_json_path("$.age").unwrap() + ), + Some("30".to_string()) + ); + } + + #[test] + fn test_evaluate_nested() { + let json = r#"{"user":{"profile":{"name":"Alice"}}}"#; + assert_eq!( + evaluate_path(json, &parse_json_path("$.user.profile.name").unwrap()), + Some("Alice".to_string()) + ); + } + + #[test] + fn test_evaluate_array_index() { + assert_eq!( + evaluate_path(r#"[1,2,3]"#, &parse_json_path("$[0]").unwrap()), + Some("1".to_string()) + ); + assert_eq!( + evaluate_path(r#"[1,2,3]"#, &parse_json_path("$[3]").unwrap()), + None + ); + } + + #[test] + fn test_evaluate_root() { + assert_eq!( + evaluate_path(r#"{"a":"b"}"#, &parse_json_path("$").unwrap()), + Some(r#"{"a":"b"}"#.to_string()) + ); + } + + #[test] + fn test_evaluate_null_value() { + assert_eq!( + evaluate_path(r#"{"a":null}"#, &parse_json_path("$.a").unwrap()), + None + ); + } + + #[test] + fn test_evaluate_missing_field() { + assert_eq!( + evaluate_path(r#"{"a":"b"}"#, &parse_json_path("$.c").unwrap()), + None + ); + } + + #[test] + fn test_evaluate_invalid_json() { + assert_eq!( + evaluate_path("not json", &parse_json_path("$.a").unwrap()), + None + ); + } + + #[test] + fn test_evaluate_wildcard() { + let json = r#"[{"a":"b"},{"a":"c"}]"#; + assert_eq!( + evaluate_path(json, &parse_json_path("$[*].a").unwrap()), + Some(r#"["b","c"]"#.to_string()) + ); + } + + #[test] + fn test_evaluate_string_unquoted() { + // Strings should be returned without quotes + assert_eq!( + evaluate_path(r#"["a","b"]"#, &parse_json_path("$[1]").unwrap()), + Some("b".to_string()) + ); + } + + #[test] + fn test_evaluate_nested_array_field() { + let json = r#"{"items":["apple","banana","cherry"]}"#; + assert_eq!( + evaluate_path(json, &parse_json_path("$.items[1]").unwrap()), + Some("banana".to_string()) + ); + } + + #[test] + fn test_evaluate_bracket_notation_with_spaces() { + let json = r#"{"key with spaces":"it works"}"#; + assert_eq!( + evaluate_path(json, &parse_json_path("$['key with spaces']").unwrap()), + Some("it works".to_string()) + ); + } + + #[test] + fn test_evaluate_boolean_and_nested_object() { + let json = r#"{"a":true,"b":{"c":1}}"#; + assert_eq!( + evaluate_path(json, &parse_json_path("$.a").unwrap()), + Some("true".to_string()) + ); + assert_eq!( + evaluate_path(json, &parse_json_path("$.b").unwrap()), + Some(r#"{"c":1}"#.to_string()) + ); + } +} diff --git a/native/spark-expr/src/string_funcs/mod.rs b/native/spark-expr/src/string_funcs/mod.rs index 919b9dc197..bb785bdb44 100644 --- a/native/spark-expr/src/string_funcs/mod.rs +++ b/native/spark-expr/src/string_funcs/mod.rs @@ -16,9 +16,11 @@ // under the License. mod contains; +mod get_json_object; mod split; mod substring; pub use contains::SparkContains; +pub use get_json_object::spark_get_json_object; pub use split::spark_split; pub use substring::SubstringExpr; diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 8c39ba779d..1942d6b3fe 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -156,6 +156,7 @@ object QueryPlanSerde extends Logging with CometExprShim { classOf[Concat] -> CometConcat, classOf[Contains] -> CometScalarFunction("contains"), classOf[EndsWith] -> CometScalarFunction("ends_with"), + classOf[GetJsonObject] -> CometGetJsonObject, classOf[InitCap] -> CometInitCap, classOf[Length] -> CometLength, classOf[Like] -> CometLike, diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index 64ba644048..47dfb95e3d 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Concat, ConcatWs, Expression, If, InitCap, IsNull, Left, Length, Like, Literal, Lower, RegExpReplace, Right, RLike, StringLPad, StringRepeat, StringRPad, StringSplit, Substring, Upper} +import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Concat, ConcatWs, Expression, GetJsonObject, If, InitCap, IsNull, Left, Length, Like, Literal, Lower, RegExpReplace, Right, RLike, StringLPad, StringRepeat, StringRPad, StringSplit, Substring, Upper} import org.apache.spark.sql.types.{BinaryType, DataTypes, LongType, StringType} import org.apache.spark.unsafe.types.UTF8String @@ -382,6 +382,24 @@ object CometStringSplit extends CometExpressionSerde[StringSplit] { } } +object CometGetJsonObject extends CometExpressionSerde[GetJsonObject] { + + override def convert( + expr: GetJsonObject, + inputs: Seq[Attribute], + binding: Boolean): Option[Expr] = { + val jsonExpr = exprToProtoInternal(expr.json, inputs, binding) + val pathExpr = exprToProtoInternal(expr.path, inputs, binding) + val optExpr = scalarFunctionExprToProtoWithReturnType( + "get_json_object", + expr.dataType, + false, + jsonExpr, + pathExpr) + optExprWithInfo(optExpr, expr, expr.json, expr.path) + } +} + trait CommonStringExprs { def stringDecode( diff --git a/spark/src/test/resources/sql-tests/expressions/string/get_json_object.sql b/spark/src/test/resources/sql-tests/expressions/string/get_json_object.sql new file mode 100644 index 0000000000..d754dda9aa --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/string/get_json_object.sql @@ -0,0 +1,112 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- ConfigMatrix: parquet.enable.dictionary=false,true + +statement +CREATE TABLE test_get_json_object(json_str string, path string) USING parquet + +statement +INSERT INTO test_get_json_object VALUES ('{"name":"John","age":30}', '$.name'), ('{"a":{"b":{"c":"deep"}}}', '$.a.b.c'), ('{"items":[1,2,3]}', '$.items[1]'), (NULL, '$.name'), ('{"name":"Alice"}', NULL), ('invalid json', '$.name'), ('{"a":"b"}', '$'), ('{"key":"value"}', '$.missing'), ('{"arr":["x","y","z"]}', '$.arr[0]'), ('{"a":null}', '$.a') + +-- column json, column path +query +SELECT get_json_object(json_str, path) FROM test_get_json_object + +-- column json, literal path +query +SELECT get_json_object(json_str, '$.name') FROM test_get_json_object + +-- simple field extraction +query +SELECT get_json_object('{"name":"John","age":30}', '$.name') + +-- numeric value +query +SELECT get_json_object('{"name":"John","age":30}', '$.age') + +-- nested field +query +SELECT get_json_object('{"user":{"profile":{"name":"Alice"}}}', '$.user.profile.name') + +-- array index +query +SELECT get_json_object('[1,2,3]', '$[0]'), get_json_object('[1,2,3]', '$[2]') + +-- out of bounds array index +query +SELECT get_json_object('[1,2,3]', '$[3]') + +-- nested array field +query +SELECT get_json_object('{"items":["apple","banana","cherry"]}', '$.items[1]') + +-- root path returns entire input +query +SELECT get_json_object('{"a":"b"}', '$') + +-- null json +query +SELECT get_json_object(NULL, '$.name') + +-- null path +query +SELECT get_json_object('{"a":"b"}', NULL) + +-- invalid json returns null +query +SELECT get_json_object('not json', '$.a') + +-- missing field returns null +query +SELECT get_json_object('{"a":"b"}', '$.c') + +-- json null value returns null +query +SELECT get_json_object('{"a":null}', '$.a') + +-- boolean value +query +SELECT get_json_object('{"flag":true}', '$.flag') + +-- nested object returned as json +query +SELECT get_json_object('{"a":{"b":1}}', '$.a') + +-- array returned as json +query +SELECT get_json_object('{"a":[1,2,3]}', '$.a') + +-- bracket notation +query +SELECT get_json_object('{"key with spaces":"it works"}', '$[''key with spaces'']') + +-- wildcard on array +query +SELECT get_json_object('[{"a":"b"},{"a":"c"}]', '$[*].a') + +-- empty string json +query +SELECT get_json_object('', '$.a') + +-- deeply nested +query +SELECT get_json_object('{"a":{"b":{"c":{"d":"found"}}}}', '$.a.b.c.d') + +-- array of arrays +query +SELECT get_json_object('[[1,2],[3,4]]', '$[0][1]') From 257af9e0928b87507a74e2f65050f5e54cb3aba2 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 Mar 2026 10:40:02 -0600 Subject: [PATCH 2/6] feat: add getSupportLevel for CometGetJsonObject Mark as Incompatible since Spark's Jackson parser allows single-quoted JSON and unescaped control characters which serde_json does not support. Add allowIncompatible config to SQL test file. --- spark/src/main/scala/org/apache/comet/serde/strings.scala | 6 ++++++ .../sql-tests/expressions/string/get_json_object.sql | 1 + 2 files changed, 7 insertions(+) diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index 47dfb95e3d..50621fc389 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -384,6 +384,12 @@ object CometStringSplit extends CometExpressionSerde[StringSplit] { object CometGetJsonObject extends CometExpressionSerde[GetJsonObject] { + override def getSupportLevel(expr: GetJsonObject): SupportLevel = + Incompatible( + Some( + "Spark allows single-quoted JSON and unescaped control characters " + + "which Comet does not support")) + override def convert( expr: GetJsonObject, inputs: Seq[Attribute], diff --git a/spark/src/test/resources/sql-tests/expressions/string/get_json_object.sql b/spark/src/test/resources/sql-tests/expressions/string/get_json_object.sql index d754dda9aa..87ba6bbb37 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/get_json_object.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/get_json_object.sql @@ -15,6 +15,7 @@ -- specific language governing permissions and limitations -- under the License. +-- Config: spark.comet.expression.GetJsonObject.allowIncompatible=true -- ConfigMatrix: parquet.enable.dictionary=false,true statement From b7ddc4a742c04c4e9f7b79a1fcd5f4469603ada1 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 Mar 2026 10:55:35 -0600 Subject: [PATCH 3/6] fix: address self-review findings for get_json_object - Enable serde_json preserve_order feature to maintain JSON key ordering - Fix wildcard to only work on arrays (not objects), matching Spark - Fix single wildcard match to preserve JSON string quoting - Add user-facing docs in expressions.md - Add more SQL tests: object wildcard, single match, missing fields, invalid paths, field names with special chars, key ordering - Add Rust unit tests for new edge cases --- docs/source/user-guide/latest/expressions.md | 6 ++ native/Cargo.lock | 1 + native/spark-expr/Cargo.toml | 2 +- .../src/string_funcs/get_json_object.rs | 83 +++++++++++++++++-- .../expressions/string/get_json_object.sql | 44 ++++++++++ 5 files changed, 130 insertions(+), 6 deletions(-) diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md index 57b7a3455a..9b6511d42d 100644 --- a/docs/source/user-guide/latest/expressions.md +++ b/docs/source/user-guide/latest/expressions.md @@ -90,6 +90,12 @@ Expressions that are not Spark-compatible will fall back to Spark by default and | Substring | Yes | | | Upper | No | Results can vary depending on locale and character set. Requires `spark.comet.caseConversion.enabled=true` | +## JSON Functions + +| Expression | Spark-Compatible? | Compatibility Notes | +| --------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------------- | +| GetJsonObject | No | Spark allows single-quoted JSON and unescaped control characters which Comet does not support | + ## Date/Time Functions | Expression | SQL | Spark-Compatible? | Compatibility Notes | diff --git a/native/Cargo.lock b/native/Cargo.lock index ef3ce87ad3..b467788c60 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -5514,6 +5514,7 @@ version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" dependencies = [ + "indexmap 2.13.0", "itoa", "memchr", "serde", diff --git a/native/spark-expr/Cargo.toml b/native/spark-expr/Cargo.toml index d4639c86ea..ffc395317f 100644 --- a/native/spark-expr/Cargo.toml +++ b/native/spark-expr/Cargo.toml @@ -34,7 +34,7 @@ chrono-tz = { workspace = true } num = { workspace = true } regex = { workspace = true } serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" +serde_json = { version = "1.0", features = ["preserve_order"] } thiserror = { workspace = true } futures = { workspace = true } twox-hash = "2.1.2" diff --git a/native/spark-expr/src/string_funcs/get_json_object.rs b/native/spark-expr/src/string_funcs/get_json_object.rs index 21e4116c5a..af08ae0e5e 100644 --- a/native/spark-expr/src/string_funcs/get_json_object.rs +++ b/native/spark-expr/src/string_funcs/get_json_object.rs @@ -242,11 +242,28 @@ fn parse_json_path(path: &str) -> Option> { /// Returns the result as a string, or None if no match. fn evaluate_path(json_str: &str, path: &[PathSegment]) -> Option { let value: Value = serde_json::from_str(json_str).ok()?; + + // Check if path contains any wildcards + let has_wildcard = path.iter().any(|s| matches!(s, PathSegment::Wildcard)); + let results = evaluate_segments(&value, path); match results.len() { 0 => None, - 1 => value_to_string(results[0]), + 1 if !has_wildcard => value_to_string(results[0]), + 1 => { + // Single wildcard match: Spark strips outer array brackets from the + // JSON representation. For strings this means the quotes are preserved. + // We simulate by serializing as a JSON array and stripping the brackets. + let arr_str = serde_json::to_string(&Value::Array(vec![results[0].clone()])).ok()?; + // Strip leading '[' and trailing ']' + let inner = &arr_str[1..arr_str.len() - 1]; + if inner == "null" { + None + } else { + Some(inner.to_string()) + } + } _ => { // Multiple results: wrap in JSON array let arr = Value::Array(results.into_iter().cloned().collect()); @@ -285,10 +302,6 @@ fn evaluate_segments<'a>(value: &'a Value, segments: &[PathSegment]) -> Vec<&'a .iter() .flat_map(|v| evaluate_segments(v, rest)) .collect(), - Value::Object(map) => map - .values() - .flat_map(|v| evaluate_segments(v, rest)) - .collect(), _ => vec![], }, } @@ -462,4 +475,64 @@ mod tests { Some(r#"{"c":1}"#.to_string()) ); } + + #[test] + fn test_object_key_order_preserved() { + // serde_json with preserve_order feature should maintain insertion order + let json = r#"{"z":1,"a":2}"#; + assert_eq!( + evaluate_path(json, &parse_json_path("$").unwrap()), + Some(r#"{"z":1,"a":2}"#.to_string()) + ); + } + + #[test] + fn test_wildcard_single_match() { + // Single wildcard match on string: Spark preserves JSON quotes + let json = r#"[{"a":"only"}]"#; + assert_eq!( + evaluate_path(json, &parse_json_path("$[*].a").unwrap()), + Some(r#""only""#.to_string()) + ); + + // Single wildcard match on number: no quotes + let json = r#"[{"a":42}]"#; + assert_eq!( + evaluate_path(json, &parse_json_path("$[*].a").unwrap()), + Some("42".to_string()) + ); + } + + #[test] + fn test_wildcard_missing_fields() { + // Wildcard should skip elements where the field is missing + let json = r#"[{"a":1},{"b":2},{"a":3}]"#; + assert_eq!( + evaluate_path(json, &parse_json_path("$[*].a").unwrap()), + Some("[1,3]".to_string()) + ); + } + + #[test] + fn test_field_with_colon() { + let json = r#"{"fb:testid":"123"}"#; + assert_eq!( + evaluate_path(json, &parse_json_path("$.fb:testid").unwrap()), + Some("123".to_string()) + ); + } + + #[test] + fn test_dot_bracket_invalid() { + // $.[0] is not valid path syntax in Spark + assert!(parse_json_path("$.[0]").is_none()); + } + + #[test] + fn test_object_wildcard() { + // Spark returns null for $.* on objects (wildcard only works in array contexts) + let json = r#"{"a":1,"b":2,"c":3}"#; + let result = evaluate_path(json, &parse_json_path("$.*").unwrap()); + assert_eq!(result, None); + } } diff --git a/spark/src/test/resources/sql-tests/expressions/string/get_json_object.sql b/spark/src/test/resources/sql-tests/expressions/string/get_json_object.sql index 87ba6bbb37..aa2f482077 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/get_json_object.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/get_json_object.sql @@ -111,3 +111,47 @@ SELECT get_json_object('{"a":{"b":{"c":{"d":"found"}}}}', '$.a.b.c.d') -- array of arrays query SELECT get_json_object('[[1,2],[3,4]]', '$[0][1]') + +-- wildcard on simple array returns the whole array +query +SELECT get_json_object('[1,2,3]', '$[*]') + +-- object wildcard (Spark returns null for $.* on objects) +query +SELECT get_json_object('{"a":1,"b":2,"c":3}', '$.*') + +-- single wildcard match returns unwrapped value +query +SELECT get_json_object('[{"a":"only"}]', '$[*].a') + +-- wildcard with missing fields in some elements +query +SELECT get_json_object('[{"a":1},{"b":2},{"a":3}]', '$[*].a') + +-- invalid path: recursive descent +query +SELECT get_json_object('{"a":1}', '$..a') + +-- invalid path: no dollar sign +query +SELECT get_json_object('{"a":1}', 'a') + +-- invalid path: dot-bracket +query +SELECT get_json_object('[1,2,3]', '$.[0]') + +-- field name with special chars +query +SELECT get_json_object('{"fb:testid":"123"}', '$.fb:testid') + +-- escaped quotes in string values +query +SELECT get_json_object('{"a":"b\\"c"}', '$.a') + +-- numeric precision preserved +query +SELECT get_json_object('{"price":8.95}', '$.price') + +-- object key ordering preserved in output +query +SELECT get_json_object('{"z":1,"a":2}', '$') From b73fdc1072944e7969c8fbda50940794430e5e1c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 Mar 2026 11:10:45 -0600 Subject: [PATCH 4/6] feat: add microbenchmark for get_json_object Benchmarks simple field, numeric field, nested field, array element, and nested object extraction with 1M rows of JSON data. --- .../CometGetJsonObjectBenchmark.scala | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 spark/src/test/scala/org/apache/spark/sql/benchmark/CometGetJsonObjectBenchmark.scala diff --git a/spark/src/test/scala/org/apache/spark/sql/benchmark/CometGetJsonObjectBenchmark.scala b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometGetJsonObjectBenchmark.scala new file mode 100644 index 0000000000..fa63945347 --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/benchmark/CometGetJsonObjectBenchmark.scala @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.benchmark + +/** + * Benchmark to measure performance of Comet get_json_object expression. To run this benchmark: + * {{{ + * SPARK_GENERATE_BENCHMARK_FILES=1 make benchmark-org.apache.spark.sql.benchmark.CometGetJsonObjectBenchmark + * }}} + * Results will be written to "spark/benchmarks/CometGetJsonObjectBenchmark-**results.txt". + */ +object CometGetJsonObjectBenchmark extends CometBenchmarkBase { + + override def runCometBenchmark(mainArgs: Array[String]): Unit = { + val numRows = 1024 * 1024 + runBenchmarkWithTable("get_json_object", numRows) { v => + withTempPath { dir => + withTempTable("parquetV1Table") { + import spark.implicits._ + + prepareTable( + dir, + spark + .range(numRows) + .map { i => + val name = s"user_$i" + val age = (i % 80 + 18).toInt + val nested = s"""{"city":"city_${i % 100}","zip":"${10000 + i % 90000}"}""" + val items = + (0 until (i % 5 + 1).toInt).map(j => s""""item_$j"""").mkString("[", ",", "]") + s"""{"name":"$name","age":$age,"address":$nested,"items":$items}""" + } + .toDF("c1")) + + val extraConfigs = + Map("spark.comet.expression.GetJsonObject.allowIncompatible" -> "true") + + val benchmarks = List( + StringExprConfig( + "simple field", + "select get_json_object(c1, '$.name') from parquetV1Table", + extraConfigs), + StringExprConfig( + "numeric field", + "select get_json_object(c1, '$.age') from parquetV1Table", + extraConfigs), + StringExprConfig( + "nested field", + "select get_json_object(c1, '$.address.city') from parquetV1Table", + extraConfigs), + StringExprConfig( + "array element", + "select get_json_object(c1, '$.items[0]') from parquetV1Table", + extraConfigs), + StringExprConfig( + "nested object", + "select get_json_object(c1, '$.address') from parquetV1Table", + extraConfigs)) + + benchmarks.foreach { config => + runBenchmark(config.name) { + runExpressionBenchmark(config.name, v, config.query, config.extraCometConfigs) + } + } + } + } + } + } +} From 3d6d560735e9d5bdd6a960376aed19d7e7288f80 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 Mar 2026 11:17:55 -0600 Subject: [PATCH 5/6] refactor: simplify get_json_object implementation - Move StringBuilder import to top-level imports - Fix doc comment to not mention object wildcard (unsupported) - Pre-compute has_wildcard in ParsedPath struct (avoids per-row scan) - Split evaluation into evaluate_no_wildcard (returns Option, zero Vec allocations) and evaluate_with_wildcard (returns Vec) - Simplify single wildcard match: serialize directly instead of array-wrap-then-strip-brackets hack - Add comment in Cargo.toml explaining preserve_order requirement --- native/spark-expr/Cargo.toml | 1 + .../src/string_funcs/get_json_object.rs | 222 +++++++++--------- 2 files changed, 111 insertions(+), 112 deletions(-) diff --git a/native/spark-expr/Cargo.toml b/native/spark-expr/Cargo.toml index ffc395317f..dc78303e07 100644 --- a/native/spark-expr/Cargo.toml +++ b/native/spark-expr/Cargo.toml @@ -34,6 +34,7 @@ chrono-tz = { workspace = true } num = { workspace = true } regex = { workspace = true } serde = { version = "1.0", features = ["derive"] } +# preserve_order: needed for get_json_object to match Spark's JSON key ordering serde_json = { version = "1.0", features = ["preserve_order"] } thiserror = { workspace = true } futures = { workspace = true } diff --git a/native/spark-expr/src/string_funcs/get_json_object.rs b/native/spark-expr/src/string_funcs/get_json_object.rs index af08ae0e5e..c1b30d2806 100644 --- a/native/spark-expr/src/string_funcs/get_json_object.rs +++ b/native/spark-expr/src/string_funcs/get_json_object.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{Array, ArrayRef, StringArray}; +use arrow::array::{Array, ArrayRef, StringArray, StringBuilder}; use datafusion::common::{ cast::as_generic_string_array, exec_err, Result as DataFusionResult, ScalarValue, }; @@ -32,8 +32,7 @@ use std::sync::Arc; /// - `$` — root element /// - `.name` or `['name']` — named child /// - `[n]` — array index (0-based) -/// - `[*]` — array wildcard -/// - `.*` or `['*']` — object wildcard +/// - `[*]` — array wildcard (iterates over array elements) pub fn spark_get_json_object(args: &[ColumnarValue]) -> DataFusionResult { if args.len() != 2 { return exec_err!( @@ -132,8 +131,6 @@ pub fn spark_get_json_object(args: &[ColumnarValue]) -> DataFusionResult, + has_wildcard: bool, +} + /// Parse a Spark-compatible JSONPath expression. /// Returns None for invalid paths. -fn parse_json_path(path: &str) -> Option> { +fn parse_json_path(path: &str) -> Option { let mut chars = path.chars().peekable(); // Must start with '$' @@ -156,6 +159,7 @@ fn parse_json_path(path: &str) -> Option> { } let mut segments = Vec::new(); + let mut has_wildcard = false; while chars.peek().is_some() { match chars.peek()? { @@ -168,6 +172,7 @@ fn parse_json_path(path: &str) -> Option> { if chars.peek() == Some(&'*') { chars.next(); segments.push(PathSegment::Wildcard); + has_wildcard = true; } else { // Read field name let mut name = String::new(); @@ -201,6 +206,7 @@ fn parse_json_path(path: &str) -> Option> { } if name == "*" { segments.push(PathSegment::Wildcard); + has_wildcard = true; } else { segments.push(PathSegment::Field(name)); } @@ -211,6 +217,7 @@ fn parse_json_path(path: &str) -> Option> { return None; } segments.push(PathSegment::Wildcard); + has_wildcard = true; } else { // [n] — numeric index let mut num_str = String::new(); @@ -235,33 +242,36 @@ fn parse_json_path(path: &str) -> Option> { } } - Some(segments) + Some(ParsedPath { + segments, + has_wildcard, + }) } /// Evaluate a parsed JSONPath against a JSON string. /// Returns the result as a string, or None if no match. -fn evaluate_path(json_str: &str, path: &[PathSegment]) -> Option { +fn evaluate_path(json_str: &str, path: &ParsedPath) -> Option { let value: Value = serde_json::from_str(json_str).ok()?; - // Check if path contains any wildcards - let has_wildcard = path.iter().any(|s| matches!(s, PathSegment::Wildcard)); + if !path.has_wildcard { + // Fast path: no wildcards, no Vec allocations + let result = evaluate_no_wildcard(&value, &path.segments)?; + return value_to_string(result); + } - let results = evaluate_segments(&value, path); + // Wildcard path: may return multiple results + let results = evaluate_with_wildcard(&value, &path.segments); match results.len() { 0 => None, - 1 if !has_wildcard => value_to_string(results[0]), 1 => { - // Single wildcard match: Spark strips outer array brackets from the - // JSON representation. For strings this means the quotes are preserved. - // We simulate by serializing as a JSON array and stripping the brackets. - let arr_str = serde_json::to_string(&Value::Array(vec![results[0].clone()])).ok()?; - // Strip leading '[' and trailing ']' - let inner = &arr_str[1..arr_str.len() - 1]; - if inner == "null" { + // Single wildcard match: Spark preserves JSON serialization format + // (strings keep their quotes, numbers don't) + let s = serde_json::to_string(results[0]).ok()?; + if s == "null" { None } else { - Some(inner.to_string()) + Some(s) } } _ => { @@ -272,27 +282,46 @@ fn evaluate_path(json_str: &str, path: &[PathSegment]) -> Option { } } -/// Recursively evaluate path segments against a JSON value. +/// Fast-path evaluation for paths without wildcards. +/// Returns a reference to the matched value, or None if no match. +fn evaluate_no_wildcard<'a>(value: &'a Value, segments: &[PathSegment]) -> Option<&'a Value> { + if segments.is_empty() { + return Some(value); + } + + match &segments[0] { + PathSegment::Field(name) => { + let child = value.as_object()?.get(name)?; + evaluate_no_wildcard(child, &segments[1..]) + } + PathSegment::Index(idx) => { + let child = value.as_array()?.get(*idx)?; + evaluate_no_wildcard(child, &segments[1..]) + } + PathSegment::Wildcard => unreachable!("wildcard in no-wildcard path"), + } +} + +/// Evaluation for paths containing wildcards. /// Returns references to all matching values. -fn evaluate_segments<'a>(value: &'a Value, segments: &[PathSegment]) -> Vec<&'a Value> { +fn evaluate_with_wildcard<'a>(value: &'a Value, segments: &[PathSegment]) -> Vec<&'a Value> { if segments.is_empty() { return vec![value]; } - let segment = &segments[0]; let rest = &segments[1..]; - match segment { + match &segments[0] { PathSegment::Field(name) => match value { Value::Object(map) => match map.get(name) { - Some(v) => evaluate_segments(v, rest), + Some(v) => evaluate_with_wildcard(v, rest), None => vec![], }, _ => vec![], }, PathSegment::Index(idx) => match value { Value::Array(arr) => match arr.get(*idx) { - Some(v) => evaluate_segments(v, rest), + Some(v) => evaluate_with_wildcard(v, rest), None => vec![], }, _ => vec![], @@ -300,7 +329,7 @@ fn evaluate_segments<'a>(value: &'a Value, segments: &[PathSegment]) -> Vec<&'a PathSegment::Wildcard => match value { Value::Array(arr) => arr .iter() - .flat_map(|v| evaluate_segments(v, rest)) + .flat_map(|v| evaluate_with_wildcard(v, rest)) .collect(), _ => vec![], }, @@ -327,26 +356,30 @@ mod tests { fn test_parse_json_path() { // Root only let path = parse_json_path("$").unwrap(); - assert!(path.is_empty()); + assert!(path.segments.is_empty()); + assert!(!path.has_wildcard); // Simple field let path = parse_json_path("$.name").unwrap(); - assert!(matches!(&path[0], PathSegment::Field(n) if n == "name")); + assert!(matches!(&path.segments[0], PathSegment::Field(n) if n == "name")); + assert!(!path.has_wildcard); // Array index let path = parse_json_path("$[0]").unwrap(); - assert!(matches!(&path[0], PathSegment::Index(0))); + assert!(matches!(&path.segments[0], PathSegment::Index(0))); // Bracket notation let path = parse_json_path("$['key with spaces']").unwrap(); - assert!(matches!(&path[0], PathSegment::Field(n) if n == "key with spaces")); + assert!(matches!(&path.segments[0], PathSegment::Field(n) if n == "key with spaces")); // Wildcard let path = parse_json_path("$[*]").unwrap(); - assert!(matches!(&path[0], PathSegment::Wildcard)); + assert!(matches!(&path.segments[0], PathSegment::Wildcard)); + assert!(path.has_wildcard); let path = parse_json_path("$.*").unwrap(); - assert!(matches!(&path[0], PathSegment::Wildcard)); + assert!(matches!(&path.segments[0], PathSegment::Wildcard)); + assert!(path.has_wildcard); // Recursive descent not supported assert!(parse_json_path("$..name").is_none()); @@ -358,18 +391,14 @@ mod tests { #[test] fn test_evaluate_simple_field() { + let path = parse_json_path("$.name").unwrap(); assert_eq!( - evaluate_path( - r#"{"name":"John","age":30}"#, - &parse_json_path("$.name").unwrap() - ), + evaluate_path(r#"{"name":"John","age":30}"#, &path), Some("John".to_string()) ); + let path = parse_json_path("$.age").unwrap(); assert_eq!( - evaluate_path( - r#"{"name":"John","age":30}"#, - &parse_json_path("$.age").unwrap() - ), + evaluate_path(r#"{"name":"John","age":30}"#, &path), Some("30".to_string()) ); } @@ -377,111 +406,89 @@ mod tests { #[test] fn test_evaluate_nested() { let json = r#"{"user":{"profile":{"name":"Alice"}}}"#; - assert_eq!( - evaluate_path(json, &parse_json_path("$.user.profile.name").unwrap()), - Some("Alice".to_string()) - ); + let path = parse_json_path("$.user.profile.name").unwrap(); + assert_eq!(evaluate_path(json, &path), Some("Alice".to_string())); } #[test] fn test_evaluate_array_index() { - assert_eq!( - evaluate_path(r#"[1,2,3]"#, &parse_json_path("$[0]").unwrap()), - Some("1".to_string()) - ); - assert_eq!( - evaluate_path(r#"[1,2,3]"#, &parse_json_path("$[3]").unwrap()), - None - ); + let path = parse_json_path("$[0]").unwrap(); + assert_eq!(evaluate_path(r#"[1,2,3]"#, &path), Some("1".to_string())); + let path = parse_json_path("$[3]").unwrap(); + assert_eq!(evaluate_path(r#"[1,2,3]"#, &path), None); } #[test] fn test_evaluate_root() { + let path = parse_json_path("$").unwrap(); assert_eq!( - evaluate_path(r#"{"a":"b"}"#, &parse_json_path("$").unwrap()), + evaluate_path(r#"{"a":"b"}"#, &path), Some(r#"{"a":"b"}"#.to_string()) ); } #[test] fn test_evaluate_null_value() { - assert_eq!( - evaluate_path(r#"{"a":null}"#, &parse_json_path("$.a").unwrap()), - None - ); + let path = parse_json_path("$.a").unwrap(); + assert_eq!(evaluate_path(r#"{"a":null}"#, &path), None); } #[test] fn test_evaluate_missing_field() { - assert_eq!( - evaluate_path(r#"{"a":"b"}"#, &parse_json_path("$.c").unwrap()), - None - ); + let path = parse_json_path("$.c").unwrap(); + assert_eq!(evaluate_path(r#"{"a":"b"}"#, &path), None); } #[test] fn test_evaluate_invalid_json() { - assert_eq!( - evaluate_path("not json", &parse_json_path("$.a").unwrap()), - None - ); + let path = parse_json_path("$.a").unwrap(); + assert_eq!(evaluate_path("not json", &path), None); } #[test] fn test_evaluate_wildcard() { let json = r#"[{"a":"b"},{"a":"c"}]"#; - assert_eq!( - evaluate_path(json, &parse_json_path("$[*].a").unwrap()), - Some(r#"["b","c"]"#.to_string()) - ); + let path = parse_json_path("$[*].a").unwrap(); + assert_eq!(evaluate_path(json, &path), Some(r#"["b","c"]"#.to_string())); } #[test] fn test_evaluate_string_unquoted() { - // Strings should be returned without quotes - assert_eq!( - evaluate_path(r#"["a","b"]"#, &parse_json_path("$[1]").unwrap()), - Some("b".to_string()) - ); + // Strings should be returned without quotes for non-wildcard paths + let path = parse_json_path("$[1]").unwrap(); + assert_eq!(evaluate_path(r#"["a","b"]"#, &path), Some("b".to_string())); } #[test] fn test_evaluate_nested_array_field() { let json = r#"{"items":["apple","banana","cherry"]}"#; - assert_eq!( - evaluate_path(json, &parse_json_path("$.items[1]").unwrap()), - Some("banana".to_string()) - ); + let path = parse_json_path("$.items[1]").unwrap(); + assert_eq!(evaluate_path(json, &path), Some("banana".to_string())); } #[test] fn test_evaluate_bracket_notation_with_spaces() { let json = r#"{"key with spaces":"it works"}"#; - assert_eq!( - evaluate_path(json, &parse_json_path("$['key with spaces']").unwrap()), - Some("it works".to_string()) - ); + let path = parse_json_path("$['key with spaces']").unwrap(); + assert_eq!(evaluate_path(json, &path), Some("it works".to_string())); } #[test] fn test_evaluate_boolean_and_nested_object() { let json = r#"{"a":true,"b":{"c":1}}"#; - assert_eq!( - evaluate_path(json, &parse_json_path("$.a").unwrap()), - Some("true".to_string()) - ); - assert_eq!( - evaluate_path(json, &parse_json_path("$.b").unwrap()), - Some(r#"{"c":1}"#.to_string()) - ); + let path_a = parse_json_path("$.a").unwrap(); + assert_eq!(evaluate_path(json, &path_a), Some("true".to_string())); + let path_b = parse_json_path("$.b").unwrap(); + assert_eq!(evaluate_path(json, &path_b), Some(r#"{"c":1}"#.to_string())); } #[test] fn test_object_key_order_preserved() { - // serde_json with preserve_order feature should maintain insertion order + // Depends on serde_json "preserve_order" feature (see Cargo.toml) let json = r#"{"z":1,"a":2}"#; + let path = parse_json_path("$").unwrap(); assert_eq!( - evaluate_path(json, &parse_json_path("$").unwrap()), + evaluate_path(json, &path), Some(r#"{"z":1,"a":2}"#.to_string()) ); } @@ -490,36 +497,27 @@ mod tests { fn test_wildcard_single_match() { // Single wildcard match on string: Spark preserves JSON quotes let json = r#"[{"a":"only"}]"#; - assert_eq!( - evaluate_path(json, &parse_json_path("$[*].a").unwrap()), - Some(r#""only""#.to_string()) - ); + let path = parse_json_path("$[*].a").unwrap(); + assert_eq!(evaluate_path(json, &path), Some(r#""only""#.to_string())); // Single wildcard match on number: no quotes let json = r#"[{"a":42}]"#; - assert_eq!( - evaluate_path(json, &parse_json_path("$[*].a").unwrap()), - Some("42".to_string()) - ); + assert_eq!(evaluate_path(json, &path), Some("42".to_string())); } #[test] fn test_wildcard_missing_fields() { // Wildcard should skip elements where the field is missing let json = r#"[{"a":1},{"b":2},{"a":3}]"#; - assert_eq!( - evaluate_path(json, &parse_json_path("$[*].a").unwrap()), - Some("[1,3]".to_string()) - ); + let path = parse_json_path("$[*].a").unwrap(); + assert_eq!(evaluate_path(json, &path), Some("[1,3]".to_string())); } #[test] fn test_field_with_colon() { let json = r#"{"fb:testid":"123"}"#; - assert_eq!( - evaluate_path(json, &parse_json_path("$.fb:testid").unwrap()), - Some("123".to_string()) - ); + let path = parse_json_path("$.fb:testid").unwrap(); + assert_eq!(evaluate_path(json, &path), Some("123".to_string())); } #[test] @@ -532,7 +530,7 @@ mod tests { fn test_object_wildcard() { // Spark returns null for $.* on objects (wildcard only works in array contexts) let json = r#"{"a":1,"b":2,"c":3}"#; - let result = evaluate_path(json, &parse_json_path("$.*").unwrap()); - assert_eq!(result, None); + let path = parse_json_path("$.*").unwrap(); + assert_eq!(evaluate_path(json, &path), None); } } From f558e0fd72ac42156b83e6f8da01592ded3a3430 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 Mar 2026 11:28:53 -0600 Subject: [PATCH 6/6] style: format markdown with prettier --- docs/source/user-guide/latest/expressions.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md index 9b6511d42d..9934499401 100644 --- a/docs/source/user-guide/latest/expressions.md +++ b/docs/source/user-guide/latest/expressions.md @@ -92,9 +92,9 @@ Expressions that are not Spark-compatible will fall back to Spark by default and ## JSON Functions -| Expression | Spark-Compatible? | Compatibility Notes | -| --------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------------- | -| GetJsonObject | No | Spark allows single-quoted JSON and unescaped control characters which Comet does not support | +| Expression | Spark-Compatible? | Compatibility Notes | +| ------------- | ----------------- | --------------------------------------------------------------------------------------------- | +| GetJsonObject | No | Spark allows single-quoted JSON and unescaped control characters which Comet does not support | ## Date/Time Functions