diff --git a/native/spark-expr/src/conversion_funcs/string.rs b/native/spark-expr/src/conversion_funcs/string.rs index 7c193716d0..2db0df7c98 100644 --- a/native/spark-expr/src/conversion_funcs/string.rs +++ b/native/spark-expr/src/conversion_funcs/string.rs @@ -25,7 +25,7 @@ use arrow::datatypes::{ i256, is_validate_decimal_precision, DataType, Date32Type, Decimal256Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, TimestampMicrosecondType, }; -use chrono::{DateTime, NaiveDate, TimeZone, Timelike}; +use chrono::{DateTime, NaiveDate, TimeZone, Timelike, Utc}; use num::traits::CheckedNeg; use num::{CheckedSub, Integer}; use regex::Regex; @@ -34,9 +34,13 @@ use std::str::FromStr; use std::sync::Arc; macro_rules! cast_utf8_to_timestamp { - ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr) => {{ + ($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr, $with_tz:expr) => {{ let len = $array.len(); - let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone("UTC"); + let mut cast_array = if let Some(tz_str) = $with_tz { + PrimitiveArray::<$array_type>::builder(len).with_timezone(tz_str) + } else { + PrimitiveArray::<$array_type>::builder(len) + }; for i in 0..len { if $array.is_null(i) { cast_array.append_null() @@ -665,16 +669,28 @@ pub(crate) fn cast_string_to_timestamp( .downcast_ref::>() .expect("Expected a string array"); - let tz = &timezone::Tz::from_str(timezone_str).unwrap(); - let cast_array: ArrayRef = match to_type { - DataType::Timestamp(_, _) => { + DataType::Timestamp(_, Some(_)) => { + let tz = &timezone::Tz::from_str(timezone_str).unwrap(); cast_utf8_to_timestamp!( string_array, eval_mode, TimestampMicrosecondType, timestamp_parser, - tz + tz, + Some("UTC") + ) + } + DataType::Timestamp(_, None) => { + // TimestampNTZ: use a dedicated parser that strips timezone offsets + // before parsing, matching Spark's behavior of ignoring offsets for NTZ. + cast_utf8_to_timestamp!( + string_array, + eval_mode, + TimestampMicrosecondType, + timestamp_ntz_parser, + &Utc, + None::<&str> ) } _ => unreachable!("Invalid data type {:?} in cast from string", to_type), @@ -1082,7 +1098,29 @@ fn parse_str_to_microsecond_timestamp( get_timestamp_values(value, "microsecond", tz) } -// used in tests only +/// Regex to match a timezone offset suffix in a timestamp string. +/// Matches: Z, +HH:MM, -HH:MM, +HHMM, -HHMM, +HH, -HH +fn strip_timezone_offset(value: &str) -> &str { + // This regex is anchored to the end of the string and matches common timezone offset formats. + // We use a lazy_static-style approach via Regex::new (consistent with the rest of this file). + let tz_suffix = Regex::new(r"([+-]\d{2}(:\d{2})?|Z)$").unwrap(); + match tz_suffix.find(value) { + Some(m) => &value[..m.start()], + None => value, + } +} + +/// Strips timezone offset before parsing, matching Spark's TimestampNTZ behavior. +/// Separate from `timestamp_parser` to preserve offsets for Timestamp WITH TZ. +fn timestamp_ntz_parser( + value: &str, + eval_mode: EvalMode, + tz: &T, +) -> SparkResult> { + let stripped = strip_timezone_offset(value.trim()); + timestamp_parser(stripped, eval_mode, tz) +} + fn timestamp_parser( value: &str, eval_mode: EvalMode, @@ -1340,7 +1378,8 @@ mod tests { eval_mode, TimestampMicrosecondType, timestamp_parser, - tz + tz, + Some("UTC") ); assert_eq!( @@ -1350,6 +1389,184 @@ mod tests { assert_eq!(result.len(), 4); } + #[test] + #[cfg_attr(miri, ignore)] + fn test_cast_string_to_timestamp_ntz() { + let array: ArrayRef = Arc::new(StringArray::from(vec![ + Some("2020-01-01T12:34:56.123456"), + Some("not_a_timestamp"), + Some("2020-01-01"), + ])); + + let string_array = array + .as_any() + .downcast_ref::>() + .expect("Expected a string array"); + + let eval_mode = EvalMode::Legacy; + let result = cast_utf8_to_timestamp!( + &string_array, + eval_mode, + TimestampMicrosecondType, + timestamp_parser, + &Utc, + None::<&str> + ); + + // Key assertion: TimestampNTZ should have NO timezone + assert_eq!( + result.data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + assert_eq!(result.len(), 3); + + // Verify the actual values are not timezone-converted + let ts_array = result + .as_any() + .downcast_ref::>() + .expect("Expected a timestamp array"); + + // 2020-01-01T12:34:56.123456 as naive epoch micros + assert_eq!(ts_array.value(0), 1577882096123456); + // "not_a_timestamp" is invalid and should be null + assert!(ts_array.is_null(1)); + // 2020-01-01 as naive epoch micros (midnight) + assert_eq!(ts_array.value(2), 1577836800000000); + } + + #[test] + #[cfg_attr(miri, ignore)] + fn test_cast_string_with_timezone_offset_to_timestamp_ntz() { + // Offset strings: offset stripped, local datetime preserved (Spark NTZ behavior) + let array: ArrayRef = Arc::new(StringArray::from(vec![ + Some("2020-01-01T12:34:56+05:00"), + Some("2020-01-01T12:34:56-08:00"), + Some("2020-01-01T12:34:56Z"), + Some("2020-01-01T12:34:56.123456+00:00"), + Some("2020-01-01T12:34:56.123456"), + ])); + + let string_array = array + .as_any() + .downcast_ref::>() + .expect("Expected a string array"); + + let eval_mode = EvalMode::Legacy; + let result = cast_utf8_to_timestamp!( + &string_array, + eval_mode, + TimestampMicrosecondType, + timestamp_ntz_parser, + &Utc, + None::<&str> + ); + + assert_eq!( + result.data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + assert_eq!(result.len(), 5); + + let ts_array = result + .as_any() + .downcast_ref::>() + .expect("Expected a timestamp array"); + + // Offset strings: offset stripped, local datetime kept + assert!(!ts_array.is_null(0), "'+05:00' offset should be parsed"); + assert_eq!(ts_array.value(0), 1577882096000000); + assert!(!ts_array.is_null(1), "'-08:00' offset should be parsed"); + assert_eq!(ts_array.value(1), 1577882096000000); + assert!(!ts_array.is_null(2), "'Z' suffix should be parsed"); + assert_eq!(ts_array.value(2), 1577882096000000); + assert!( + !ts_array.is_null(3), + "'+00:00' offset with micros should be parsed" + ); + assert_eq!(ts_array.value(3), 1577882096123456); + + // The one without offset should also parse correctly + assert!(!ts_array.is_null(4)); + assert_eq!(ts_array.value(4), 1577882096123456); + } + + #[test] + fn test_cast_string_to_timestamp_ntz_via_cast_array() -> DataFusionResult<()> { + let array: ArrayRef = Arc::new(StringArray::from(vec![ + Some("2020-01-01T12:34:56.123456"), + Some("T2"), + ])); + + let timezone = "America/New_York".to_string(); + let cast_options = SparkCastOptions::new(EvalMode::Legacy, &timezone, false); + + // Cast to Timestamp with timezone + let result_tz = cast_array( + Arc::clone(&array), + &DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())), + &cast_options, + )?; + assert_eq!( + *result_tz.data_type(), + DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())) + ); + + // Cast to TimestampNTZ (no timezone) + let result_ntz = cast_array( + Arc::clone(&array), + &DataType::Timestamp(TimeUnit::Microsecond, None), + &cast_options, + )?; + assert_eq!( + *result_ntz.data_type(), + DataType::Timestamp(TimeUnit::Microsecond, None) + ); + + // The NTZ result should NOT have timezone metadata + let ntz_array = result_ntz + .as_any() + .downcast_ref::>() + .expect("Expected a timestamp array"); + // 2020-01-01T12:34:56.123456 stored as-is (no timezone conversion) + assert_eq!(ntz_array.value(0), 1577882096123456); + + Ok(()) + } + + #[test] + fn test_cast_string_with_offset_via_cast_array() -> DataFusionResult<()> { + let array: ArrayRef = Arc::new(StringArray::from(vec![ + Some("2020-01-01T12:34:56+05:00"), + Some("2020-01-01T12:34:56"), + ])); + + let timezone = "America/Los_Angeles".to_string(); + let cast_options = SparkCastOptions::new(EvalMode::Legacy, &timezone, false); + + let result = cast_array( + Arc::clone(&array), + &DataType::Timestamp(TimeUnit::Microsecond, None), + &cast_options, + )?; + + let ts_array = result + .as_any() + .downcast_ref::>() + .expect("Expected a timestamp array"); + + // Offset string should be parsed: offset stripped, local datetime kept + assert!( + !ts_array.is_null(0), + "Offset string should be parsed for NTZ" + ); + assert_eq!(ts_array.value(0), 1577882096000000); // 2020-01-01T12:34:56 + // Non-offset string should parse correctly + assert!(!ts_array.is_null(1)); + assert_eq!(ts_array.value(1), 1577882096000000); + + Ok(()) + } + #[test] fn test_cast_dict_string_to_timestamp() -> DataFusionResult<()> { // prepare input data diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index 95d5366907..a6d65d37af 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -21,7 +21,7 @@ package org.apache.comet.expressions import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Expression, Literal} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, NullType, StructType, TimestampType} +import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, NullType, StructType, TimestampNTZType, TimestampType} import org.apache.comet.CometConf import org.apache.comet.CometSparkSessionExtensions.withInfo @@ -222,6 +222,10 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim { case DataTypes.TimestampType => // https://github.com/apache/datafusion-comet/issues/328 Incompatible(Some("Not all valid formats are supported")) + case _: TimestampNTZType if evalMode == CometEvalMode.ANSI => + Incompatible(Some("ANSI mode not supported")) + case _: TimestampNTZType => + Incompatible(Some("Not all valid formats are supported")) case _ => unsupported(DataTypes.StringType, toType) } diff --git a/spark/src/test/resources/sql-tests/expressions/cast/cast_string_to_timestamp_ntz.sql b/spark/src/test/resources/sql-tests/expressions/cast/cast_string_to_timestamp_ntz.sql new file mode 100644 index 0000000000..aded08750b --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/cast/cast_string_to_timestamp_ntz.sql @@ -0,0 +1,41 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Config: spark.comet.expression.Cast.allowIncompatible=true +-- ConfigMatrix: parquet.enable.dictionary=false,true + +-- Test casting string to timestamp_ntz +-- https://github.com/apache/datafusion-comet/issues/3179 + +statement +CREATE TABLE test_cast_ts_ntz(s string) USING parquet + +statement +INSERT INTO test_cast_ts_ntz VALUES ('2020-01-01T12:34:56.123456'), ('2020-01-01'), ('2020-01-01T12:34:56'), ('2020'), ('2020-01'), (NULL), ('not_a_timestamp'), ('2020-01-01T12:34:56+05:00') + +-- Cast string to timestamp_ntz: valid formats should parse, invalid should be null +query +SELECT s, cast(s AS timestamp_ntz) FROM test_cast_ts_ntz + +-- Verify that timestamp_ntz values are not affected by session timezone +query +SELECT s, cast(s AS timestamp_ntz) FROM test_cast_ts_ntz WHERE s = '2020-01-01T12:34:56.123456' + +-- Compare timestamp_ntz vs timestamp (with timezone) to show they differ +query +-- Exclude offset string: cast(s AS timestamp) does not yet parse offsets natively +SELECT s, cast(s AS timestamp_ntz) as ts_ntz, cast(s AS timestamp) as ts FROM test_cast_ts_ntz WHERE s IS NOT NULL AND s != 'not_a_timestamp' AND s != '2020-01-01T12:34:56+05:00'