From 1b9e68478875f820c61d75500adf24e50d0b4e6b Mon Sep 17 00:00:00 2001 From: daidai Date: Thu, 26 Mar 2026 15:54:24 +0800 Subject: [PATCH 1/4] [fix](parquet)fix parquet write timestamp int96 type. --- .../transformer/vparquet_transformer.cpp | 2 +- thirdparty/download-thirdparty.sh | 7 +- ...-17.0.0-force-write-int96-timestamps.patch | 104 ++++++++++++++++++ 3 files changed, 111 insertions(+), 2 deletions(-) create mode 100644 thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch diff --git a/be/src/format/transformer/vparquet_transformer.cpp b/be/src/format/transformer/vparquet_transformer.cpp index fa2d651be8bc1a..4d89261a379677 100644 --- a/be/src/format/transformer/vparquet_transformer.cpp +++ b/be/src/format/transformer/vparquet_transformer.cpp @@ -209,7 +209,7 @@ Status VParquetTransformer::_parse_properties() { //build arrow writer properties ::parquet::ArrowWriterProperties::Builder arrow_builder; if (_parquet_options.enable_int96_timestamps) { - arrow_builder.enable_deprecated_int96_timestamps(); + arrow_builder.enable_force_write_int96_timestamps(); } arrow_builder.store_schema(); _arrow_properties = arrow_builder.build(); diff --git a/thirdparty/download-thirdparty.sh b/thirdparty/download-thirdparty.sh index f57cab9de1a6cb..5498f5292df56f 100755 --- a/thirdparty/download-thirdparty.sh +++ b/thirdparty/download-thirdparty.sh @@ -429,8 +429,13 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " ARROW " ]]; then cd "${TP_SOURCE_DIR}/${ARROW_SOURCE}" if [[ ! -f "${PATCHED_MARK}" ]]; then # Paimon-cpp parquet patches: row-group-aware batch reader, max_row_group_size, - # GetBufferedSize(), int96 NANO guard, and Thrift_VERSION empty fix. + # GetBufferedSize(), int96 NANO guard, force-write INT96 override, and + # Thrift_VERSION empty fix. patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-paimon.patch" + + # apache-arrow-17.0.0-force-write-int96-timestamps.patch : + # Introducing the parameter that forces writing int96 timestampes for compatibility with Paimon cpp. + patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-force-write-int96-timestamps.patch" touch "${PATCHED_MARK}" fi cd - diff --git a/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch b/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch new file mode 100644 index 00000000000000..15e814899aa715 --- /dev/null +++ b/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch @@ -0,0 +1,104 @@ +diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc +index 943f69bb6c..d26c74e6e8 100644 +--- a/cpp/src/parquet/arrow/schema.cc ++++ b/cpp/src/parquet/arrow/schema.cc +@@ -178,7 +178,8 @@ static Status GetTimestampMetadata(const ::arrow::TimestampType& type, + + // The user is explicitly asking for Impala int96 encoding, there is no + // logical type. +- if (arrow_properties.support_deprecated_int96_timestamps() && target_unit == ::arrow::TimeUnit::NANO) { ++ if (arrow_properties.force_write_int96_timestamps() || ++ (arrow_properties.support_deprecated_int96_timestamps() && target_unit == ::arrow::TimeUnit::NANO)) { + *physical_type = ParquetType::INT96; + return Status::OK(); + } + +diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h +index 3906ff3c59..c3ce83d996 100644 +--- a/cpp/src/parquet/properties.h ++++ b/cpp/src/parquet/properties.h +@@ -980,6 +980,7 @@ class PARQUET_EXPORT ArrowWriterProperties { + public: + Builder() + : write_timestamps_as_int96_(false), ++ force_write_int96_timestamps_(false), + coerce_timestamps_enabled_(false), + coerce_timestamps_unit_(::arrow::TimeUnit::SECOND), + truncated_timestamps_allowed_(false), +@@ -1008,6 +1009,22 @@ class PARQUET_EXPORT ArrowWriterProperties { + return this; + } + ++ /// \brief Force writing legacy int96 timestamps. ++ /// ++ /// This bypasses unit-based guards and writes INT96 whenever timestamp ++ /// metadata is resolved. ++ Builder* enable_force_write_int96_timestamps() { ++ force_write_int96_timestamps_ = true; ++ return this; ++ } ++ ++ /// \brief Disable forcing legacy int96 timestamps (default). ++ Builder* disable_force_write_int96_timestamps() { ++ force_write_int96_timestamps_ = false; ++ return this; ++ } ++ + /// \brief Coerce all timestamps to the specified time unit. + /// \param unit time unit to truncate to. + /// For Parquet versions 1.0 and 2.4, nanoseconds are casted to microseconds. +@@ -1085,17 +1102,21 @@ class PARQUET_EXPORT ArrowWriterProperties { + /// Create the final properties. + std::shared_ptr build() { + return std::shared_ptr(new ArrowWriterProperties( +- write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, +- truncated_timestamps_allowed_, store_schema_, compliant_nested_types_, +- engine_version_, use_threads_, executor_)); ++ write_timestamps_as_int96_, force_write_int96_timestamps_, ++ coerce_timestamps_enabled_, coerce_timestamps_unit_, ++ truncated_timestamps_allowed_, store_schema_, compliant_nested_types_, ++ engine_version_, use_threads_, executor_)); + } + + private: + bool write_timestamps_as_int96_; + ++ bool force_write_int96_timestamps_; ++ + bool coerce_timestamps_enabled_; + ::arrow::TimeUnit::type coerce_timestamps_unit_; + bool truncated_timestamps_allowed_; + +@@ -1108,6 +1129,8 @@ class PARQUET_EXPORT ArrowWriterProperties { + + bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; } + ++ bool force_write_int96_timestamps() const { return force_write_int96_timestamps_; } ++ + bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; } + ::arrow::TimeUnit::type coerce_timestamps_unit() const { + return coerce_timestamps_unit_; +@@ -1140,13 +1163,15 @@ class PARQUET_EXPORT ArrowWriterProperties { + + private: + explicit ArrowWriterProperties(bool write_nanos_as_int96, ++ bool force_write_int96_timestamps, + bool coerce_timestamps_enabled, + ::arrow::TimeUnit::type coerce_timestamps_unit, + bool truncated_timestamps_allowed, bool store_schema, + bool compliant_nested_types, + EngineVersion engine_version, bool use_threads, + ::arrow::internal::Executor* executor) + : write_timestamps_as_int96_(write_nanos_as_int96), ++ force_write_int96_timestamps_(force_write_int96_timestamps), + coerce_timestamps_enabled_(coerce_timestamps_enabled), + coerce_timestamps_unit_(coerce_timestamps_unit), + truncated_timestamps_allowed_(truncated_timestamps_allowed), +@@ -1157,6 +1182,7 @@ class PARQUET_EXPORT ArrowWriterProperties { + executor_(executor) {} + + const bool write_timestamps_as_int96_; ++ const bool force_write_int96_timestamps_; + const bool coerce_timestamps_enabled_; + const ::arrow::TimeUnit::type coerce_timestamps_unit_; + const bool truncated_timestamps_allowed_; From dabf393ed0854bdba8d9bf38e704bca8db4a43f3 Mon Sep 17 00:00:00 2001 From: daidai Date: Thu, 26 Mar 2026 23:12:30 +0800 Subject: [PATCH 2/4] Update download-thirdparty.sh --- thirdparty/download-thirdparty.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/thirdparty/download-thirdparty.sh b/thirdparty/download-thirdparty.sh index 5498f5292df56f..75ba6313529c2b 100755 --- a/thirdparty/download-thirdparty.sh +++ b/thirdparty/download-thirdparty.sh @@ -429,8 +429,7 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " ARROW " ]]; then cd "${TP_SOURCE_DIR}/${ARROW_SOURCE}" if [[ ! -f "${PATCHED_MARK}" ]]; then # Paimon-cpp parquet patches: row-group-aware batch reader, max_row_group_size, - # GetBufferedSize(), int96 NANO guard, force-write INT96 override, and - # Thrift_VERSION empty fix. + # GetBufferedSize(), int96 NANO guard, and Thrift_VERSION empty fix. patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-paimon.patch" # apache-arrow-17.0.0-force-write-int96-timestamps.patch : From 1c7230b935be95fd147570e4d7478276c4c56570 Mon Sep 17 00:00:00 2001 From: morningman Date: Fri, 27 Mar 2026 01:31:45 +0800 Subject: [PATCH 3/4] fix patch --- ...-17.0.0-force-write-int96-timestamps.patch | 40 ++++++++----------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch b/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch index 15e814899aa715..5a75424756671d 100644 --- a/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch +++ b/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch @@ -1,8 +1,7 @@ -diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc -index 943f69bb6c..d26c74e6e8 100644 ---- a/cpp/src/parquet/arrow/schema.cc -+++ b/cpp/src/parquet/arrow/schema.cc -@@ -178,7 +178,8 @@ static Status GetTimestampMetadata(const ::arrow::TimestampType& type, +diff -ruN arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/arrow/schema.cc arrow-apache-arrow-17.0.0/cpp/src/parquet/arrow/schema.cc +--- arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/arrow/schema.cc 2026-03-27 01:23:23.651831424 +0800 ++++ arrow-apache-arrow-17.0.0/cpp/src/parquet/arrow/schema.cc 2026-03-27 01:28:36.855281965 +0800 +@@ -178,7 +178,8 @@ // The user is explicitly asking for Impala int96 encoding, there is no // logical type. @@ -12,12 +11,10 @@ index 943f69bb6c..d26c74e6e8 100644 *physical_type = ParquetType::INT96; return Status::OK(); } - -diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h -index 3906ff3c59..c3ce83d996 100644 ---- a/cpp/src/parquet/properties.h -+++ b/cpp/src/parquet/properties.h -@@ -980,6 +980,7 @@ class PARQUET_EXPORT ArrowWriterProperties { +diff -ruN arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/properties.h arrow-apache-arrow-17.0.0/cpp/src/parquet/properties.h +--- arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/properties.h 2026-03-27 01:23:23.643831362 +0800 ++++ arrow-apache-arrow-17.0.0/cpp/src/parquet/properties.h 2026-03-27 01:27:47.717897537 +0800 +@@ -980,6 +980,7 @@ public: Builder() : write_timestamps_as_int96_(false), @@ -25,7 +22,7 @@ index 3906ff3c59..c3ce83d996 100644 coerce_timestamps_enabled_(false), coerce_timestamps_unit_(::arrow::TimeUnit::SECOND), truncated_timestamps_allowed_(false), -@@ -1008,6 +1009,22 @@ class PARQUET_EXPORT ArrowWriterProperties { +@@ -1005,6 +1006,21 @@ return this; } @@ -47,19 +44,17 @@ index 3906ff3c59..c3ce83d996 100644 /// \brief Coerce all timestamps to the specified time unit. /// \param unit time unit to truncate to. /// For Parquet versions 1.0 and 2.4, nanoseconds are casted to microseconds. -@@ -1085,17 +1102,21 @@ class PARQUET_EXPORT ArrowWriterProperties { +@@ -1085,7 +1101,8 @@ /// Create the final properties. std::shared_ptr build() { return std::shared_ptr(new ArrowWriterProperties( - write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, -- truncated_timestamps_allowed_, store_schema_, compliant_nested_types_, -- engine_version_, use_threads_, executor_)); + write_timestamps_as_int96_, force_write_int96_timestamps_, + coerce_timestamps_enabled_, coerce_timestamps_unit_, -+ truncated_timestamps_allowed_, store_schema_, compliant_nested_types_, -+ engine_version_, use_threads_, executor_)); + truncated_timestamps_allowed_, store_schema_, compliant_nested_types_, + engine_version_, use_threads_, executor_)); } - +@@ -1093,6 +1110,8 @@ private: bool write_timestamps_as_int96_; @@ -68,8 +63,7 @@ index 3906ff3c59..c3ce83d996 100644 bool coerce_timestamps_enabled_; ::arrow::TimeUnit::type coerce_timestamps_unit_; bool truncated_timestamps_allowed_; - -@@ -1108,6 +1129,8 @@ class PARQUET_EXPORT ArrowWriterProperties { +@@ -1107,6 +1126,8 @@ bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; } @@ -78,7 +72,7 @@ index 3906ff3c59..c3ce83d996 100644 bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; } ::arrow::TimeUnit::type coerce_timestamps_unit() const { return coerce_timestamps_unit_; -@@ -1140,13 +1163,15 @@ class PARQUET_EXPORT ArrowWriterProperties { +@@ -1138,6 +1159,7 @@ private: explicit ArrowWriterProperties(bool write_nanos_as_int96, @@ -86,7 +80,7 @@ index 3906ff3c59..c3ce83d996 100644 bool coerce_timestamps_enabled, ::arrow::TimeUnit::type coerce_timestamps_unit, bool truncated_timestamps_allowed, bool store_schema, - bool compliant_nested_types, +@@ -1145,6 +1167,7 @@ EngineVersion engine_version, bool use_threads, ::arrow::internal::Executor* executor) : write_timestamps_as_int96_(write_nanos_as_int96), @@ -94,7 +88,7 @@ index 3906ff3c59..c3ce83d996 100644 coerce_timestamps_enabled_(coerce_timestamps_enabled), coerce_timestamps_unit_(coerce_timestamps_unit), truncated_timestamps_allowed_(truncated_timestamps_allowed), -@@ -1157,6 +1182,7 @@ class PARQUET_EXPORT ArrowWriterProperties { +@@ -1155,6 +1178,7 @@ executor_(executor) {} const bool write_timestamps_as_int96_; From ecb25520238e5e88e2d000622eeea7856849d391 Mon Sep 17 00:00:00 2001 From: daidai Date: Fri, 27 Mar 2026 14:34:20 +0800 Subject: [PATCH 4/4] Change timestamp option to deprecated in Arrow writer --- be/src/format/transformer/vparquet_transformer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/format/transformer/vparquet_transformer.cpp b/be/src/format/transformer/vparquet_transformer.cpp index 4d89261a379677..fa2d651be8bc1a 100644 --- a/be/src/format/transformer/vparquet_transformer.cpp +++ b/be/src/format/transformer/vparquet_transformer.cpp @@ -209,7 +209,7 @@ Status VParquetTransformer::_parse_properties() { //build arrow writer properties ::parquet::ArrowWriterProperties::Builder arrow_builder; if (_parquet_options.enable_int96_timestamps) { - arrow_builder.enable_force_write_int96_timestamps(); + arrow_builder.enable_deprecated_int96_timestamps(); } arrow_builder.store_schema(); _arrow_properties = arrow_builder.build();