From 2261cbaf98c13924488ac8fa7b34075e33cdd173 Mon Sep 17 00:00:00 2001 From: daidai Date: Sat, 28 Mar 2026 09:15:03 +0800 Subject: [PATCH] [fix](parquet)fix parquet write timestamp int96 type. (1/2) (#61760) ### What problem does this PR solve? PR #60946 Problem Summary: This pull request fixes a patch introduced in #60946 that caused Doris exports to fail to write Parquet int96 data types. This issue is resolved by adding a new patch to arrow that introduces a parameter that forces writing to int96. This pr only update thirdparty, next pr update be code. --- thirdparty/download-thirdparty.sh | 4 + ...-17.0.0-force-write-int96-timestamps.patch | 98 +++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch diff --git a/thirdparty/download-thirdparty.sh b/thirdparty/download-thirdparty.sh index d85b86704d3bb7..52481b24fca863 100755 --- a/thirdparty/download-thirdparty.sh +++ b/thirdparty/download-thirdparty.sh @@ -431,6 +431,10 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " ARROW " ]]; then # Paimon-cpp parquet patches: row-group-aware batch reader, max_row_group_size, # GetBufferedSize(), int96 NANO guard, and Thrift_VERSION empty fix. patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-paimon.patch" + + # apache-arrow-17.0.0-force-write-int96-timestamps.patch : + # Introducing the parameter that forces writing int96 timestampes for compatibility with Paimon cpp. + patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-force-write-int96-timestamps.patch" touch "${PATCHED_MARK}" fi cd - diff --git a/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch b/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch new file mode 100644 index 00000000000000..5a75424756671d --- /dev/null +++ b/thirdparty/patches/apache-arrow-17.0.0-force-write-int96-timestamps.patch @@ -0,0 +1,98 @@ +diff -ruN arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/arrow/schema.cc arrow-apache-arrow-17.0.0/cpp/src/parquet/arrow/schema.cc +--- arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/arrow/schema.cc 2026-03-27 01:23:23.651831424 +0800 ++++ arrow-apache-arrow-17.0.0/cpp/src/parquet/arrow/schema.cc 2026-03-27 01:28:36.855281965 +0800 +@@ -178,7 +178,8 @@ + + // The user is explicitly asking for Impala int96 encoding, there is no + // logical type. +- if (arrow_properties.support_deprecated_int96_timestamps() && target_unit == ::arrow::TimeUnit::NANO) { ++ if (arrow_properties.force_write_int96_timestamps() || ++ (arrow_properties.support_deprecated_int96_timestamps() && target_unit == ::arrow::TimeUnit::NANO)) { + *physical_type = ParquetType::INT96; + return Status::OK(); + } +diff -ruN arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/properties.h arrow-apache-arrow-17.0.0/cpp/src/parquet/properties.h +--- arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/properties.h 2026-03-27 01:23:23.643831362 +0800 ++++ arrow-apache-arrow-17.0.0/cpp/src/parquet/properties.h 2026-03-27 01:27:47.717897537 +0800 +@@ -980,6 +980,7 @@ + public: + Builder() + : write_timestamps_as_int96_(false), ++ force_write_int96_timestamps_(false), + coerce_timestamps_enabled_(false), + coerce_timestamps_unit_(::arrow::TimeUnit::SECOND), + truncated_timestamps_allowed_(false), +@@ -1005,6 +1006,21 @@ + return this; + } + ++ /// \brief Force writing legacy int96 timestamps. ++ /// ++ /// This bypasses unit-based guards and writes INT96 whenever timestamp ++ /// metadata is resolved. ++ Builder* enable_force_write_int96_timestamps() { ++ force_write_int96_timestamps_ = true; ++ return this; ++ } ++ ++ /// \brief Disable forcing legacy int96 timestamps (default). ++ Builder* disable_force_write_int96_timestamps() { ++ force_write_int96_timestamps_ = false; ++ return this; ++ } ++ + /// \brief Coerce all timestamps to the specified time unit. + /// \param unit time unit to truncate to. + /// For Parquet versions 1.0 and 2.4, nanoseconds are casted to microseconds. +@@ -1085,7 +1101,8 @@ + /// Create the final properties. + std::shared_ptr build() { + return std::shared_ptr(new ArrowWriterProperties( +- write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_, ++ write_timestamps_as_int96_, force_write_int96_timestamps_, ++ coerce_timestamps_enabled_, coerce_timestamps_unit_, + truncated_timestamps_allowed_, store_schema_, compliant_nested_types_, + engine_version_, use_threads_, executor_)); + } +@@ -1093,6 +1110,8 @@ + private: + bool write_timestamps_as_int96_; + ++ bool force_write_int96_timestamps_; ++ + bool coerce_timestamps_enabled_; + ::arrow::TimeUnit::type coerce_timestamps_unit_; + bool truncated_timestamps_allowed_; +@@ -1107,6 +1126,8 @@ + + bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; } + ++ bool force_write_int96_timestamps() const { return force_write_int96_timestamps_; } ++ + bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; } + ::arrow::TimeUnit::type coerce_timestamps_unit() const { + return coerce_timestamps_unit_; +@@ -1138,6 +1159,7 @@ + + private: + explicit ArrowWriterProperties(bool write_nanos_as_int96, ++ bool force_write_int96_timestamps, + bool coerce_timestamps_enabled, + ::arrow::TimeUnit::type coerce_timestamps_unit, + bool truncated_timestamps_allowed, bool store_schema, +@@ -1145,6 +1167,7 @@ + EngineVersion engine_version, bool use_threads, + ::arrow::internal::Executor* executor) + : write_timestamps_as_int96_(write_nanos_as_int96), ++ force_write_int96_timestamps_(force_write_int96_timestamps), + coerce_timestamps_enabled_(coerce_timestamps_enabled), + coerce_timestamps_unit_(coerce_timestamps_unit), + truncated_timestamps_allowed_(truncated_timestamps_allowed), +@@ -1155,6 +1178,7 @@ + executor_(executor) {} + + const bool write_timestamps_as_int96_; ++ const bool force_write_int96_timestamps_; + const bool coerce_timestamps_enabled_; + const ::arrow::TimeUnit::type coerce_timestamps_unit_; + const bool truncated_timestamps_allowed_;