Skip to content

Commit f1e1020

Browse files
authored
[fix](parquet)fix parquet write timestamp int96 type. (1/2) (#61760)
### What problem does this PR solve? PR #60946 Problem Summary: This pull request fixes a patch introduced in #60946 that caused Doris exports to fail to write Parquet int96 data types. This issue is resolved by adding a new patch to arrow that introduces a parameter that forces writing to int96. This pr only update thirdparty, next pr update be code.
1 parent 1cac2c5 commit f1e1020

2 files changed

Lines changed: 102 additions & 0 deletions

File tree

thirdparty/download-thirdparty.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,10 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " ARROW " ]]; then
431431
# Paimon-cpp parquet patches: row-group-aware batch reader, max_row_group_size,
432432
# GetBufferedSize(), int96 NANO guard, and Thrift_VERSION empty fix.
433433
patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-paimon.patch"
434+
435+
# apache-arrow-17.0.0-force-write-int96-timestamps.patch :
436+
# Introducing the parameter that forces writing int96 timestampes for compatibility with Paimon cpp.
437+
patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-force-write-int96-timestamps.patch"
434438
touch "${PATCHED_MARK}"
435439
fi
436440
cd -
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
diff -ruN arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/arrow/schema.cc arrow-apache-arrow-17.0.0/cpp/src/parquet/arrow/schema.cc
2+
--- arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/arrow/schema.cc 2026-03-27 01:23:23.651831424 +0800
3+
+++ arrow-apache-arrow-17.0.0/cpp/src/parquet/arrow/schema.cc 2026-03-27 01:28:36.855281965 +0800
4+
@@ -178,7 +178,8 @@
5+
6+
// The user is explicitly asking for Impala int96 encoding, there is no
7+
// logical type.
8+
- if (arrow_properties.support_deprecated_int96_timestamps() && target_unit == ::arrow::TimeUnit::NANO) {
9+
+ if (arrow_properties.force_write_int96_timestamps() ||
10+
+ (arrow_properties.support_deprecated_int96_timestamps() && target_unit == ::arrow::TimeUnit::NANO)) {
11+
*physical_type = ParquetType::INT96;
12+
return Status::OK();
13+
}
14+
diff -ruN arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/properties.h arrow-apache-arrow-17.0.0/cpp/src/parquet/properties.h
15+
--- arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/properties.h 2026-03-27 01:23:23.643831362 +0800
16+
+++ arrow-apache-arrow-17.0.0/cpp/src/parquet/properties.h 2026-03-27 01:27:47.717897537 +0800
17+
@@ -980,6 +980,7 @@
18+
public:
19+
Builder()
20+
: write_timestamps_as_int96_(false),
21+
+ force_write_int96_timestamps_(false),
22+
coerce_timestamps_enabled_(false),
23+
coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
24+
truncated_timestamps_allowed_(false),
25+
@@ -1005,6 +1006,21 @@
26+
return this;
27+
}
28+
29+
+ /// \brief Force writing legacy int96 timestamps.
30+
+ ///
31+
+ /// This bypasses unit-based guards and writes INT96 whenever timestamp
32+
+ /// metadata is resolved.
33+
+ Builder* enable_force_write_int96_timestamps() {
34+
+ force_write_int96_timestamps_ = true;
35+
+ return this;
36+
+ }
37+
+
38+
+ /// \brief Disable forcing legacy int96 timestamps (default).
39+
+ Builder* disable_force_write_int96_timestamps() {
40+
+ force_write_int96_timestamps_ = false;
41+
+ return this;
42+
+ }
43+
+
44+
/// \brief Coerce all timestamps to the specified time unit.
45+
/// \param unit time unit to truncate to.
46+
/// For Parquet versions 1.0 and 2.4, nanoseconds are casted to microseconds.
47+
@@ -1085,7 +1101,8 @@
48+
/// Create the final properties.
49+
std::shared_ptr<ArrowWriterProperties> build() {
50+
return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
51+
- write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
52+
+ write_timestamps_as_int96_, force_write_int96_timestamps_,
53+
+ coerce_timestamps_enabled_, coerce_timestamps_unit_,
54+
truncated_timestamps_allowed_, store_schema_, compliant_nested_types_,
55+
engine_version_, use_threads_, executor_));
56+
}
57+
@@ -1093,6 +1110,8 @@
58+
private:
59+
bool write_timestamps_as_int96_;
60+
61+
+ bool force_write_int96_timestamps_;
62+
+
63+
bool coerce_timestamps_enabled_;
64+
::arrow::TimeUnit::type coerce_timestamps_unit_;
65+
bool truncated_timestamps_allowed_;
66+
@@ -1107,6 +1126,8 @@
67+
68+
bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }
69+
70+
+ bool force_write_int96_timestamps() const { return force_write_int96_timestamps_; }
71+
+
72+
bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; }
73+
::arrow::TimeUnit::type coerce_timestamps_unit() const {
74+
return coerce_timestamps_unit_;
75+
@@ -1138,6 +1159,7 @@
76+
77+
private:
78+
explicit ArrowWriterProperties(bool write_nanos_as_int96,
79+
+ bool force_write_int96_timestamps,
80+
bool coerce_timestamps_enabled,
81+
::arrow::TimeUnit::type coerce_timestamps_unit,
82+
bool truncated_timestamps_allowed, bool store_schema,
83+
@@ -1145,6 +1167,7 @@
84+
EngineVersion engine_version, bool use_threads,
85+
::arrow::internal::Executor* executor)
86+
: write_timestamps_as_int96_(write_nanos_as_int96),
87+
+ force_write_int96_timestamps_(force_write_int96_timestamps),
88+
coerce_timestamps_enabled_(coerce_timestamps_enabled),
89+
coerce_timestamps_unit_(coerce_timestamps_unit),
90+
truncated_timestamps_allowed_(truncated_timestamps_allowed),
91+
@@ -1155,6 +1178,7 @@
92+
executor_(executor) {}
93+
94+
const bool write_timestamps_as_int96_;
95+
+ const bool force_write_int96_timestamps_;
96+
const bool coerce_timestamps_enabled_;
97+
const ::arrow::TimeUnit::type coerce_timestamps_unit_;
98+
const bool truncated_timestamps_allowed_;

0 commit comments

Comments
 (0)