Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions thirdparty/download-thirdparty.sh
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,10 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " ARROW " ]]; then
# Paimon-cpp parquet patches: row-group-aware batch reader, max_row_group_size,
# GetBufferedSize(), int96 NANO guard, and Thrift_VERSION empty fix.
patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-paimon.patch"

# apache-arrow-17.0.0-force-write-int96-timestamps.patch :
# Introducing the parameter that forces writing int96 timestampes for compatibility with Paimon cpp.
patch -p1 <"${TP_PATCH_DIR}/apache-arrow-17.0.0-force-write-int96-timestamps.patch"
touch "${PATCHED_MARK}"
fi
cd -
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
diff -ruN arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/arrow/schema.cc arrow-apache-arrow-17.0.0/cpp/src/parquet/arrow/schema.cc
--- arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/arrow/schema.cc 2026-03-27 01:23:23.651831424 +0800
+++ arrow-apache-arrow-17.0.0/cpp/src/parquet/arrow/schema.cc 2026-03-27 01:28:36.855281965 +0800
@@ -178,7 +178,8 @@

// The user is explicitly asking for Impala int96 encoding, there is no
// logical type.
- if (arrow_properties.support_deprecated_int96_timestamps() && target_unit == ::arrow::TimeUnit::NANO) {
+ if (arrow_properties.force_write_int96_timestamps() ||
+ (arrow_properties.support_deprecated_int96_timestamps() && target_unit == ::arrow::TimeUnit::NANO)) {
*physical_type = ParquetType::INT96;
return Status::OK();
}
diff -ruN arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/properties.h arrow-apache-arrow-17.0.0/cpp/src/parquet/properties.h
--- arrow-apache-arrow-17.0.0-after-paimon/cpp/src/parquet/properties.h 2026-03-27 01:23:23.643831362 +0800
+++ arrow-apache-arrow-17.0.0/cpp/src/parquet/properties.h 2026-03-27 01:27:47.717897537 +0800
@@ -980,6 +980,7 @@
public:
Builder()
: write_timestamps_as_int96_(false),
+ force_write_int96_timestamps_(false),
coerce_timestamps_enabled_(false),
coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
truncated_timestamps_allowed_(false),
@@ -1005,6 +1006,21 @@
return this;
}

+ /// \brief Force writing legacy int96 timestamps.
+ ///
+ /// This bypasses unit-based guards and writes INT96 whenever timestamp
+ /// metadata is resolved.
+ Builder* enable_force_write_int96_timestamps() {
+ force_write_int96_timestamps_ = true;
+ return this;
+ }
+
+ /// \brief Disable forcing legacy int96 timestamps (default).
+ Builder* disable_force_write_int96_timestamps() {
+ force_write_int96_timestamps_ = false;
+ return this;
+ }
+
/// \brief Coerce all timestamps to the specified time unit.
/// \param unit time unit to truncate to.
/// For Parquet versions 1.0 and 2.4, nanoseconds are casted to microseconds.
@@ -1085,7 +1101,8 @@
/// Create the final properties.
std::shared_ptr<ArrowWriterProperties> build() {
return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
- write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
+ write_timestamps_as_int96_, force_write_int96_timestamps_,
+ coerce_timestamps_enabled_, coerce_timestamps_unit_,
truncated_timestamps_allowed_, store_schema_, compliant_nested_types_,
engine_version_, use_threads_, executor_));
}
@@ -1093,6 +1110,8 @@
private:
bool write_timestamps_as_int96_;

+ bool force_write_int96_timestamps_;
+
bool coerce_timestamps_enabled_;
::arrow::TimeUnit::type coerce_timestamps_unit_;
bool truncated_timestamps_allowed_;
@@ -1107,6 +1126,8 @@

bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }

+ bool force_write_int96_timestamps() const { return force_write_int96_timestamps_; }
+
bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; }
::arrow::TimeUnit::type coerce_timestamps_unit() const {
return coerce_timestamps_unit_;
@@ -1138,6 +1159,7 @@

private:
explicit ArrowWriterProperties(bool write_nanos_as_int96,
+ bool force_write_int96_timestamps,
bool coerce_timestamps_enabled,
::arrow::TimeUnit::type coerce_timestamps_unit,
bool truncated_timestamps_allowed, bool store_schema,
@@ -1145,6 +1167,7 @@
EngineVersion engine_version, bool use_threads,
::arrow::internal::Executor* executor)
: write_timestamps_as_int96_(write_nanos_as_int96),
+ force_write_int96_timestamps_(force_write_int96_timestamps),
coerce_timestamps_enabled_(coerce_timestamps_enabled),
coerce_timestamps_unit_(coerce_timestamps_unit),
truncated_timestamps_allowed_(truncated_timestamps_allowed),
@@ -1155,6 +1178,7 @@
executor_(executor) {}

const bool write_timestamps_as_int96_;
+ const bool force_write_int96_timestamps_;
const bool coerce_timestamps_enabled_;
const ::arrow::TimeUnit::type coerce_timestamps_unit_;
const bool truncated_timestamps_allowed_;
Loading