From 1f40adb4feed235b8c3c5c6e215682a37a4efb68 Mon Sep 17 00:00:00 2001 From: lihangyu-x Date: Sun, 17 May 2026 15:30:26 +0800 Subject: [PATCH] [fix](be) Preserve Variant predefined decimal precision ### What problem does this PR solve? Issue Number: N/A (CIR-20240) Related PR: N/A Problem Summary: Variant predefined decimal typed paths could lose precision because JSON numeric values were materialized through binary numeric values before the later decimal cast. This change keeps raw number tokens in the simdjson-based Variant parser and, only when TabletSchema predefined Variant decimal typed paths are configured through ParseConfig, materializes matching numeric tokens as strings for the decimal cast. Untyped Variant numeric paths keep their existing numeric behavior. ### Release note Fix Variant predefined decimal typed path precision loss for high precision and scientific notation JSON numbers. ### Check List (For Author) - Test: - BE unit test: `./run-be-ut.sh --run --filter='JsonParserTest.*'` - Regression test: `./run-regression-test.sh --conf tmp/regression-conf.auto.groovy --run -d variant_p0/predefine -s test_variant_high_precision_decimal` - Build: `BUILD_TYPE=ASAN USE_MEM_TRACKER=ON ./build.sh --be` - Format: `PATH=/tmp/doris-codex-clang:$PATH build-support/check-format.sh`, `git diff --check` - Static analysis: Not run to completion; clang-tidy was attempted but blocked by local toolchain/pre-existing header issues. - Behavior changed: Yes. Variant predefined decimal typed paths preserve matching JSON numeric tokens for decimal materialization. - Does this need documentation: No --- be/src/exec/common/variant_util.cpp | 49 +++ be/src/util/json/json_parser.cpp | 37 +- be/src/util/json/json_parser.h | 29 +- be/src/util/json/simd_json_parser.h | 351 +++++++++++++++--- be/test/core/jsonb/json_parser_test.cpp | 203 +++++++++- ...variant_high_precision_decimal_stream.json | 1 + ...test_variant_high_precision_decimal.groovy | 168 +++++++++ 7 files changed, 773 insertions(+), 65 deletions(-) create mode 100644 regression-test/data/variant_p0/predefine/test_variant_high_precision_decimal_stream.json create mode 100644 regression-test/suites/variant_p0/predefine/test_variant_high_precision_decimal.groovy diff --git a/be/src/exec/common/variant_util.cpp b/be/src/exec/common/variant_util.cpp index 39e8f236ecd16e..93a0e179207050 100644 --- a/be/src/exec/common/variant_util.cpp +++ b/be/src/exec/common/variant_util.cpp @@ -104,6 +104,54 @@ namespace doris::variant_util { +static bool is_decimal_typed_path_column(const TabletColumn& column) { + if (column.is_array_type()) { + CHECK_EQ(column.get_sub_columns().size(), 1); + return is_decimal_typed_path_column(*column.get_sub_columns()[0]); + } + return is_decimal(TabletColumn::get_primitive_type_by_field_type(column.type())); +} + +struct DecimalNumberPreservePathRule { + std::string pattern; + PatternTypePB pattern_type; + bool is_decimal = false; +}; + +static void configure_decimal_number_preserve_paths(const TabletColumn& column, + ParseConfig* config) { + std::vector path_rules; + bool has_decimal_path = false; + for (const auto& sub_column : column.get_sub_columns()) { + const bool is_decimal_path = is_decimal_typed_path_column(*sub_column); + has_decimal_path |= is_decimal_path; + path_rules.push_back({sub_column->name(), sub_column->pattern_type(), is_decimal_path}); + } + if (has_decimal_path) { + config->preserve_decimal_number_path_matcher = + [path_rules = std::move(path_rules)](std::string_view path) { + std::string candidate_path(path); + for (const auto& rule : path_rules) { + switch (rule.pattern_type) { + case PatternTypePB::MATCH_NAME: + if (rule.pattern == candidate_path) { + return rule.is_decimal; + } + break; + case PatternTypePB::MATCH_NAME_GLOB: + if (glob_match_re2(rule.pattern, candidate_path)) { + return rule.is_decimal; + } + break; + default: + break; + } + } + return false; + }; + } +} + inline void append_escaped_regex_char(std::string* regex_output, char ch) { switch (ch) { case '.': @@ -2245,6 +2293,7 @@ Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& t return Status::InternalError("column is not variant type, column name: {}", column.name()); } + configure_decimal_number_preserve_paths(column, &configs[i]); // if doc mode is not enabled, no need to parse to doc value column if (!column.variant_enable_doc_mode()) { configs[i].parse_to = ParseConfig::ParseTo::OnlySubcolumns; diff --git a/be/src/util/json/json_parser.cpp b/be/src/util/json/json_parser.cpp index 3df723c3849eac..14345ab35a3473 100644 --- a/be/src/util/json/json_parser.cpp +++ b/be/src/util/json/json_parser.cpp @@ -31,6 +31,7 @@ #include "common/cast_set.h" // IWYU pragma: keep #include "common/status.h" +#include "util/defer_op.h" #include "util/json/path_in_data.h" #include "util/json/simd_json_parser.h" @@ -43,12 +44,15 @@ std::optional JSONDataParser::parse(const char* begin, if (!parser.parse(begin, length, document)) { return {}; } + Defer release_parser {[&]() { parser.release(); }}; ParseContext context; // deprecated_enable_flatten_nested controls nested path traversal // NestedGroup expansion is now handled at storage layer context.deprecated_enable_flatten_nested = config.deprecated_enable_flatten_nested; context.check_duplicate_json_path = config.check_duplicate_json_path; context.is_top_array = document.isArray(); + context.preserve_decimal_number_paths = &config.preserve_decimal_number_paths; + context.preserve_decimal_number_path_matcher = &config.preserve_decimal_number_path_matcher; traverse(document, context); ParseResult result; result.values = std::move(context.values); @@ -84,8 +88,30 @@ void JSONDataParser::traverse(const Element& element, ParseContext& // we should set has_nested_in_flatten to false when traverse array finished for next array otherwise it will be true for next array ctx.has_nested_in_flatten = false; } else { - appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(), getValueAsField(element)); + const bool preserve_number_as_string = shouldPreserveNumberAsString(ctx); + appendValueIfNotDuplicate(ctx, ctx.builder.get_parts(), + getValueAsField(element, preserve_number_as_string)); + } +} + +template +bool JSONDataParser::shouldPreserveNumberAsString(const ParseContext& ctx) const { + const bool has_exact_paths = ctx.preserve_decimal_number_paths != nullptr && + !ctx.preserve_decimal_number_paths->empty(); + const bool has_path_matcher = ctx.preserve_decimal_number_path_matcher != nullptr && + *ctx.preserve_decimal_number_path_matcher; + if (!has_exact_paths && !has_path_matcher) { + return false; + } + PathInData::Parts path = ctx.path_prefix_for_typed_paths; + const auto& current_parts = ctx.builder.get_parts(); + path.insert(path.end(), current_parts.begin(), current_parts.end()); + const auto current_path = PathInData(path).get_path(); + if (has_exact_paths && ctx.preserve_decimal_number_paths->find(current_path) != + ctx.preserve_decimal_number_paths->end()) { + return true; } + return has_path_matcher && (*ctx.preserve_decimal_number_path_matcher)(current_path); } template @@ -201,6 +227,12 @@ void JSONDataParser::traverseArray(const JSONArray& array, ParseCont array_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten; array_ctx.is_top_array = ctx.is_top_array; array_ctx.check_duplicate_json_path = ctx.check_duplicate_json_path; + array_ctx.preserve_decimal_number_paths = ctx.preserve_decimal_number_paths; + array_ctx.preserve_decimal_number_path_matcher = ctx.preserve_decimal_number_path_matcher; + array_ctx.path_prefix_for_typed_paths = ctx.path_prefix_for_typed_paths; + const auto& current_parts = ctx.builder.get_parts(); + array_ctx.path_prefix_for_typed_paths.insert(array_ctx.path_prefix_for_typed_paths.end(), + current_parts.begin(), current_parts.end()); array_ctx.total_size = array.size(); for (auto it = array.begin(); it != array.end(); ++it) { traverseArrayElement(*it, array_ctx); @@ -231,6 +263,9 @@ void JSONDataParser::traverseArrayElement(const Element& element, element_ctx.has_nested_in_flatten = ctx.has_nested_in_flatten; element_ctx.is_top_array = ctx.is_top_array; element_ctx.check_duplicate_json_path = ctx.check_duplicate_json_path; + element_ctx.preserve_decimal_number_paths = ctx.preserve_decimal_number_paths; + element_ctx.preserve_decimal_number_path_matcher = ctx.preserve_decimal_number_path_matcher; + element_ctx.path_prefix_for_typed_paths = ctx.path_prefix_for_typed_paths; traverse(element, element_ctx); auto& paths = element_ctx.paths; auto& values = element_ctx.values; diff --git a/be/src/util/json/json_parser.h b/be/src/util/json/json_parser.h index c4a165e899546f..ff783124b2e179 100644 --- a/be/src/util/json/json_parser.h +++ b/be/src/util/json/json_parser.h @@ -21,10 +21,12 @@ #pragma once #include -#include +#include +#include #include #include +#include #include #include @@ -40,7 +42,10 @@ namespace doris { template -Field getValueAsField(const Element& element) { +Field getValueAsField(const Element& element, bool preserve_number_as_string = false) { + if (preserve_number_as_string && element.isNumber()) { + return Field::create_field(String(element.getRawNumber())); + } // bool will convert to type FiledType::UInt64 if (element.isBool()) { return Field::create_field(element.getBool()); @@ -53,6 +58,9 @@ Field getValueAsField(const Element& element) { if (element.isUInt64()) { return Field::create_field(static_cast(element.getUInt64())); } + if (element.isBigInteger()) { + return Field::create_field(element.getDouble()); + } if (element.isDouble()) { return Field::create_field(element.getDouble()); } @@ -60,9 +68,9 @@ Field getValueAsField(const Element& element) { return Field::create_field(String(element.getString())); } if (element.isNull()) { - return Field(); + return {}; } - return Field(); + return {}; } template @@ -82,6 +90,10 @@ void writeValueAsJsonb(const Element& element, JsonbWriter& writer) { writer.writeInt128(static_cast(element.getUInt64())); return; } + if (element.isBigInteger()) { + writer.writeDouble(element.getDouble()); + return; + } if (element.isDouble()) { writer.writeDouble(element.getDouble()); return; @@ -107,6 +119,8 @@ struct ParseConfig { OnlyDocValueColumn = 1, }; ParseTo parse_to = ParseTo::OnlySubcolumns; + phmap::flat_hash_set preserve_decimal_number_paths; + std::function preserve_decimal_number_path_matcher; }; /// Result of parsing of a document. /// Contains all paths extracted from document @@ -133,6 +147,9 @@ class JSONDataParser { bool check_duplicate_json_path = false; bool has_nested_in_flatten = false; bool is_top_array = false; + const phmap::flat_hash_set* preserve_decimal_number_paths = nullptr; + const std::function* preserve_decimal_number_path_matcher = nullptr; + PathInData::Parts path_prefix_for_typed_paths; }; using PathPartsWithArray = std::pair; using PathToArray = phmap::flat_hash_map; @@ -145,12 +162,16 @@ class JSONDataParser { bool has_nested_in_flatten = false; bool is_top_array = false; bool check_duplicate_json_path = false; + const phmap::flat_hash_set* preserve_decimal_number_paths = nullptr; + const std::function* preserve_decimal_number_path_matcher = nullptr; + PathInData::Parts path_prefix_for_typed_paths; }; void traverse(const Element& element, ParseContext& ctx); void traverseObject(const JSONObject& object, ParseContext& ctx); void traverseArray(const JSONArray& array, ParseContext& ctx); void appendValueIfNotDuplicate(ParseContext& ctx, const PathInData::Parts& path, Field&& value); void traverseArrayElement(const Element& element, ParseArrayContext& ctx); + bool shouldPreserveNumberAsString(const ParseContext& ctx) const; void checkAmbiguousStructure(const ParseArrayContext& ctx, const std::vector& paths); void handleExistingPath(std::pair& path_data, diff --git a/be/src/util/json/simd_json_parser.h b/be/src/util/json/simd_json_parser.h index f450ac4a05f7bf..a3683c31a27d79 100644 --- a/be/src/util/json/simd_json_parser.h +++ b/be/src/util/json/simd_json_parser.h @@ -20,9 +20,15 @@ #pragma once -#include #include +#include +#include +#include +#include +#include +#include + #include "core/types.h" namespace doris { @@ -30,6 +36,31 @@ namespace doris { /// This class can be used as an argument for the template class FunctionJSON. /// It provides ability to parse JSONs using simdjson library. class SimdJSONParser { + struct Node { + enum class Type { + INT64, + UINT64, + BIG_INTEGER, + DOUBLE, + STRING, + ARRAY, + OBJECT, + BOOL, + NULL_VALUE, + }; + + Type type = Type::NULL_VALUE; + Int64 int64_value = 0; + UInt64 uint64_value = 0; + double double_value = 0; + bool bool_value = false; + std::string string_value; + std::string raw_number; + std::vector array_values; + std::vector object_keys; + std::vector object_values; + }; + public: class Array; class Object; @@ -37,54 +68,87 @@ class SimdJSONParser { /// array or object. class Element { public: - ALWAYS_INLINE Element() {} /// NOLINT - ALWAYS_INLINE Element(const simdjson::dom::element& element_) - : element(element_) {} /// NOLINT + ALWAYS_INLINE Element() {} /// NOLINT + ALWAYS_INLINE explicit Element(const Node* node_) : node(node_) {} /// NOLINT ALWAYS_INLINE bool isInt64() const { - return element.type() == simdjson::dom::element_type::INT64; + assert(node != nullptr); + return node->type == Node::Type::INT64; } ALWAYS_INLINE bool isUInt64() const { - return element.type() == simdjson::dom::element_type::UINT64; + assert(node != nullptr); + return node->type == Node::Type::UINT64; + } + ALWAYS_INLINE bool isBigInteger() const { + assert(node != nullptr); + return node->type == Node::Type::BIG_INTEGER; + } + ALWAYS_INLINE bool isNumber() const { + assert(node != nullptr); + return node->type == Node::Type::INT64 || node->type == Node::Type::UINT64 || + node->type == Node::Type::BIG_INTEGER || node->type == Node::Type::DOUBLE; } ALWAYS_INLINE bool isDouble() const { - return element.type() == simdjson::dom::element_type::DOUBLE; + assert(node != nullptr); + return node->type == Node::Type::DOUBLE; } ALWAYS_INLINE bool isString() const { - return element.type() == simdjson::dom::element_type::STRING; + assert(node != nullptr); + return node->type == Node::Type::STRING; } ALWAYS_INLINE bool isArray() const { - return element.type() == simdjson::dom::element_type::ARRAY; + assert(node != nullptr); + return node->type == Node::Type::ARRAY; } ALWAYS_INLINE bool isObject() const { - return element.type() == simdjson::dom::element_type::OBJECT; + assert(node != nullptr); + return node->type == Node::Type::OBJECT; } ALWAYS_INLINE bool isBool() const { - return element.type() == simdjson::dom::element_type::BOOLEAN; + assert(node != nullptr); + return node->type == Node::Type::BOOL; } ALWAYS_INLINE bool isNull() const { - return element.type() == simdjson::dom::element_type::NULL_VALUE; + assert(node != nullptr); + return node->type == Node::Type::NULL_VALUE; + } + ALWAYS_INLINE Int64 getInt64() const { + assert(node != nullptr); + return node->int64_value; + } + ALWAYS_INLINE double getDouble() const { + assert(node != nullptr); + return node->double_value; + } + ALWAYS_INLINE bool getBool() const { + assert(node != nullptr); + return node->bool_value; } - ALWAYS_INLINE Int64 getInt64() const { return element.get_int64().value_unsafe(); } - ALWAYS_INLINE double getDouble() const { return element.get_double().value_unsafe(); } - ALWAYS_INLINE bool getBool() const { return element.get_bool().value_unsafe(); } ALWAYS_INLINE std::string_view getString() const { - return element.get_string().value_unsafe(); + assert(node != nullptr); + return node->string_value; + } + ALWAYS_INLINE UInt64 getUInt64() const { + assert(node != nullptr); + return node->uint64_value; + } + ALWAYS_INLINE std::string_view getRawNumber() const { + assert(node != nullptr); + return node->raw_number; } - ALWAYS_INLINE UInt64 getUInt64() const { return element.get_uint64().value_unsafe(); } ALWAYS_INLINE Array getArray() const; ALWAYS_INLINE Object getObject() const; private: - simdjson::dom::element element; + const Node* node = nullptr; }; /// References an array in a JSON document. class Array { public: class Iterator { public: - ALWAYS_INLINE Iterator(const simdjson::dom::array::iterator& it_) - : it(it_) {} /// NOLINT - ALWAYS_INLINE Element operator*() const { return *it; } + using NodeIterator = std::vector::const_iterator; + ALWAYS_INLINE explicit Iterator(NodeIterator it_) : it(it_) {} /// NOLINT + ALWAYS_INLINE Element operator*() const { return Element(&*it); } ALWAYS_INLINE Iterator& operator++() { ++it; return *this; @@ -94,19 +158,19 @@ class SimdJSONParser { } private: - simdjson::dom::array::iterator it; + NodeIterator it; }; - ALWAYS_INLINE Array(const simdjson::dom::array& array_) : array(array_) {} /// NOLINT - ALWAYS_INLINE Iterator begin() const { return array.begin(); } - ALWAYS_INLINE Iterator end() const { return array.end(); } - ALWAYS_INLINE size_t size() const { return array.size(); } + ALWAYS_INLINE explicit Array(const std::vector* array_) : array(array_) {} /// NOLINT + ALWAYS_INLINE Iterator begin() const { return Iterator(array->begin()); } + ALWAYS_INLINE Iterator end() const { return Iterator(array->end()); } + ALWAYS_INLINE size_t size() const { return array->size(); } ALWAYS_INLINE Element operator[](size_t index) const { assert(index < size()); - return array.at(index).value_unsafe(); + return Element(&(*array)[index]); } private: - simdjson::dom::array array; + const std::vector* array; }; using KeyValuePair = std::pair; /// References an object in a JSON document. @@ -114,67 +178,244 @@ class SimdJSONParser { public: class Iterator { public: - ALWAYS_INLINE Iterator(const simdjson::dom::object::iterator& it_) - : it(it_) {} /// NOLINT + ALWAYS_INLINE explicit Iterator(const std::vector* keys_, + const std::vector* values_, size_t index_) + : index(index_), keys(keys_), values(values_) {} /// NOLINT ALWAYS_INLINE KeyValuePair operator*() const { - const auto& res = *it; - return {res.key, res.value}; + return {(*keys)[index], Element(&(*values)[index])}; } ALWAYS_INLINE Iterator& operator++() { - ++it; + ++index; return *this; } ALWAYS_INLINE Iterator operator++(int) { auto res = *this; - ++it; + ++(*this); return res; } /// NOLINT ALWAYS_INLINE friend bool operator!=(const Iterator& left, const Iterator& right) { - return left.it != right.it; + return left.index != right.index; } ALWAYS_INLINE friend bool operator==(const Iterator& left, const Iterator& right) { return !(left != right); } private: - simdjson::dom::object::iterator it; + size_t index; + const std::vector* keys; + const std::vector* values; }; - ALWAYS_INLINE Object(const simdjson::dom::object& object_) : object(object_) {} /// NOLINT - ALWAYS_INLINE Iterator begin() const { return object.begin(); } - ALWAYS_INLINE Iterator end() const { return object.end(); } - ALWAYS_INLINE size_t size() const { return object.size(); } + ALWAYS_INLINE explicit Object(const std::vector* keys_, + const std::vector* values_) + : keys(keys_), values(values_) {} /// NOLINT + ALWAYS_INLINE Iterator begin() const { return Iterator(keys, values, 0); } + ALWAYS_INLINE Iterator end() const { return Iterator(keys, values, size()); } + ALWAYS_INLINE size_t size() const { return values->size(); } /// Optional: Provides access to an object's element by index. KeyValuePair operator[](size_t index) const { assert(index < size()); - auto it = object.begin(); - while (index--) { - ++it; - } - const auto& res = *it; - return {res.key, res.value}; + return {(*keys)[index], Element(&(*values)[index])}; } private: - simdjson::dom::object object; + const std::vector* keys; + const std::vector* values; }; /// Parses a JSON document, returns the reference to its root element if succeeded. bool parse(const char* data, size_t size, Element& result) { - auto document = parser.parse(data, size); - if (document.error()) { + root = Node(); + return parse_ondemand(data, size, result); + } + void release() { root = Node(); } + +private: + bool parse_ondemand(const char* data, size_t size, Element& result) { + simdjson::padded_string padded_json(data, size); + simdjson::ondemand::document document; + auto error = ondemand_parser.iterate(padded_json).get(document); + if (error) { return false; } - result = document.value_unsafe(); + if (!build_node(document, &root)) { + root = Node(); + return false; + } + result = Element(&root); return true; } -private: - simdjson::dom::parser parser; + static std::string_view trim_raw_number(std::string_view raw_number) { + auto is_space = [](char ch) { return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'; }; + while (!raw_number.empty() && is_space(raw_number.front())) { + raw_number.remove_prefix(1); + } + while (!raw_number.empty() && is_space(raw_number.back())) { + raw_number.remove_suffix(1); + } + return raw_number; + } + + template + static bool assign_raw_number(RawNumber&& raw_number, std::string* out) { + if constexpr (std::is_same_v, std::string_view>) { + *out = std::string(trim_raw_number(raw_number)); + return true; + } else { + std::string_view raw_number_view; + auto error = std::move(raw_number).get(raw_number_view); + if (error) { + return false; + } + *out = std::string(trim_raw_number(raw_number_view)); + return true; + } + } + + template + bool build_array_node(Value& value, Node* out) { + simdjson::ondemand::array array; + auto error = value.get_array().get(array); + if (error) { + return false; + } + out->type = Node::Type::ARRAY; + for (auto element_result : array) { + simdjson::ondemand::value element; + error = std::move(element_result).get(element); + if (error) { + return false; + } + Node element_node; + if (!build_node(element, &element_node)) { + return false; + } + out->array_values.push_back(std::move(element_node)); + } + return true; + } + + template + bool build_object_node(Value& value, Node* out) { + simdjson::ondemand::object object; + auto error = value.get_object().get(object); + if (error) { + return false; + } + out->type = Node::Type::OBJECT; + for (auto field_result : object) { + simdjson::ondemand::field field; + error = std::move(field_result).get(field); + if (error) { + return false; + } + std::string_view key; + error = field.unescaped_key().get(key); + if (error) { + return false; + } + std::string key_copy(key); + simdjson::ondemand::value field_value = field.value(); + Node field_node; + if (!build_node(field_value, &field_node)) { + return false; + } + out->object_keys.push_back(std::move(key_copy)); + out->object_values.push_back(std::move(field_node)); + } + return true; + } + + template + bool build_number_node(Value& value, Node* out) { + simdjson::ondemand::number_type number_type; + auto error = value.get_number_type().get(number_type); + if (error) { + return false; + } + switch (number_type) { + case simdjson::ondemand::number_type::signed_integer: + if (!assign_raw_number(value.raw_json_token(), &out->raw_number)) { + return false; + } + out->type = Node::Type::INT64; + error = value.get_int64().get(out->int64_value); + return !error; + case simdjson::ondemand::number_type::unsigned_integer: + if (!assign_raw_number(value.raw_json_token(), &out->raw_number)) { + return false; + } + out->type = Node::Type::UINT64; + error = value.get_uint64().get(out->uint64_value); + return !error; + case simdjson::ondemand::number_type::floating_point_number: + if (!assign_raw_number(value.raw_json_token(), &out->raw_number)) { + return false; + } + out->type = Node::Type::DOUBLE; + error = value.get_double().get(out->double_value); + return !error; + case simdjson::ondemand::number_type::big_integer: { + if (!assign_raw_number(value.raw_json_token(), &out->raw_number)) { + return false; + } + out->type = Node::Type::BIG_INTEGER; + error = value.get_double().get(out->double_value); + return !error; + } + } + return false; + } + + template + bool build_string_node(Value& value, Node* out) { + std::string_view str; + auto error = value.get_string().get(str); + if (error) { + return false; + } + out->type = Node::Type::STRING; + out->string_value = std::string(str); + return true; + } + + template + bool build_node(Value& value, Node* out) { + simdjson::ondemand::json_type type; + auto error = value.type().get(type); + if (error) { + return false; + } + switch (type) { + case simdjson::ondemand::json_type::array: + return build_array_node(value, out); + case simdjson::ondemand::json_type::object: + return build_object_node(value, out); + case simdjson::ondemand::json_type::number: + return build_number_node(value, out); + case simdjson::ondemand::json_type::string: { + return build_string_node(value, out); + } + case simdjson::ondemand::json_type::boolean: + out->type = Node::Type::BOOL; + error = value.get_bool().get(out->bool_value); + return !error; + case simdjson::ondemand::json_type::null: + out->type = Node::Type::NULL_VALUE; + return true; + } + return false; + } + + simdjson::ondemand::parser ondemand_parser; + Node root; }; inline ALWAYS_INLINE SimdJSONParser::Array SimdJSONParser::Element::getArray() const { - return element.get_array().value_unsafe(); + assert(node != nullptr); + return Array(&node->array_values); } inline ALWAYS_INLINE SimdJSONParser::Object SimdJSONParser::Element::getObject() const { - return element.get_object().value_unsafe(); + assert(node != nullptr); + return Object(&node->object_keys, &node->object_values); } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/core/jsonb/json_parser_test.cpp b/be/test/core/jsonb/json_parser_test.cpp index f8cfdf8e1626bd..c013a610890aa2 100644 --- a/be/test/core/jsonb/json_parser_test.cpp +++ b/be/test/core/jsonb/json_parser_test.cpp @@ -19,6 +19,7 @@ #include +#include #include #include "common/config.h" @@ -49,6 +50,77 @@ TEST(JsonParserTest, ParseSimpleTypes) { EXPECT_EQ(parse_result_double.paths[0].get_path(), ""); EXPECT_EQ(parse_result_double.values[0].get_type(), doris::PrimitiveType::TYPE_DOUBLE); + // untyped high precision decimal tokens keep the regular numeric behavior + std::string high_precision_decimal = "999999999999999999999999999.999999999"; + result = parser.parse(high_precision_decimal.c_str(), high_precision_decimal.size(), config); + ASSERT_TRUE(result.has_value()); + + auto parse_result_decimal = result.value(); + EXPECT_EQ(parse_result_decimal.paths[0].get_path(), ""); + EXPECT_EQ(parse_result_decimal.values[0].get_type(), doris::PrimitiveType::TYPE_DOUBLE); + + // typed decimal paths keep the original text for later decimal casts + config.preserve_decimal_number_paths.emplace(""); + result = parser.parse(high_precision_decimal.c_str(), high_precision_decimal.size(), config); + ASSERT_TRUE(result.has_value()); + + parse_result_decimal = result.value(); + EXPECT_EQ(parse_result_decimal.paths[0].get_path(), ""); + EXPECT_EQ(parse_result_decimal.values[0].get_type(), doris::PrimitiveType::TYPE_STRING); + EXPECT_EQ(parse_result_decimal.values[0].get(), + high_precision_decimal); + + std::string scale_sensitive_decimal = "0.57"; + result = parser.parse(scale_sensitive_decimal.c_str(), scale_sensitive_decimal.size(), config); + ASSERT_TRUE(result.has_value()); + + parse_result_decimal = result.value(); + EXPECT_EQ(parse_result_decimal.paths[0].get_path(), ""); + EXPECT_EQ(parse_result_decimal.values[0].get_type(), doris::PrimitiveType::TYPE_STRING); + EXPECT_EQ(parse_result_decimal.values[0].get(), + scale_sensitive_decimal); + + std::string high_precision_decimal_with_spaces = high_precision_decimal + " \n"; + result = parser.parse(high_precision_decimal_with_spaces.c_str(), + high_precision_decimal_with_spaces.size(), config); + ASSERT_TRUE(result.has_value()); + + parse_result_decimal = result.value(); + EXPECT_EQ(parse_result_decimal.paths[0].get_path(), ""); + EXPECT_EQ(parse_result_decimal.values[0].get_type(), doris::PrimitiveType::TYPE_STRING); + EXPECT_EQ(parse_result_decimal.values[0].get(), + high_precision_decimal); + + std::string high_precision_decimal_exponent = "999999999999999999999999999.999999999e0"; + result = parser.parse(high_precision_decimal_exponent.c_str(), + high_precision_decimal_exponent.size(), config); + ASSERT_TRUE(result.has_value()); + + parse_result_decimal = result.value(); + EXPECT_EQ(parse_result_decimal.paths[0].get_path(), ""); + EXPECT_EQ(parse_result_decimal.values[0].get_type(), doris::PrimitiveType::TYPE_STRING); + EXPECT_EQ(parse_result_decimal.values[0].get(), + high_precision_decimal_exponent); + + std::string typed_integer = "123"; + result = parser.parse(typed_integer.c_str(), typed_integer.size(), config); + ASSERT_TRUE(result.has_value()); + + auto parse_result_typed_integer = result.value(); + EXPECT_EQ(parse_result_typed_integer.paths[0].get_path(), ""); + EXPECT_EQ(parse_result_typed_integer.values[0].get_type(), doris::PrimitiveType::TYPE_STRING); + EXPECT_EQ(parse_result_typed_integer.values[0].get(), + typed_integer); + + ParseConfig untyped_decimal_config; + result = parser.parse(high_precision_decimal_exponent.c_str(), + high_precision_decimal_exponent.size(), untyped_decimal_config); + ASSERT_TRUE(result.has_value()); + + parse_result_decimal = result.value(); + EXPECT_EQ(parse_result_decimal.paths[0].get_path(), ""); + EXPECT_EQ(parse_result_decimal.values[0].get_type(), doris::PrimitiveType::TYPE_DOUBLE); + // bool result = parser.parse("true", 4, config); ASSERT_TRUE(result.has_value()); @@ -71,11 +143,57 @@ TEST(JsonParserTest, ParseSimpleTypes) { EXPECT_EQ(parse_result_string.values[0].get_type(), doris::PrimitiveType::TYPE_STRING); // largeint - result = parser.parse("12345678901234567890", 20, config); + ParseConfig untyped_integer_config; + result = parser.parse("12345678901234567890", 20, untyped_integer_config); ASSERT_TRUE(result.has_value()); auto parse_result_bigint = result.value(); EXPECT_EQ(parse_result_bigint.paths[0].get_path(), ""); EXPECT_EQ(parse_result_bigint.values[0].get_type(), doris::PrimitiveType::TYPE_LARGEINT); + + // untyped integers beyond uint64 keep the regular numeric behavior. + std::string big_integer = "18446744073709551616"; + result = parser.parse(big_integer.c_str(), big_integer.size(), untyped_integer_config); + ASSERT_TRUE(result.has_value()); + auto parse_result_untyped_big_integer = result.value(); + EXPECT_EQ(parse_result_untyped_big_integer.paths[0].get_path(), ""); + EXPECT_EQ(parse_result_untyped_big_integer.values[0].get_type(), + doris::PrimitiveType::TYPE_DOUBLE); + + ParseConfig typed_integer_config; + typed_integer_config.preserve_decimal_number_paths.emplace(""); + result = parser.parse(big_integer.c_str(), big_integer.size(), typed_integer_config); + ASSERT_TRUE(result.has_value()); + auto parse_result_big_integer = result.value(); + EXPECT_EQ(parse_result_big_integer.paths[0].get_path(), ""); + EXPECT_EQ(parse_result_big_integer.values[0].get_type(), doris::PrimitiveType::TYPE_STRING); + EXPECT_EQ(parse_result_big_integer.values[0].get(), + big_integer); + + std::string decimal256_integer = "99999999999999999999999999999999999999999999999999"; + result = parser.parse(decimal256_integer.c_str(), decimal256_integer.size(), + typed_integer_config); + ASSERT_TRUE(result.has_value()); + auto parse_result_decimal256_integer = result.value(); + EXPECT_EQ(parse_result_decimal256_integer.paths[0].get_path(), ""); + EXPECT_EQ(parse_result_decimal256_integer.values[0].get_type(), + doris::PrimitiveType::TYPE_STRING); + EXPECT_EQ(parse_result_decimal256_integer.values[0].get(), + decimal256_integer); + + ParseConfig mixed_number_config; + mixed_number_config.preserve_decimal_number_paths.emplace("amount"); + std::string mixed_json = + R"({"amount":0.57,"id":18446744073709551616,"normal":12345678901234567890})"; + result = parser.parse(mixed_json.c_str(), mixed_json.size(), mixed_number_config); + ASSERT_TRUE(result.has_value()); + auto parse_result_mixed_number = result.value(); + ASSERT_EQ(parse_result_mixed_number.paths.size(), 3); + EXPECT_EQ(parse_result_mixed_number.paths[0].get_path(), "amount"); + EXPECT_EQ(parse_result_mixed_number.values[0].get_type(), doris::PrimitiveType::TYPE_STRING); + EXPECT_EQ(parse_result_mixed_number.paths[1].get_path(), "id"); + EXPECT_EQ(parse_result_mixed_number.values[1].get_type(), doris::PrimitiveType::TYPE_DOUBLE); + EXPECT_EQ(parse_result_mixed_number.paths[2].get_path(), "normal"); + EXPECT_EQ(parse_result_mixed_number.values[2].get_type(), doris::PrimitiveType::TYPE_LARGEINT); } TEST(JsonParserTest, ParseObjectAndArray) { @@ -117,6 +235,69 @@ TEST(JsonParserTest, ParseObjectAndArray) { EXPECT_EQ(array_field[5].get_type(), doris::PrimitiveType::TYPE_LARGEINT); } +TEST(JsonParserTest, PreserveDecimalNumbersForTypedPaths) { + JSONDataParser parser; + ParseConfig config; + config.preserve_decimal_number_paths.emplace("a"); + + std::string json = R"({"a":[999999999999999999999999999.999999999 ]})"; + auto result = parser.parse(json.c_str(), json.size(), config); + ASSERT_TRUE(result.has_value()); + auto& parse_result_decimal_array = result.value(); + EXPECT_EQ(parse_result_decimal_array.paths[0].get_path(), "a"); + EXPECT_EQ(parse_result_decimal_array.values[0].get_type(), doris::PrimitiveType::TYPE_ARRAY); + auto& decimal_array = + parse_result_decimal_array.values[0].get(); + ASSERT_EQ(decimal_array.size(), 1); + EXPECT_EQ(decimal_array[0].get_type(), doris::PrimitiveType::TYPE_STRING); + EXPECT_EQ(decimal_array[0].get(), + "999999999999999999999999999.999999999"); + + ParseConfig untyped_config; + result = parser.parse(json.c_str(), json.size(), untyped_config); + ASSERT_TRUE(result.has_value()); + auto& parse_result_untyped_decimal_array = result.value(); + auto& untyped_decimal_array = + parse_result_untyped_decimal_array.values[0].get(); + ASSERT_EQ(untyped_decimal_array.size(), 1); + EXPECT_EQ(untyped_decimal_array[0].get_type(), doris::PrimitiveType::TYPE_DOUBLE); +} + +TEST(JsonParserTest, PreserveDecimalNumbersByPathMatcher) { + JSONDataParser parser; + + ParseConfig matcher_config; + matcher_config.preserve_decimal_number_path_matcher = [](std::string_view path) { + return path.size() >= 8 && path.substr(0, 8) == "decimal_"; + }; + + std::string json = + R"({"decimal_1":999999999999999999999999999.999999999,"other":999999999999999999999999999.999999999})"; + auto result = parser.parse(json.c_str(), json.size(), matcher_config); + ASSERT_TRUE(result.has_value()); + auto& parse_result_matcher = result.value(); + EXPECT_EQ(parse_result_matcher.paths[0].get_path(), "decimal_1"); + EXPECT_EQ(parse_result_matcher.values[0].get_type(), doris::PrimitiveType::TYPE_STRING); + EXPECT_EQ(parse_result_matcher.paths[1].get_path(), "other"); + EXPECT_EQ(parse_result_matcher.values[1].get_type(), doris::PrimitiveType::TYPE_DOUBLE); +} + +TEST(JsonParserTest, PreserveDecimalNumbersForEscapedTypedPath) { + JSONDataParser parser; + ParseConfig escaped_key_config; + escaped_key_config.preserve_decimal_number_paths.emplace("ab.decimal"); + + std::string json = R"({"a\u0062":{"decimal":0.57,"nested_key":"value"}})"; + auto result = parser.parse(json.c_str(), json.size(), escaped_key_config); + ASSERT_TRUE(result.has_value()); + auto& parse_result_escaped_key = result.value(); + ASSERT_EQ(parse_result_escaped_key.paths.size(), 2); + EXPECT_EQ(parse_result_escaped_key.paths[0].get_path(), "ab.decimal"); + EXPECT_EQ(parse_result_escaped_key.values[0].get_type(), doris::PrimitiveType::TYPE_STRING); + EXPECT_EQ(parse_result_escaped_key.values[0].get(), "0.57"); + EXPECT_EQ(parse_result_escaped_key.paths[1].get_path(), "ab.nested_key"); +} + TEST(JsonParserTest, ParseMultiLevelNestedArray) { JSONDataParser parser; ParseConfig config; @@ -313,6 +494,18 @@ TEST(JsonParserTest, TestNestedArrayWithDifferentConfigs) { // EXPECT_ANY_THROW(parser.parse(json1.c_str(), json1.size(), config2)); } +TEST(JsonParserTest, BigIntegerInJsonbKeepsNumericParse) { + JSONDataParser parser; + ParseConfig config; + config.deprecated_enable_flatten_nested = false; + + std::string json = R"({"nested": [{"big": 18446744073709551616}]})"; + auto result = parser.parse(json.c_str(), json.size(), config); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result->values.size(), 1); + EXPECT_EQ(result->values[0].get_type(), doris::PrimitiveType::TYPE_JSONB); +} + // Test case for directly calling handleNewPath to cover the if (!nested_key.empty()) branch TEST(JsonParserTest, TestHandleNewPathDirectCall) { JSONDataParser parser; @@ -455,11 +648,11 @@ TEST(JsonParserTest, KeyLengthLimitByConfig) { ScopedMaxJsonKeyLength guard(10); std::string key11(11, 'a'); - std::string obj_json = "{\"" + key11 + "\": 1}"; + std::string obj_json = R"({")" + key11 + R"(": 1})"; EXPECT_ANY_THROW(parser.parse(obj_json.c_str(), obj_json.size(), config)); config.deprecated_enable_flatten_nested = false; - std::string jsonb_json = "{\"a\": [{\"" + key11 + "\": 1}]}"; + std::string jsonb_json = R"({"a": [{")" + key11 + R"(": 1}]})"; EXPECT_ANY_THROW(parser.parse(jsonb_json.c_str(), jsonb_json.size(), config)); } @@ -467,12 +660,12 @@ TEST(JsonParserTest, KeyLengthLimitByConfig) { ScopedMaxJsonKeyLength guard(255); std::string key255(255, 'b'); - std::string obj_json = "{\"" + key255 + "\": 1}"; + std::string obj_json = R"({")" + key255 + R"(": 1})"; auto result = parser.parse(obj_json.c_str(), obj_json.size(), config); ASSERT_TRUE(result.has_value()); config.deprecated_enable_flatten_nested = false; - std::string jsonb_json = "{\"a\": [{\"" + key255 + "\": 1}]}"; + std::string jsonb_json = R"({"a": [{")" + key255 + R"(": 1}]})"; result = parser.parse(jsonb_json.c_str(), jsonb_json.size(), config); ASSERT_TRUE(result.has_value()); ASSERT_EQ(result->values.size(), 1); diff --git a/regression-test/data/variant_p0/predefine/test_variant_high_precision_decimal_stream.json b/regression-test/data/variant_p0/predefine/test_variant_high_precision_decimal_stream.json new file mode 100644 index 00000000000000..2633dd34a384be --- /dev/null +++ b/regression-test/data/variant_p0/predefine/test_variant_high_precision_decimal_stream.json @@ -0,0 +1 @@ +{"id":2,"v":"{\"number_1\":999999999999999999999999999.999999999,\"number_2\":0.0000000001,\"number_3\":-999999999999999999999999999.999999999,\"number_4\":0.57,\"number_big_integer\":99999999999999999999999999999999999999999999999999,\"number_scientific_1\":999999999999999999999999999.999999999e0,\"number_scientific_2\":1e-10,\"numberArray_1\":[999999999999999999999999999.999999999,-999999999999999999999999999.999999999],\"glob_decimal_1\":999999999999999999999999999.999999999,\"string_first_decimal\":999999999999999999999999999.999999999,\"untyped_decimal\":999999999999999999999999999.999999999,\"untyped_big_integer\":340282366920938463463374607431768211456}"} diff --git a/regression-test/suites/variant_p0/predefine/test_variant_high_precision_decimal.groovy b/regression-test/suites/variant_p0/predefine/test_variant_high_precision_decimal.groovy new file mode 100644 index 00000000000000..63c05e18069c04 --- /dev/null +++ b/regression-test/suites/variant_p0/predefine/test_variant_high_precision_decimal.groovy @@ -0,0 +1,168 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_variant_high_precision_decimal", "p0") { + sql """ set default_variant_enable_doc_mode = false """ + sql """ set default_variant_enable_typed_paths_to_sparse = false """ + sql """ set enable_decimal256 = true """ + + def decimal256Integer = "99999999999999999999999999999999999999999999999999" + + sql "DROP TABLE IF EXISTS test_variant_high_precision_decimal" + sql "DROP TABLE IF EXISTS test_variant_high_precision_decimal_json_stage" + sql """ + CREATE TABLE test_variant_high_precision_decimal ( + `id` bigint NOT NULL, + `v` variant< + 'number_1':decimalv3(38,10), + 'number_2':decimalv3(38,10), + 'number_3':decimalv3(38,10), + 'number_4':decimalv3(38,18), + 'number_big_integer':decimalv3(76,0), + 'number_scientific_1':decimalv3(38,10), + 'number_scientific_2':decimalv3(38,10), + 'numberArray_1':array, + 'glob_decimal_*':decimalv3(38,10), + 'string_first_*':string, + 'string_first_decimal':decimalv3(38,10), + properties ( + "variant_enable_typed_paths_to_sparse" = "true", + "variant_max_subcolumns_count" = "1" + ) + > NULL + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true" + ) + """ + sql """ + CREATE TABLE test_variant_high_precision_decimal_json_stage ( + `id` bigint NOT NULL, + `v` string NULL + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ) + """ + + sql """ + INSERT INTO test_variant_high_precision_decimal VALUES + ( + 1, + '{ + "number_1": 999999999999999999999999999.999999999, + "number_2": 0.0000000001, + "number_3": -999999999999999999999999999.999999999, + "number_4": 0.57, + "number_big_integer": ${decimal256Integer}, + "number_scientific_1": 999999999999999999999999999.999999999e0, + "number_scientific_2": 1e-10, + "numberArray_1": [ + 999999999999999999999999999.999999999, + -999999999999999999999999999.999999999 + ], + "glob_decimal_1": 999999999999999999999999999.999999999, + "string_first_decimal": 999999999999999999999999999.999999999, + "untyped_decimal": 999999999999999999999999999.999999999, + "untyped_big_integer": 340282366920938463463374607431768211456 + }' + ) + """ + sql "sync" + + streamLoad { + table "test_variant_high_precision_decimal_json_stage" + set 'read_json_by_line', 'true' + set 'format', 'json' + set 'max_filter_ratio', '0' + file "test_variant_high_precision_decimal_stream.json" + time 10000 + + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + logger.info("Stream load test_variant_high_precision_decimal_stream.json result: ${result}".toString()) + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + assertEquals(1, json.NumberLoadedRows as int) + } + } + sql """ + INSERT INTO test_variant_high_precision_decimal + SELECT id, v FROM test_variant_high_precision_decimal_json_stage + """ + sql "sync" + + streamLoad { + table "test_variant_high_precision_decimal" + set 'read_json_by_line', 'true' + set 'format', 'json' + set 'max_filter_ratio', '0' + file "test_variant_high_precision_decimal_stream.json" + time 10000 + + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + logger.info("Stream load test_variant_high_precision_decimal_stream.json to variant result: ${result}".toString()) + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + assertEquals(1, json.NumberLoadedRows as int) + } + } + + sql "sync" + + def assertAllRowsMatch = { String predicate -> + def result = sql "SELECT count(*) FROM test_variant_high_precision_decimal WHERE ${predicate}" + assertEquals(3, result[0][0] as int) + } + + assertAllRowsMatch("cast(v['number_1'] as decimalv3(38,10)) = cast('999999999999999999999999999.9999999990' as decimalv3(38,10))") + assertAllRowsMatch("cast(v['number_2'] as decimalv3(38,10)) = cast('0.0000000001' as decimalv3(38,10))") + assertAllRowsMatch("cast(v['number_3'] as decimalv3(38,10)) = cast('-999999999999999999999999999.9999999990' as decimalv3(38,10))") + assertAllRowsMatch("cast(v['number_4'] as decimalv3(38,18)) = cast('0.570000000000000000' as decimalv3(38,18))") + assertAllRowsMatch("cast(v['number_big_integer'] as decimalv3(76,0)) = cast('${decimal256Integer}' as decimalv3(76,0))") + assertAllRowsMatch("cast(v['number_scientific_1'] as decimalv3(38,10)) = cast('999999999999999999999999999.9999999990' as decimalv3(38,10))") + assertAllRowsMatch("cast(v['number_scientific_2'] as decimalv3(38,10)) = cast('0.0000000001' as decimalv3(38,10))") + assertAllRowsMatch("cast(v['glob_decimal_1'] as decimalv3(38,10)) = cast('999999999999999999999999999.9999999990' as decimalv3(38,10))") + assertAllRowsMatch("array_contains(cast(v['numberArray_1'] as array), cast('999999999999999999999999999.9999999990' as decimalv3(38,10)))") + assertAllRowsMatch("array_contains(cast(v['numberArray_1'] as array), cast('-999999999999999999999999999.9999999990' as decimalv3(38,10)))") + + def stringFirstTypeCount = sql """ SELECT count(*) + FROM test_variant_high_precision_decimal + WHERE cast(variant_type(v['string_first_decimal']) as string) = '{"":"string"}' + AND cast(v['string_first_decimal'] as string) != '999999999999999999999999999.999999999' """ + assertEquals(3, stringFirstTypeCount[0][0] as int) + + def untypedTypeCount = sql """ SELECT count(*) + FROM test_variant_high_precision_decimal + WHERE cast(variant_type(v['untyped_decimal']) as string) = '{"":"double"}' """ + assertEquals(3, untypedTypeCount[0][0] as int) + + def untypedBigIntegerTypeCount = sql """ SELECT count(*) + FROM test_variant_high_precision_decimal + WHERE cast(variant_type(v['untyped_big_integer']) as string) = '{"":"double"}' """ + assertEquals(3, untypedBigIntegerTypeCount[0][0] as int) +}