From fdf8418cdb3a11ae2d9ffe5f8ddc0f347326534e Mon Sep 17 00:00:00 2001 From: Hu Shenggang Date: Sat, 16 May 2026 10:39:21 +0800 Subject: [PATCH 1/2] [fix](json) Fix strict json validation ### What problem does this PR solve? Issue Number: DORIS-25567 Related PR: None Problem Summary: Fix json_valid accepting incomplete JSON input or JSON with trailing non-whitespace content. DORIS-25568 is documented as intentional behavior because typed jsonb_extract_* keeps the historical jsonb_extract plus cast rewrite and follows normal cast rules. ### Release note Fix json_valid to reject incomplete or trailing JSON input. ### Check List (For Author) - Test: Unit Test / FE build - ./run-be-ut.sh --run --filter=JsonBinaryValueTest.TestValidation:FunctionJsonbTEST.JsonValidStrictTest - ./build.sh --fe - Added regression-test/suites/jsonb_p0/test_json_valid_strict.groovy, but local regression run did not complete because the worktree FE metadata/port state prevented a stable cluster startup. - Behavior changed: Yes. json_valid rejects incomplete/trailing JSON input. typed jsonb_extract_* behavior is unchanged and documented as intentional. - Does this need documentation: No --- be/src/util/jsonb_parser_simd.h | 32 +++++++++++++++++ be/test/core/value/jsonb_value_test.cpp | 5 +-- .../exprs/function/function_jsonb_test.cpp | 18 ++++++++++ .../functions/scalar/JsonbExtractBigint.java | 2 ++ .../functions/scalar/JsonbExtractBool.java | 2 ++ .../functions/scalar/JsonbExtractDouble.java | 2 ++ .../functions/scalar/JsonbExtractInt.java | 2 ++ .../scalar/JsonbExtractLargeint.java | 2 ++ .../functions/scalar/JsonbExtractString.java | 2 ++ .../jsonb_p0/test_json_valid_strict.groovy | 34 +++++++++++++++++++ 10 files changed, 99 insertions(+), 2 deletions(-) create mode 100644 regression-test/suites/jsonb_p0/test_json_valid_strict.groovy diff --git a/be/src/util/jsonb_parser_simd.h b/be/src/util/jsonb_parser_simd.h index 43f12f024ed4c1..7632eb45fba1a7 100644 --- a/be/src/util/jsonb_parser_simd.h +++ b/be/src/util/jsonb_parser_simd.h @@ -93,6 +93,29 @@ struct JsonbParser { simdjson::padded_string json_str {pch, len}; simdjson::ondemand::document doc = simdjson_parser.iterate(json_str); + auto is_json_whitespace = [](char c) { + return c == ' ' || c == '\t' || c == '\n' || c == '\r'; + }; + const char* json_begin = json_str.data(); + const char* json_end = json_str.data() + len; + while (json_begin < json_end && is_json_whitespace(*json_begin)) { + ++json_begin; + } + while (json_end > json_begin && is_json_whitespace(*(json_end - 1))) { + --json_end; + } + + std::string_view raw_json; + simdjson::error_code raw_res = doc.raw_json().get(raw_json); + if (raw_res != simdjson::SUCCESS) { + return Status::InvalidArgument(fmt::format("simdjson raw_json failed: {}", + simdjson::error_message(raw_res))); + } + if (raw_json.data() != json_begin || raw_json.data() + raw_json.size() != json_end) { + return Status::InvalidArgument("simdjson parse exception: trailing content"); + } + doc.rewind(); + // simdjson process top level primitive types specially // so some repeated code here switch (doc.type()) { @@ -102,6 +125,12 @@ struct JsonbParser { break; } case simdjson::ondemand::json_type::null: { + bool is_null = false; + simdjson::error_code res = doc.is_null().get(is_null); + if (res != simdjson::SUCCESS || !is_null) { + return Status::InvalidArgument(fmt::format("simdjson get null failed: {}", + simdjson::error_message(res))); + } if (writer.writeNull() == 0) { return Status::InvalidArgument("writeNull failed"); } @@ -130,6 +159,9 @@ struct JsonbParser { break; } } + if (!doc.at_end()) { + return Status::InvalidArgument("simdjson parse exception: trailing content"); + } } catch (simdjson::simdjson_error& e) { return Status::InvalidArgument(fmt::format("simdjson parse exception: {}", e.what())); } diff --git a/be/test/core/value/jsonb_value_test.cpp b/be/test/core/value/jsonb_value_test.cpp index 811717950802ff..2592153822628b 100644 --- a/be/test/core/value/jsonb_value_test.cpp +++ b/be/test/core/value/jsonb_value_test.cpp @@ -29,7 +29,8 @@ TEST(JsonBinaryValueTest, TestValidation) { JsonBinaryValue json_val; // Empty string and non-JSON text should fail to parse - std::vector invalid_strs = {"", "abc"}; + std::vector invalid_strs = {"", "abc", "not", + "not json", "null junk", R"({"a":1} junk)"}; for (size_t i = 0; i < invalid_strs.size(); i++) { auto status = json_val.from_json_string(invalid_strs[i].c_str(), invalid_strs[i].size()); EXPECT_FALSE(status.ok()) << "Should fail for: [" << invalid_strs[i] << "]"; @@ -55,4 +56,4 @@ TEST(JsonBinaryValueTest, TestValidation) { EXPECT_TRUE(status.ok()) << "Should succeed for: [" << valid_strs[i] << "]"; } } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/exprs/function/function_jsonb_test.cpp b/be/test/exprs/function/function_jsonb_test.cpp index c321fffb4f440c..7db6b9febbd338 100644 --- a/be/test/exprs/function/function_jsonb_test.cpp +++ b/be/test/exprs/function/function_jsonb_test.cpp @@ -186,6 +186,24 @@ TEST(FunctionJsonbTEST, JsonbParseErrorToValueTest) { ASSERT_EQ(st.code(), ErrorCode::INVALID_ARGUMENT) << st.to_string(); } +TEST(FunctionJsonbTEST, JsonValidStrictTest) { + std::string func_name = "json_valid"; + InputTypeSet input_types = {Nullable {PrimitiveType::TYPE_VARCHAR}}; + + DataSet data_set = { + {{Null()}, Null()}, + {{STRING("not")}, INT(0)}, + {{STRING("not json")}, INT(0)}, + {{STRING("null junk")}, INT(0)}, + {{STRING(R"({"a":1} junk)")}, INT(0)}, + {{STRING("null")}, INT(1)}, + {{STRING("true")}, INT(1)}, + {{STRING(R"({"a":1})")}, INT(1)}, + }; + + static_cast(check_function(func_name, input_types, data_set)); +} + TEST(FunctionJsonbTEST, JsonbExtractTest) { std::string func_name = "jsonb_extract"; InputTypeSet input_types = {PrimitiveType::TYPE_JSONB, PrimitiveType::TYPE_VARCHAR}; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractBigint.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractBigint.java index 64bbcbaee91254..7027dbc29f901d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractBigint.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractBigint.java @@ -80,6 +80,8 @@ public R accept(ExpressionVisitor visitor, C context) { @Override public Expression rewriteWhenAnalyze() { JsonbExtract jsonExtract = new JsonbExtract(children.get(0), children.get(1)); + // Keep the historical JSON typed-extract behavior: extract JSONB first and then use the + // normal cast rules, so convertible mismatched JSON values are allowed by design. return new Cast(jsonExtract, BigIntType.INSTANCE, false); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractBool.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractBool.java index 6091c4f717061d..90488a68418421 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractBool.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractBool.java @@ -80,6 +80,8 @@ public R accept(ExpressionVisitor visitor, C context) { @Override public Expression rewriteWhenAnalyze() { JsonbExtract jsonExtract = new JsonbExtract(children.get(0), children.get(1)); + // Keep the historical JSON typed-extract behavior: extract JSONB first and then use the + // normal cast rules, so convertible mismatched JSON values are allowed by design. return new Cast(jsonExtract, BooleanType.INSTANCE, false); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractDouble.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractDouble.java index 35dd98ad17e6a6..832ae65b4fa542 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractDouble.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractDouble.java @@ -80,6 +80,8 @@ public R accept(ExpressionVisitor visitor, C context) { @Override public Expression rewriteWhenAnalyze() { JsonbExtract jsonExtract = new JsonbExtract(children.get(0), children.get(1)); + // Keep the historical JSON typed-extract behavior: extract JSONB first and then use the + // normal cast rules, so convertible mismatched JSON values are allowed by design. return new Cast(jsonExtract, DoubleType.INSTANCE, false); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractInt.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractInt.java index fdc0c20f544cc7..d30ec49dfd30bd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractInt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractInt.java @@ -80,6 +80,8 @@ public R accept(ExpressionVisitor visitor, C context) { @Override public Expression rewriteWhenAnalyze() { JsonbExtract jsonExtract = new JsonbExtract(children.get(0), children.get(1)); + // Keep the historical JSON typed-extract behavior: extract JSONB first and then use the + // normal cast rules, so convertible mismatched JSON values are allowed by design. return new Cast(jsonExtract, IntegerType.INSTANCE, false); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractLargeint.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractLargeint.java index 4c16a1e0828c1a..aadc357599f52a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractLargeint.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractLargeint.java @@ -80,6 +80,8 @@ public R accept(ExpressionVisitor visitor, C context) { @Override public Expression rewriteWhenAnalyze() { JsonbExtract jsonExtract = new JsonbExtract(children.get(0), children.get(1)); + // Keep the historical JSON typed-extract behavior: extract JSONB first and then use the + // normal cast rules, so convertible mismatched JSON values are allowed by design. return new Cast(jsonExtract, LargeIntType.INSTANCE, false); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractString.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractString.java index 641045ae1f96fb..16f0b0861857e5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractString.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/JsonbExtractString.java @@ -79,6 +79,8 @@ public R accept(ExpressionVisitor visitor, C context) { @Override public Expression rewriteWhenAnalyze() { JsonbExtract jsonExtract = new JsonbExtract(children.get(0), children.get(1)); + // Keep the historical JSON typed-extract behavior: extract JSONB first and then use the + // normal cast rules, so convertible mismatched JSON values are allowed by design. return new Cast(jsonExtract, StringType.INSTANCE, false); } } diff --git a/regression-test/suites/jsonb_p0/test_json_valid_strict.groovy b/regression-test/suites/jsonb_p0/test_json_valid_strict.groovy new file mode 100644 index 00000000000000..4cc85446ee4cac --- /dev/null +++ b/regression-test/suites/jsonb_p0/test_json_valid_strict.groovy @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_json_valid_strict") { + def jsonValid = sql """ + select + json_valid('not json'), + json_valid('not'), + json_valid('null junk'), + json_valid('{"a":1} junk'), + json_valid('{"a":1}') + from (select 1) t + order by 1 + """ + assertEquals(0, jsonValid[0][0]) + assertEquals(0, jsonValid[0][1]) + assertEquals(0, jsonValid[0][2]) + assertEquals(0, jsonValid[0][3]) + assertEquals(1, jsonValid[0][4]) +} From ab4c47c9aa8ca22563434743b7dbd9d2a194fd88 Mon Sep 17 00:00:00 2001 From: Hu Shenggang Date: Sat, 16 May 2026 22:41:45 +0800 Subject: [PATCH 2/2] [fix](json) Address strict JSON review comments ### What problem does this PR solve? Issue Number: None Related PR: #63309 Problem Summary: Avoid the extra raw_json pre-pass in strict JSONB parsing, and keep const JSON plus const path extract results sized to the input block. ### Release note None ### Check List (For Author) - Test: Unit Test - ./run-be-ut.sh --run --filter=JsonBinaryValueTest.TestValidation:FunctionJsonbTEST.JsonbParseTest:FunctionJsonbTEST.JsonbParseErrorToNullTest:FunctionJsonbTEST.JsonValidStrictTest:FunctionJsonbTEST.JsonExtractConstConstMultiRow - build-support/check-format.sh - build-support/run-clang-tidy.sh --build-dir be/ut_build_ASAN (failed due existing complexity/NOLINT diagnostics and local toolchain stddef.h lookup) - Behavior changed: Yes. Const JSON plus const path non-string JSONB extract returns a correctly sized repeated result. - Does this need documentation: No --- be/src/exprs/function/function_jsonb.cpp | 14 +++++ be/src/util/jsonb_parser_simd.h | 23 -------- .../exprs/function/function_jsonb_test.cpp | 55 ++++++++++++++++++- 3 files changed, 68 insertions(+), 24 deletions(-) diff --git a/be/src/exprs/function/function_jsonb.cpp b/be/src/exprs/function/function_jsonb.cpp index 40c10cb4e6183d..ce1cbfc3c37cb4 100644 --- a/be/src/exprs/function/function_jsonb.cpp +++ b/be/src/exprs/function/function_jsonb.cpp @@ -450,6 +450,20 @@ class FunctionJsonbExtract : public IFunction { return create_all_null_result(); } + if (path_const[0]) { + auto const_null_map = ColumnUInt8::create(1, 0); + auto const_res = Impl::ColumnType::create(); + RETURN_IF_ERROR(Impl::scalar_vector( + context, jsonb_data_column->get_data_at(0), rdata, roffsets, + path_null_maps[0], const_res->get_data(), const_null_map->get_data())); + DCHECK_EQ(const_res->size(), 1); + auto nullable_column = + ColumnNullable::create(std::move(const_res), std::move(const_null_map)); + block.get_by_position(result).column = + ColumnConst::create(std::move(nullable_column), input_rows_count); + return Status::OK(); + } + RETURN_IF_ERROR(Impl::scalar_vector(context, jsonb_data_column->get_data_at(0), rdata, roffsets, path_null_maps[0], res->get_data(), null_map->get_data())); diff --git a/be/src/util/jsonb_parser_simd.h b/be/src/util/jsonb_parser_simd.h index 7632eb45fba1a7..202ecb8d8b0f18 100644 --- a/be/src/util/jsonb_parser_simd.h +++ b/be/src/util/jsonb_parser_simd.h @@ -93,29 +93,6 @@ struct JsonbParser { simdjson::padded_string json_str {pch, len}; simdjson::ondemand::document doc = simdjson_parser.iterate(json_str); - auto is_json_whitespace = [](char c) { - return c == ' ' || c == '\t' || c == '\n' || c == '\r'; - }; - const char* json_begin = json_str.data(); - const char* json_end = json_str.data() + len; - while (json_begin < json_end && is_json_whitespace(*json_begin)) { - ++json_begin; - } - while (json_end > json_begin && is_json_whitespace(*(json_end - 1))) { - --json_end; - } - - std::string_view raw_json; - simdjson::error_code raw_res = doc.raw_json().get(raw_json); - if (raw_res != simdjson::SUCCESS) { - return Status::InvalidArgument(fmt::format("simdjson raw_json failed: {}", - simdjson::error_message(raw_res))); - } - if (raw_json.data() != json_begin || raw_json.data() + raw_json.size() != json_end) { - return Status::InvalidArgument("simdjson parse exception: trailing content"); - } - doc.rewind(); - // simdjson process top level primitive types specially // so some repeated code here switch (doc.type()) { diff --git a/be/test/exprs/function/function_jsonb_test.cpp b/be/test/exprs/function/function_jsonb_test.cpp index 7db6b9febbd338..9d7b96f02e407c 100644 --- a/be/test/exprs/function/function_jsonb_test.cpp +++ b/be/test/exprs/function/function_jsonb_test.cpp @@ -16,8 +16,8 @@ // under the License. #include -#include +#include #include #include @@ -384,6 +384,59 @@ TEST(FunctionJsonbTEST, JsonExtractCheckArg) { ASSERT_EQ(st.code(), ErrorCode::INVALID_ARGUMENT); } +TEST(FunctionJsonbTEST, JsonExtractConstConstMultiRow) { + constexpr size_t input_rows_count = 3; + auto json_data_type = std::make_shared(); + auto path_data_type = std::make_shared(); + auto return_type = make_nullable(std::make_shared()); + + JsonbWriter writer; + ASSERT_TRUE(writer.writeStartObject()); + ASSERT_TRUE(writer.writeKey("a")); + ASSERT_TRUE(writer.writeNull()); + ASSERT_TRUE(writer.writeEndObject()); + + auto json_column = json_data_type->create_column(); + json_column->insert_data(writer.getOutput()->getBuffer(), writer.getOutput()->getSize()); + + auto path_column = path_data_type->create_column(); + path_column->insert_data("$.a", 3); + + Block block; + block.insert({ColumnConst::create(std::move(json_column), input_rows_count), json_data_type, + "json_col"}); + block.insert({ColumnConst::create(std::move(path_column), input_rows_count), path_data_type, + "path_col"}); + + FunctionBasePtr func = SimpleFunctionFactory::instance().get_function( + "json_extract_isnull", block.get_columns_with_type_and_name(), return_type); + ASSERT_TRUE(func != nullptr); + + FunctionUtils fn_utils(return_type, {json_data_type, path_data_type}, 0); + auto* fn_ctx = fn_utils.get_fn_ctx(); + auto st = func->open(fn_ctx, FunctionContext::FRAGMENT_LOCAL); + ASSERT_TRUE(st.ok()) << "open failed: " << st.to_string(); + st = func->open(fn_ctx, FunctionContext::THREAD_LOCAL); + ASSERT_TRUE(st.ok()) << "open failed: " << st.to_string(); + + block.insert({nullptr, return_type, "result"}); + auto result = block.columns() - 1; + st = func->execute(fn_ctx, block, {0, 1}, result, input_rows_count); + ASSERT_TRUE(st.ok()) << "execute failed: " << st.to_string(); + + auto result_column = block.get_by_position(result).column->convert_to_full_column_if_const(); + ASSERT_EQ(result_column->size(), input_rows_count); + const auto& result_nullable = assert_cast(*result_column); + const auto& result_data = assert_cast(result_nullable.get_nested_column()); + for (size_t i = 0; i < input_rows_count; ++i) { + EXPECT_FALSE(result_nullable.is_null_at(i)); + EXPECT_EQ(result_data.get_data()[i], 1); + } + + static_cast(func->close(fn_ctx, FunctionContext::THREAD_LOCAL)); + static_cast(func->close(fn_ctx, FunctionContext::FRAGMENT_LOCAL)); +} + TEST(FunctionJsonbTEST, JsonParseCheckArg) { ColumnsWithTypeAndName args; args.emplace_back(