|
23 | 23 |
|
24 | 24 | #include "function_test_util.h" |
25 | 25 | #include "util/encryption_util.h" |
| 26 | +#include "vec/columns/column_array.h" |
| 27 | +#include "vec/columns/column_nullable.h" |
| 28 | +#include "vec/columns/column_string.h" |
26 | 29 | #include "vec/core/field.h" |
27 | 30 | #include "vec/core/types.h" |
28 | 31 | #include "vec/data_types/data_type_number.h" |
@@ -3869,4 +3872,202 @@ TEST(function_string_test, function_unicode_normalize_invalid_mode) { |
3869 | 3872 | EXPECT_NE(Status::OK(), st); |
3870 | 3873 | } |
3871 | 3874 |
|
| 3875 | +// Helper: run split_by_string with 3 args (str, delimiter, limit) on a single row |
| 3876 | +// Returns the result column (Array<Nullable<String>>) |
| 3877 | +static ColumnPtr run_split_by_string_3arg(const std::string& str, const std::string& delimiter, |
| 3878 | + Int32 limit_val) { |
| 3879 | + Block block; |
| 3880 | + auto str_type = std::make_shared<DataTypeString>(); |
| 3881 | + auto int_type = std::make_shared<DataTypeInt32>(); |
| 3882 | + auto ret_type = std::make_shared<DataTypeArray>(make_nullable(str_type)); |
| 3883 | + |
| 3884 | + // Build input columns with one row each |
| 3885 | + auto str_col = ColumnString::create(); |
| 3886 | + str_col->insert_data(str.data(), str.size()); |
| 3887 | + auto delim_col = ColumnString::create(); |
| 3888 | + delim_col->insert_data(delimiter.data(), delimiter.size()); |
| 3889 | + auto limit_col = ColumnInt32::create(); |
| 3890 | + limit_col->insert_value(limit_val); |
| 3891 | + |
| 3892 | + block.insert({std::move(str_col), str_type, "str"}); |
| 3893 | + block.insert({std::move(delim_col), str_type, "delim"}); |
| 3894 | + block.insert({std::move(limit_col), int_type, "limit"}); |
| 3895 | + block.insert({nullptr, ret_type, "result"}); |
| 3896 | + |
| 3897 | + auto func = SimpleFunctionFactory::instance().get_function( |
| 3898 | + "split_by_string", block.get_columns_with_type_and_name(), ret_type); |
| 3899 | + EXPECT_TRUE(func != nullptr); |
| 3900 | + auto st = func->execute(nullptr, block, {0, 1, 2}, 3, 1); |
| 3901 | + EXPECT_EQ(Status::OK(), st); |
| 3902 | + |
| 3903 | + return block.get_by_position(3).column; |
| 3904 | +} |
| 3905 | + |
| 3906 | +// Helper: run split_by_string with 2 args (str, delimiter) on a single row |
| 3907 | +static ColumnPtr run_split_by_string_2arg(const std::string& str, const std::string& delimiter) { |
| 3908 | + Block block; |
| 3909 | + auto str_type = std::make_shared<DataTypeString>(); |
| 3910 | + auto ret_type = std::make_shared<DataTypeArray>(make_nullable(str_type)); |
| 3911 | + |
| 3912 | + auto str_col = ColumnString::create(); |
| 3913 | + str_col->insert_data(str.data(), str.size()); |
| 3914 | + auto delim_col = ColumnString::create(); |
| 3915 | + delim_col->insert_data(delimiter.data(), delimiter.size()); |
| 3916 | + |
| 3917 | + block.insert({std::move(str_col), str_type, "str"}); |
| 3918 | + block.insert({std::move(delim_col), str_type, "delim"}); |
| 3919 | + block.insert({nullptr, ret_type, "result"}); |
| 3920 | + |
| 3921 | + auto func = SimpleFunctionFactory::instance().get_function( |
| 3922 | + "split_by_string", block.get_columns_with_type_and_name(), ret_type); |
| 3923 | + EXPECT_TRUE(func != nullptr); |
| 3924 | + auto st = func->execute(nullptr, block, {0, 1}, 2, 1); |
| 3925 | + EXPECT_EQ(Status::OK(), st); |
| 3926 | + |
| 3927 | + return block.get_by_position(2).column; |
| 3928 | +} |
| 3929 | + |
| 3930 | +// Helper: extract array elements as vector of strings from row 0 of an array column |
| 3931 | +static std::vector<std::string> get_array_strings(const ColumnPtr& col) { |
| 3932 | + const auto* array_col = assert_cast<const ColumnArray*>(col.get()); |
| 3933 | + auto offsets = array_col->get_offsets(); |
| 3934 | + size_t start = 0; |
| 3935 | + size_t end = offsets[0]; |
| 3936 | + |
| 3937 | + std::vector<std::string> result; |
| 3938 | + const auto& nested = array_col->get_data(); |
| 3939 | + // nested is ColumnNullable<ColumnString> |
| 3940 | + const auto* nullable_col = assert_cast<const ColumnNullable*>(&nested); |
| 3941 | + const auto* str_col = assert_cast<const ColumnString*>(&nullable_col->get_nested_column()); |
| 3942 | + |
| 3943 | + for (size_t i = start; i < end; i++) { |
| 3944 | + auto ref = str_col->get_data_at(i); |
| 3945 | + result.emplace_back(ref.data, ref.size); |
| 3946 | + } |
| 3947 | + return result; |
| 3948 | +} |
| 3949 | + |
| 3950 | +TEST(function_string_test, function_split_by_string_with_limit_test) { |
| 3951 | + // Basic limit functionality |
| 3952 | + { |
| 3953 | + auto col = run_split_by_string_3arg("one,two,three,", ",", 2); |
| 3954 | + auto arr = get_array_strings(col); |
| 3955 | + ASSERT_EQ(arr.size(), 2); |
| 3956 | + EXPECT_EQ(arr[0], "one"); |
| 3957 | + EXPECT_EQ(arr[1], "two,three,"); |
| 3958 | + } |
| 3959 | + { |
| 3960 | + auto col = run_split_by_string_3arg("one,two,three,", ",", 3); |
| 3961 | + auto arr = get_array_strings(col); |
| 3962 | + ASSERT_EQ(arr.size(), 3); |
| 3963 | + EXPECT_EQ(arr[0], "one"); |
| 3964 | + EXPECT_EQ(arr[1], "two"); |
| 3965 | + EXPECT_EQ(arr[2], "three,"); |
| 3966 | + } |
| 3967 | + // limit = 1: no split |
| 3968 | + { |
| 3969 | + auto col = run_split_by_string_3arg("one,two,three", ",", 1); |
| 3970 | + auto arr = get_array_strings(col); |
| 3971 | + ASSERT_EQ(arr.size(), 1); |
| 3972 | + EXPECT_EQ(arr[0], "one,two,three"); |
| 3973 | + } |
| 3974 | + // limit >= parts: return all |
| 3975 | + { |
| 3976 | + auto col = run_split_by_string_3arg("a,b,c", ",", 10); |
| 3977 | + auto arr = get_array_strings(col); |
| 3978 | + ASSERT_EQ(arr.size(), 3); |
| 3979 | + EXPECT_EQ(arr[0], "a"); |
| 3980 | + EXPECT_EQ(arr[1], "b"); |
| 3981 | + EXPECT_EQ(arr[2], "c"); |
| 3982 | + } |
| 3983 | + // Multi-char delimiter + limit |
| 3984 | + { |
| 3985 | + auto col = run_split_by_string_3arg("a::b::c::d", "::", 2); |
| 3986 | + auto arr = get_array_strings(col); |
| 3987 | + ASSERT_EQ(arr.size(), 2); |
| 3988 | + EXPECT_EQ(arr[0], "a"); |
| 3989 | + EXPECT_EQ(arr[1], "b::c::d"); |
| 3990 | + } |
| 3991 | +} |
| 3992 | + |
| 3993 | +TEST(function_string_test, function_split_by_string_limit_empty_delim_test) { |
| 3994 | + // Empty delimiter + limit: splits by character (ASCII) |
| 3995 | + { |
| 3996 | + auto col = run_split_by_string_3arg("abcde", "", 3); |
| 3997 | + auto arr = get_array_strings(col); |
| 3998 | + ASSERT_EQ(arr.size(), 3); |
| 3999 | + EXPECT_EQ(arr[0], "a"); |
| 4000 | + EXPECT_EQ(arr[1], "b"); |
| 4001 | + EXPECT_EQ(arr[2], "cde"); |
| 4002 | + } |
| 4003 | + { |
| 4004 | + auto col = run_split_by_string_3arg("abcde", "", 1); |
| 4005 | + auto arr = get_array_strings(col); |
| 4006 | + ASSERT_EQ(arr.size(), 1); |
| 4007 | + EXPECT_EQ(arr[0], "abcde"); |
| 4008 | + } |
| 4009 | + { |
| 4010 | + auto col = run_split_by_string_3arg("abcde", "", 10); |
| 4011 | + auto arr = get_array_strings(col); |
| 4012 | + ASSERT_EQ(arr.size(), 5); |
| 4013 | + EXPECT_EQ(arr[0], "a"); |
| 4014 | + EXPECT_EQ(arr[4], "e"); |
| 4015 | + } |
| 4016 | + // Empty delimiter + limit: UTF-8 |
| 4017 | + { |
| 4018 | + // "你好世" = 3 UTF-8 characters |
| 4019 | + std::string utf8_str = "\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96"; |
| 4020 | + auto col = run_split_by_string_3arg(utf8_str, "", 2); |
| 4021 | + auto arr = get_array_strings(col); |
| 4022 | + ASSERT_EQ(arr.size(), 2); |
| 4023 | + EXPECT_EQ(arr[0], "\xe4\xbd\xa0"); // 你 |
| 4024 | + EXPECT_EQ(arr[1], "\xe5\xa5\xbd\xe4\xb8\x96"); // 好世 |
| 4025 | + } |
| 4026 | +} |
| 4027 | + |
| 4028 | +TEST(function_string_test, function_split_by_string_limit_edge_cases_test) { |
| 4029 | + // limit <= 0: behaves like no limit |
| 4030 | + { |
| 4031 | + auto col = run_split_by_string_3arg("a,b,c", ",", -1); |
| 4032 | + auto arr = get_array_strings(col); |
| 4033 | + ASSERT_EQ(arr.size(), 3); |
| 4034 | + EXPECT_EQ(arr[0], "a"); |
| 4035 | + EXPECT_EQ(arr[1], "b"); |
| 4036 | + EXPECT_EQ(arr[2], "c"); |
| 4037 | + } |
| 4038 | + { |
| 4039 | + auto col = run_split_by_string_3arg("a,b,c", ",", 0); |
| 4040 | + auto arr = get_array_strings(col); |
| 4041 | + ASSERT_EQ(arr.size(), 3); |
| 4042 | + } |
| 4043 | + // Empty source string |
| 4044 | + { |
| 4045 | + auto col = run_split_by_string_3arg("", ",", 2); |
| 4046 | + auto arr = get_array_strings(col); |
| 4047 | + ASSERT_EQ(arr.size(), 0); |
| 4048 | + } |
| 4049 | + // Consecutive delimiters + limit |
| 4050 | + { |
| 4051 | + auto col = run_split_by_string_3arg(",,,", ",", 2); |
| 4052 | + auto arr = get_array_strings(col); |
| 4053 | + ASSERT_EQ(arr.size(), 2); |
| 4054 | + EXPECT_EQ(arr[0], ""); |
| 4055 | + EXPECT_EQ(arr[1], ",,"); |
| 4056 | + } |
| 4057 | + // 2-arg version still works after refactoring |
| 4058 | + { |
| 4059 | + auto col = run_split_by_string_2arg("a,b,c", ","); |
| 4060 | + auto arr = get_array_strings(col); |
| 4061 | + ASSERT_EQ(arr.size(), 3); |
| 4062 | + EXPECT_EQ(arr[0], "a"); |
| 4063 | + EXPECT_EQ(arr[1], "b"); |
| 4064 | + EXPECT_EQ(arr[2], "c"); |
| 4065 | + } |
| 4066 | + { |
| 4067 | + auto col = run_split_by_string_2arg("abcde", ""); |
| 4068 | + auto arr = get_array_strings(col); |
| 4069 | + ASSERT_EQ(arr.size(), 5); |
| 4070 | + } |
| 4071 | +} |
| 4072 | + |
3872 | 4073 | } // namespace doris::vectorized |
0 commit comments