Skip to content

Commit 5d70879

Browse files
fix: improve test coverage for split_by_string limit parameter
1 parent e89a95b commit 5d70879

3 files changed

Lines changed: 435 additions & 0 deletions

File tree

be/test/vec/function/function_string_test.cpp

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323

2424
#include "function_test_util.h"
2525
#include "util/encryption_util.h"
26+
#include "vec/columns/column_array.h"
27+
#include "vec/columns/column_nullable.h"
28+
#include "vec/columns/column_string.h"
2629
#include "vec/core/field.h"
2730
#include "vec/core/types.h"
2831
#include "vec/data_types/data_type_number.h"
@@ -3869,4 +3872,202 @@ TEST(function_string_test, function_unicode_normalize_invalid_mode) {
38693872
EXPECT_NE(Status::OK(), st);
38703873
}
38713874

3875+
// Helper: run split_by_string with 3 args (str, delimiter, limit) on a single row
3876+
// Returns the result column (Array<Nullable<String>>)
3877+
static ColumnPtr run_split_by_string_3arg(const std::string& str, const std::string& delimiter,
3878+
Int32 limit_val) {
3879+
Block block;
3880+
auto str_type = std::make_shared<DataTypeString>();
3881+
auto int_type = std::make_shared<DataTypeInt32>();
3882+
auto ret_type = std::make_shared<DataTypeArray>(make_nullable(str_type));
3883+
3884+
// Build input columns with one row each
3885+
auto str_col = ColumnString::create();
3886+
str_col->insert_data(str.data(), str.size());
3887+
auto delim_col = ColumnString::create();
3888+
delim_col->insert_data(delimiter.data(), delimiter.size());
3889+
auto limit_col = ColumnInt32::create();
3890+
limit_col->insert_value(limit_val);
3891+
3892+
block.insert({std::move(str_col), str_type, "str"});
3893+
block.insert({std::move(delim_col), str_type, "delim"});
3894+
block.insert({std::move(limit_col), int_type, "limit"});
3895+
block.insert({nullptr, ret_type, "result"});
3896+
3897+
auto func = SimpleFunctionFactory::instance().get_function(
3898+
"split_by_string", block.get_columns_with_type_and_name(), ret_type);
3899+
EXPECT_TRUE(func != nullptr);
3900+
auto st = func->execute(nullptr, block, {0, 1, 2}, 3, 1);
3901+
EXPECT_EQ(Status::OK(), st);
3902+
3903+
return block.get_by_position(3).column;
3904+
}
3905+
3906+
// Helper: run split_by_string with 2 args (str, delimiter) on a single row
3907+
static ColumnPtr run_split_by_string_2arg(const std::string& str, const std::string& delimiter) {
3908+
Block block;
3909+
auto str_type = std::make_shared<DataTypeString>();
3910+
auto ret_type = std::make_shared<DataTypeArray>(make_nullable(str_type));
3911+
3912+
auto str_col = ColumnString::create();
3913+
str_col->insert_data(str.data(), str.size());
3914+
auto delim_col = ColumnString::create();
3915+
delim_col->insert_data(delimiter.data(), delimiter.size());
3916+
3917+
block.insert({std::move(str_col), str_type, "str"});
3918+
block.insert({std::move(delim_col), str_type, "delim"});
3919+
block.insert({nullptr, ret_type, "result"});
3920+
3921+
auto func = SimpleFunctionFactory::instance().get_function(
3922+
"split_by_string", block.get_columns_with_type_and_name(), ret_type);
3923+
EXPECT_TRUE(func != nullptr);
3924+
auto st = func->execute(nullptr, block, {0, 1}, 2, 1);
3925+
EXPECT_EQ(Status::OK(), st);
3926+
3927+
return block.get_by_position(2).column;
3928+
}
3929+
3930+
// Helper: extract array elements as vector of strings from row 0 of an array column
3931+
static std::vector<std::string> get_array_strings(const ColumnPtr& col) {
3932+
const auto* array_col = assert_cast<const ColumnArray*>(col.get());
3933+
auto offsets = array_col->get_offsets();
3934+
size_t start = 0;
3935+
size_t end = offsets[0];
3936+
3937+
std::vector<std::string> result;
3938+
const auto& nested = array_col->get_data();
3939+
// nested is ColumnNullable<ColumnString>
3940+
const auto* nullable_col = assert_cast<const ColumnNullable*>(&nested);
3941+
const auto* str_col = assert_cast<const ColumnString*>(&nullable_col->get_nested_column());
3942+
3943+
for (size_t i = start; i < end; i++) {
3944+
auto ref = str_col->get_data_at(i);
3945+
result.emplace_back(ref.data, ref.size);
3946+
}
3947+
return result;
3948+
}
3949+
3950+
TEST(function_string_test, function_split_by_string_with_limit_test) {
3951+
// Basic limit functionality
3952+
{
3953+
auto col = run_split_by_string_3arg("one,two,three,", ",", 2);
3954+
auto arr = get_array_strings(col);
3955+
ASSERT_EQ(arr.size(), 2);
3956+
EXPECT_EQ(arr[0], "one");
3957+
EXPECT_EQ(arr[1], "two,three,");
3958+
}
3959+
{
3960+
auto col = run_split_by_string_3arg("one,two,three,", ",", 3);
3961+
auto arr = get_array_strings(col);
3962+
ASSERT_EQ(arr.size(), 3);
3963+
EXPECT_EQ(arr[0], "one");
3964+
EXPECT_EQ(arr[1], "two");
3965+
EXPECT_EQ(arr[2], "three,");
3966+
}
3967+
// limit = 1: no split
3968+
{
3969+
auto col = run_split_by_string_3arg("one,two,three", ",", 1);
3970+
auto arr = get_array_strings(col);
3971+
ASSERT_EQ(arr.size(), 1);
3972+
EXPECT_EQ(arr[0], "one,two,three");
3973+
}
3974+
// limit >= parts: return all
3975+
{
3976+
auto col = run_split_by_string_3arg("a,b,c", ",", 10);
3977+
auto arr = get_array_strings(col);
3978+
ASSERT_EQ(arr.size(), 3);
3979+
EXPECT_EQ(arr[0], "a");
3980+
EXPECT_EQ(arr[1], "b");
3981+
EXPECT_EQ(arr[2], "c");
3982+
}
3983+
// Multi-char delimiter + limit
3984+
{
3985+
auto col = run_split_by_string_3arg("a::b::c::d", "::", 2);
3986+
auto arr = get_array_strings(col);
3987+
ASSERT_EQ(arr.size(), 2);
3988+
EXPECT_EQ(arr[0], "a");
3989+
EXPECT_EQ(arr[1], "b::c::d");
3990+
}
3991+
}
3992+
3993+
TEST(function_string_test, function_split_by_string_limit_empty_delim_test) {
3994+
// Empty delimiter + limit: splits by character (ASCII)
3995+
{
3996+
auto col = run_split_by_string_3arg("abcde", "", 3);
3997+
auto arr = get_array_strings(col);
3998+
ASSERT_EQ(arr.size(), 3);
3999+
EXPECT_EQ(arr[0], "a");
4000+
EXPECT_EQ(arr[1], "b");
4001+
EXPECT_EQ(arr[2], "cde");
4002+
}
4003+
{
4004+
auto col = run_split_by_string_3arg("abcde", "", 1);
4005+
auto arr = get_array_strings(col);
4006+
ASSERT_EQ(arr.size(), 1);
4007+
EXPECT_EQ(arr[0], "abcde");
4008+
}
4009+
{
4010+
auto col = run_split_by_string_3arg("abcde", "", 10);
4011+
auto arr = get_array_strings(col);
4012+
ASSERT_EQ(arr.size(), 5);
4013+
EXPECT_EQ(arr[0], "a");
4014+
EXPECT_EQ(arr[4], "e");
4015+
}
4016+
// Empty delimiter + limit: UTF-8
4017+
{
4018+
// "你好世" = 3 UTF-8 characters
4019+
std::string utf8_str = "\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xb8\x96";
4020+
auto col = run_split_by_string_3arg(utf8_str, "", 2);
4021+
auto arr = get_array_strings(col);
4022+
ASSERT_EQ(arr.size(), 2);
4023+
EXPECT_EQ(arr[0], "\xe4\xbd\xa0"); //
4024+
EXPECT_EQ(arr[1], "\xe5\xa5\xbd\xe4\xb8\x96"); // 好世
4025+
}
4026+
}
4027+
4028+
TEST(function_string_test, function_split_by_string_limit_edge_cases_test) {
4029+
// limit <= 0: behaves like no limit
4030+
{
4031+
auto col = run_split_by_string_3arg("a,b,c", ",", -1);
4032+
auto arr = get_array_strings(col);
4033+
ASSERT_EQ(arr.size(), 3);
4034+
EXPECT_EQ(arr[0], "a");
4035+
EXPECT_EQ(arr[1], "b");
4036+
EXPECT_EQ(arr[2], "c");
4037+
}
4038+
{
4039+
auto col = run_split_by_string_3arg("a,b,c", ",", 0);
4040+
auto arr = get_array_strings(col);
4041+
ASSERT_EQ(arr.size(), 3);
4042+
}
4043+
// Empty source string
4044+
{
4045+
auto col = run_split_by_string_3arg("", ",", 2);
4046+
auto arr = get_array_strings(col);
4047+
ASSERT_EQ(arr.size(), 0);
4048+
}
4049+
// Consecutive delimiters + limit
4050+
{
4051+
auto col = run_split_by_string_3arg(",,,", ",", 2);
4052+
auto arr = get_array_strings(col);
4053+
ASSERT_EQ(arr.size(), 2);
4054+
EXPECT_EQ(arr[0], "");
4055+
EXPECT_EQ(arr[1], ",,");
4056+
}
4057+
// 2-arg version still works after refactoring
4058+
{
4059+
auto col = run_split_by_string_2arg("a,b,c", ",");
4060+
auto arr = get_array_strings(col);
4061+
ASSERT_EQ(arr.size(), 3);
4062+
EXPECT_EQ(arr[0], "a");
4063+
EXPECT_EQ(arr[1], "b");
4064+
EXPECT_EQ(arr[2], "c");
4065+
}
4066+
{
4067+
auto col = run_split_by_string_2arg("abcde", "");
4068+
auto arr = get_array_strings(col);
4069+
ASSERT_EQ(arr.size(), 5);
4070+
}
4071+
}
4072+
38724073
} // namespace doris::vectorized
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
package org.apache.doris.nereids.trees.expressions.functions.executable;
19+
20+
import org.apache.doris.nereids.trees.expressions.Expression;
21+
import org.apache.doris.nereids.trees.expressions.literal.ArrayLiteral;
22+
import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral;
23+
import org.apache.doris.nereids.trees.expressions.literal.Literal;
24+
import org.apache.doris.nereids.trees.expressions.literal.StringLiteral;
25+
import org.apache.doris.nereids.types.ArrayType;
26+
import org.apache.doris.nereids.types.StringType;
27+
28+
import com.google.common.collect.ImmutableList;
29+
import org.junit.jupiter.api.Assertions;
30+
import org.junit.jupiter.api.Test;
31+
32+
import java.util.Arrays;
33+
import java.util.List;
34+
import java.util.stream.Collectors;
35+
36+
public class StringArithmeticSplitByStringTest {
37+
38+
private static ArrayLiteral makeArray(String... values) {
39+
List<Literal> items = Arrays.stream(values)
40+
.map(StringLiteral::new)
41+
.collect(Collectors.toList());
42+
return new ArrayLiteral(items);
43+
}
44+
45+
private static ArrayLiteral makeEmptyArray() {
46+
return new ArrayLiteral(ImmutableList.of(), ArrayType.of(StringType.INSTANCE));
47+
}
48+
49+
@Test
50+
public void testSplitByStringWithLimitBasic() {
51+
// limit < parts: "a,b,c,d" split by "," limit 2 -> ["a", "b,c,d"]
52+
Expression result = StringArithmetic.splitByString(
53+
new StringLiteral("a,b,c,d"), new StringLiteral(","), new IntegerLiteral(2));
54+
Assertions.assertEquals(makeArray("a", "b,c,d"), result);
55+
56+
// limit = 3
57+
result = StringArithmetic.splitByString(
58+
new StringLiteral("a,b,c,d"), new StringLiteral(","), new IntegerLiteral(3));
59+
Assertions.assertEquals(makeArray("a", "b", "c,d"), result);
60+
61+
// limit = 1: no split at all
62+
result = StringArithmetic.splitByString(
63+
new StringLiteral("one,two,three"), new StringLiteral(","), new IntegerLiteral(1));
64+
Assertions.assertEquals(makeArray("one,two,three"), result);
65+
66+
// multi-char delimiter + limit
67+
result = StringArithmetic.splitByString(
68+
new StringLiteral("a::b::c::d"), new StringLiteral("::"), new IntegerLiteral(2));
69+
Assertions.assertEquals(makeArray("a", "b::c::d"), result);
70+
}
71+
72+
@Test
73+
public void testSplitByStringWithLimitExceedParts() {
74+
// limit >= parts: "a,b,c" split by "," limit 10 -> ["a","b","c"]
75+
Expression result = StringArithmetic.splitByString(
76+
new StringLiteral("a,b,c"), new StringLiteral(","), new IntegerLiteral(10));
77+
Assertions.assertEquals(makeArray("a", "b", "c"), result);
78+
79+
// limit == parts
80+
result = StringArithmetic.splitByString(
81+
new StringLiteral("a,b,c"), new StringLiteral(","), new IntegerLiteral(3));
82+
Assertions.assertEquals(makeArray("a", "b", "c"), result);
83+
}
84+
85+
@Test
86+
public void testSplitByStringWithLimitZeroAndNegative() {
87+
// limit = 0 -> delegates to 2-arg version
88+
Expression result = StringArithmetic.splitByString(
89+
new StringLiteral("a,b,c"), new StringLiteral(","), new IntegerLiteral(0));
90+
Assertions.assertEquals(makeArray("a", "b", "c"), result);
91+
92+
// limit = -1 -> delegates to 2-arg version
93+
result = StringArithmetic.splitByString(
94+
new StringLiteral("a,b,c"), new StringLiteral(","), new IntegerLiteral(-1));
95+
Assertions.assertEquals(makeArray("a", "b", "c"), result);
96+
97+
// limit = -100 -> delegates to 2-arg version
98+
result = StringArithmetic.splitByString(
99+
new StringLiteral("a,b,c"), new StringLiteral(","), new IntegerLiteral(-100));
100+
Assertions.assertEquals(makeArray("a", "b", "c"), result);
101+
}
102+
103+
@Test
104+
public void testSplitByStringWithLimitEmptyFirst() {
105+
// empty source string -> empty array
106+
Expression result = StringArithmetic.splitByString(
107+
new StringLiteral(""), new StringLiteral(","), new IntegerLiteral(2));
108+
Assertions.assertEquals(makeEmptyArray(), result);
109+
110+
result = StringArithmetic.splitByString(
111+
new StringLiteral(""), new StringLiteral(","), new IntegerLiteral(0));
112+
// limit <= 0 delegates to 2-arg, which also returns empty array for empty input
113+
Assertions.assertEquals(makeEmptyArray(), result);
114+
}
115+
116+
@Test
117+
public void testSplitByStringWithLimitEmptyDelimiter() {
118+
// empty delimiter splits by character, with limit < chars
119+
Expression result = StringArithmetic.splitByString(
120+
new StringLiteral("abcde"), new StringLiteral(""), new IntegerLiteral(3));
121+
Assertions.assertEquals(makeArray("a", "b", "cde"), result);
122+
123+
// limit = 1 -> entire string as single element
124+
result = StringArithmetic.splitByString(
125+
new StringLiteral("abcde"), new StringLiteral(""), new IntegerLiteral(1));
126+
Assertions.assertEquals(makeArray("abcde"), result);
127+
}
128+
129+
@Test
130+
public void testSplitByStringWithLimitEmptyDelimiterExceed() {
131+
// empty delimiter + limit >= chars -> all characters
132+
Expression result = StringArithmetic.splitByString(
133+
new StringLiteral("abcde"), new StringLiteral(""), new IntegerLiteral(10));
134+
Assertions.assertEquals(makeArray("a", "b", "c", "d", "e"), result);
135+
136+
// exact match
137+
result = StringArithmetic.splitByString(
138+
new StringLiteral("abc"), new StringLiteral(""), new IntegerLiteral(3));
139+
Assertions.assertEquals(makeArray("a", "b", "c"), result);
140+
}
141+
142+
@Test
143+
public void testSplitByStringWithLimitConsecutiveDelimiters() {
144+
// consecutive delimiters produce empty strings
145+
Expression result = StringArithmetic.splitByString(
146+
new StringLiteral(",,,"), new StringLiteral(","), new IntegerLiteral(2));
147+
Assertions.assertEquals(makeArray("", ",,"), result);
148+
149+
result = StringArithmetic.splitByString(
150+
new StringLiteral(",,a,b,c,"), new StringLiteral(","), new IntegerLiteral(3));
151+
Assertions.assertEquals(makeArray("", "", "a,b,c,"), result);
152+
}
153+
}

0 commit comments

Comments
 (0)