Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions be/src/core/block/block.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1081,15 +1081,6 @@ std::unique_ptr<Block> Block::create_same_struct_block(size_t size, bool is_rese
return temp_block;
}

void Block::shrink_char_type_column_suffix_zero(const std::vector<size_t>& char_type_idx) {
for (auto idx : char_type_idx) {
if (idx < data.size()) {
auto& col_and_name = this->get_by_position(idx);
col_and_name.column->assume_mutable()->shrink_padding_chars();
}
}
}

size_t MutableBlock::allocated_bytes() const {
size_t res = 0;
for (const auto& col : _columns) {
Expand Down
3 changes: 0 additions & 3 deletions be/src/core/block/block.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,9 +346,6 @@ class Block {
return res;
}

// for String type or Array<String> type
void shrink_char_type_column_suffix_zero(const std::vector<size_t>& char_type_idx);

void clear_column_mem_not_keep(const std::vector<bool>& column_keep_flags,
bool need_keep_first);

Expand Down
4 changes: 0 additions & 4 deletions be/src/core/column/column_array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,6 @@ ColumnArray::ColumnArray(MutableColumnPtr&& nested_column) : data(std::move(nest
offsets = ColumnOffsets::create();
}

void ColumnArray::shrink_padding_chars() {
data->shrink_padding_chars();
}

std::string ColumnArray::get_name() const {
return "Array(" + get_data().get_name() + ")";
}
Expand Down
2 changes: 0 additions & 2 deletions be/src/core/column/column_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,6 @@ class ColumnArray final : public COWHelper<IColumn, ColumnArray> {
offsets->sanity_check();
}

void shrink_padding_chars() override;

/** On the index i there is an offset to the beginning of the i + 1 -th element. */
using ColumnOffsets = ColumnOffset64;

Expand Down
27 changes: 3 additions & 24 deletions be/src/core/column/column_dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ class ColumnDictI32 final : public COWHelper<IColumn, ColumnDictI32> {
_dict.initialize_hash_values_for_runtime_filter();
}

uint32_t get_hash_value(uint32_t idx) const { return _dict.get_hash_value(_codes[idx], _type); }
uint32_t get_hash_value(uint32_t idx) const { return _dict.get_hash_value(_codes[idx]); }

template <typename HybridSetType>
void find_codes(const HybridSetType* values, std::vector<UInt8>& selected) const {
Expand Down Expand Up @@ -278,14 +278,6 @@ class ColumnDictI32 final : public COWHelper<IColumn, ColumnDictI32> {

inline const StringRef& get_value(value_type code) const { return _dict.get_value(code); }

inline StringRef get_shrink_value(value_type code) const {
StringRef result = _dict.get_value(code);
if (_type == FieldType::OLAP_FIELD_TYPE_CHAR) {
result.size = strnlen(result.data, result.size);
}
return result;
}

size_t dict_size() const { return _dict.size(); }

std::string dict_debug_string() const { return _dict.debug_string(); }
Expand Down Expand Up @@ -326,26 +318,13 @@ class ColumnDictI32 final : public COWHelper<IColumn, ColumnDictI32> {
}
}

inline uint32_t get_hash_value(Int32 code, FieldType type) const {
inline uint32_t get_hash_value(Int32 code) const {
if (_compute_hash_value_flags[code]) {
return _hash_values[code];
} else {
auto& sv = (*_dict_data)[code];
// The char data is stored in the disk with the schema length,
// and zeros are filled if the length is insufficient

// When reading data, use shrink_char_type_column_suffix_zero(_char_type_idx)
// Remove the suffix 0
// When writing data, use the CharField::consume function to fill in the trailing 0.

// For dictionary data of char type, sv.size is the schema length,
// so use strnlen to remove the 0 at the end to get the actual length.
size_t len = sv.size;
if (type == FieldType::OLAP_FIELD_TYPE_CHAR) {
len = strnlen(sv.data, sv.size);
}
uint32_t hash_val =
crc32c::Extend(0, (const uint8_t*)sv.data, static_cast<uint32_t>(len));
crc32c::Extend(0, (const uint8_t*)sv.data, static_cast<uint32_t>(sv.size));
_hash_values[code] = hash_val;
_compute_hash_value_flags[code] = 1;
return _hash_values[code];
Expand Down
5 changes: 0 additions & 5 deletions be/src/core/column/column_map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -643,11 +643,6 @@ Status ColumnMap::deduplicate_keys(bool recursive) {
return Status::OK();
}

void ColumnMap::shrink_padding_chars() {
keys_column->shrink_padding_chars();
values_column->shrink_padding_chars();
}

void ColumnMap::reserve(size_t n) {
get_offsets().reserve(n);
keys_column->reserve(n);
Expand Down
1 change: 0 additions & 1 deletion be/src/core/column/column_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ class ColumnMap final : public COWHelper<IColumn, ColumnMap> {
const char* deserialize_and_insert_from_arena(const char* pos) override;

void update_hash_with_value(size_t n, SipHash& hash) const override;
void shrink_padding_chars() override;
ColumnPtr filter(const Filter& filt, ssize_t result_size_hint) const override;
size_t filter(const Filter& filter) override;
MutableColumnPtr permute(const Permutation& perm, size_t limit) const override;
Expand Down
6 changes: 0 additions & 6 deletions be/src/core/column/column_struct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,12 +338,6 @@ MutableColumnPtr ColumnStruct::permute(const Permutation& perm, size_t limit) co
return ColumnStruct::create(new_columns);
}

void ColumnStruct::shrink_padding_chars() {
for (auto& column : columns) {
column->shrink_padding_chars();
}
}

void ColumnStruct::reserve(size_t n) {
const size_t tuple_size = columns.size();
for (size_t i = 0; i < tuple_size; ++i) {
Expand Down
2 changes: 0 additions & 2 deletions be/src/core/column/column_struct.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,6 @@ class ColumnStruct final : public COWHelper<IColumn, ColumnStruct> {

int compare_at(size_t n, size_t m, const IColumn& rhs_, int nan_direction_hint) const override;

void shrink_padding_chars() override;

void reserve(size_t n) override;
void resize(size_t n) override;
size_t byte_size() const override;
Expand Down
14 changes: 9 additions & 5 deletions be/src/core/column/predicate_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,18 +104,22 @@ class PredicateColumnType final : public COWHelper<IColumn, PredicateColumnType<

StringRef get_data_at(size_t n) const override {
if constexpr (std::is_same_v<T, StringRef>) {
auto res = reinterpret_cast<const StringRef&>(data[n]);
if constexpr (Type == TYPE_CHAR) {
res.size = strnlen(res.data, res.size);
}
return res;
return reinterpret_cast<const StringRef&>(data[n]);
} else {
throw doris::Exception(
ErrorCode::INTERNAL_ERROR,
"should not call get_data_at in predicate column except for string type");
}
}

void shrink_padding_chars() override {
if constexpr (Type == TYPE_CHAR) {
for (size_t i = 0; i < data.size(); ++i) {
data[i].size = strnlen(data[i].data, data[i].size);
}
}
}

void insert_from(const IColumn& src, size_t n) override {
throw doris::Exception(ErrorCode::INTERNAL_ERROR,
"should not call insert_from in predicate column");
Expand Down
16 changes: 3 additions & 13 deletions be/src/core/data_type_serde/data_type_string_serde.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,22 +462,12 @@ Status DataTypeStringSerDeBase<ColumnType>::from_string(StringRef& str, IColumn&

// Deserializes a STRING/VARCHAR/CHAR value from its OLAP string representation
// (e.g. from ZoneMap protobuf). This is the inverse of to_olap_string().
//
// For CHAR type: if the string is shorter than the declared column length (_len),
// pads with '\0' bytes to reach _len. This preserves CHAR's fixed-length semantics.
// For STRING/VARCHAR: stores the string as-is.
//
// Examples:
// CHAR(10), str="hello" => field = "hello\0\0\0\0\0" (10 bytes)
// VARCHAR, str="hello" => field = "hello" (5 bytes)
template <typename ColumnType>
Status DataTypeStringSerDeBase<ColumnType>::from_olap_string(const std::string& str, Field& field,
const FormatOptions& options) const {
if (cast_set<int>(str.size()) < _len) {
DCHECK_EQ(_type, TYPE_CHAR);
std::string tmp(_len, '\0');
memcpy(tmp.data(), str.data(), str.size());
field = Field::create_field<TYPE_CHAR>(std::move(tmp));
if (_type == TYPE_CHAR) {
size_t real_len = strnlen(str.data(), str.size());
field = Field::create_field<TYPE_CHAR>(std::string(str.data(), real_len));
} else {
field = Field::create_field<TYPE_STRING>(str);
}
Expand Down
46 changes: 0 additions & 46 deletions be/src/exec/rowid_fetcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,30 +206,6 @@ Status RowIDFetcher::_merge_rpc_results(const PMultiGetRequest& request,
return Status::OK();
}

bool _has_char_type(const DataTypePtr& type) {
switch (type->get_primitive_type()) {
case TYPE_CHAR: {
return true;
}
case TYPE_ARRAY: {
const auto* arr_type = assert_cast<const DataTypeArray*>(remove_nullable(type).get());
return _has_char_type(arr_type->get_nested_type());
}
case TYPE_MAP: {
const auto* map_type = assert_cast<const DataTypeMap*>(remove_nullable(type).get());
return _has_char_type(map_type->get_key_type()) ||
_has_char_type(map_type->get_value_type());
}
case TYPE_STRUCT: {
const auto* struct_type = assert_cast<const DataTypeStruct*>(remove_nullable(type).get());
return std::any_of(struct_type->get_elements().begin(), struct_type->get_elements().end(),
[&](const DataTypePtr& dt) -> bool { return _has_char_type(dt); });
}
default:
return false;
}
}

Status RowIDFetcher::fetch(const ColumnPtr& column_row_ids, Block* res_block) {
CHECK(!_stubs.empty());
PMultiGetRequest mget_req = _init_fetch_request(
Expand Down Expand Up @@ -279,16 +255,6 @@ Status RowIDFetcher::fetch(const ColumnPtr& column_row_ids, Block* res_block) {
}
// Check row consistency
RETURN_IF_CATCH_EXCEPTION(res_block->check_number_of_rows());
// shrink for char type
std::vector<size_t> char_type_idx;
for (size_t i = 0; i < _fetch_option.desc->slots().size(); i++) {
const auto& column_desc = _fetch_option.desc->slots()[i];
const auto type = column_desc->type();
if (_has_char_type(type)) {
char_type_idx.push_back(i);
}
}
res_block->shrink_char_type_column_suffix_zero(char_type_idx);
VLOG_DEBUG << "dump block:" << res_block->dump_data(0, 10);
return Status::OK();
}
Expand Down Expand Up @@ -561,15 +527,6 @@ Status RowIdStorageReader::read_by_rowids(const PMultiGetRequestV2& request,
for (const auto& pslot : request_block_desc.slots()) {
slots.push_back(SlotDescriptor(pslot));
}
// prepare block char vector shrink for char type
std::vector<size_t> char_type_idx;
for (int j = 0; j < slots.size(); ++j) {
auto slot = slots[j];
if (_has_char_type(slot.type())) {
char_type_idx.push_back(j);
}
}

try {
if (first_file_mapping->type == FileMappingType::INTERNAL) {
RETURN_IF_ERROR(read_batch_doris_format_row(
Expand All @@ -587,9 +544,6 @@ Status RowIdStorageReader::read_by_rowids(const PMultiGetRequestV2& request,
return Status::Error<false>(e.code(), "Row id fetch failed because {}",
e.what());
}

// after read the block, shrink char type block
result_blocks[i].shrink_char_type_column_suffix_zero(char_type_idx);
}

[[maybe_unused]] size_t compressed_size = 0;
Expand Down
5 changes: 0 additions & 5 deletions be/src/service/point_query_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -567,11 +567,6 @@ Status PointQueryExecutor::_lookup_row_data() {
RETURN_IF_ERROR(segment->seek_and_read_by_rowid(*_tablet->tablet_schema(), slot,
row_id, column,
storage_read_options, iter));
if (_tablet->tablet_schema()
->column_by_uid(slot->col_unique_id())
.has_char_type()) {
column->shrink_padding_chars();
}
}
}
}
Expand Down
20 changes: 1 addition & 19 deletions be/src/storage/delete/delete_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,6 @@ Status convert(const DataTypePtr& data_type, const std::list<std::string>& str,
// Parses a single condition value string into a Field and creates a comparison predicate.
// Uses serde->from_fe_string to do the parsing, which handles all type-specific
// conversions (including decimal scale, etc.).
// For CHAR type, the value is padded with '\0' to the declared column length, consistent
// with the IN list path in convert() above.
// For VARCHAR/STRING, the Field is created directly from the raw string.
Status parse_to_predicate(const uint32_t index, const std::string col_name, const DataTypePtr& type,
DeleteHandler::ConditionParseResult& res, Arena& arena,
std::shared_ptr<ColumnPredicate>& predicate) {
Expand All @@ -128,22 +125,7 @@ Status parse_to_predicate(const uint32_t index, const std::string col_name, cons
}

Field v;
if (type->get_primitive_type() == TYPE_CHAR) {
// CHAR type: create Field and pad with '\0' to the declared column length,
// consistent with IN list path (convert() above) and create_comparison_predicate.
const auto& str = res.value_str.front();
auto char_len = cast_set<size_t>(
assert_cast<const DataTypeString*>(remove_nullable(type).get())->len());
auto target = std::max(char_len, str.size());
if (target > str.size()) {
std::string padded(target, '\0');
memcpy(padded.data(), str.data(), str.size());
v = Field::create_field<TYPE_CHAR>(std::move(padded));
} else {
v = Field::create_field<TYPE_CHAR>(str);
}
} else if (is_string_type(type->get_primitive_type())) {
// VARCHAR/STRING: create Field directly from the raw string, no padding needed.
if (is_string_type(type->get_primitive_type())) {
v = Field::create_field<TYPE_STRING>(res.value_str.front());
} else {
auto serde = type->get_serde();
Expand Down
4 changes: 2 additions & 2 deletions be/src/storage/field.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ class StorageField {
}

// encode the provided `value` into `buf`.
void full_encode_ascending(const void* value, std::string* buf) const {
_key_coder->full_encode_ascending(value, buf);
void full_encode_ascending(const void* value, std::string* buf, size_t char_len = 0) const {
_key_coder->full_encode_ascending(value, buf, char_len);
}

const KeyCoder* key_coder() const { return _key_coder; }
Expand Down
10 changes: 10 additions & 0 deletions be/src/storage/index/bloom_filter/bloom_filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,13 @@ class BloomFilter {

virtual bool has_null() const { return *_has_null; }

// Set by the BF index reader to mirror BloomFilterIndexPB.unpadded_char_filter.
// Predicate evaluation uses this to decide whether a CHAR BF can be safely
// probed with an unpadded predicate value (true) or must be skipped to avoid
// false negatives against an old padded-hash BF (false / unset).
void set_unpadded_char_filter(bool v) { _unpadded_char_filter = v; }
bool unpadded_char_filter() const { return _unpadded_char_filter; }

virtual void add_hash(uint64_t hash) = 0;
virtual bool test_hash(uint64_t hash) const = 0;

Expand Down Expand Up @@ -237,6 +244,9 @@ class BloomFilter {
bool* _has_null = nullptr;
// is this bf used for write
bool _is_write = false;
// mirrors BloomFilterIndexPB.unpadded_char_filter; only meaningful for CHAR
// columns and only set by the BF index reader after deserialization.
bool _unpadded_char_filter = false;

std::function<void(const void*, const int64_t, const uint64_t, void*)> _hash_func;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ Status BloomFilterIndexIterator::read_bloom_filter(rowid_t ordinal,
BloomFilter::create(_reader->_bloom_filter_index_meta->algorithm(), bf, value.size));
RETURN_IF_ERROR((*bf)->init(value.data, value.size,
_reader->_bloom_filter_index_meta->hash_strategy()));
(*bf)->set_unpadded_char_filter(_reader->_bloom_filter_index_meta->unpadded_char_filter());
return Status::OK();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,12 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter {
BloomFilterIndexPB* meta = index_meta->mutable_bloom_filter_index();
meta->set_hash_strategy(_bf_options.strategy);
meta->set_algorithm(BLOCK_BLOOM_FILTER);
if constexpr (field_type == FieldType::OLAP_FIELD_TYPE_CHAR) {
// Mark this BF was built from unpadded CHAR bytes so the reader can
// safely probe with the (also unpadded) predicate value. Old segments
// lack this flag and will be skipped at probe time.
meta->set_unpadded_char_filter(true);
}

// write bloom filters
IndexedColumnWriterOptions options;
Expand Down
4 changes: 2 additions & 2 deletions be/src/storage/index/indexed_column_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,8 @@ Status IndexedColumnIterator::seek_to_ordinal(ordinal_t idx) {
// need to read the data page containing row at idx
if (_reader->_has_index_page) {
std::string key;
KeyCoderTraits<FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT>::full_encode_ascending(&idx,
&key);
KeyCoderTraits<FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT>::full_encode_ascending(
&idx, &key, /*char_len=*/0);
RETURN_IF_ERROR(_ordinal_iter.seek_at_or_before(key));
RETURN_IF_ERROR(_read_data_page(_ordinal_iter.current_page_pointer()));
_current_iter = &_ordinal_iter;
Expand Down
2 changes: 1 addition & 1 deletion be/src/storage/index/indexed_column_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ Status IndexedColumnWriter::_finish_current_data_page(size_t& num_val) {
if (_options.write_ordinal_index) {
std::string key;
KeyCoderTraits<FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT>::full_encode_ascending(
&first_ordinal, &key);
&first_ordinal, &key, /*char_len=*/0);
_ordinal_index_builder->add(key, _last_data_page);
}

Expand Down
Loading
Loading