Skip to content

Commit 093dee7

Browse files
committed
test cases for column sketches
1 parent 4c7b352 commit 093dee7

File tree

4 files changed

+129
-23
lines changed

4 files changed

+129
-23
lines changed

CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ if(COMPILER_SUPPORTS_MARCH_NATIVE)
3737
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
3838
endif()
3939

40-
#add_compile_options(-fsanitize=address)
41-
#add_link_options(-fsanitize=address)
40+
# add_compile_options(-fsanitize=address)
41+
# add_link_options(-fsanitize=address)
4242
#add_compile_options(-fsanitize=undefined)
4343
#add_link_options(-fsanitize=undefined)
4444

include/bucket.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ struct Bucket {
2323
bool operator!=(const Bucket &rhs) const {
2424
return alpha != rhs.alpha || gamma != rhs.gamma;
2525
};
26+
friend std::ostream &operator<<(std::ostream &os, const Bucket &b) {
27+
os << "(a: " << b.alpha << " g: " << b.gamma << ")";
28+
return os;
29+
}
2630
};
2731
#pragma pack(pop)
2832

include/sketch/sketch_columns.h

Lines changed: 64 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,38 +12,55 @@
1212
*/
1313
class FixedSizeSketchColumn {
1414
private:
15-
static uint64_t seed;
1615

17-
Bucket *buckets;
16+
std::unique_ptr<Bucket[]> buckets;
1817
Bucket deterministic_bucket = {0, 0};
18+
uint16_t col_idx; // determines column seeding
1919
uint8_t capacity;
20-
uint8_t col_idx; // determines column seeding
2120
public:
21+
static uint64_t seed;
2222
static void set_seed(uint64_t new_seed) {
2323
seed = new_seed;
2424
};
2525
static const uint64_t get_seed() {
2626
return seed;
2727
};
2828

29-
FixedSizeSketchColumn(uint8_t capacity, uint8_t col_idx);
29+
FixedSizeSketchColumn(uint8_t capacity, uint16_t col_idx);
30+
FixedSizeSketchColumn(const FixedSizeSketchColumn &other);
3031
~FixedSizeSketchColumn();
3132
SketchSample<vec_t> sample() const;
3233
void clear();
3334
void update(const vec_t update);
3435
void merge(FixedSizeSketchColumn &other);
3536
uint8_t get_depth() const;
3637
void serialize(std::ostream &binary_out) const;
38+
friend std::ostream& operator<<(std::ostream &os, const FixedSizeSketchColumn &sketch) {
39+
os << "FixedSizeSketchColumn: " << std::endl;
40+
os << "Capacity: " << (int)sketch.capacity << std::endl;
41+
os << "Column Index: " << (int)sketch.col_idx << std::endl;
42+
os << "Deterministic Bucket: " << sketch.deterministic_bucket << std::endl;
43+
for (size_t i = 0; i < sketch.capacity; ++i) {
44+
os << "Bucket[" << i << "]: " << sketch.buckets[i] << std::endl;
45+
}
46+
return os;
47+
}
3748
};
3849

39-
FixedSizeSketchColumn::FixedSizeSketchColumn(uint8_t capacity, uint8_t col_idx) :
50+
FixedSizeSketchColumn::FixedSizeSketchColumn(uint8_t capacity, uint16_t col_idx) :
4051
capacity(capacity), col_idx(col_idx) {
41-
buckets = new Bucket[capacity];
42-
// std::memset(buckets, 0, capacity * sizeof(Bucket));
52+
buckets = std::make_unique<Bucket[]>(capacity);
53+
// std::memset(buckets.get(), 0, capacity * sizeof(Bucket));
54+
}
55+
56+
FixedSizeSketchColumn::FixedSizeSketchColumn(const FixedSizeSketchColumn &other) :
57+
capacity(other.capacity), col_idx(other.col_idx), deterministic_bucket(other.deterministic_bucket) {
58+
buckets = std::make_unique<Bucket[]>(capacity);
59+
std::memcpy(buckets.get(), other.buckets.get(), capacity * sizeof(Bucket));
4360
}
4461

4562
FixedSizeSketchColumn::~FixedSizeSketchColumn() {
46-
delete[] buckets;
63+
// delete[] buckets;
4764
}
4865

4966
uint8_t FixedSizeSketchColumn::get_depth() const {
@@ -56,7 +73,7 @@ uint8_t FixedSizeSketchColumn::get_depth() const {
5673

5774
// TODO - implement actual deserialization
5875
void FixedSizeSketchColumn::serialize(std::ostream &binary_out) const {
59-
binary_out.write((char *) buckets, capacity * sizeof(Bucket));
76+
binary_out.write((char *) buckets.get(), capacity * sizeof(Bucket));
6077
binary_out.write((char *) &deterministic_bucket, sizeof(Bucket));
6178
binary_out.write((char *) &capacity, sizeof(uint8_t));
6279
binary_out.write((char *) &col_idx, sizeof(uint8_t));
@@ -75,7 +92,7 @@ SketchSample<vec_t> FixedSizeSketchColumn::sample() const {
7592
}
7693

7794
void FixedSizeSketchColumn::clear() {
78-
std::memset(buckets, 0, capacity * sizeof(Bucket));
95+
std::memset(buckets.get(), 0, capacity * sizeof(Bucket));
7996
deterministic_bucket = {0, 0};
8097
}
8198

@@ -89,6 +106,7 @@ void FixedSizeSketchColumn::merge(FixedSizeSketchColumn &other) {
89106
void FixedSizeSketchColumn::update(const vec_t update) {
90107
vec_hash_t checksum = Bucket_Boruvka::get_index_hash(update, seed);
91108
col_hash_t depth = Bucket_Boruvka::get_index_depth(update, seed, col_idx, capacity);
109+
assert(depth < capacity);
92110
buckets[depth] ^= {update, checksum};
93111
deterministic_bucket ^= {update, checksum};
94112
}
@@ -98,16 +116,16 @@ void FixedSizeSketchColumn::update(const vec_t update) {
98116
class ResizeableSketchColumn {
99117
private:
100118
static uint64_t seed;
101-
102119
hwy::AlignedFreeUniquePtr<Bucket[]> aligned_buckets;
103120
Bucket deterministic_bucket = {0, 0};
121+
uint16_t col_idx; // determines column seeding
104122
uint8_t capacity;
105-
uint8_t col_idx; // determines column seeding
106123
public:
107124
static void set_seed(uint64_t new_seed) { seed = new_seed; };
108125
static const uint64_t get_seed() { return seed; };
109126

110-
ResizeableSketchColumn(uint8_t start_capacity, uint8_t col_idx);
127+
ResizeableSketchColumn(uint8_t start_capacity, uint16_t col_idx);
128+
ResizeableSketchColumn(const ResizeableSketchColumn &other);
111129
~ResizeableSketchColumn();
112130
SketchSample<vec_t> sample() const;
113131
void clear();
@@ -119,15 +137,27 @@ class ResizeableSketchColumn {
119137
void reallocate(uint8_t new_capacity);
120138
};
121139

140+
uint64_t ResizeableSketchColumn::seed = 0;
141+
uint64_t FixedSizeSketchColumn::seed = 0;
142+
122143

123-
ResizeableSketchColumn::ResizeableSketchColumn(uint8_t start_capacity, uint8_t col_idx) :
144+
ResizeableSketchColumn::ResizeableSketchColumn(uint8_t start_capacity, uint16_t col_idx) :
124145
capacity(start_capacity), col_idx(col_idx) {
125146

126147
// auto aligned_memptr = hwy::MakeUniqueAlignedArray<Bucket>(start_capacity);
127148
aligned_buckets = hwy::AllocateAligned<Bucket>(start_capacity);
128149
std::memset(aligned_buckets.get(), 0, capacity * sizeof(Bucket));
129150
}
130151

152+
ResizeableSketchColumn::ResizeableSketchColumn(const ResizeableSketchColumn &other) :
153+
capacity(other.capacity), col_idx(other.col_idx), deterministic_bucket(other.deterministic_bucket) {
154+
aligned_buckets = hwy::AllocateAligned<Bucket>(capacity);
155+
std::memcpy(aligned_buckets.get(), other.aligned_buckets.get(), capacity * sizeof(Bucket));
156+
}
157+
158+
ResizeableSketchColumn::~ResizeableSketchColumn() {
159+
}
160+
131161
/*
132162
Note this DROPS the contents if allocated down too much.
133163
*/
@@ -154,13 +184,29 @@ void ResizeableSketchColumn::serialize(std::ostream &binary_out) const {
154184
binary_out.write((char *) &col_idx, sizeof(uint8_t));
155185
}
156186

187+
SketchSample<vec_t> ResizeableSketchColumn::sample() const {
188+
if (Bucket_Boruvka::is_empty(deterministic_bucket)) {
189+
return {0, ZERO}; // the "first" bucket is deterministic so if all zero then no edges to return
190+
}
191+
for (size_t i = capacity; i > 0; --i) {
192+
if (Bucket_Boruvka::is_good(aligned_buckets[i - 1], seed)) {
193+
return {aligned_buckets[i - 1].alpha, GOOD};
194+
}
195+
}
196+
return {0, FAIL};
197+
}
198+
157199
void ResizeableSketchColumn::update(const vec_t update) {
158200
vec_hash_t checksum = Bucket_Boruvka::get_index_hash(update, seed);
159-
col_hash_t depth = Bucket_Boruvka::get_index_depth(update, seed, col_idx, capacity);
201+
// TODO - remove magic number
202+
// TODO - get_index_depth needs to be fixed. hashes need to be longer
203+
// than 32 bits if we're not using the deep bucket buffer idea.
204+
col_hash_t depth = Bucket_Boruvka::get_index_depth(update, seed, col_idx, 32);
160205
deterministic_bucket ^= {update, checksum};
161206

162-
if (depth >= capacity) {
163-
reallocate(depth + 4);
207+
while (depth >= capacity) {
208+
// first multple of 4 larger than or equal to depth
209+
reallocate(capacity + 4);
164210
}
165211
aligned_buckets[depth] ^= {update, checksum};
166212
}
@@ -187,6 +233,7 @@ uint8_t ResizeableSketchColumn::get_depth() const {
187233
}
188234

189235

236+
190237
static_assert(SketchColumnConcept<FixedSizeSketchColumn, vec_t>,
191238
"FixedSizeSketchColumn does not satisfy SketchColumnConcept");
192239

test/sketch_test.cpp

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,62 @@ TEST(SketchTestSuite, TestRawBucketUpdate) {
479479
}
480480

481481

482-
// TEST(SketchColumnTestSuite, TestSketchColumnZero) {
483-
// ResizeableSketchColumn::set_seed(get_seed());
484-
// ResizeableSketchColumn column(4, 0);
485-
// }
482+
TEST(SketchColumnTestSuite, TestSketchColumnSampling) {
483+
ResizeableSketchColumn::set_seed(get_seed());
484+
// ResizeableSketchColumn::seed = get_seed();
485+
ResizeableSketchColumn column(18, 0);
486+
column.update(10);
487+
auto sample = column.sample();
488+
ASSERT_EQ(sample.result, GOOD);
489+
ASSERT_EQ(sample.idx, 10);
490+
column.update(20);
491+
sample = column.sample();
492+
ASSERT_NE(sample.result, ZERO);
493+
column.update(10);
494+
sample = column.sample();
495+
ASSERT_EQ(sample.result, GOOD);
496+
ASSERT_EQ(sample.idx, 20);
497+
column.update(20);
498+
sample = column.sample();
499+
ASSERT_EQ(sample.result, ZERO);
500+
}
501+
502+
TEST(SketchColumnTestSuite, TestSketchColumnMerging) {
503+
ResizeableSketchColumn::set_seed(get_seed());
504+
for (size_t col_idx =0; col_idx < 16; col_idx++) {
505+
ResizeableSketchColumn column1(18, col_idx);
506+
ResizeableSketchColumn column2(18, col_idx);
507+
for (vec_t i = 0; i < (1 << 11); i++) {
508+
column1.update(i);
509+
column2.update(i + 128);
510+
}
511+
column1.merge(column2);
512+
// at this point, the value should be [0,127] or [1 << 11]
513+
auto sample = column1.sample();
514+
if (sample.result == GOOD) {
515+
// std::cout << "sample.idx: " << sample.idx << std::endl;
516+
ASSERT_TRUE(
517+
sample.idx < 128 || (
518+
sample.idx >= (1 << 11) && sample.idx < (1 << 11) + 128
519+
)
520+
);
521+
}
522+
}
523+
}
524+
525+
TEST(SketchColumnTestSuite, TestSketchColumnMergeMany) {
526+
ResizeableSketchColumn::set_seed(get_seed());
527+
std::vector<ResizeableSketchColumn> columns(1 << 13, ResizeableSketchColumn(4, 0));
528+
for (size_t i = 0; i < columns.size(); i++) {
529+
columns[i].update(i);
530+
}
531+
for (size_t i = 1; i < columns.size(); i++) {
532+
columns[0].merge(columns[i]);
533+
}
534+
for (size_t i = 1; i < columns.size(); i++) {
535+
columns[0].update(i);
536+
}
537+
auto sample = columns[0].sample();
538+
ASSERT_EQ(sample.result, GOOD);
539+
ASSERT_EQ(sample.idx, 0);
540+
}

0 commit comments

Comments
 (0)