diff --git a/benchmarks/bulk-insert-and-query.cc b/benchmarks/bulk-insert-and-query.cc index c507a23..eecc930 100644 --- a/benchmarks/bulk-insert-and-query.cc +++ b/benchmarks/bulk-insert-and-query.cc @@ -13,38 +13,41 @@ // 55: // Million Find Find Find Find Find optimal wasted // adds/sec 0% 25% 50% 75% 100% ε bits/item bits/item space -// Cuckoo12 23.78 37.24 35.04 37.17 37.35 36.35 0.131% 18.30 9.58 91.1% -// SemiSort13 11.63 17.55 17.08 17.14 17.54 22.32 0.064% 18.30 10.62 72.4% -// Cuckoo8 35.31 49.32 50.24 49.98 48.32 50.49 2.044% 12.20 5.61 117.4% -// SemiSort9 13.99 22.23 22.78 22.13 23.16 24.06 1.207% 12.20 6.37 91.5% -// Cuckoo16 27.06 36.94 37.12 35.31 36.81 35.10 0.009% 24.40 13.46 81.4% -// SemiSort17 10.37 15.70 15.84 15.78 15.55 15.93 0.004% 24.40 14.72 65.8% -// SimdBlock8 74.22 72.34 74.23 74.34 74.69 74.32 0.508% 12.20 7.62 60.1% -// time: 14.34 seconds +// Cuckoo12 27.15 30.20 40.99 41.18 40.83 41.61 0.128% 18.30 9.61 90.5% +// SemiSort13 11.21 18.29 18.15 18.26 18.46 17.55 0.065% 18.30 10.58 72.9% +// Shingle12 21.34 40.58 40.80 40.82 40.66 40.91 0.062% 18.30 10.66 71.8% +// Cuckoo8 42.06 45.61 54.74 53.58 55.83 54.35 2.071% 12.20 5.59 118.1% +// SemiSort9 15.18 24.40 25.77 14.41 25.57 26.05 1.214% 12.20 6.36 91.7% +// Cuckoo16 31.81 39.52 40.61 40.41 40.09 40.08 0.010% 24.40 13.30 83.5% +// SemiSort17 11.24 16.73 16.55 16.71 16.77 16.34 0.005% 24.40 14.44 69.0% +// SimdBlock8 81.48 84.58 86.63 86.63 83.58 87.26 0.485% 12.20 7.69 58.7% +// time: 14.06 seconds // // 75: // Million Find Find Find Find Find optimal wasted // adds/sec 0% 25% 50% 75% 100% ε bits/item bits/item space -// Cuckoo12 15.61 37.24 37.23 37.34 37.15 37.36 0.173% 13.42 9.18 46.2% -// SemiSort13 8.77 17.11 15.70 17.34 17.73 18.86 0.087% 13.42 10.17 31.9% -// Cuckoo8 23.46 48.81 48.14 39.48 49.28 49.65 2.806% 8.95 5.16 73.6% -// SemiSort9 11.14 23.98 20.80 23.37 24.35 21.41 1.428% 8.95 6.13 46.0% -// Cuckoo16 15.08 36.64 36.75 36.83 36.59 36.74 0.011% 17.90 13.11 36.5% -// SemiSort17 8.02 15.63 15.66 15.87 15.67 15.88 0.006% 17.90 14.02 27.6% -// SimdBlock8 73.26 74.41 74.28 70.86 72.02 70.69 2.071% 8.95 5.59 60.0% -// time: 18.06 seconds +// Cuckoo12 18.27 41.87 41.80 40.89 39.83 41.95 0.170% 13.42 9.20 45.9% +// SemiSort13 8.65 18.48 14.31 18.63 18.78 14.83 0.087% 13.42 10.17 31.9% +// Shingle12 11.00 40.80 41.14 41.34 41.30 41.41 0.088% 13.42 10.16 32.1% +// Cuckoo8 28.13 53.47 55.73 56.40 56.30 56.50 2.797% 8.95 5.16 73.4% +// SemiSort9 12.43 25.76 26.30 25.91 16.99 26.46 1.438% 8.95 6.12 46.2% +// Cuckoo16 17.71 40.93 41.09 41.19 41.31 40.84 0.012% 17.90 13.00 37.7% +// SemiSort17 8.46 16.99 17.06 15.84 13.75 17.06 0.006% 17.90 14.10 26.9% +// SimdBlock8 88.56 88.43 84.02 87.45 88.91 88.38 2.054% 8.95 5.61 59.6% +// time: 16.27 seconds // // 85: // Million Find Find Find Find Find optimal wasted // adds/sec 0% 25% 50% 75% 100% ε bits/item bits/item space -// Cuckoo12 22.74 32.49 32.69 32.58 32.85 32.71 0.102% 23.69 9.94 138.3% -// SemiSort13 9.97 13.16 13.15 13.54 16.01 19.58 0.056% 23.69 10.80 119.4% -// Cuckoo8 30.67 36.86 36.79 37.09 36.97 36.87 1.581% 15.79 5.98 163.9% -// SemiSort9 10.96 15.49 15.37 15.40 15.18 15.63 1.047% 15.79 6.58 140.1% -// Cuckoo16 27.84 33.74 33.72 33.69 33.75 33.62 0.007% 31.58 13.80 128.8% -// SemiSort17 9.51 12.83 12.80 12.64 12.86 12.50 0.004% 31.58 14.65 115.6% -// SimdBlock8 54.84 58.37 59.73 59.13 60.11 60.12 0.144% 15.79 9.44 67.3% -// time: 19.43 seconds +// Cuckoo12 25.80 37.66 37.97 38.01 37.94 37.87 0.098% 23.69 9.99 137.1% +// SemiSort13 9.60 14.38 14.51 14.34 12.69 14.56 0.048% 23.69 11.02 114.8% +// Shingle12 21.77 37.25 36.65 37.44 37.55 35.79 0.052% 23.69 10.91 117.1% +// Cuckoo8 36.73 40.92 40.99 41.51 40.96 41.39 1.574% 15.79 5.99 163.6% +// SemiSort9 11.39 16.76 16.57 16.68 16.25 16.82 1.049% 15.79 6.57 140.2% +// Cuckoo16 33.98 37.85 38.70 38.92 38.76 38.95 0.006% 31.58 13.98 125.9% +// SemiSort17 10.30 13.39 14.30 14.21 14.34 14.40 0.004% 31.58 14.61 116.2% +// SimdBlock8 66.62 72.34 72.50 71.38 72.43 72.09 0.141% 15.79 9.48 66.6% +// time: 16.50 seconds // #include @@ -54,7 +57,9 @@ #include #include "cuckoofilter.h" +#include "filter-api.h" #include "random.h" +#include "shingle.h" #include "simd-block.h" #include "timing.h" @@ -119,38 +124,6 @@ basic_ostream& operator<<( return os; } -template -struct FilterAPI {}; - -template class TableType> -struct FilterAPI> { - using Table = CuckooFilter; - static Table ConstructFromAddCount(size_t add_count) { return Table(add_count); } - static void Add(uint64_t key, Table * table) { - if (0 != table->Add(key)) { - throw logic_error("The filter is too small to hold all of the elements"); - } - } - static bool Contain(uint64_t key, const Table * table) { - return (0 == table->Contain(key)); - } -}; - -template <> -struct FilterAPI> { - using Table = SimdBlockFilter<>; - static Table ConstructFromAddCount(size_t add_count) { - Table ans(ceil(log2(add_count * 8.0 / CHAR_BIT))); - return ans; - } - static void Add(uint64_t key, Table* table) { - table->Add(key); - } - static bool Contain(uint64_t key, const Table * table) { - return table->Find(key); - } -}; - template Statistics FilterBenchmark( size_t add_count, const vector& to_add, const vector& to_lookup) { @@ -224,6 +197,10 @@ int main(int argc, char * argv[]) { cout << setw(NAME_WIDTH) << "SemiSort13" << cf << endl; + cf = FilterBenchmark>(add_count, to_add, to_lookup); + + cout << setw(NAME_WIDTH) << "Shingle12" << cf << endl; + cf = FilterBenchmark< CuckooFilter>( add_count, to_add, to_lookup); diff --git a/benchmarks/conext-figure5.cc b/benchmarks/conext-figure5.cc index 2f759de..fc1eb3d 100644 --- a/benchmarks/conext-figure5.cc +++ b/benchmarks/conext-figure5.cc @@ -1,22 +1,24 @@ // This benchmark reproduces the CoNEXT 2014 results found in "Figure 5: Lookup -// performance when a filter achieves its capacity." It takes about two minutes to run on -// an Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz. +// performance when a filter achieves its capacity." It takes about three minutes to run +// on an Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz. // // Results: // fraction of queries on existing items/lookup throughput (million OPS) -// CF ss-CF -// 0.00% 24.79 9.37 -// 25.00% 24.65 9.57 -// 50.00% 24.84 9.57 -// 75.00% 24.86 9.62 -// 100.00% 24.89 9.96 +// CF ss-CF Shingle +// 0.00% 26.10 10.07 27.37 +// 25.00% 25.92 10.65 27.40 +// 50.00% 26.00 10.65 27.18 +// 75.00% 25.95 10.79 27.21 +// 100.00% 25.89 10.64 27.28 #include #include #include #include "cuckoofilter.h" +#include "filter-api.h" #include "random.h" +#include "shingle.h" #include "timing.h" using namespace std; @@ -29,14 +31,21 @@ const size_t SAMPLE_SIZE = 1000 * 1000; // The time (in seconds) to lookup SAMPLE_SIZE keys in which 0%, 25%, 50%, 75%, and 100% // of the keys looked up are found. template -array CuckooBenchmark( +array Benchmark( size_t add_count, const vector& to_add, const vector& to_lookup) { - Table cuckoo(add_count); + Table filter = FilterAPI::ConstructFromAddCount(add_count); array result; // Add values until failure or until we run out of values to add: size_t added = 0; - while (added < to_add.size() && 0 == cuckoo.Add(to_add[added])) ++added; + while (added < to_add.size()) { + try { + FilterAPI
::Add(to_add[added], &filter); + } catch (...) { + break; + } + ++added; + } // A value to track to prevent the compiler from optimizing out all lookups: size_t found_count = 0; @@ -44,7 +53,9 @@ array CuckooBenchmark( const auto to_lookup_mixed = MixIn(&to_lookup[0], &to_lookup[SAMPLE_SIZE], &to_add[0], &to_add[added], found_percent); auto start_time = NowNanos(); - for (const auto v : to_lookup_mixed) found_count += (0 == cuckoo.Contain(v)); + for (const auto v : to_lookup_mixed) { + found_count += FilterAPI
::Contain(v, &filter); + } auto lookup_time = NowNanos() - start_time; result[found_percent * 4] = lookup_time / (1000.0 * 1000.0 * 1000.0); } @@ -64,21 +75,24 @@ int main() { const vector to_lookup = GenerateRandom64(SAMPLE_SIZE); // Calculate metrics: - const auto cf = CuckooBenchmark< - CuckooFilter>( + const auto cf = Benchmark>( add_count, to_add, to_lookup); - const auto sscf = CuckooBenchmark< - CuckooFilter>( + const auto sscf = Benchmark>( add_count, to_add, to_lookup); + const auto qcf = Benchmark>(add_count, to_add, to_lookup); cout << "fraction of queries on existing items/lookup throughput (million OPS) " << endl; cout << setw(10) << "" - << " " << setw(10) << right << "CF" << setw(10) << right << "ss-CF" << endl; + << " " << setw(10) << right << "CF" << setw(10) << right << "ss-CF" + << setw(10) << right << "Shingle" << endl; for (const double found_percent : {0.0, 0.25, 0.50, 0.75, 1.00}) { cout << fixed << setprecision(2) << setw(10) << right << 100 * found_percent << "%"; cout << setw(10) << right << (SAMPLE_SIZE / cf[found_percent * 4]) / (1000 * 1000); cout << setw(10) << right << (SAMPLE_SIZE / sscf[found_percent * 4]) / (1000 * 1000); + cout << setw(10) << right << (SAMPLE_SIZE / qcf[found_percent * 4]) / (1000 * 1000); cout << endl; } } diff --git a/benchmarks/filter-api.h b/benchmarks/filter-api.h new file mode 100644 index 0000000..b99a6e5 --- /dev/null +++ b/benchmarks/filter-api.h @@ -0,0 +1,63 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "cuckoofilter.h" +#include "shingle.h" +#include "simd-block.h" + +template +struct FilterAPI {}; + +template class TableType> +struct FilterAPI< + cuckoofilter::CuckooFilter> { + using Table = cuckoofilter::CuckooFilter; + static Table ConstructFromAddCount(std::size_t add_count) { + return Table(add_count); + } + static void Add(std::uint64_t key, Table *table) { + if (0 != table->Add(key)) { + throw std::logic_error( + "The cuckoo filter is too small to hold all of the elements"); + } + } + static bool Contain(std::uint64_t key, const Table *table) { + return (0 == table->Contain(key)); + } +}; + +template <> +struct FilterAPI> { + using Table = SimdBlockFilter<>; + static Table ConstructFromAddCount(std::size_t add_count) { + Table ans(std::ceil(std::log2(add_count * 8.0 / CHAR_BIT))); + return ans; + } + static void Add(std::uint64_t key, Table *table) { table->Add(key); } + static bool Contain(std::uint64_t key, const Table *table) { + return table->Find(key); + } +}; + +template +struct FilterAPI> { + using Table = Shingle; + static Table ConstructFromAddCount(size_t add_count) { + return Table(ceil(log2(add_count * 12.75 / 12.0))); + } + static void Add(std::uint64_t key, Table *table) { + if (!table->Add(key)) { + throw std::logic_error( + "The quotient filter is too small to hold all of the elements"); + } + } + static bool Contain(std::uint64_t key, const Table *table) { + return table->Contain(key); + } +}; diff --git a/src/shingle.h b/src/shingle.h new file mode 100644 index 0000000..29d0900 --- /dev/null +++ b/src/shingle.h @@ -0,0 +1,208 @@ +#pragma once + +// Cuckoo filters in which the buckets can overlap. See Lehman, Eric, and Rina +// Panigrahy. "3.5-way cuckoo hashing for the price of 2-and-a-bit." European +// Symposium on Algorithms. Springer, Berlin, Heidelberg, 2009. + +#include +#include +#include +#include + +#include "bitsutil.h" +#include "hashutil.h" + +template +class Shingle { + using uint16_t = ::std::uint16_t; + using uint64_t = ::std::uint64_t; + + // The low-order `bits` bits of the result are 1; all others are 0. + static constexpr uint64_t Mask(int bits) { + return (static_cast(1) << bits) - 1; + } + + // The two halves of the table are stored interleaved, A[0] then B[0] then + // A[1] then B[1], and so on. Each slot has 12 bits, and we store A[i] and + // B[i] together in a `Cell` of three bytes (24 bits). We use the eleven + // high-order bits to store a fingerprint and the bottom bit to indicate if + // the fingerprint is offset from the original bucket it hashed to. + // + // In this class and below, methods that can operate on Cells have a + // template parameter that is true if the value from the array A is + // to be manipulated and false if the value from the array B is to be + // manipulated. + // + // The fingerprint 0x0 is reserved to indicate an empty slot. Keys hashing to + // 0x0 are considered to have a hash of 0x1. + + using Cell = ::std::array; + + static_assert(sizeof(Cell[3]) == 9, "Cells are not packed tightly"); + + HashFamily hasher_; + // A and B have the same length, which is a power of 2. imask_ is one less + // than that length + const uint64_t imask_; + // fp_hash_ uses delta-universal hashing (of the multiply-shift type) to + // derive an index in B from the index in A plus a hash of the fingerprint. + const uint64_t fp_hash_; + Cell *const data_; + size_t filled_; // Number of non-empty slots. + + // Get the fingerprint and offset from index i. The table is A if ISA is true. + template + [[gnu::always_inline]] uint64_t Get(uint64_t i) const { + const uint16_t result = + *reinterpret_cast(&data_[i][1 - ISA]); + if (ISA) { + return result & 0x0fff; + } else { + return result >> 4; + } + } + + // Set the fingerprint and offset at index i to the low-order 12 bits of + // x. The table is A if ISA is true. + template + [[gnu::always_inline]] void Set(uint64_t i, uint64_t x) { + uint16_t &result = *reinterpret_cast(&data_[i][1 - ISA]); + if (ISA) { + result = x | (result & 0xf000); + } else { + result = (x << 4) | (result & 0x000f); + } + } + + uint64_t ReIndex(uint64_t idx, uint64_t fp) const { + return (idx ^ ((fp_hash_ * fp) >> 11)) & imask_; + } + + // Set (ISA ? A : B)[idx + offset] = fp and return the index and fingerpritn + // that was previously there. + template + [[gnu::always_inline]] void Swap(uint64_t idx, uint64_t offset, uint64_t fp, + uint64_t *result_idx, + uint64_t *result_fp) { + idx += offset; + fp = offset | (fp << 1); + *result_idx = idx; + *result_fp = Get(idx); + if (*result_fp & 1) --*result_idx; + *result_fp >>= 1; + Set(idx, fp); + } + + // Helper function for Add(), below. Places fp in one of its two slots (idx or + // idx+1) in (ISA ? A : B), and recurses if necessary. + template + void AddHelp(uint64_t idx, uint64_t fp) { + for (uint64_t offset : {0, 1}) { + const uint64_t q = idx + offset; + const uint64_t fp_now = Get(q); + if (0 == fp_now) { + uint64_t fp_later = offset | (fp << 1); + Set(q, fp_later); + ++filled_; + return; + } + } + + // Do a short local search to see if some items in the next bucket can be + // pushed to later slots, ala robin-hood linear probing. + if (0 == (Get(idx + 1) & 0x1)) { + if (0 == Get(idx + 2)) { + Set(idx + 2, 0x1 | Get(idx + 1)); + Set(idx + 1, 0x1 | (fp << 1)); + ++filled_; + return; + } else if (0 == (Get(idx + 2) & 0x1)) { + if (0 == Get(idx + 3)) { + Set(idx + 3, 0x1 | Get(idx + 2)); + Set(idx + 2, 0x1 | Get(idx + 1)); + Set(idx + 1, 0x1 | (fp << 1)); + ++filled_; + return; + } + } + } + + // Kick out a random key from the two slots: + uint64_t offset = std::rand() % 2; + // TODO: replace random search with BFS or iterative deepening + Swap(idx, offset, fp, &idx, &fp); + // TODO: replace recursion with iteration + return AddHelp(ReIndex(idx, fp), fp); + } + + // Helper for Delete(), below. Returns true if the key was found. + template + [[gnu::always_inline]] bool DeleteHelp(uint64_t idx, uint64_t fp) { + for (uint64_t offset : {0, 1}) { + uint64_t i = idx + offset, f = offset | (fp << 1); + if (Get(i) == f) { + Set(i, 0); + return true; + } + } + if (ISA) return DeleteHelp(ReIndex(idx, fp), fp); + return false; + } + + public: + explicit Shingle(int log2_slots) + : hasher_(), + // Each array has half of the slots + imask_(Mask(log2_slots - 1)), + fp_hash_([]() { + ::std::random_device random; + uint64_t result = random(); + return (result << 32) | random(); + }()), + // Add two extra SlotPairs at the end so 64-bit operations don't read + // past the end and SEGFAULT. + data_(new Cell[imask_ + 3]()), + filled_(0) {} + + ~Shingle() { delete[] data_; } + + uint64_t SizeInBytes() const { return sizeof(Cell) * (imask_ + 3); } + + bool Add(uint64_t key) { + if ((static_cast(filled_) / (2 * (imask_ + 1))) > (12.0 / 12.75)) { + return false; + } + key = hasher_(key); + uint64_t idx = (key >> 11) & imask_, fp = key & Mask(11); + fp += (0 == fp); // Since 0 is the empty slot, re-target zero remainders. + AddHelp(idx, fp); + return true; + } + + [[gnu::always_inline]] bool Contain(uint64_t key) const { + key = hasher_(key); + uint64_t idx = (key >> 11) & imask_, fp = key & Mask(11); + fp += (fp == 0); + auto idx2 = ReIndex(idx, fp); + constexpr uint64_t A_SLOTS_MASK = Mask(12) + (Mask(12) << 24), + B_SLOTS_MASK = A_SLOTS_MASK << 12; + uint64_t slots = + (~A_SLOTS_MASK) | *reinterpret_cast(&data_[idx]); + auto slots2 = + (~B_SLOTS_MASK) | *reinterpret_cast(&data_[idx2]); + auto slots_all = slots & slots2; + + uint64_t fp_all = fp * 0x002002002002ull; + fp_all |= 0x001001000000ull; + + return haszero12(fp_all ^ slots_all); + } + + bool Delete(uint64_t key) { + key = hasher_(key); + const uint64_t idx = (key >> 11) & imask_; + uint64_t fp = key & Mask(11); + fp += (0 == fp); + return DeleteHelp<>(idx, fp); + } +};