diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 00000000..786c761c --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,17 @@ +{ + "hooks": { + "PostToolUse": [ + { + "matcher": "Edit", + "hooks": [ + { + "type": "command", + "command": "make format-fix 2>/dev/null || true", + "timeout": 30, + "statusMessage": "Running format-fix..." + } + ] + } + ] + } +} diff --git a/.claude/skills/explain-dp/SKILL.md b/.claude/skills/explain-dp/SKILL.md new file mode 100644 index 00000000..352a2437 --- /dev/null +++ b/.claude/skills/explain-dp/SKILL.md @@ -0,0 +1,82 @@ +--- +name: explain-dp +description: Reference material for differential privacy concepts. Auto-loaded when discussing privacy, attacks, sensitivity, or clipping. +--- + +## Differential Privacy (DP) + +### Definition + +A randomized mechanism M satisfies (ε,δ)-differential privacy if for all +neighboring datasets D, D' (differing in one individual) and all outputs S: + + P[M(D) ∈ S] ≤ e^ε · P[M(D') ∈ S] + δ + +Smaller ε = stronger privacy. δ is the probability of catastrophic failure. + +### Key concepts + +- **Sensitivity**: Maximum change in query output when one individual is + added/removed. For SUM with values in [L,U]: sensitivity = U-L. +- **Laplace mechanism**: Add Laplace(0, sensitivity/ε) noise. Standard for counting queries. +- **Gaussian mechanism**: Add N(0, sensitivity²·2ln(1.25/δ)/ε²) noise. Better for composition. +- **Composition**: Running k queries on the same data costs k·ε total (basic), + or O(√k·ε) with advanced composition. +- **Post-processing**: Any function of a DP output is still DP. Free to clip/transform after noise. + +### Membership Inference Attack (MIA) + +The adversary's game: given a query result, determine whether a specific individual +is in the dataset. Attack accuracy = fraction of correct guesses across trials. +50% = random (DP working). >50% = information leakage. + +### Bounded user contribution (Wilson et al. 2019) + +Standard approach for DP SQL: +1. GROUP BY user_id → compute per-user contribution +2. Clip each user's contribution to [L, U] +3. Sum clipped contributions +4. Add noise calibrated to U-L + +This handles both single-large-value outliers and many-small-values users. +Reference: "Differentially Private SQL with Bounded User Contribution" (Google). + +### How PAC differs from DP + +| | DP | PAC | +|---|---|---| +| **Guarantee type** | Input-independent (worst-case) | Instance-dependent (distribution D) | +| **Noise calibration** | Sensitivity s → noise ∝ s/ε | Variance σ² → noise ∝ σ²/(2β) | +| **White-boxing** | Required (analyze algorithm) | Not needed (black-box simulation) | +| **Composition** | k queries → k·ε (basic) | k queries → Σ MIᵢ (linear, Theorem 2) | +| **Privacy metric** | ε (log-likelihood ratio) | MI (mutual information, in nats) | +| **Conversion** | MI=1/128 ≈ ε=0.25 for prior=50% | See Table 3.2 in thesis | +| **Stable algorithms** | Same noise regardless | Less noise automatically | +| **Outlier impact** | Sensitivity explodes | Variance explodes (same practical problem) | + +Key insight: PAC guarantees are **loose** — the theoretical bound on MIA success +rate is conservative. Empirical attacks achieve lower success than the bound +predicts. This means the bounds are hard to violate. + +### Input clipping (Winsorization) + +Clip individual values to [μ-tσ, μ+tσ] before aggregation. Reduces sensitivity. +Well-established in DP literature. Limitations: doesn't catch users with many +small values (need per-user contribution clipping instead). + +### Privacy-conscious design + +Rather than post-hoc privatization (build algorithm, then add noise), PAC enables +**privacy-conscious design**: optimize algorithm parameters jointly with +the privacy budget. + +Key result: For a privatized estimator with budget B: + MSE = Bias² + (1/(2B) + 1) · Var + error + +This means privatization amplifies the variance by 1/(2B). At tight budgets +(small B), the optimal algorithm shifts toward lower-variance (higher-bias) +models. E.g., stronger regularization in ridge regression. + +For databases: this suggests that queries producing high-variance outputs (due to +outliers, small groups, etc.) are inherently harder to privatize. Clipping reduces +variance and thus the noise needed, improving the privacy-utility tradeoff. diff --git a/.claude/skills/explain-pac-ddl/SKILL.md b/.claude/skills/explain-pac-ddl/SKILL.md new file mode 100644 index 00000000..110fbcda --- /dev/null +++ b/.claude/skills/explain-pac-ddl/SKILL.md @@ -0,0 +1,108 @@ +--- +name: explain-pac-ddl +description: Reference for PAC DDL syntax — PAC_KEY, PAC_LINK, PROTECTED, SET PU, and the parser. Auto-loaded when discussing table setup, privacy units, or protected columns. +--- + +## PAC DDL Overview + +PAC extends SQL DDL with privacy annotations. The parser (`src/parser/pac_parser.cpp`, +`src/parser/pac_parser_helpers.cpp`) intercepts CREATE TABLE and ALTER TABLE statements +to extract PAC-specific clauses before forwarding to DuckDB. + +### Privacy Unit (PU) table + +The PU table is the entity being protected (e.g., customer). One row = one individual. + +```sql +-- Mark a table as the privacy unit +ALTER TABLE customer ADD PAC_KEY (c_custkey); +ALTER TABLE customer SET PU; + +-- Protect specific columns from direct projection +ALTER PU TABLE customer ADD PROTECTED (c_acctbal, c_name, c_address); +``` + +- `PAC_KEY (col)`: Designates the column(s) that uniquely identify a privacy unit. + Must be set before `SET PU`. +- `SET PU`: Marks the table as the privacy unit. After this, aggregates on linked + tables get PAC noise. +- `PROTECTED (col1, col2, ...)`: Columns that cannot be directly projected. + Aggregates (SUM, COUNT, AVG) on protected columns go through PAC. + +### Linking tables to the PU + +Non-PU tables reference the PU table via foreign-key-like links: + +```sql +ALTER TABLE orders ADD PAC_LINK (o_custkey) REFERENCES customer (c_custkey); +ALTER TABLE lineitem ADD PAC_LINK (l_orderkey) REFERENCES orders (o_orderkey); +``` + +- `PAC_LINK (local_col) REFERENCES table(ref_col)`: Declares how to join this + table back to the PU. The compiler uses these links to inject the PU hash + into the query plan. +- Links can be chained: `lineitem → orders → customer`. + +### CREATE TABLE syntax (inline) + +PAC clauses can be inlined in CREATE TABLE: + +```sql +CREATE PU TABLE employees ( + id INTEGER, + department VARCHAR, + salary DECIMAL(10,2), + PAC_KEY (id), + PROTECTED (salary) +); +``` + +The parser strips PAC_KEY, PAC_LINK, and PROTECTED clauses from the CREATE +statement, forwards the clean SQL to DuckDB, then applies the PAC metadata +via ALTER TABLE internally. + +### Common mistakes + +- `PAC_LINK(col, table, ref)` — wrong. Use `PAC_LINK (col) REFERENCES table(ref)`. +- `PROTECTED salary` — wrong. Must have parentheses: `PROTECTED (salary)`. +- ALTER TABLE on a PU table requires `ALTER PU TABLE`, not `ALTER TABLE`. + +### Metadata files + +PAC metadata (PU tables, links, protected columns) is stored in JSON sidecar files +next to the database file. The naming convention is: + +``` +pac_metadata__.json +``` + +For example, `tpch_sf1.db` produces `pac_metadata_tpch_sf1_main.json` in the same +directory. + +**Auto-loading**: When the PAC extension loads (`LOAD pac`), it automatically looks +for a matching metadata file next to the attached database and loads it. No manual +`PRAGMA load_pac_metadata` needed for persistent databases. + +**Saving**: After setting up PAC_KEY/PAC_LINK/PROTECTED, save with: +```sql +PRAGMA save_pac_metadata('/path/to/pac_metadata_mydb_main.json'); +``` + +**Clearing**: Reset all in-memory PAC metadata: +```sql +PRAGMA clear_pac_metadata; +``` + +**Important**: If you delete or recreate a database file, also delete the +corresponding `pac_metadata_*.json` file. Stale metadata causes confusing errors +(references to tables/columns that no longer exist). + +For in-memory databases, metadata file is named `pac_metadata_memory_main.json` +in the current working directory. + +### Key source files + +- `src/parser/pac_parser.cpp` — main parser hook (intercepts SQL statements) +- `src/parser/pac_parser_helpers.cpp` — extraction of PAC_KEY, PAC_LINK, PROTECTED +- `src/core/pac_metadata.cpp` — in-memory metadata storage for PU/link/protected info +- `src/core/pac_extension.cpp` — auto-loading of metadata on extension load (LoadInternal) diff --git a/.claude/skills/explain-pac/SKILL.md b/.claude/skills/explain-pac/SKILL.md new file mode 100644 index 00000000..19ea9124 --- /dev/null +++ b/.claude/skills/explain-pac/SKILL.md @@ -0,0 +1,98 @@ +--- +name: explain-pac +description: Reference material for PAC privacy internals. Auto-loaded when discussing PAC mechanism, noise, counters, or clipping. +--- + +## PAC Privacy Overview + +PAC (Probably Approximately Correct) privacy is a framework for privatizing +algorithms with provable guarantees, described in [SIMD-PAC-DB](https://arxiv.org/abs/2603.15023). + +### Formal definition + +Given a data distribution D, a query Q satisfies (δ, ρ, D)-PAC Privacy if no +adversary who knows D can, after observing Q(X) where X ~ D, produce an +estimate X̂ such that ρ(X̂, X) = 1 with probability ≥ (1-δ). + +The key insight: **noise scales with the variance of the algorithm's output across +random subsamples** of the data. Stable algorithms (low variance) need less noise. + +### The 4-step privatization template + +1. **Subsample**: Draw m random 50%-subsets X₁...Xₘ from the full dataset +2. **Compute**: Run the query Q on each subset → outputs y₁...yₘ +3. **Estimate noise**: Compute variance σ² across the yᵢ. Required noise: Δ = σ²/(2β) + where β is the MI budget +4. **Release**: Pick a random subset Xⱼ, return Q(Xⱼ) + N(0, Δ) + +This is the theoretical foundation. SIMD-PAC-DB encodes this efficiently using +64 parallel counters (one per possible subset assignment bit). + +### MI → posterior success rate + +| MI | Max posterior (prior=50%) | Max posterior (prior=25%) | +|----|--------------------------|--------------------------| +| 1/128 | 56.2% | 30.5% | +| 1/64 | 58.8% | 32.9% | +| 1/32 | 62.4% | 36.3% | +| 1/16 | 67.5% | 41.2% | +| 1/8 | 74.5% | 48.2% | +| 1/4 | 83.8% | 58.4% | +| 1/2 | 95.2% | 72.7% | +| 1 | 100% | 91.4% | + +### PAC Composition + +For T adaptive queries with independent random sampling per query, the total +MI is bounded by the sum: MI(total) ≤ Σᵢ MIᵢ. This is linear composition — +each query's MI adds to the budget. The key requirement: **independent random +sampling per query** (each query uses a fresh random subset). + +### PAC vs DP + +- **DP**: input-independent guarantee. Requires white-boxing to compute sensitivity. + Noise ∝ sensitivity/ε. Works for worst-case neighboring datasets. +- **PAC**: instance-dependent guarantee. No white-boxing needed. Noise ∝ Var[Q(X)]/β. + Stable queries get less noise automatically. But the guarantee depends on the + data distribution D. + +### Core mechanism (SIMD-PAC-DB implementation) + +- Each aggregate maintains **64 parallel counters** (one per bit of a hashed key) +- Each row's value is added to ~32 counters (determined by pac_hash of the PU key) +- At finalization, noise calibrated to a **mutual information bound** (pac_mi) is + added, and the result is estimated from the counters +- PAC does NOT compute sensitivity (unlike differential privacy) +- The 64 counters encode m=64 possible subsets in one pass (SIMD-efficient) + +### SWAR bitslice encoding + +- Counters are packed as 4 × uint16_t per uint64_t (SWAR = SIMD Within A Register) +- This enables processing 4 counters per instruction without actual SIMD intrinsics +- Overflow cascades to 32-bit overflow counters when 16-bit counters saturate + +### pac_clip_sum (contribution clipping) + +- **Pre-aggregation**: Query rewriter inserts `GROUP BY pu_hash` to sum each user's + rows into a single contribution (handles the "50K small items" case) +- **Magnitude levels**: Values decomposed into levels (4x per level, 2-bit shift). + Level 0: 0-255, Level 1: 256-1023, Level 2: 1024-4095, etc. +- **Bitmap tracking**: Each level maintains a 64-bit bitmap of distinct contributors + (using birthday-paradox estimation from popcount) +- **Hard-zero**: Levels with fewer distinct contributors than `pac_clip_support` + contribute nothing to the result (prevents variance side-channel attacks) + +### Key settings + +- `pac_mi`: Mutual information bound (0 = deterministic/no noise) +- `pac_seed`: RNG seed for reproducible noise +- `pac_clip_support`: Minimum distinct contributors per magnitude level (NULL = disabled) +- `pac_hash_repair`: Ensure pac_hash outputs exactly 32 bits set + +### DDL + +```sql +ALTER TABLE customer ADD PAC_KEY (c_custkey); +ALTER TABLE customer SET PU; +ALTER TABLE orders ADD PAC_LINK (o_custkey) REFERENCES customer (c_custkey); +``` diff --git a/.claude/skills/run-attacks/SKILL.md b/.claude/skills/run-attacks/SKILL.md new file mode 100644 index 00000000..a01e7c13 --- /dev/null +++ b/.claude/skills/run-attacks/SKILL.md @@ -0,0 +1,23 @@ +--- +name: run-attacks +description: Run the pac_clip_sum membership inference attack test suite and summarize results. +--- + +## Context + +PAC (Probably Approximately Correct) privacy privatizes SQL aggregates via 64 parallel +SWAR bitslice counters with MI-bounded noise. pac_clip_sum adds per-user contribution +clipping using magnitude-level decomposition (4x bands, 2-bit shift) with distinct-contributor +bitmaps. Unsupported outlier levels are hard-zeroed to prevent variance side-channel attacks. + +## Instructions + +1. Build if needed: `GEN=ninja make 2>&1 | tail -5` +2. Run the main attack suite: `bash attacks/clip_attack_test.sh 2>/dev/null` +3. Run the multi-row attack: `bash attacks/clip_multirow_test.sh 2>/dev/null` +4. Run stress tests if available: `bash attacks/clip_hardzero_stress.sh 2>/dev/null` + +Summarize results as a table: +- Attack scenario, clip_support value, attack accuracy, std_in, std_out, std ratio +- Flag any accuracy above 60% as a potential regression +- Compare to baselines in `attacks/clip_attack_results.md` diff --git a/.claude/skills/shared b/.claude/skills/shared new file mode 160000 index 00000000..9d673ac7 --- /dev/null +++ b/.claude/skills/shared @@ -0,0 +1 @@ +Subproject commit 9d673ac7a0eade0ae0d729e9d6080d5172810728 diff --git a/.claude/skills/test-clip/SKILL.md b/.claude/skills/test-clip/SKILL.md new file mode 100644 index 00000000..66f7dc8a --- /dev/null +++ b/.claude/skills/test-clip/SKILL.md @@ -0,0 +1,11 @@ +--- +name: test-clip +description: Build and run pac_clip_sum unit tests. +--- + +## Instructions + +1. Build: `GEN=ninja make 2>&1 | tail -5` +2. Run clip_sum tests: `build/release/test/unittest "test/sql/pac_clip_sum*" 2>&1` +3. Report: number of assertions passed/failed +4. If any fail, show the failing test name and expected vs actual values diff --git a/.gitmodules b/.gitmodules index e2e8a723..a809e05e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -9,3 +9,6 @@ [submodule "benchmark/sqlstorm/SQLStorm"] path = benchmark/sqlstorm/SQLStorm url = https://github.com/SQL-Storm/SQLStorm.git +[submodule ".claude/skills/shared"] + path = .claude/skills/shared + url = https://github.com/ila/duckdb-claude-skills.git diff --git a/CLAUDE.md b/CLAUDE.md index e497ad57..f457036c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -10,9 +10,19 @@ Always test your changes with real queries (e.g., TPC-H on sf1) before declaring Never execute git commands that could lose code. Always ask the user for permission on those. +## Development rules + +- **New features must have tests.** Ask the user whether to create a new test file or extend an existing one in `test/sql/`. +- **Never remove a failing test to "fix" a failure.** If a test fails, fix the underlying bug. Tests exist for a reason. +- **Before implementing anything, search the existing codebase** for similar patterns or solutions. Check if a helper function, utility, or prior approach already addresses the problem. Reuse before reinventing. +- **Use helper functions.** Factor shared logic into helpers rather than duplicating code. Check `src/include/utils/` and existing helpers in the file you're editing. +- **Never edit the `duckdb/` submodule.** The DuckDB source is read-only. All PAC logic lives in `src/` and `test/`. If you need DuckDB internals, use the public API or ask the user. +- **Keep the paper in mind.** The PAC mechanism is described in [SIMD-PAC-DB: Pretty Performant PAC Privacy](https://arxiv.org/abs/2603.15023). Refer to it for the theoretical foundations (noise calibration, mutual information bounds, counter semantics) before making changes to core aggregate logic. +- **Add `PAC_DEBUG_PRINT` statements** at major code flow points (entry/exit of compilation phases, aggregate rewrites, clipping decisions). Use the existing `PAC_DEBUG_PRINT` macro from `src/include/pac_debug.hpp` — it's compiled out when `PAC_DEBUG` is 0. + ## What is PAC? -PAC (Pretty Accurate Counting) is a DuckDB extension that automatically privatizes SQL aggregate queries. It protects against Membership Inference Attacks by maintaining 64 parallel counters per aggregate (one per "world" bit), adding calibrated noise at finalization. Queries are rewritten transparently — users write normal SQL and PAC transforms it. +PAC (Probably Approximately Correct) Privacy, or short: pac, is a DuckDB extension that automatically privatizes SQL aggregate queries. It protects against Membership Inference Attacks by maintaining 64 parallel counters per aggregate (one per "world" bit), adding calibrated noise at finalization. Queries are rewritten transparently — users write normal SQL and PAC transforms it. ## Build & Test @@ -88,3 +98,36 @@ SET pac_mi = 0; -- disable noise for testing SET pac_seed = 42; -- reproducible results SET pac_clip_support = 40; -- enable clip rewrite with support threshold ``` + +## Code style (clang-tidy) + +The project uses clang-tidy with DuckDB's configuration (`.clang-tidy`). Key naming rules: + +- **Classes/Enums**: `CamelCase` (e.g., `PacClipSumIntState`) +- **Functions**: `CamelCase` (e.g., `GetLevel`, `AllocateLevel`) +- **Variables/parameters/members**: `lower_case` (e.g., `max_level_used`, `key_hash`) +- **Constants/static/constexpr**: `UPPER_CASE` (e.g., `PAC2_NUM_LEVELS`, `PAC2_LEVEL_SHIFT`) +- **Macros**: `UPPER_CASE` (e.g., `PAC_DEBUG_PRINT`) +- **Typedefs**: `lower_case_t` suffix (e.g., `aggregate_update_t`) + +Other style rules (from `.clang-format`, based on LLVM): + +- **Tabs for indentation**, width 4 +- **Column limit**: 120 +- **Braces**: same line as statement (K&R / Allman-attached) +- **Pointers**: right-aligned (`int *ptr`, not `int* ptr`) +- **No short functions on single line** +- **Templates**: always break after `template<...>` +- **Long arguments**: align after open bracket + +Run `make format-fix` to auto-format. Formatting runs automatically via hook after edits. + +## Attack evaluation + +Attack scripts live in `attacks/`. Results are documented in `attacks/clip_attack_results.md`. + +```bash +bash attacks/clip_attack_test.sh 2>/dev/null # main attack suite +bash attacks/clip_multirow_test.sh 2>/dev/null # 20K small items test +bash attacks/clip_hardzero_stress.sh 2>/dev/null # stress tests +``` diff --git a/CMakeLists.txt b/CMakeLists.txt index 93e0c917..fac7d1b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,8 @@ set(EXTENSION_SOURCES src/aggregates/pac_count.cpp src/aggregates/pac_min_max.cpp src/aggregates/pac_sum.cpp + src/aggregates/pac_clip_sum.cpp + src/aggregates/pac_clip_min_max.cpp src/compiler/pac_bitslice_compiler.cpp src/compiler/pac_compiler_helpers.cpp src/query_processing/pac_avg_rewriter.cpp diff --git a/attacks/clip_attack_results.md b/attacks/clip_attack_results.md new file mode 100644 index 00000000..4cd0e8cb --- /dev/null +++ b/attacks/clip_attack_results.md @@ -0,0 +1,382 @@ +# pac_clip_sum Attack Evaluation + +Evaluates whether `pac_clip_sum`'s support-based outlier attenuation defeats the +variance side-channel membership inference attack (MIA). + +## Background + +`pac_clip_sum` (commit 948a504) introduces a two-level aggregation: + +1. **Lower aggregate**: `GROUP BY pu_hash` with plain `SUM` — pre-aggregates per user +2. **Upper aggregate**: `pac_noised_clip_sum` — decomposes values into magnitude levels + (each 16x the previous), tracks a 64-bit bitmap of distinct contributors per level, + and attenuates levels with fewer contributors than `pac_clip_support` + +Supported clip aggregates: SUM, COUNT, MIN, MAX (MIN/MAX not fully implemented). +float/double types not yet supported. + +## Parameters + +- `pac_mi = 0.0078125` (1/128) +- 30 trials per condition unless noted +- Random guess baseline: 50% + +--- + +## Attack 1: Single-query variance classifier (small filter) + +**Setup**: N=1000 users, target=999999, filter<=3 (3-4 users in aggregation) + +| clip_support | Var>200k accuracy | Standard accuracy | std_in | std_out | std ratio | +|-------------|-------------------|-------------------|--------|---------|-----------| +| off | **96.0%** | 80.0% | 10,688,008 | 117,662 | 90.8x | +| 2 | **72.5%** | 62.7% | 613,511 | 122,026 | 5.0x | +| 3 | **72.5%** | 62.7% | 613,511 | 122,026 | 5.0x | + +**Finding**: Clipping reduces attack accuracy from 96% to 72.5% and collapses the +variance ratio from 91x to 5x. The outlier's contribution is attenuated by ~16x +(one magnitude level). However, 72.5% is still well above random — the residual +5x variance gap remains exploitable. + +clip=2 and clip=3 produce identical results because with only 3-4 users per level, +the bitmap support is at most 2-3 regardless of threshold. + +--- + +## Attack 2: Wide filter (clipping best-case) + +**Setup**: N=1000 users, target=999999, filter<=999 (all users in aggregation) + +| clip_support | Var>200k accuracy | Standard accuracy | std_in | std_out | std ratio | +|-------------|-------------------|-------------------|--------|---------|-----------| +| off | 53.3% | 60.0% | 10,424,631 | 1,822,936 | 5.7x | +| 2 | **55.0%** | 53.3% | 1,816,604 | 1,875,047 | **1.0x** | +| 5 | **55.0%** | 53.3% | 1,816,604 | 1,875,047 | **1.0x** | +| 10 | **55.0%** | 53.3% | 1,816,604 | 1,875,047 | **1.0x** | + +**Finding**: With many users in the aggregation, clipping completely eliminates the +variance side-channel (std ratio goes to 1.0x). Attack accuracy drops to ~53-55%, near +random. The outlier's magnitude level has only 1 bitmap contributor vs hundreds at +normal levels — it is cleanly identified and attenuated. + +Note: even without clipping, the attack barely works (53%) because the outlier +signal (1M) is diluted by the large background (5M from 999 users). Clipping +further equalizes the means (5.1M vs 4.9M). + +clip=2, 5, and 10 all produce identical results — with ~1000 users, all normal +levels have saturated bitmaps (est. distinct >> 10), so only the outlier level +is affected. + +--- + +## Attack 3: 10K users, extreme outlier + +**Setup**: N=10000 users, target=9,999,999, filter<=2 (2-3 users in aggregation) + +| clip_support | Var>200k accuracy | Standard accuracy | std_in | std_out | std ratio | +|-------------|-------------------|-------------------|--------|---------|-----------| +| off | **97.8%** | 77.8% | 104,410,859 | 110,259 | 947x | +| 2 | **75.0%** | 47.9% | 396,383 | 112,778 | 3.5x | +| 3 | **47.9%** | 47.9% | 0 | 0 | — | + +**Finding**: clip=2 reduces accuracy from 97.8% to 75% (variance ratio 947x to 3.5x). +clip=3 zeroes ALL results (returns 0 for both in and out) — with only 2-3 users, +no level reaches 3 distinct contributors, so everything is zeroed. The attack is +"defeated" at 47.9% but utility is completely destroyed. + +--- + +## Attack 4: Over-clipping + +**Setup**: N=1000 users, target=999999, filter<=3, 15 trials + +| clip_support | Var>200k accuracy | mean_in | mean_out | +|-------------|-------------------|---------|----------| +| off | 91.3% | -2,067,562 | 42,892 | +| 5 | **50.0%** | 0 | 0 | +| 10 | **50.0%** | 0 | 0 | + +**Finding**: With only 3-4 users in the filter, clip_support >= 5 zeroes all output. +Attack accuracy = 50% (random), but every query returns 0. This is not a useful +defense — it's equivalent to refusing to answer. + +**Takeaway**: `pac_clip_support` must be set below the minimum expected number of +users in any aggregation group. For small filters, this severely limits the +clipping threshold. + +--- + +## Attack 5: Wide filter + aggressive clipping + +**Setup**: N=1000 users, target=999999, filter<=999, 15 trials + +| clip_support | Var>200k accuracy | Standard accuracy | std_in | std_out | +|-------------|-------------------|-------------------|--------|---------| +| off | 53.3% | 63.3% | 11,391,246 | 2,172,216 | +| 50 | **50.0%** | 50.0% | 1,219,189 | 2,049,795 | +| 100 | **50.0%** | 50.0% | 1,219,189 | 2,049,795 | + +**Finding**: With a wide filter (1000 users), even aggressive clipping (support=50, 100) +works perfectly — attack accuracy = 50% (random) and noise stds are equalized. +Normal magnitude levels have hundreds of bitmap contributors, far exceeding the +threshold. Only the outlier level (1 contributor) is affected. + +--- + +## Attack 6: Clip-after-filter vs clip-full-table (Dandan's hypothesis) + +Dandan's concern: clipping applied after filtering may leak more than clipping +applied to the entire dataset, because the filter changes which users contribute +to the bitmap, affecting which levels appear "supported." + +**Setup**: N=1000 users, target=999999, filter<=3 + +| Method | Var>200k accuracy | std_in | std_out | std ratio | +|--------|-------------------|--------|---------|-----------| +| No clipping | **96.0%** | 10,688,008 | 117,662 | 90.8x | +| clip-after-filter (pac_clip_support=2) | **72.5%** | 613,511 | 122,026 | 5.0x | +| clip-full-table (pre-clip to mu+3sigma) | **56.9%** | 180,457 | 124,641 | 1.4x | + +**Finding: Dandan is correct.** Pre-clipping the full table then filtering gives +significantly better protection (56.9% vs 72.5%). The reasons: + +1. **Full-table pre-clipping** clamps the billionaire to 13,661 BEFORE PAC sees it. + PAC computes noise from the clamped range [1, 13661], and the noise is nearly + the same for in/out (std ratio 1.4x). + +2. **Clip-after-filter** only sees 3-4 rows. The bitmap has very few bits set per + level, making it harder to distinguish the outlier level from normal levels. + The attenuation is only ~16x (one level), leaving a 5x variance gap. + +However, clip-after-filter is still much better than no clipping (72.5% vs 96%), +confirming Dandan's second point: "this approach is still significantly better +than not applying clipping at all." + +--- + +## Summary + +| Scenario | No clip | pac_clip_support=2 | Pre-clip full table | +|----------|---------|-------------------|---------------------| +| Small filter (3-4 users) | 96% | 72.5% | 56.9% | +| Wide filter (1000 users) | 53% | 55% (side-channel gone) | — | +| 10K users, filter<=2 | 97.8% | 75% | — | + +## Attack 7: 20K small items user (multi-row outlier) + +The core argument for per-user pre-aggregation: a user with 20,000 purchases of +$50 each has normal individual values but a total contribution of $1,000,000. +Per-row Winsorization won't catch this. Does pac_clip_sum's GROUP BY pu_hash? + +**Setup**: N=1000 background users (1 row each, acctbal in [1,10000]). +Target user_id=0: 20,000 rows x $50 = $1,000,000 total. filter<=3. + +| Method | Var>200k accuracy | std_in | std_out | std ratio | +|--------|-------------------|--------|---------|-----------| +| No clipping | **96.0%** | 10,686,722 | 117,662 | 90.8x | +| Winsorization (per-row clip to 13661) | **94.1%** | 9,436,439 | 124,641 | 75.7x | +| pac_clip_sum (clip_support=2) | **72.5%** | 613,511 | 122,026 | 5.0x | + +**Finding: Winsorization completely fails.** Each $50 value is well within the +[0, 13661] clip bounds, so nothing gets clipped. The 20,000 small rows pass +through untouched and the attack succeeds at 94.1% (barely below the 96% baseline). + +**pac_clip_sum catches it** because the pre-aggregation step (`GROUP BY pu_hash`) +sums user_id=0's 20,000 rows into a single $1,000,000 entry. This lands at an +outlier magnitude level with only 1 bitmap contributor, and gets attenuated. +Attack accuracy drops to 72.5%. + +This is precisely Peter's argument for why per-user contribution clipping (via +pre-aggregation) is needed instead of per-row value clipping. It validates the +two-level aggregation design from Wilson et al. 2019. + +--- + +## Suffix attenuation modes compared + +We tested three suffix attenuation strategies for unsupported outlier levels. +"Soft-clamp" is Peter's original (scale by 16^distance). "Bitmap-proportional" +adds a factor of estimated_distinct/threshold. "Hard-zero" skips the level entirely. + +**Attack 1 results (N=1000, filter<=3, clip=2):** + +| Mode | Var>200k | std_in | std_out | std ratio | +|------|----------|--------|---------|-----------| +| No clipping | **96.0%** | 10,688,008 | 117,662 | 90.8x | +| Soft-clamp | **72.5%** | 613,511 | 122,026 | 5.0x | +| Bitmap-proportional | **66.7%** | 327,625 | 122,026 | 2.7x | +| **Hard zero** | **47.1%** | 106,223 | 122,026 | **0.87x** | + +--- + +## Hard-zero stress tests + +Comprehensive adversarial evaluation of the hard-zero mode. + +### TEST 1: High trial count (60 trials, best-threshold search) + +| truth | mean | std | n | +|-------|------|-----|---| +| in | 12,397 | 87,366 | 49 | +| out | 29,758 | 109,140 | 54 | + +Best threshold accuracy (searched 10k-500k in 10k steps): **52.4%**. +Midpoint classifier: **52.4%**. Likelihood ratio: **52.4%**. +All classifiers are indistinguishable from random. + +### TEST 2: Composed queries (30 trials x 10 queries) + +| n_queries | accuracy | +|-----------|----------| +| 1 | 43.4% | +| 5 | 50.9% | +| 10 | 48.1% | +| Majority vote | **50.0%** | + +Per-trial variance: in_std=83,747, out_std=85,116, **ratio=0.98**. +Composing 10 queries and averaging does not help the attacker. + +### TEST 3: Moderate outlier (target=50,000, same magnitude level as normal) + +**THIS BREAKS IT.** Target 50,000 is in level 2 (4096-65535), same as normal users. +The bitmap shows this level as supported → no clipping occurs. + +| truth | mean | std | +|-------|------|-----| +| in | 139,946 | 497,518 | +| out | 20,093 | 122,026 | + +Best threshold accuracy: **76.5%**. Std ratio: 4.1x. + +**Implication**: pac_clip_sum only clips outliers that are at a DIFFERENT magnitude +level than normal users. A 10x outlier within the same level passes through. + +### TEST 4: Two colluding outliers + +Two users with 999,999 — level 3 has 2 bitmap bits, meeting threshold=2. + +| truth | mean | std | +|-------|------|-----| +| in | 1,783,465 | 12,547,986 | +| out | 20,093 | 122,026 | + +Best threshold accuracy: **100.0%**. Attack fully succeeds. + +**Implication**: Two colluding users at the same magnitude level make that level +"supported." The clipping mechanism assumes outlier levels have few contributors. +Collusion (or any scenario with 2+ users at the same extreme level) defeats it. + +### TEST 5: Dandan's filter probing + +Attacker uses two queries with different filters to probe clipping behavior. + +| Query | Best accuracy | +|-------|--------------| +| Filter<=3 (narrow) | 52.9% | +| Filter<=999 (wide) | 51.7% | +| Cross-filter differential | **51.0%** | + +**Dandan's concern is NOT exploitable with hard-zero.** The narrow query zeroes the +outlier level, giving identical counter distributions for in/out. The wide query +has the outlier's level zeroed too (1 contributor < threshold). The cross-filter +differential reveals nothing. + +### TEST 6: 20K small items ($50 x 20,000) + +Best threshold accuracy: **52.9%**. Attack defeated. +Pre-aggregation collapses 20K rows into one $1M entry at level 3, which is zeroed. + +### TEST 7: Borderline outlier (target=65,536, exactly level 3 boundary) + +Best threshold accuracy: **52.9%**. Attack defeated. +Even the minimum level-3 value is zeroed when it's the sole contributor. + +--- + +### Key takeaways + +1. **Hard-zero fully defeats the variance side-channel** for outliers at unsupported + magnitude levels. Attack accuracy = 50% across all classifiers, even with + composed queries, different thresholds, and cross-filter probing. + +2. **Moderate outliers within the same magnitude level are NOT caught.** A 10x outlier + (50,000 vs normal ~5,000) sits in the same level and passes through unclipped. + Attack accuracy: 76.5%. This is a fundamental limitation of the magnitude-level + granularity (each level spans 16x). + +3. **Two colluding outliers defeat the clipping** by making their level "supported" + (2 contributors >= threshold 2). Attack accuracy: 100%. + +4. **Dandan's filter-probing concern does not apply with hard-zero.** The zeroed level + contributes nothing regardless of filter, so different filters reveal no info. + +5. **The pre-aggregation step remains essential** — 20K small items are correctly + collapsed and clipped. + +--- + +## pac_clip_scale comparison (2026-04-02) + +Tests `pac_clip_scale = true` (scale unsupported outlier levels to nearest supported +level) vs `false` (hard-zero / omit). Peter's hypothesis: scaling should be safe +because outliers become a minority in the already-supported bucket. + +Code version: after Peter's refactoring into `pac_clip_aggr.hpp`, CLIP_LEVEL_SHIFT=2 +(4x per level, was 16x previously). + +### Setup + +- N=1000 background users, acctbal ∈ [1, 10000] +- pac_mi = 0.0078125 (1/128), 30 trials per condition +- Background sum: filter<=3 = 18,347; filter<=999 = 4,871,091 + +### Small filter results (filter<=3, clip_support=2) + +| Test | Outlier | scale=false best% | scale=true best% | scale=false std_in | scale=true std_in | std_out | +|------|---------|-------------------|------------------|--------------------|-------------------|---------| +| Extreme | tv=999,999 | **55.6%** | **64.8%** | 94,882 | 169,449 | 107,976 | +| Moderate | tv=50,000 | **55.6%** | **61.1%** | 94,882 | 147,406 | 107,976 | +| Multi-row | 20K×$50 | **55.6%** | **64.8%** | 94,882 | 169,449 | 107,976 | +| Borderline | tv=65,536 | **55.6%** | **57.4%** | 94,882 | 104,640 | 107,976 | + +With **hard-zero** (scale=false): all outlier contributions are completely zeroed. The +"in" distribution is identical to "out" — attack accuracy ~55% (random). No side-channel. + +With **scaling** (scale=true): outlier values are scaled down to the nearest supported +level. This creates a mild variance side-channel (std ratio ~1.57x). Attack accuracy +rises to 57-65% — measurable but not catastrophic. + +### Small filter results (clip_support=10 and 50) + +All conditions return 0 for both in/out (no level reaches 10 or 50 distinct +contributors with only 3-4 users). Scale mode is irrelevant — both modes identical. + +### Wide filter results (filter<=999, clip_support=2) + +| clip | scale | mean_in | std_in | mean_out | std_out | best% | +|------|-------|---------|--------|----------|---------|-------| +| 2 | false | 4,737,114 | 1,516,576 | 4,973,475 | 1,642,654 | 53.3% | +| 2 | true | 4,750,242 | 1,538,381 | 4,973,475 | 1,642,654 | 53.3% | +| 50 | false | 4,734,354 | 1,516,680 | 4,970,423 | 1,643,039 | 53.3% | +| 50 | true | 4,791,594 | 1,539,287 | 5,019,631 | 1,639,877 | 55.0% | + +No meaningful difference. Both modes near random with wide filters. + +### Key findings + +1. **Peter is partially right**: scaling does NOT cause a variance explosion. The leak + is moderate (~10 percentage points above random at worst), not catastrophic. + His intuition that scaled values become a minority in the supported bucket holds. + +2. **Hard-zero is still strictly better for privacy**: it produces zero information + leakage in all tested scenarios. Scaling leaks mildly. + +3. **Major improvement from 4x granularity**: the moderate outlier (tv=50,000) that + previously defeated clipping at 76.5% accuracy (when levels were 16x wide, both + 5000 and 50000 in same level) is now caught by both modes (~55-61%). The shift + from CLIP_LEVEL_SHIFT=4 (16x) to CLIP_LEVEL_SHIFT=2 (4x) dramatically improved + detection of moderate outliers. + +4. **Recommendation**: keep hard-zero as default (no leakage), but scaling is a + reasonable option where utility matters more. The ~10pp accuracy gap may be + acceptable in many threat models. diff --git a/attacks/clip_attack_test.sh b/attacks/clip_attack_test.sh new file mode 100755 index 00000000..e37d47cb --- /dev/null +++ b/attacks/clip_attack_test.sh @@ -0,0 +1,196 @@ +#!/usr/bin/env bash +# Test pac_clip_sum against membership inference attacks. +set -euo pipefail + +DUCKDB="/home/ila/Code/pac/build/release/duckdb" +PAC_EXT="/home/ila/Code/pac/build/release/extension/pac/pac.duckdb_extension" + +run_sum() { + local cond=$1 seed=$2 n_users=$3 target_val=$4 filter=$5 clip=$6 + local insert="" + [ "$cond" = "in" ] && insert="INSERT INTO users VALUES (0, ${target_val});" + local clip_sql="" + [ "$clip" != "off" ] && clip_sql="SET pac_clip_support = ${clip};" + $DUCKDB -noheader -list 2>/dev/null <> "$IN_F" + echo "out,$(run_sum out $seed $n $tv $filt $clip)" >> "$OUT_F" + done + + echo "=== $label | N=$n filt<=$filt tv=$tv clip=$clip ===" + $DUCKDB -markdown < ${FBG} + ${tv}/2.0 THEN 1 + WHEN truth='out' AND v <= ${FBG} + ${tv}/2.0 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*)) AS accuracy +FROM raw WHERE v IS NOT NULL +UNION ALL +SELECT 'Var>200k', + printf('%.1f%%', 100.0*SUM(CASE + WHEN truth='in' AND ABS(v - ${FBG}) > 200000 THEN 1 + WHEN truth='out' AND ABS(v - ${FBG}) <= 200000 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*)) +FROM raw WHERE v IS NOT NULL; +SQL + echo "" + rm -f "$IN_F" "$OUT_F" +} + +NT=30 + +echo "==========================================" +echo " pac_clip_sum ATTACK EVALUATION" +echo "==========================================" +echo "" + +# --- Attack 1: Baseline variance classifier (simplest) --- +echo "## ATTACK 1: Single-query variance classifier" +echo "N=1000, target=999999, filter<=3, $NT trials" +echo "" +for CLIP in off 2 3; do + run_scenario "atk1" 1000 999999 3 "$CLIP" $NT +done + +# --- Attack 2: Wide filter (clipping best-case) --- +echo "## ATTACK 2: Wide filter (all users in aggregation)" +echo "N=1000, target=999999, filter<=999, $NT trials" +echo "" +for CLIP in off 2 5 10; do + run_scenario "atk2" 1000 999999 999 "$CLIP" $NT +done + +# --- Attack 3: 10K users --- +echo "## ATTACK 3: 10K users, extreme outlier" +echo "N=10000, target=9999999, filter<=2, $NT trials" +echo "" +for CLIP in off 2 3; do + run_scenario "atk3" 10000 9999999 2 "$CLIP" $NT +done + +# --- Attack 4: Over-clipping --- +echo "## ATTACK 4: Over-clipping (too aggressive)" +echo "N=1000, target=999999, filter<=3, 15 trials" +echo "clip_support=10 with only 3-4 users => no supported levels" +echo "" +for CLIP in off 5 10; do + run_scenario "atk4" 1000 999999 3 "$CLIP" 15 +done + +# --- Attack 5: Wide filter + over-clipping --- +echo "## ATTACK 5: Wide filter + aggressive clipping" +echo "N=1000, target=999999, filter<=999, 15 trials" +echo "" +for CLIP in off 50 100; do + run_scenario "atk5" 1000 999999 999 "$CLIP" 15 +done + +# --- Attack 6: Clip after filter vs clip on full table (Dandan's concern) --- +# pac_clip_sum clips AFTER filtering (only filtered rows enter the aggregate). +# An adversary might exploit this: the clipping behavior differs depending on +# which users are in the filter. Compare filter-then-clip (what pac_clip_sum does) +# vs clip-all-then-filter (manual pre-clipping of the full table, then query). +echo "## ATTACK 6: Clip-after-filter vs clip-full-table" +echo "N=1000, target=999999, filter<=3, $NT trials" +echo "Tests Dandan's hypothesis: clipping after filtering leaks more than" +echo "clipping the entire dataset. We compare pac_clip_sum (clips filtered rows)" +echo "vs manual pre-clipping of all rows then querying without clip_support." +echo "" + +# 6a: pac_clip_sum (clip after filter) — already covered in atk1 clip=2 +echo "### 6a: clip-after-filter (pac_clip_support=2)" +echo "(Same as Attack 1 clip=2)" +echo "" + +# 6b: clip-full-table-then-query (no pac_clip_support, but data is pre-clipped) +run_sum_preclipped() { + local cond=$1 seed=$2 n_users=$3 target_val=$4 filter=$5 clip_support=$6 + local insert="" + [ "$cond" = "in" ] && insert="INSERT INTO users VALUES (0, ${target_val});" + # Pre-clip ALL rows at percentile bounds (simulating clip-on-full-table) + # Use the same magnitude-level idea: clip values to level-2 max (65535) + # This ensures the billionaire is clipped BEFORE filtering. + $DUCKDB -noheader -list 2>/dev/null < 13661; +ALTER TABLE users ADD PAC_KEY(user_id); +ALTER TABLE users SET PU; +SET pac_mi = 0.0078125; +SET pac_seed = ${seed}; +SELECT SUM(acctbal) FROM users WHERE user_id <= ${filter} OR user_id = 0; +SQL +} + +echo "### 6b: clip-full-table-then-query (pre-clip all to mu+3sigma=13661)" +echo "" +FBG_CLIP=$($DUCKDB -noheader -list -c \ + "SELECT SUM(LEAST((hash(i*31+7)%10000+1)::INTEGER, 13661)) FROM generate_series(1,3) t(i);" | tr -d '[:space:]') +IN_F=$(mktemp); OUT_F=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_sum_preclipped in $seed 1000 999999 3 off)" >> "$IN_F" + echo "out,$(run_sum_preclipped out $seed 1000 999999 3 off)" >> "$OUT_F" +done +echo "=== atk6b | N=1000 filt<=3 tv=999999 pre-clip=13661 ===" +$DUCKDB -markdown < ${FBG_CLIP} + 999999/2.0 THEN 1 + WHEN truth='out' AND v <= ${FBG_CLIP} + 999999/2.0 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*)) AS accuracy +FROM raw WHERE v IS NOT NULL +UNION ALL +SELECT 'Var>200k', + printf('%.1f%%', 100.0*SUM(CASE + WHEN truth='in' AND ABS(v - ${FBG_CLIP}) > 200000 THEN 1 + WHEN truth='out' AND ABS(v - ${FBG_CLIP}) <= 200000 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*)) +FROM raw WHERE v IS NOT NULL; +SQL +echo "" +rm -f "$IN_F" "$OUT_F" diff --git a/attacks/clip_hardzero_stress.sh b/attacks/clip_hardzero_stress.sh new file mode 100755 index 00000000..33e78154 --- /dev/null +++ b/attacks/clip_hardzero_stress.sh @@ -0,0 +1,297 @@ +#!/usr/bin/env bash +# Stress-test hard-zero clipping: try every attack angle we can think of. +set -euo pipefail + +DUCKDB="/home/ila/Code/pac/build/release/duckdb" +PAC_EXT="/home/ila/Code/pac/build/release/extension/pac/pac.duckdb_extension" +CLIP=2 + +run_sum() { + local cond=$1 seed=$2 n_users=$3 target_val=$4 filter=$5 extra_sql="${6:-}" + local insert="" + [ "$cond" = "in" ] && insert="$target_val" + $DUCKDB -noheader -list 2>/dev/null < threshold THEN 1 + WHEN truth='out' AND ABS(v - ${fbg}) <= threshold THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*) AS acc + FROM raw, generate_series(10000, 500000, 10000) thresholds(threshold) + WHERE v IS NOT NULL + GROUP BY threshold +); + +-- Mean-based classifier (v > midpoint) +SELECT 'Midpoint clf' AS clf, + printf('%.1f%%', 100.0*SUM(CASE + WHEN truth='in' AND v > (${fbg} + ${fbg} + ${tv}) / 2.0 THEN 1 + WHEN truth='out' AND v <= (${fbg} + ${fbg} + ${tv}) / 2.0 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*)) AS accuracy +FROM raw WHERE v IS NOT NULL; + +-- Likelihood ratio: compare distance to expected in vs expected out +SELECT 'LR clf' AS clf, + printf('%.1f%%', 100.0*SUM(CASE + WHEN truth='in' AND ABS(v - ${fbg}::DOUBLE - ${tv}) < ABS(v - ${fbg}::DOUBLE) THEN 1 + WHEN truth='out' AND ABS(v - ${fbg}::DOUBLE - ${tv}) >= ABS(v - ${fbg}::DOUBLE) THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*)) AS accuracy +FROM raw WHERE v IS NOT NULL; +SQL + echo "" +} + +analyze_composed() { + local label=$1 in_f=$2 out_f=$3 fbg=$4 tv=$5 nq=$6 + echo "=== $label ===" + $DUCKDB -markdown < ${fbg} + ${tv}/2.0 THEN 1 + WHEN truth='out' AND ravg <= ${fbg} + ${tv}/2.0 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*)) AS accuracy +FROM cum GROUP BY qid ORDER BY qid; + +-- Majority vote +WITH votes AS ( + SELECT truth, trial, + SUM(CASE WHEN v > ${fbg} + ${tv}/2.0 THEN 1 ELSE 0 END) AS yes, COUNT(*) AS total + FROM raw WHERE v IS NOT NULL GROUP BY truth, trial +) +SELECT 'Majority vote (${nq}q)' AS method, + printf('%.1f%%', 100.0*SUM(CASE + WHEN truth='in' AND yes > total/2.0 THEN 1 + WHEN truth='out' AND yes <= total/2.0 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*)) AS accuracy +FROM votes; + +-- Variance-based: use per-trial variance across queries +WITH trial_stats AS ( + SELECT truth, trial, STDDEV(v) AS trial_std + FROM raw WHERE v IS NOT NULL GROUP BY truth, trial +) +SELECT 'Variance of queries' AS method, + printf('in_std=%.0f out_std=%.0f ratio=%.2f', + AVG(CASE WHEN truth='in' THEN trial_std END), + AVG(CASE WHEN truth='out' THEN trial_std END), + AVG(CASE WHEN truth='in' THEN trial_std END) / + NULLIF(AVG(CASE WHEN truth='out' THEN trial_std END), 0)) AS stats +FROM trial_stats; +SQL + echo "" +} + +FBG=$($DUCKDB -noheader -list -c \ + "SELECT SUM((hash(i*31+7)%10000+1)::INTEGER) FROM generate_series(1,3) t(i);" | tr -d '[:space:]') +FBG999=$($DUCKDB -noheader -list -c \ + "SELECT SUM((hash(i*31+7)%10000+1)::INTEGER) FROM generate_series(1,999) t(i);" | tr -d '[:space:]') + +echo "=============================================" +echo " HARD-ZERO STRESS TEST" +echo " pac_clip_support=$CLIP, trying to break it" +echo "=============================================" +echo "Background: filter<=3 sum=$FBG, filter<=999 sum=$FBG999" +echo "" + +# --------------------------------------------------------------- +# TEST 1: High trial count (100 trials for statistical power) +# --------------------------------------------------------------- +echo "## TEST 1: High trial count (60 trials)" +echo "" +NT=60 +IN_F=$(mktemp); OUT_F=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_sum in $seed 1000 "INSERT INTO users VALUES (0, 999999);" 3)" >> "$IN_F" + echo "out,$(run_sum out $seed 1000 "" 3)" >> "$OUT_F" +done +analyze "100 trials, N=1000, tv=999999, filt<=3" "$IN_F" "$OUT_F" "$FBG" 999999 +rm -f "$IN_F" "$OUT_F" + +# --------------------------------------------------------------- +# TEST 2: Composed queries (10 queries, reduce noise by averaging) +# --------------------------------------------------------------- +echo "## TEST 2: Composed queries (10 queries)" +echo "" +NT=30; NQ=10 +IN_F=$(mktemp); OUT_F=$(mktemp) +for trial in $(seq 1 $NT); do + for q in $(seq 1 $NQ); do + s=$((trial * 1000 + q)) + echo "in,${trial},${q},$(run_sum in $s 1000 "INSERT INTO users VALUES (0, 999999);" 3)" >> "$IN_F" + echo "out,${trial},${q},$(run_sum out $s 1000 "" 3)" >> "$OUT_F" + done +done +analyze_composed "50 trials x 10 queries" "$IN_F" "$OUT_F" "$FBG" 999999 10 +rm -f "$IN_F" "$OUT_F" + +# --------------------------------------------------------------- +# TEST 3: Moderate outlier (50000 — same magnitude level as normal) +# --------------------------------------------------------------- +echo "## TEST 3: Moderate outlier (target=50000, same magnitude level)" +echo "Normal users ~5000, target ~50000 — both in level 2 (4096-65535)" +echo "The bitmap should show this level as supported, so NO clipping occurs" +echo "" +NT=30 +IN_F=$(mktemp); OUT_F=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_sum in $seed 1000 "INSERT INTO users VALUES (0, 50000);" 3)" >> "$IN_F" + echo "out,$(run_sum out $seed 1000 "" 3)" >> "$OUT_F" +done +analyze "Moderate outlier tv=50000" "$IN_F" "$OUT_F" "$FBG" 50000 +rm -f "$IN_F" "$OUT_F" + +# --------------------------------------------------------------- +# TEST 4: Two colluding outliers +# --------------------------------------------------------------- +echo "## TEST 4: Two colluding outliers" +echo "Two users with 999999 — level 3 now has 2 bitmap bits (meets threshold=2)" +echo "Hard-zero might NOT clip because level has enough support!" +echo "" +NT=30 +IN_F=$(mktemp); OUT_F=$(mktemp) +TWO_INSERT="INSERT INTO users VALUES (0, 999999); INSERT INTO users VALUES (-1, 999999);" +for seed in $(seq 1 $NT); do + echo "in,$(run_sum in $seed 1000 "$TWO_INSERT" 3)" >> "$IN_F" + echo "out,$(run_sum out $seed 1000 "" 3)" >> "$OUT_F" +done +# For two outliers: "in" filter catches user 0 (user -1 is NOT in filter <= 3) +# But user -1's value still goes into the table and affects the bitmap! +analyze "Two outliers (0 and -1), filt<=3" "$IN_F" "$OUT_F" "$FBG" 999999 +rm -f "$IN_F" "$OUT_F" + +# --------------------------------------------------------------- +# TEST 5: Dandan's filter probing attack +# --------------------------------------------------------------- +echo "## TEST 5: Dandan's filter probing" +echo "Attacker tries different filters to see if clipping behavior changes." +echo "If the outlier is present, the bitmap at level 3 has a bit set." +echo "Query 1: filter<=3 (includes user 0 if present)" +echo "Query 2: filter<=999 (includes everyone)" +echo "Difference in results might reveal membership." +echo "" +NT=30 +IN_F1=$(mktemp); OUT_F1=$(mktemp) +IN_F2=$(mktemp); OUT_F2=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_sum in $seed 1000 "INSERT INTO users VALUES (0, 999999);" 3)" >> "$IN_F1" + echo "out,$(run_sum out $seed 1000 "" 3)" >> "$OUT_F1" + echo "in,$(run_sum in $((seed+10000)) 1000 "INSERT INTO users VALUES (0, 999999);" 999)" >> "$IN_F2" + echo "out,$(run_sum out $((seed+10000)) 1000 "" 999)" >> "$OUT_F2" +done +analyze "Filter<=3 (narrow)" "$IN_F1" "$OUT_F1" "$FBG" 999999 +analyze "Filter<=999 (wide)" "$IN_F2" "$OUT_F2" "$FBG999" 999999 + +echo "=== Cross-filter differential ===" +$DUCKDB -markdown < 0 THEN 1 + WHEN n.truth='out' AND w.v - n.v <= 0 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*)) AS accuracy +FROM narrow n JOIN wide w ON n.truth = w.truth AND n.trial = w.trial +WHERE n.v IS NOT NULL AND w.v IS NOT NULL; +SQL +echo "" +rm -f "$IN_F1" "$OUT_F1" "$IN_F2" "$OUT_F2" + +# --------------------------------------------------------------- +# TEST 6: 20K small items with high trial count +# --------------------------------------------------------------- +echo "## TEST 6: 20K small items, 50 trials" +echo "" +NT=30 +IN_F=$(mktemp); OUT_F=$(mktemp) +MULTI_INSERT="INSERT INTO users SELECT 0, 50 FROM generate_series(1,20000) t(i);" +for seed in $(seq 1 $NT); do + echo "in,$(run_sum in $seed 1000 "$MULTI_INSERT" 3)" >> "$IN_F" + echo "out,$(run_sum out $seed 1000 "" 3)" >> "$OUT_F" +done +analyze "20K items x \$50, filt<=3" "$IN_F" "$OUT_F" "$FBG" 1000000 +rm -f "$IN_F" "$OUT_F" + +# --------------------------------------------------------------- +# TEST 7: Borderline outlier (value at level boundary) +# --------------------------------------------------------------- +echo "## TEST 7: Borderline outlier (target=65536, exactly level 3 boundary)" +echo "Just barely crosses into level 3 — minimum unsupported value" +echo "" +NT=30 +IN_F=$(mktemp); OUT_F=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_sum in $seed 1000 "INSERT INTO users VALUES (0, 65536);" 3)" >> "$IN_F" + echo "out,$(run_sum out $seed 1000 "" 3)" >> "$OUT_F" +done +analyze "Borderline tv=65536" "$IN_F" "$OUT_F" "$FBG" 65536 +rm -f "$IN_F" "$OUT_F" + +echo "=============================================" +echo " STRESS TEST COMPLETE" +echo "=============================================" diff --git a/attacks/clip_multirow_test.sh b/attacks/clip_multirow_test.sh new file mode 100755 index 00000000..c818360f --- /dev/null +++ b/attacks/clip_multirow_test.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +# Attack 7: "20K small items" user — tests whether clipping catches +# a user whose individual rows are normal but total contribution is huge. +set -euo pipefail + +DUCKDB="/home/ila/Code/pac/build/release/duckdb" +PAC_EXT="/home/ila/Code/pac/build/release/extension/pac/pac.duckdb_extension" +NT=30 +N=1000; FILT=3; TV_PER_ROW=50; TV_ROWS=20000 +# Total contribution: 20000 * 50 = 1,000,000 + +echo "=============================================" +echo " ATTACK 7: 20K small items user" +echo "=============================================" +echo " N=$N background users (1 row each)" +echo " Target user_id=0: $TV_ROWS rows x \$$TV_PER_ROW = \$$(( TV_ROWS * TV_PER_ROW ))" +echo " filter<=3, $NT trials" +echo "" + +FBG=$($DUCKDB -noheader -list -c \ + "SELECT SUM((hash(i*31+7)%10000+1)::INTEGER) FROM generate_series(1,${FILT}) t(i);" | tr -d '[:space:]') +TV_TOTAL=$((TV_ROWS * TV_PER_ROW)) +echo "Background SUM=$FBG, target total=$TV_TOTAL" +echo "" + +# --- 7a: No clipping (baseline) --- +run_noprotection() { + local cond=$1 seed=$2 insert="" + [ "$cond" = "in" ] && insert="INSERT INTO users SELECT 0, ${TV_PER_ROW} FROM generate_series(1,${TV_ROWS}) t(i);" + $DUCKDB -noheader -list 2>/dev/null </dev/null < 13661; +ALTER TABLE users ADD PAC_KEY(user_id); +ALTER TABLE users SET PU; +SET pac_mi = 0.0078125; +SET pac_seed = ${seed}; +SELECT SUM(acctbal) FROM users WHERE user_id <= ${FILT} OR user_id = 0; +SQL +} + +# --- 7c: pac_clip_sum (clip after filter, with pre-aggregation) --- +run_clipsum() { + local cond=$1 seed=$2 clip=$3 insert="" + [ "$cond" = "in" ] && insert="INSERT INTO users SELECT 0, ${TV_PER_ROW} FROM generate_series(1,${TV_ROWS}) t(i);" + $DUCKDB -noheader -list 2>/dev/null < ${FBG} + ${TV_TOTAL}/2.0 THEN 1 + WHEN truth='out' AND v <= ${FBG} + ${TV_TOTAL}/2.0 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*)) AS accuracy +FROM raw WHERE v IS NOT NULL +UNION ALL +SELECT 'Var>200k', + printf('%.1f%%', 100.0*SUM(CASE + WHEN truth='in' AND ABS(v - ${FBG}) > 200000 THEN 1 + WHEN truth='out' AND ABS(v - ${FBG}) <= 200000 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*)) +FROM raw WHERE v IS NOT NULL; +SQL + echo "" +} + +# 7a: No clipping +IN_F=$(mktemp); OUT_F=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_noprotection in $seed)" >> "$IN_F" + echo "out,$(run_noprotection out $seed)" >> "$OUT_F" +done +analyze "7a: No clipping (baseline)" "$IN_F" "$OUT_F" +rm -f "$IN_F" "$OUT_F" + +# 7b: Full-table Winsorization +IN_F=$(mktemp); OUT_F=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_winsorized in $seed)" >> "$IN_F" + echo "out,$(run_winsorized out $seed)" >> "$OUT_F" +done +analyze "7b: Winsorization (per-row clip to 13661)" "$IN_F" "$OUT_F" +rm -f "$IN_F" "$OUT_F" + +# 7c: pac_clip_sum with clip_support=2 +IN_F=$(mktemp); OUT_F=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_clipsum in $seed 2)" >> "$IN_F" + echo "out,$(run_clipsum out $seed 2)" >> "$OUT_F" +done +analyze "7c: pac_clip_sum (clip_support=2)" "$IN_F" "$OUT_F" +rm -f "$IN_F" "$OUT_F" diff --git a/attacks/clip_scale_test.sh b/attacks/clip_scale_test.sh new file mode 100644 index 00000000..a12319e0 --- /dev/null +++ b/attacks/clip_scale_test.sh @@ -0,0 +1,152 @@ +#!/usr/bin/env bash +# Test pac_clip_scale=true (scale outliers to nearest supported level) +# vs default false (hard-zero / omit). Peter's hypothesis: scaling should +# be safe because outliers become a minority in an already-supported bucket. +set -euo pipefail + +DUCKDB="/home/ila/Code/pac/build/release/duckdb" +PAC_EXT="/home/ila/Code/pac/build/release/extension/pac/pac.duckdb_extension" + +run_sum() { + local cond=$1 seed=$2 n_users=$3 target_val=$4 filter=$5 clip=$6 scale=$7 + local insert="" + [ "$cond" = "in" ] && insert="$target_val" + $DUCKDB -noheader -list 2>/dev/null < threshold THEN 1 + WHEN truth='out' AND ABS(v - ${fbg}) <= threshold THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*) AS acc + FROM raw, generate_series(10000, 500000, 10000) thresholds(threshold) + WHERE v IS NOT NULL + GROUP BY threshold +); + +-- Midpoint classifier +SELECT 'Midpoint clf' AS clf, + printf('%.1f%%', 100.0*SUM(CASE + WHEN truth='in' AND v > (${fbg} + ${fbg} + ${tv}) / 2.0 THEN 1 + WHEN truth='out' AND v <= (${fbg} + ${fbg} + ${tv}) / 2.0 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*)) AS accuracy +FROM raw WHERE v IS NOT NULL; + +-- Likelihood ratio +SELECT 'LR clf' AS clf, + printf('%.1f%%', 100.0*SUM(CASE + WHEN truth='in' AND ABS(v - ${fbg}::DOUBLE - ${tv}) < ABS(v - ${fbg}::DOUBLE) THEN 1 + WHEN truth='out' AND ABS(v - ${fbg}::DOUBLE - ${tv}) >= ABS(v - ${fbg}::DOUBLE) THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*)) AS accuracy +FROM raw WHERE v IS NOT NULL; +SQL + echo "" +} + +FBG=$($DUCKDB -noheader -list -c \ + "SELECT SUM((hash(i*31+7)%10000+1)::INTEGER) FROM generate_series(1,3) t(i);" | tr -d '[:space:]') +FBG999=$($DUCKDB -noheader -list -c \ + "SELECT SUM((hash(i*31+7)%10000+1)::INTEGER) FROM generate_series(1,999) t(i);" | tr -d '[:space:]') + +echo "=================================================================" +echo " pac_clip_scale COMPARISON TEST" +echo " Comparing scale=true (Peter's preference) vs scale=false (hard-zero)" +echo "=================================================================" +echo "Background: filter<=3 sum=$FBG, filter<=999 sum=$FBG999" +echo "" + +run_test() { + local test_name=$1 n_users=$2 target_sql=$3 filter=$4 tv=$5 fbg=$6 + local nt=30 + + for clip in 2 10 50; do + for scale in false true; do + local label="${test_name} [clip=${clip}, scale=${scale}]" + IN_F=$(mktemp); OUT_F=$(mktemp) + for seed in $(seq 1 $nt); do + echo "in,$(run_sum in $seed $n_users "$target_sql" $filter $clip $scale)" >> "$IN_F" + echo "out,$(run_sum out $seed $n_users "" $filter $clip $scale)" >> "$OUT_F" + done + analyze "$label" "$IN_F" "$OUT_F" "$fbg" "$tv" + rm -f "$IN_F" "$OUT_F" + done + done +} + +# --------------------------------------------------------------- +# TEST 1: Extreme outlier, small filter +# Hard-zero baseline: 52.4% (random) at clip=2 +# --------------------------------------------------------------- +echo "## TEST 1: Extreme outlier (tv=999999), small filter (<=3), N=1000" +echo "" +run_test "T1" 1000 "INSERT INTO users VALUES (0, 999999);" 3 999999 "$FBG" + +# --------------------------------------------------------------- +# TEST 2: Extreme outlier, wide filter +# Hard-zero baseline: ~55% at clip=2 +# --------------------------------------------------------------- +echo "## TEST 2: Extreme outlier (tv=999999), wide filter (<=999), N=1000" +echo "" +run_test "T2" 1000 "INSERT INTO users VALUES (0, 999999);" 999 999999 "$FBG999" + +# --------------------------------------------------------------- +# TEST 3: Moderate outlier (same magnitude level as normal users) +# Hard-zero baseline: 76.5% at clip=2 (already leaks) +# --------------------------------------------------------------- +echo "## TEST 3: Moderate outlier (tv=50000), small filter (<=3)" +echo "Normal users ~5000, target ~50000 — both in level 2 (4096-65535)" +echo "" +run_test "T3" 1000 "INSERT INTO users VALUES (0, 50000);" 3 50000 "$FBG" + +# --------------------------------------------------------------- +# TEST 4: 20K small items (multi-row outlier) +# Hard-zero baseline: 52.9% at clip=2 +# --------------------------------------------------------------- +echo "## TEST 4: 20K small items (50 x 20000 = 1M), small filter (<=3)" +echo "" +run_test "T4" 1000 "INSERT INTO users SELECT 0, 50 FROM generate_series(1,20000) t(i);" 3 1000000 "$FBG" + +# --------------------------------------------------------------- +# TEST 5: Borderline outlier (exactly at level boundary) +# Hard-zero baseline: 52.9% at clip=2 +# --------------------------------------------------------------- +echo "## TEST 5: Borderline outlier (tv=65536, level 3 boundary), small filter (<=3)" +echo "" +run_test "T5" 1000 "INSERT INTO users VALUES (0, 65536);" 3 65536 "$FBG" + +echo "=================================================================" +echo " SCALE TEST COMPLETE" +echo "=================================================================" diff --git a/attacks/clip_shift2_stress.sh b/attacks/clip_shift2_stress.sh new file mode 100755 index 00000000..88567708 --- /dev/null +++ b/attacks/clip_shift2_stress.sh @@ -0,0 +1,241 @@ +#!/usr/bin/env bash +# Stress-test shift=2 (4x levels) with hard-zero clipping. +# Focus on edge cases that 4x granularity might miss. +set -euo pipefail + +DUCKDB="/home/ila/Code/pac/build/release/duckdb" +PAC_EXT="/home/ila/Code/pac/build/release/extension/pac/pac.duckdb_extension" +CLIP=2 + +run_sum() { + local cond=$1 seed=$2 n_users=$3 target_insert="$4" filter=$5 + $DUCKDB -noheader -list 2>/dev/null < threshold THEN 1 + WHEN truth='out' AND ABS(v - ${fbg}) <= threshold THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*) AS acc + FROM raw, generate_series(5000, 500000, 5000) thresholds(threshold) + WHERE v IS NOT NULL + GROUP BY threshold +); +SQL + echo "" +} + +FBG=$($DUCKDB -noheader -list -c \ + "SELECT SUM((hash(i*31+7)%10000+1)::INTEGER) FROM generate_series(1,3) t(i);" | tr -d '[:space:]') + +echo "=============================================" +echo " SHIFT=2 STRESS TEST (4x levels, hard-zero)" +echo " pac_clip_support=$CLIP" +echo "=============================================" +echo "" + +NT=30 + +# --------------------------------------------------------------- +# TEST 1: 3.5x outlier (within 4x boundary — should NOT be caught) +# Normal ~5000, target=17000 (3.4x) +# Both in level 3 (4096-16383)? Let's check: +# 5000: bit_pos=12, (12-4)>>1 = 4. Level 4. +# 17000: bit_pos=14, (14-4)>>1 = 5. Level 5. DIFFERENT! +# Actually 17000 might be caught. Let's try 15000: +# 15000: bit_pos=13, (13-4)>>1 = 4. Level 4. SAME as 5000! +# --------------------------------------------------------------- +echo "## TEST 1: 3x outlier (target=15000, same level as normal)" +echo "5000→level 4, 15000→level 4 (bit_pos 13, (13-4)/2=4). Same level." +echo "" +IN_F=$(mktemp); OUT_F=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_sum in $seed 1000 "INSERT INTO users VALUES (0, 15000);" 3)" >> "$IN_F" + echo "out,$(run_sum out $seed 1000 "" 3)" >> "$OUT_F" +done +analyze "3x outlier tv=15000" "$IN_F" "$OUT_F" "$FBG" 15000 +rm -f "$IN_F" "$OUT_F" + +# --------------------------------------------------------------- +# TEST 2: Just above 4x (target=20000) +# 20000: bit_pos=14, (14-4)>>1 = 5. Level 5. Different from 5000 (level 4). +# --------------------------------------------------------------- +echo "## TEST 2: 4x outlier (target=20000, different level)" +echo "5000→level 4, 20000→level 5. Should be caught." +echo "" +IN_F=$(mktemp); OUT_F=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_sum in $seed 1000 "INSERT INTO users VALUES (0, 20000);" 3)" >> "$IN_F" + echo "out,$(run_sum out $seed 1000 "" 3)" >> "$OUT_F" +done +analyze "4x outlier tv=20000" "$IN_F" "$OUT_F" "$FBG" 20000 +rm -f "$IN_F" "$OUT_F" + +# --------------------------------------------------------------- +# TEST 3: Two colluding outliers (still breaks it?) +# --------------------------------------------------------------- +echo "## TEST 3: Two colluding outliers (999999)" +echo "" +IN_F=$(mktemp); OUT_F=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_sum in $seed 1000 "INSERT INTO users VALUES (0, 999999); INSERT INTO users VALUES (-1, 999999);" 3)" >> "$IN_F" + echo "out,$(run_sum out $seed 1000 "" 3)" >> "$OUT_F" +done +analyze "Two colluders tv=999999" "$IN_F" "$OUT_F" "$FBG" 999999 +rm -f "$IN_F" "$OUT_F" + +# --------------------------------------------------------------- +# TEST 4: Outlier at exact level boundary (16384 = start of level 5) +# 16384: bit_pos=14, (14-4)>>1 = 5. Normal at level 4. +# --------------------------------------------------------------- +echo "## TEST 4: Boundary outlier (target=16384, exact level 5 start)" +echo "" +IN_F=$(mktemp); OUT_F=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_sum in $seed 1000 "INSERT INTO users VALUES (0, 16384);" 3)" >> "$IN_F" + echo "out,$(run_sum out $seed 1000 "" 3)" >> "$OUT_F" +done +analyze "Boundary tv=16384" "$IN_F" "$OUT_F" "$FBG" 16384 +rm -f "$IN_F" "$OUT_F" + +# --------------------------------------------------------------- +# TEST 5: Outlier just below boundary (16383 = max of level 4) +# 16383: bit_pos=13, (13-4)>>1 = 4. Same level as 5000. +# --------------------------------------------------------------- +echo "## TEST 5: Just-below-boundary (target=16383, still level 4)" +echo "" +IN_F=$(mktemp); OUT_F=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_sum in $seed 1000 "INSERT INTO users VALUES (0, 16383);" 3)" >> "$IN_F" + echo "out,$(run_sum out $seed 1000 "" 3)" >> "$OUT_F" +done +analyze "Just-below tv=16383" "$IN_F" "$OUT_F" "$FBG" 16383 +rm -f "$IN_F" "$OUT_F" + +# --------------------------------------------------------------- +# TEST 6: Many small outliers (10 users at 15000, all in same level) +# They all go to level 4 like normal users → supported → no clipping +# --------------------------------------------------------------- +echo "## TEST 6: 10 users at 15000 (3x, same level, all 'supported')" +echo "" +IN_F=$(mktemp); OUT_F=$(mktemp) +MULTI="INSERT INTO users SELECT -(i+1), 15000 FROM generate_series(1,10) t(i); INSERT INTO users VALUES (0, 15000);" +for seed in $(seq 1 $NT); do + echo "in,$(run_sum in $seed 1000 "$MULTI" 3)" >> "$IN_F" + echo "out,$(run_sum out $seed 1000 "" 3)" >> "$OUT_F" +done +analyze "10 users at 15000" "$IN_F" "$OUT_F" "$FBG" 15000 +rm -f "$IN_F" "$OUT_F" + +# --------------------------------------------------------------- +# TEST 7: Wide filter + moderate outlier (best case for clipping) +# --------------------------------------------------------------- +FBG999=$($DUCKDB -noheader -list -c \ + "SELECT SUM((hash(i*31+7)%10000+1)::INTEGER) FROM generate_series(1,999) t(i);" | tr -d '[:space:]') +echo "## TEST 7: Wide filter + moderate outlier (tv=50000)" +echo "" +IN_F=$(mktemp); OUT_F=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_sum in $seed 1000 "INSERT INTO users VALUES (0, 50000);" 999)" >> "$IN_F" + echo "out,$(run_sum out $seed 1000 "" 999)" >> "$OUT_F" +done +analyze "Wide filter tv=50000" "$IN_F" "$OUT_F" "$FBG999" 50000 +rm -f "$IN_F" "$OUT_F" + +# --------------------------------------------------------------- +# TEST 8: 20K small items (pre-aggregation test) +# --------------------------------------------------------------- +echo "## TEST 8: 20K small items (\$50 x 20000 = \$1M)" +echo "" +IN_F=$(mktemp); OUT_F=$(mktemp) +for seed in $(seq 1 $NT); do + echo "in,$(run_sum in $seed 1000 "INSERT INTO users SELECT 0, 50 FROM generate_series(1,20000) t(i);" 3)" >> "$IN_F" + echo "out,$(run_sum out $seed 1000 "" 3)" >> "$OUT_F" +done +analyze "20K items" "$IN_F" "$OUT_F" "$FBG" 1000000 +rm -f "$IN_F" "$OUT_F" + +# --------------------------------------------------------------- +# TEST 9: Composed queries (10 queries averaged) +# --------------------------------------------------------------- +echo "## TEST 9: 10 composed queries, tv=999999" +echo "" +IN_F=$(mktemp); OUT_F=$(mktemp) +for trial in $(seq 1 $NT); do + for q in $(seq 1 10); do + s=$((trial * 1000 + q)) + echo "in,${trial},${q},$(run_sum in $s 1000 "INSERT INTO users VALUES (0, 999999);" 3)" >> "$IN_F" + echo "out,${trial},${q},$(run_sum out $s 1000 "" 3)" >> "$OUT_F" + done +done +echo "=== 10 composed queries ===" +$DUCKDB -markdown < ${FBG} + 999999/2.0 THEN 1 ELSE 0 END) AS yes, COUNT(*) AS total + FROM raw WHERE v IS NOT NULL GROUP BY truth, trial +) +SELECT 'Majority vote (10q)' AS method, + printf('%.1f%%', 100.0*SUM(CASE + WHEN truth='in' AND yes > total/2.0 THEN 1 + WHEN truth='out' AND yes <= total/2.0 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*)) AS accuracy +FROM votes; + +WITH trial_stats AS ( + SELECT truth, trial, STDDEV(v) AS trial_std + FROM raw WHERE v IS NOT NULL GROUP BY truth, trial +) +SELECT 'Per-trial variance' AS method, + printf('in_std=%.0f out_std=%.0f ratio=%.2f', + AVG(CASE WHEN truth='in' THEN trial_std END), + AVG(CASE WHEN truth='out' THEN trial_std END), + AVG(CASE WHEN truth='in' THEN trial_std END) / + NULLIF(AVG(CASE WHEN truth='out' THEN trial_std END), 0)) AS stats +FROM trial_stats; +SQL +echo "" +rm -f "$IN_F" "$OUT_F" + +echo "=============================================" +echo " STRESS TEST COMPLETE" +echo "=============================================" diff --git a/attacks/clipping_experiment.sh b/attacks/clipping_experiment.sh new file mode 100755 index 00000000..ef2c7842 --- /dev/null +++ b/attacks/clipping_experiment.sh @@ -0,0 +1,215 @@ +#!/usr/bin/env bash +# Clipping experiment: does pre-PAC outlier clipping reduce attack success + improve utility? +# +# For each clipping threshold t in {1, 2, 3, 5, inf}, clips data at μ ± t·σ +# (recursive until convergence), then runs PAC and measures attack accuracy + utility. +set -euo pipefail + +DUCKDB="/home/ila/Code/pac/build/release/duckdb" +PAC_EXT="/home/ila/Code/pac/build/release/extension/pac/pac.duckdb_extension" + +N=1000; TV=999999; FILT=3; NTRIALS=30 +CLIP_ITERS=20 +T_VALUES="1 2 3 5 inf" + +echo "============================================================" +echo " CLIPPING EXPERIMENT" +echo " Does pre-PAC outlier clipping reduce attack accuracy?" +echo "============================================================" +echo " N=$N users, target=$TV, filter<=$FILT, $NTRIALS trials" +echo " Clipping thresholds (t): $T_VALUES" +echo "============================================================" +echo "" + +# Generate SQL for recursive clipping: CLIP_ITERS rounds of UPDATE at μ ± t·σ. +# Each round recomputes μ,σ from the current data. The WHERE clause ensures +# convergence — once all values are within bounds, subsequent rounds are no-ops. +gen_clip_sql() { + local t=$1 sql="" i + for i in $(seq 1 $CLIP_ITERS); do + sql+="UPDATE users SET acctbal = LEAST(GREATEST(acctbal, + (SELECT (AVG(acctbal) - ${t} * STDDEV_POP(acctbal))::INTEGER FROM users)), + (SELECT (AVG(acctbal) + ${t} * STDDEV_POP(acctbal))::INTEGER FROM users)) +WHERE acctbal < (SELECT (AVG(acctbal) - ${t} * STDDEV_POP(acctbal))::INTEGER FROM users) + OR acctbal > (SELECT (AVG(acctbal) + ${t} * STDDEV_POP(acctbal))::INTEGER FROM users); +" + done + printf '%s' "$sql" +} + +# Ground truth: no clipping, no PAC noise +run_true_unclipped() { + local cond=$1 insert="" + [ "$cond" = "in" ] && insert="INSERT INTO users VALUES (0, ${TV});" + $DUCKDB -noheader -list </dev/null <> "$GROUND_F" + echo "${t},out,${CLIP_OUT}" >> "$GROUND_F" + + # Run PAC trials + for seed in $(seq 1 $NTRIALS); do + v_in=$(run_clipped_pac in "$seed" "$t" | tr -d '[:space:]') + v_out=$(run_clipped_pac out "$seed" "$t" | tr -d '[:space:]') + echo "${t},in,${seed},${v_in}" >> "$RESULTS_F" + echo "${t},out,${seed},${v_out}" >> "$RESULTS_F" + printf "." + done + echo " done" +done + +echo "" +echo "============================================================" +echo " RESULTS" +echo "============================================================" +echo "" + +# --- Analysis --- +$DUCKDB -markdown < 200000 + 100.0 * SUM(CASE + WHEN r.truth='in' AND ABS(r.v - ${TRUE_OUT}) > 200000 THEN 1 + WHEN r.truth='out' AND ABS(r.v - ${TRUE_OUT}) <= 200000 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*) AS var_acc, + -- Midpoint classifier: v > (clipped_in + clipped_out) / 2 + 100.0 * SUM(CASE + WHEN r.truth='in' AND r.v > (g_in.clipped_true + g_out.clipped_true) / 2.0 THEN 1 + WHEN r.truth='out' AND r.v <= (g_in.clipped_true + g_out.clipped_true) / 2.0 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*) AS mid_acc, + STDDEV(CASE WHEN r.truth='in' THEN r.v END) AS std_in, + STDDEV(CASE WHEN r.truth='out' THEN r.v END) AS std_out + FROM results r + JOIN ground g_in ON r.t_val = g_in.t_val AND g_in.truth = 'in' + JOIN ground g_out ON r.t_val = g_out.t_val AND g_out.truth = 'out' + WHERE r.v IS NOT NULL + GROUP BY r.t_val, g_in.clipped_true, g_out.clipped_true +), +utility AS ( + SELECT r.t_val, + AVG(CASE WHEN r.truth='in' + THEN ABS(r.v - ${TRUE_IN}::DOUBLE) / NULLIF(ABS(${TRUE_IN}::DOUBLE), 0) * 100 + ELSE ABS(r.v - ${TRUE_OUT}::DOUBLE) / NULLIF(ABS(${TRUE_OUT}::DOUBLE), 0) * 100 + END) AS mape + FROM results r + WHERE r.v IS NOT NULL + GROUP BY r.t_val +), +bias AS ( + SELECT g.t_val, + MAX(CASE WHEN g.truth='in' THEN ABS(g.clipped_true - ${TRUE_IN}::DOUBLE) END) AS bias_in, + MAX(CASE WHEN g.truth='out' THEN ABS(g.clipped_true - ${TRUE_OUT}::DOUBLE) END) AS bias_out + FROM ground g + GROUP BY g.t_val +) +SELECT a.t_val AS t, + printf('%.1f%%', a.var_acc) AS attack_acc_200k, + printf('%.1f%%', a.mid_acc) AS attack_acc_mid, + printf('%.1f%%', ut.mape) AS mape_vs_true, + printf('%.0f', a.std_in) AS noise_std_in, + printf('%.0f', a.std_out) AS noise_std_out, + printf('%.0f', b.bias_in) AS clip_bias_in, + printf('%.0f', b.bias_out) AS clip_bias_out +FROM attack a +JOIN utility ut ON a.t_val = ut.t_val +JOIN bias b ON a.t_val = b.t_val +ORDER BY CASE a.t_val WHEN 'inf' THEN 999 ELSE a.t_val::INT END; + +-- Detailed per-condition stats +SELECT r.t_val AS t, r.truth, + printf('%.0f', AVG(r.v)) AS mean_pac, + printf('%.0f', STDDEV(r.v)) AS std_pac, + printf('%.0f', g.clipped_true) AS truth_clipped, + COUNT(*) AS n +FROM results r +JOIN ground g ON r.t_val = g.t_val AND r.truth = g.truth +WHERE r.v IS NOT NULL +GROUP BY r.t_val, r.truth, g.clipped_true +ORDER BY CASE r.t_val WHEN 'inf' THEN 999 ELSE r.t_val::INT END, r.truth; +SQL + +rm -f "$RESULTS_F" "$GROUND_F" + +echo "" +echo "============================================================" +echo " INTERPRETATION" +echo "============================================================" +echo " attack_acc_200k: variance classifier (|v - bg| > 200k), 50% = random" +echo " attack_acc_mid: midpoint classifier (optimal threshold), 50% = random" +echo " mape_vs_true: mean |noised - unclipped_truth| / |unclipped_truth|" +echo " clip_bias_in/out: |clipped_truth - unclipped_truth|" +echo " Unclipped truths: in=$TRUE_IN out=$TRUE_OUT" +echo "============================================================" diff --git a/attacks/output_clipping_experiment.sh b/attacks/output_clipping_experiment.sh new file mode 100755 index 00000000..2fb161a4 --- /dev/null +++ b/attacks/output_clipping_experiment.sh @@ -0,0 +1,244 @@ +#!/usr/bin/env bash +# Output clipping experiment: clip the PAC query RESULT (not input data) at bounds +# derived from baseline column statistics. +# +# Unlike input clipping (which modifies stored values before PAC), output clipping +# leaves the data untouched. After PAC returns a noised result, we clamp it to +# [n·(μ - t·σ), n·(μ + t·σ)] where μ,σ are pre-computed column stats and n is the +# expected number of users in the filter. +# +# Key property: if the billionaire is NOT in the filter, the result is already +# within bounds and nothing changes. Clipping only fires when an outlier inflates +# the result beyond the expected range. +set -euo pipefail + +DUCKDB="/home/ila/Code/pac/build/release/duckdb" +PAC_EXT="/home/ila/Code/pac/build/release/extension/pac/pac.duckdb_extension" + +N=1000; TV=999999; FILT=3; NTRIALS=30 +T_VALUES="1 2 3 5 inf" + +echo "============================================================" +echo " OUTPUT CLIPPING EXPERIMENT" +echo " Clip the PAC result post-hoc at n·(μ ± t·σ)" +echo "============================================================" +echo " N=$N users, target=$TV, filter<=$FILT, $NTRIALS trials" +echo " Clipping thresholds (t): $T_VALUES" +echo "============================================================" +echo "" + +# --- Baseline column statistics (from N users, NO target) --- +# These represent the "known" column distribution used for clipping bounds. +read MU SIGMA <<< "$($DUCKDB -noheader -csv -separator ' ' </dev/null <> "$RESULTS_F" + echo "out,${seed},${v_out}" >> "$RESULTS_F" + printf "." +done +echo " done" + +echo "" +echo "============================================================" +echo " RESULTS" +echo "============================================================" +echo "" + +# --- Analysis: for each t, apply output clipping and compute metrics --- +$DUCKDB -markdown < 200000 + 100.0 * SUM(CASE + WHEN c.truth='in' AND ABS(c.v - ${TRUE_OUT}) > 200000 THEN 1 + WHEN c.truth='out' AND ABS(c.v - ${TRUE_OUT}) <= 200000 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*) AS var_acc, + -- Midpoint classifier: v > (clipped_in_truth + clipped_out_truth) / 2 + 100.0 * SUM(CASE + WHEN c.truth='in' AND c.v > (g_in.clipped_true + g_out.clipped_true) / 2.0 THEN 1 + WHEN c.truth='out' AND c.v <= (g_in.clipped_true + g_out.clipped_true) / 2.0 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*) AS mid_acc, + STDDEV(CASE WHEN c.truth='in' THEN c.v END) AS std_in, + STDDEV(CASE WHEN c.truth='out' THEN c.v END) AS std_out, + -- Fraction of results that hit a clip bound + 100.0 * SUM(CASE WHEN c.truth='in' AND (c.v = c.lo OR c.v = c.hi) THEN 1 ELSE 0 END)::DOUBLE + / NULLIF(SUM(CASE WHEN c.truth='in' THEN 1 ELSE 0 END), 0) AS pct_clipped_in, + 100.0 * SUM(CASE WHEN c.truth='out' AND (c.v = c.lo OR c.v = c.hi) THEN 1 ELSE 0 END)::DOUBLE + / NULLIF(SUM(CASE WHEN c.truth='out' THEN 1 ELSE 0 END), 0) AS pct_clipped_out + FROM clipped c + JOIN ground g_in ON c.t_val = g_in.t_val AND g_in.truth = 'in' + JOIN ground g_out ON c.t_val = g_out.t_val AND g_out.truth = 'out' + GROUP BY c.t_val, g_in.clipped_true, g_out.clipped_true +), +utility AS ( + SELECT c.t_val, + AVG(CASE WHEN c.truth='in' + THEN ABS(c.v - ${TRUE_IN}::DOUBLE) / NULLIF(ABS(${TRUE_IN}::DOUBLE), 0) * 100 + ELSE ABS(c.v - ${TRUE_OUT}::DOUBLE) / NULLIF(ABS(${TRUE_OUT}::DOUBLE), 0) * 100 + END) AS mape + FROM clipped c + GROUP BY c.t_val +), +bias AS ( + SELECT g.t_val, + MAX(CASE WHEN g.truth='in' THEN ABS(g.clipped_true - ${TRUE_IN}::DOUBLE) END) AS bias_in, + MAX(CASE WHEN g.truth='out' THEN ABS(g.clipped_true - ${TRUE_OUT}::DOUBLE) END) AS bias_out + FROM ground g + GROUP BY g.t_val +) +SELECT CASE WHEN a.t_val = 999 THEN 'inf' ELSE a.t_val::VARCHAR END AS t, + printf('%.1f%%', a.var_acc) AS attack_acc_200k, + printf('%.1f%%', a.mid_acc) AS attack_acc_mid, + printf('%.1f%%', ut.mape) AS mape_vs_true, + printf('%.0f', a.std_in) AS noise_std_in, + printf('%.0f', a.std_out) AS noise_std_out, + printf('%.0f', b.bias_in) AS clip_bias_in, + printf('%.0f', b.bias_out) AS clip_bias_out, + printf('%.0f%%', a.pct_clipped_in) AS pct_clip_in, + printf('%.0f%%', a.pct_clipped_out) AS pct_clip_out +FROM attack a +JOIN utility ut ON a.t_val = ut.t_val +JOIN bias b ON a.t_val = b.t_val +ORDER BY CASE WHEN a.t_val = 999 THEN 999 ELSE a.t_val END; + +-- Detailed per-condition stats +SELECT CASE WHEN c.t_val = 999 THEN 'inf' ELSE c.t_val::VARCHAR END AS t, + c.truth, + printf('%.0f', AVG(c.v)) AS mean_clipped, + printf('%.0f', STDDEV(c.v)) AS std_clipped, + printf('%.0f', AVG(c.raw_v)) AS mean_raw, + printf('%.0f', STDDEV(c.raw_v)) AS std_raw, + printf('%.0f', g.clipped_true) AS truth_clipped, + COUNT(*) AS n +FROM clipped c +JOIN ground g ON c.t_val = g.t_val AND c.truth = g.truth +GROUP BY c.t_val, c.truth, g.clipped_true +ORDER BY CASE WHEN c.t_val = 999 THEN 999 ELSE c.t_val END, c.truth; +SQL + +rm -f "$RESULTS_F" + +echo "" +echo "============================================================" +echo " INTERPRETATION" +echo "============================================================" +echo " Output clipping: CLAMP(pac_result, n·(μ-tσ), n·(μ+tσ))" +echo " Bounds use baseline stats (μ=$MU, σ=$SIGMA, n=$N_BASE)" +echo " attack_acc_200k: variance classifier, 50% = random" +echo " attack_acc_mid: midpoint classifier, 50% = random" +echo " pct_clip_in/out: fraction of results hitting a bound" +echo " Unclipped truths: in=$TRUE_IN out=$TRUE_OUT" +echo "============================================================" diff --git a/attacks/output_clipping_v2_experiment.sh b/attacks/output_clipping_v2_experiment.sh new file mode 100644 index 00000000..9fb3ac71 --- /dev/null +++ b/attacks/output_clipping_v2_experiment.sh @@ -0,0 +1,220 @@ +#!/usr/bin/env bash +# Output clipping v2: clip values at QUERY TIME using pre-computed baseline bounds, +# BEFORE PAC computes sensitivity. +# +# Pipeline per query: +# 1. Data is stored unmodified (billionaire's 999999 stays as-is) +# 2. At query time, clamp each value to [μ-tσ, μ+tσ] (baseline stats, single pass) +# 3. PAC computes sensitivity from the CLIPPED range → noise ∝ 2tσ +# 4. Return noised result +# +# Key property: bounds are identical for in/out (derived from baseline, not current data), +# so PAC calibrates the SAME noise regardless of membership. No side-channel from +# differing sensitivities. +# +# Simulated by: UPDATE + PAC_KEY in the same session. The UPDATE models the query-time +# clamping; PAC then sees the clipped range for sensitivity. +set -euo pipefail + +DUCKDB="/home/ila/Code/pac/build/release/duckdb" +PAC_EXT="/home/ila/Code/pac/build/release/extension/pac/pac.duckdb_extension" + +N=1000; TV=999999; FILT=3; NTRIALS=30 +T_VALUES="1 2 3 5 inf" + +echo "============================================================" +echo " OUTPUT CLIPPING v2" +echo " Clip at query time, BEFORE PAC sensitivity computation" +echo "============================================================" +echo " N=$N users, target=$TV, filter<=$FILT, $NTRIALS trials" +echo " Clipping thresholds (t): $T_VALUES" +echo "============================================================" +echo "" + +# --- Baseline column stats (from N users, NO target) --- +read MU SIGMA <<< "$($DUCKDB -noheader -csv -separator ' ' </dev/null <> "$GROUND_F" + echo "${t},out,${CLIP_OUT}" >> "$GROUND_F" + + # Run PAC trials + for seed in $(seq 1 $NTRIALS); do + v_in=$(run_clipped_pac in "$seed" "$t" | tr -d '[:space:]') + v_out=$(run_clipped_pac out "$seed" "$t" | tr -d '[:space:]') + echo "${t},in,${seed},${v_in}" >> "$RESULTS_F" + echo "${t},out,${seed},${v_out}" >> "$RESULTS_F" + printf "." + done + echo " done" +done + +echo "" +echo "============================================================" +echo " RESULTS" +echo "============================================================" +echo "" + +# --- Analysis --- +$DUCKDB -markdown < 200000 THEN 1 + WHEN r.truth='out' AND ABS(r.v - ${TRUE_OUT}) <= 200000 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*) AS var_acc, + 100.0 * SUM(CASE + WHEN r.truth='in' AND r.v > (g_in.clipped_true + g_out.clipped_true) / 2.0 THEN 1 + WHEN r.truth='out' AND r.v <= (g_in.clipped_true + g_out.clipped_true) / 2.0 THEN 1 + ELSE 0 END)::DOUBLE / COUNT(*) AS mid_acc, + STDDEV(CASE WHEN r.truth='in' THEN r.v END) AS std_in, + STDDEV(CASE WHEN r.truth='out' THEN r.v END) AS std_out + FROM results r + JOIN ground g_in ON r.t_val = g_in.t_val AND g_in.truth = 'in' + JOIN ground g_out ON r.t_val = g_out.t_val AND g_out.truth = 'out' + WHERE r.v IS NOT NULL + GROUP BY r.t_val, g_in.clipped_true, g_out.clipped_true +), +utility AS ( + SELECT r.t_val, + AVG(CASE WHEN r.truth='in' + THEN ABS(r.v - ${TRUE_IN}::DOUBLE) / NULLIF(ABS(${TRUE_IN}::DOUBLE), 0) * 100 + ELSE ABS(r.v - ${TRUE_OUT}::DOUBLE) / NULLIF(ABS(${TRUE_OUT}::DOUBLE), 0) * 100 + END) AS mape + FROM results r + WHERE r.v IS NOT NULL + GROUP BY r.t_val +), +bias AS ( + SELECT g.t_val, + MAX(CASE WHEN g.truth='in' THEN ABS(g.clipped_true - ${TRUE_IN}::DOUBLE) END) AS bias_in, + MAX(CASE WHEN g.truth='out' THEN ABS(g.clipped_true - ${TRUE_OUT}::DOUBLE) END) AS bias_out + FROM ground g + GROUP BY g.t_val +) +SELECT a.t_val AS t, + printf('%.1f%%', a.var_acc) AS attack_acc_200k, + printf('%.1f%%', a.mid_acc) AS attack_acc_mid, + printf('%.1f%%', ut.mape) AS mape_vs_true, + printf('%.0f', a.std_in) AS noise_std_in, + printf('%.0f', a.std_out) AS noise_std_out, + printf('%.1fx', a.std_in / NULLIF(a.std_out, 0)) AS std_ratio, + printf('%.0f', b.bias_in) AS clip_bias_in, + printf('%.0f', b.bias_out) AS clip_bias_out +FROM attack a +JOIN utility ut ON a.t_val = ut.t_val +JOIN bias b ON a.t_val = b.t_val +ORDER BY CASE a.t_val WHEN 'inf' THEN 999 ELSE a.t_val::INT END; + +-- Detailed per-condition stats +SELECT r.t_val AS t, r.truth, + printf('%.0f', AVG(r.v)) AS mean_pac, + printf('%.0f', STDDEV(r.v)) AS std_pac, + printf('%.0f', g.clipped_true) AS truth_clipped, + COUNT(*) AS n +FROM results r +JOIN ground g ON r.t_val = g.t_val AND r.truth = g.truth +WHERE r.v IS NOT NULL +GROUP BY r.t_val, r.truth, g.clipped_true +ORDER BY CASE r.t_val WHEN 'inf' THEN 999 ELSE r.t_val::INT END, r.truth; +SQL + +rm -f "$RESULTS_F" "$GROUND_F" + +echo "" +echo "============================================================" +echo " INTERPRETATION" +echo "============================================================" +echo " Single-pass clip at [μ-tσ, μ+tσ] using baseline stats" +echo " μ=$MU, σ=$SIGMA (from $N background users)" +echo " PAC sees clipped range → sensitivity = 2tσ for both in/out" +echo " std_ratio: noise_std_in / noise_std_out (1.0 = no side-channel)" +echo " Unclipped truths: in=$TRUE_IN out=$TRUE_OUT" +echo "============================================================" diff --git a/src/aggregates/pac_clip_min_max.cpp b/src/aggregates/pac_clip_min_max.cpp new file mode 100644 index 00000000..e1228563 --- /dev/null +++ b/src/aggregates/pac_clip_min_max.cpp @@ -0,0 +1,797 @@ +#include "aggregates/pac_clip_min_max.hpp" +#include "categorical/pac_categorical.hpp" +#include "duckdb/common/types/decimal.hpp" +#include "duckdb/parser/parsed_data/create_aggregate_function_info.hpp" +#include + +namespace duckdb { + +// ============================================================================ +// Inner state update: always unsigned (caller provides abs value) +// ============================================================================ +template +AUTOVECTORIZE inline void PacClipMinMaxUpdateOneInternal(PacClipMinMaxIntState &state, uint64_t key_hash, + uint64_t value, ArenaAllocator &allocator) { + state.key_hash |= key_hash; + + int level = PacClipMinMaxIntState::GetLevel(value); + int shift = level << 1; + uint8_t shifted_val = static_cast((value >> shift) & 0xFF); + + state.EnsureLevelAllocated(allocator, level); + uint64_t *buf = state.levels[level]; + + // Set bitmap bit (always, even if BOUNDOPT skips the extreme update) + buf[PCMM_SWAR] |= (1ULL << (key_hash >> 58)); + + // BOUNDOPT: skip expensive SIMD update if value can't improve any extreme at this level + if (!PAC_IS_BETTER(shifted_val, state.level_bounds[level])) { + return; + } + state.UpdateExtreme(buf, shifted_val, key_hash); + + // Periodically recompute bound + if ((state.update_count & (BOUND_RECOMPUTE_INTERVAL - 1)) == 0) { + state.RecomputeBound(level); + } +} + +// ============================================================================ +// Route signed value to pos or neg state (two-sided, like clip_sum) +// ============================================================================ +template +inline void PacClipMinMaxRouteValue(PacClipMinMaxStateWrapper &wrapper, + PacClipMinMaxIntState *pos_state, uint64_t hash, int64_t value, + ArenaAllocator &a) { + if (value < 0) { + auto *neg = wrapper.EnsureNegState(a); + PacClipMinMaxUpdateOneInternal(*neg, hash, static_cast(-value), a); + neg->update_count++; + } else { + PacClipMinMaxUpdateOneInternal(*pos_state, hash, static_cast(value), a); + pos_state->update_count++; + } +} + +// ============================================================================ +// Buffered update (two-sided: SIGNED routes to pos/neg, !SIGNED always pos) +// ============================================================================ +template +AUTOVECTORIZE inline void PacClipMinMaxUpdateOne(PacClipMinMaxStateWrapper &agg, uint64_t key_hash, + ValueT value, ArenaAllocator &a) { + uint64_t cnt = agg.n_buffered & PacClipMinMaxStateWrapper::BUF_MASK; + if (DUCKDB_UNLIKELY(cnt == PacClipMinMaxStateWrapper::BUF_SIZE)) { + auto *dst_state = agg.EnsureState(a); + for (int i = 0; i < PacClipMinMaxStateWrapper::BUF_SIZE; i++) { + if (SIGNED) { + PacClipMinMaxRouteValue(agg, dst_state, agg.hash_buf[i], agg.val_buf[i], a); + } else { + PacClipMinMaxUpdateOneInternal(*dst_state, agg.hash_buf[i], + static_cast(agg.val_buf[i]), a); + dst_state->update_count++; + } + } + if (SIGNED) { + PacClipMinMaxRouteValue(agg, dst_state, key_hash, static_cast(value), a); + } else { + PacClipMinMaxUpdateOneInternal(*dst_state, key_hash, static_cast(value), a); + dst_state->update_count++; + } + agg.n_buffered &= ~PacClipMinMaxStateWrapper::BUF_MASK; + } else { + agg.val_buf[cnt] = static_cast(value); + agg.hash_buf[cnt] = key_hash; + agg.n_buffered++; + } +} + +// ============================================================================ +// Buffer flush +// ============================================================================ +template +inline void PacClipMinMaxFlushBuffer(PacClipMinMaxStateWrapper &src, PacClipMinMaxStateWrapper &dst, + ArenaAllocator &a) { + uint64_t cnt = src.n_buffered & PacClipMinMaxStateWrapper::BUF_MASK; + if (cnt > 0) { + auto *dst_state = dst.EnsureState(a); + for (uint64_t i = 0; i < cnt; i++) { + if (SIGNED) { + PacClipMinMaxRouteValue(dst, dst_state, src.hash_buf[i], src.val_buf[i], a); + } else { + PacClipMinMaxUpdateOneInternal(*dst_state, src.hash_buf[i], + static_cast(src.val_buf[i]), a); + dst_state->update_count++; + } + } + src.n_buffered &= ~PacClipMinMaxStateWrapper::BUF_MASK; + } +} + +// ============================================================================ +// Vectorized Update and ScatterUpdate +// ============================================================================ +template +static void PacClipMinMaxUpdate(Vector inputs[], PacClipMinMaxStateWrapper &state, idx_t count, + ArenaAllocator &allocator) { + UnifiedVectorFormat hash_data, value_data; + inputs[0].ToUnifiedFormat(count, hash_data); + inputs[1].ToUnifiedFormat(count, value_data); + auto hashes = UnifiedVectorFormat::GetData(hash_data); + auto values = UnifiedVectorFormat::GetData(value_data); + + if (hash_data.validity.AllValid() && value_data.validity.AllValid()) { + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + PacClipMinMaxUpdateOne(state, hashes[h_idx], + ConvertValue::convert(values[v_idx]), allocator); + } + } else { + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + if (!hash_data.validity.RowIsValid(h_idx) || !value_data.validity.RowIsValid(v_idx)) { + continue; + } + PacClipMinMaxUpdateOne(state, hashes[h_idx], + ConvertValue::convert(values[v_idx]), allocator); + } + } +} + +template +static void PacClipMinMaxScatterUpdate(Vector inputs[], Vector &states, idx_t count, ArenaAllocator &allocator) { + UnifiedVectorFormat hash_data, value_data, sdata; + inputs[0].ToUnifiedFormat(count, hash_data); + inputs[1].ToUnifiedFormat(count, value_data); + states.ToUnifiedFormat(count, sdata); + + auto hashes = UnifiedVectorFormat::GetData(hash_data); + auto values = UnifiedVectorFormat::GetData(value_data); + auto state_ptrs = UnifiedVectorFormat::GetData *>(sdata); + + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + auto state = state_ptrs[sdata.sel->get_index(i)]; + if (!hash_data.validity.RowIsValid(h_idx) || !value_data.validity.RowIsValid(v_idx)) { + continue; + } + PacClipMinMaxUpdateOne(*state, hashes[h_idx], ConvertValue::convert(values[v_idx]), + allocator); + } +} + +// ============================================================================ +// X-macro: generate Update/ScatterUpdate for integer types +// ============================================================================ +#define PCMM_INT_TYPES_SIGNED \ + X(TinyInt, int64_t, int8_t, true) \ + X(SmallInt, int64_t, int16_t, true) \ + X(Integer, int64_t, int32_t, true) \ + X(BigInt, int64_t, int64_t, true) + +#define PCMM_INT_TYPES_UNSIGNED \ + X(UTinyInt, uint64_t, uint8_t, false) \ + X(USmallInt, uint64_t, uint16_t, false) \ + X(UInteger, uint64_t, uint32_t, false) \ + X(UBigInt, uint64_t, uint64_t, false) + +// Generate for IS_MAX=true (MAX) +#define X(NAME, VALUE_T, INPUT_T, SIGNED_VAL) \ + static void PacClipMaxUpdate##NAME(Vector inputs[], AggregateInputData &aggr, idx_t, data_ptr_t state_p, \ + idx_t count) { \ + auto &state = *reinterpret_cast *>(state_p); \ + PacClipMinMaxUpdate(inputs, state, count, aggr.allocator); \ + } \ + static void PacClipMaxScatterUpdate##NAME(Vector inputs[], AggregateInputData &aggr, idx_t, Vector &states, \ + idx_t count) { \ + PacClipMinMaxScatterUpdate(inputs, states, count, aggr.allocator); \ + } +PCMM_INT_TYPES_SIGNED +PCMM_INT_TYPES_UNSIGNED +#undef X + +// Generate for IS_MAX=false (MIN) +#define X(NAME, VALUE_T, INPUT_T, SIGNED_VAL) \ + static void PacClipMinUpdate##NAME(Vector inputs[], AggregateInputData &aggr, idx_t, data_ptr_t state_p, \ + idx_t count) { \ + auto &state = *reinterpret_cast *>(state_p); \ + PacClipMinMaxUpdate(inputs, state, count, aggr.allocator); \ + } \ + static void PacClipMinScatterUpdate##NAME(Vector inputs[], AggregateInputData &aggr, idx_t, Vector &states, \ + idx_t count) { \ + PacClipMinMaxScatterUpdate(inputs, states, count, aggr.allocator); \ + } +PCMM_INT_TYPES_SIGNED +PCMM_INT_TYPES_UNSIGNED +#undef X + +// ============================================================================ +// Float/double update: scale to int64, route through signed path +// ============================================================================ +template +static void PacClipMinMaxUpdateFloat(Vector inputs[], PacClipMinMaxStateWrapper &state, idx_t count, + ArenaAllocator &allocator) { + UnifiedVectorFormat hash_data, value_data; + inputs[0].ToUnifiedFormat(count, hash_data); + inputs[1].ToUnifiedFormat(count, value_data); + auto hashes = UnifiedVectorFormat::GetData(hash_data); + auto values = UnifiedVectorFormat::GetData(value_data); + + if (hash_data.validity.AllValid() && value_data.validity.AllValid()) { + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + PacClipMinMaxUpdateOne(state, hashes[h_idx], + ScaleFloatToInt64(values[v_idx]), allocator); + } + } else { + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + if (!hash_data.validity.RowIsValid(h_idx) || !value_data.validity.RowIsValid(v_idx)) { + continue; + } + PacClipMinMaxUpdateOne(state, hashes[h_idx], + ScaleFloatToInt64(values[v_idx]), allocator); + } + } +} + +template +static void PacClipMinMaxScatterUpdateFloat(Vector inputs[], Vector &states, idx_t count, ArenaAllocator &allocator) { + UnifiedVectorFormat hash_data, value_data, sdata; + inputs[0].ToUnifiedFormat(count, hash_data); + inputs[1].ToUnifiedFormat(count, value_data); + states.ToUnifiedFormat(count, sdata); + auto hashes = UnifiedVectorFormat::GetData(hash_data); + auto values = UnifiedVectorFormat::GetData(value_data); + auto state_ptrs = UnifiedVectorFormat::GetData *>(sdata); + + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + auto state = state_ptrs[sdata.sel->get_index(i)]; + if (!hash_data.validity.RowIsValid(h_idx) || !value_data.validity.RowIsValid(v_idx)) { + continue; + } + PacClipMinMaxUpdateOne(*state, hashes[h_idx], ScaleFloatToInt64(values[v_idx]), + allocator); + } +} + +// X-macro: generate float/double Update/ScatterUpdate for MAX and MIN +#define PCMM_FLOAT_TYPES \ + XF(SingleFloat, float, CLIP_FLOAT_SHIFT) \ + XF(SingleDouble, double, CLIP_DOUBLE_SHIFT) + +#define XF(NAME, FLOAT_T, SHIFT_VAL) \ + static void PacClipMaxUpdate##NAME(Vector inputs[], AggregateInputData &aggr, idx_t, data_ptr_t state_p, \ + idx_t count) { \ + auto &state = *reinterpret_cast *>(state_p); \ + PacClipMinMaxUpdateFloat(inputs, state, count, aggr.allocator); \ + } \ + static void PacClipMaxScatterUpdate##NAME(Vector inputs[], AggregateInputData &aggr, idx_t, Vector &states, \ + idx_t count) { \ + PacClipMinMaxScatterUpdateFloat(inputs, states, count, aggr.allocator); \ + } \ + static void PacClipMinUpdate##NAME(Vector inputs[], AggregateInputData &aggr, idx_t, data_ptr_t state_p, \ + idx_t count) { \ + auto &state = *reinterpret_cast *>(state_p); \ + PacClipMinMaxUpdateFloat(inputs, state, count, aggr.allocator); \ + } \ + static void PacClipMinScatterUpdate##NAME(Vector inputs[], AggregateInputData &aggr, idx_t, Vector &states, \ + idx_t count) { \ + PacClipMinMaxScatterUpdateFloat(inputs, states, count, aggr.allocator); \ + } +PCMM_FLOAT_TYPES +#undef XF + +// ============================================================================ +// Combine +// ============================================================================ +template +static void PacClipMinMaxCombineInt(Vector &src, Vector &dst, idx_t count, ArenaAllocator &allocator) { + auto src_wrapper = FlatVector::GetData *>(src); + auto dst_wrapper = FlatVector::GetData *>(dst); + + for (idx_t i = 0; i < count; i++) { + // Flush src's buffer into dst (always signed — values stored as int64 in buffer) + PacClipMinMaxFlushBuffer(*src_wrapper[i], *dst_wrapper[i], allocator); + + auto *s = src_wrapper[i]->GetState(); + if (s) { + auto *d = dst_wrapper[i]->EnsureState(allocator); + d->CombineFrom(s, allocator); + } + + // Combine neg states + auto *s_neg = src_wrapper[i]->GetNegState(); + if (s_neg) { + auto *d_neg = dst_wrapper[i]->GetNegState(); + if (!d_neg) { + dst_wrapper[i]->neg_state = s_neg; // steal + } else { + d_neg->CombineFrom(s_neg, allocator); + } + } + } +} + +static void PacClipMaxCombine(Vector &src, Vector &dst, AggregateInputData &aggr, idx_t count) { + PacClipMinMaxCombineInt(src, dst, count, aggr.allocator); +} +static void PacClipMinCombine(Vector &src, Vector &dst, AggregateInputData &aggr, idx_t count) { + PacClipMinMaxCombineInt(src, dst, count, aggr.allocator); +} + +// PacClipBindData is defined in pac_clip_aggr.hpp + +// ============================================================================ +// Finalize: noised scalar output +// ============================================================================ +template +static void PacClipMinMaxFinalize(Vector &states, AggregateInputData &input, Vector &result, idx_t count, + idx_t offset) { + auto state_ptrs = FlatVector::GetData *>(states); + auto data = FlatVector::GetData(result); + auto &result_mask = FlatVector::Validity(result); + auto &bind = static_cast(*input.bind_data); + double mi = bind.mi; + double correction = bind.correction; + uint64_t query_hash = bind.query_hash; + auto pstate = bind.pstate; + int clip_support = bind.clip_support_threshold; + bool clip_scale = bind.clip_scale; + + for (idx_t i = 0; i < count; i++) { + PacClipMinMaxFlushBuffer(*state_ptrs[i], *state_ptrs[i], input.allocator); + + PAC_FLOAT buf[64] = {0}; + auto *pos = state_ptrs[i]->GetState(); + auto *neg = state_ptrs[i]->GetNegState(); + if (!pos && !neg) { + result_mask.SetInvalid(offset + i); + continue; + } + uint64_t key_hash = (pos ? pos->key_hash : 0) | (neg ? neg->key_hash : 0); + std::mt19937_64 gen(bind.seed); + if (PacNoiseInNull(key_hash, mi, correction, gen)) { + result_mask.SetInvalid(offset + i); + continue; + } + + uint64_t update_count = 0; + if (pos) { + pos->GetTotals(buf, clip_support, clip_scale); + update_count = pos->update_count; + } + + // Merge neg state: negate absolute extremes back to negative values + if (neg) { + PAC_FLOAT neg_buf[64] = {0}; + neg->GetTotals(neg_buf, clip_support, clip_scale); + for (int j = 0; j < 64; j++) { + // Only merge if neg had a surviving contribution (not fully clipped) + if (neg_buf[j] != 0) { + PAC_FLOAT neg_val = -neg_buf[j]; + if (IS_MAX) { + buf[j] = std::max(buf[j], neg_val); + } else { + buf[j] = std::min(buf[j], neg_val); + } + } + } + update_count += neg->update_count; + } + + CheckPacSampleDiversity(key_hash, buf, update_count, IS_MAX ? "pac_noised_clip_max" : "pac_noised_clip_min", + bind); + PAC_FLOAT result_val = PacNoisySampleFrom64Counters(buf, mi, correction, gen, ~key_hash, query_hash, pstate); + result_val /= static_cast(bind.float_scale); + data[offset + i] = FromDouble(result_val); + } +} + +// Noised finalize instantiations — return type matches input type +// Integer inputs: return same type as non-clip min/max (the type itself) +// For clip variants, noised output returns the value type. We use templates to handle all types. + +// Helper to deduce return type from value type. For integers, the noised clip min/max +// returns the same type. For float/double, returns float/double. +// X-macro: generate noised finalize wrappers for all output types × MAX/MIN +#define PCMM_FINALIZE_TYPES \ + XFIN(BigInt, int64_t) \ + XFIN(Float, float) \ + XFIN(Double, double) \ + XFIN(HugeInt, hugeint_t) + +#define XFIN(NAME, ACC_T) \ + static void PacClipMaxNoisedFinalize##NAME(Vector &s, AggregateInputData &i, Vector &r, idx_t c, idx_t o) { \ + PacClipMinMaxFinalize(s, i, r, c, o); \ + } \ + static void PacClipMinNoisedFinalize##NAME(Vector &s, AggregateInputData &i, Vector &r, idx_t c, idx_t o) { \ + PacClipMinMaxFinalize(s, i, r, c, o); \ + } +PCMM_FINALIZE_TYPES +#undef XFIN + +// ============================================================================ +// Counters finalize (LIST output) +// ============================================================================ +template +static void PacClipMinMaxFinalizeCounters(Vector &states, AggregateInputData &input, Vector &result, idx_t count, + idx_t offset) { + auto state_ptrs = FlatVector::GetData *>(states); + auto &bind = static_cast(*input.bind_data); + int clip_support = bind.clip_support_threshold; + double correction = bind.correction; + double float_scale = bind.float_scale; + bool clip_scale = bind.clip_scale; + + auto list_entries = FlatVector::GetData(result); + auto &child_vec = ListVector::GetEntry(result); + + idx_t total_elements = count * 64; + ListVector::Reserve(result, total_elements); + ListVector::SetListSize(result, total_elements); + + auto child_data = FlatVector::GetData(child_vec); + + for (idx_t i = 0; i < count; i++) { + PacClipMinMaxFlushBuffer(*state_ptrs[i], *state_ptrs[i], input.allocator); + + list_entries[offset + i].offset = i * 64; + list_entries[offset + i].length = 64; + + PAC_FLOAT buf[64] = {0}; + uint64_t key_hash = 0; + uint64_t update_count = 0; + + auto *pos = state_ptrs[i]->GetState(); + auto *neg = state_ptrs[i]->GetNegState(); + if (pos) { + key_hash = pos->key_hash; + update_count = pos->update_count; + pos->GetTotals(buf, clip_support, clip_scale); + } + if (neg) { + PAC_FLOAT neg_buf[64] = {0}; + neg->GetTotals(neg_buf, clip_support, clip_scale); + key_hash |= neg->key_hash; + for (int j = 0; j < 64; j++) { + if (neg_buf[j] != 0) { + PAC_FLOAT neg_val = -neg_buf[j]; + if (IS_MAX) { + buf[j] = std::max(buf[j], neg_val); + } else { + buf[j] = std::min(buf[j], neg_val); + } + } + } + update_count += neg->update_count; + } + + CheckPacSampleDiversity(key_hash, buf, update_count, IS_MAX ? "pac_clip_max" : "pac_clip_min", bind); + + idx_t base = i * 64; + for (int j = 0; j < 64; j++) { + if ((key_hash >> j) & 1ULL) { + child_data[base + j] = static_cast(buf[j] * correction / float_scale); + } else { + child_data[base + j] = 0.0; + } + } + } +} + +static void PacClipMaxFinalizeCounters(Vector &s, AggregateInputData &i, Vector &r, idx_t c, idx_t o) { + PacClipMinMaxFinalizeCounters(s, i, r, c, o); +} +static void PacClipMinFinalizeCounters(Vector &s, AggregateInputData &i, Vector &r, idx_t c, idx_t o) { + PacClipMinMaxFinalizeCounters(s, i, r, c, o); +} + +// ============================================================================ +// State size / init / bind +// ============================================================================ +template +static idx_t PacClipMinMaxStateSize(const AggregateFunction &) { + return sizeof(PacClipMinMaxStateWrapper); +} + +template +static void PacClipMinMaxInitialize(const AggregateFunction &, data_ptr_t state_p) { + memset(state_p, 0, sizeof(PacClipMinMaxStateWrapper)); +} + +// PacClipBind, PacClipBindFloat, PacClipBindDouble are defined in pac_clip_aggr.hpp + +// ============================================================================ +// DECIMAL support: dispatch by physical type +// ============================================================================ +template +static AggregateFunction GetPacClipMinMaxNoisedAggregate(PhysicalType type) { + const char *name = IS_MAX ? "pac_noised_clip_max" : "pac_noised_clip_min"; + auto finalize = IS_MAX ? PacClipMaxNoisedFinalizeBigInt : PacClipMinNoisedFinalizeBigInt; + auto combine = IS_MAX ? PacClipMaxCombine : PacClipMinCombine; + auto state_size = PacClipMinMaxStateSize; + auto init = PacClipMinMaxInitialize; + + switch (type) { + case PhysicalType::INT16: + return AggregateFunction(name, {LogicalType::UBIGINT, LogicalType::SMALLINT}, LogicalType::HUGEINT, state_size, + init, IS_MAX ? PacClipMaxScatterUpdateSmallInt : PacClipMinScatterUpdateSmallInt, + combine, finalize, FunctionNullHandling::DEFAULT_NULL_HANDLING, + IS_MAX ? PacClipMaxUpdateSmallInt : PacClipMinUpdateSmallInt); + case PhysicalType::INT32: + return AggregateFunction(name, {LogicalType::UBIGINT, LogicalType::INTEGER}, LogicalType::HUGEINT, state_size, + init, IS_MAX ? PacClipMaxScatterUpdateInteger : PacClipMinScatterUpdateInteger, + combine, finalize, FunctionNullHandling::DEFAULT_NULL_HANDLING, + IS_MAX ? PacClipMaxUpdateInteger : PacClipMinUpdateInteger); + case PhysicalType::INT64: + return AggregateFunction(name, {LogicalType::UBIGINT, LogicalType::BIGINT}, LogicalType::HUGEINT, state_size, + init, IS_MAX ? PacClipMaxScatterUpdateBigInt : PacClipMinScatterUpdateBigInt, combine, + finalize, FunctionNullHandling::DEFAULT_NULL_HANDLING, + IS_MAX ? PacClipMaxUpdateBigInt : PacClipMinUpdateBigInt); + case PhysicalType::INT128: + return AggregateFunction(name, {LogicalType::UBIGINT, LogicalType::HUGEINT}, LogicalType::HUGEINT, state_size, + init, IS_MAX ? PacClipMaxScatterUpdateBigInt : PacClipMinScatterUpdateBigInt, combine, + IS_MAX ? PacClipMaxNoisedFinalizeHugeInt : PacClipMinNoisedFinalizeHugeInt, + FunctionNullHandling::DEFAULT_NULL_HANDLING, + IS_MAX ? PacClipMaxUpdateBigInt : PacClipMinUpdateBigInt); + default: + throw InternalException("pac_noised_clip_min/max: unsupported decimal physical type"); + } +} + +template +static unique_ptr BindDecimalPacNoisedClipMinMax(ClientContext &ctx, AggregateFunction &function, + vector> &args) { + auto decimal_type = args[1]->return_type; + function = GetPacClipMinMaxNoisedAggregate(decimal_type.InternalType()); + function.name = IS_MAX ? "pac_noised_clip_max" : "pac_noised_clip_min"; + function.arguments[1] = decimal_type; + function.return_type = LogicalType::DECIMAL(Decimal::MAX_WIDTH_DECIMAL, DecimalType::GetScale(decimal_type)); + return PacClipBind(ctx, function, args); +} + +// ============================================================================ +// Registration helpers +// ============================================================================ +template +static void AddClipMinMaxCountersFcn(AggregateFunctionSet &set, const string &name, const LogicalType &value_type, + aggregate_update_t scatter, aggregate_finalize_t finalize, + aggregate_simple_update_t update) { + auto list_type = LogicalType::LIST(PacFloatLogicalType()); + set.AddFunction(AggregateFunction(name, {LogicalType::UBIGINT, value_type}, list_type, + PacClipMinMaxStateSize, PacClipMinMaxInitialize, scatter, + IS_MAX ? PacClipMaxCombine : PacClipMinCombine, finalize, + FunctionNullHandling::DEFAULT_NULL_HANDLING, update, PacClipBind)); + set.AddFunction(AggregateFunction(name, {LogicalType::UBIGINT, value_type, LogicalType::DOUBLE}, list_type, + PacClipMinMaxStateSize, PacClipMinMaxInitialize, scatter, + IS_MAX ? PacClipMaxCombine : PacClipMinCombine, finalize, + FunctionNullHandling::DEFAULT_NULL_HANDLING, update, PacClipBind)); +} + +template +static void AddNoisedClipMinMaxFcn(AggregateFunctionSet &set, const string &name, const LogicalType &value_type, + const LogicalType &result_type, aggregate_update_t scatter, + aggregate_finalize_t finalize, aggregate_simple_update_t update) { + set.AddFunction(AggregateFunction(name, {LogicalType::UBIGINT, value_type}, result_type, + PacClipMinMaxStateSize, PacClipMinMaxInitialize, scatter, + IS_MAX ? PacClipMaxCombine : PacClipMinCombine, finalize, + FunctionNullHandling::DEFAULT_NULL_HANDLING, update, PacClipBind)); + set.AddFunction(AggregateFunction(name, {LogicalType::UBIGINT, value_type, LogicalType::DOUBLE}, result_type, + PacClipMinMaxStateSize, PacClipMinMaxInitialize, scatter, + IS_MAX ? PacClipMaxCombine : PacClipMinCombine, finalize, + FunctionNullHandling::DEFAULT_NULL_HANDLING, update, PacClipBind)); +} + +// Helper to register all type overloads +template +static void RegisterClipMinMaxTypeOverloads(AggregateFunctionSet &set, const string &name, bool counters) { + auto counters_finalize = IS_MAX ? PacClipMaxFinalizeCounters : PacClipMinFinalizeCounters; + auto noised_finalize = IS_MAX ? PacClipMaxNoisedFinalizeBigInt : PacClipMinNoisedFinalizeBigInt; + + if (counters) { + // Counters (LIST) variants — signed types + AddClipMinMaxCountersFcn(set, name, LogicalType::TINYINT, + IS_MAX ? PacClipMaxScatterUpdateTinyInt : PacClipMinScatterUpdateTinyInt, + counters_finalize, IS_MAX ? PacClipMaxUpdateTinyInt : PacClipMinUpdateTinyInt); + AddClipMinMaxCountersFcn(set, name, LogicalType::BOOLEAN, + IS_MAX ? PacClipMaxScatterUpdateTinyInt : PacClipMinScatterUpdateTinyInt, + counters_finalize, IS_MAX ? PacClipMaxUpdateTinyInt : PacClipMinUpdateTinyInt); + AddClipMinMaxCountersFcn(set, name, LogicalType::SMALLINT, + IS_MAX ? PacClipMaxScatterUpdateSmallInt : PacClipMinScatterUpdateSmallInt, + counters_finalize, + IS_MAX ? PacClipMaxUpdateSmallInt : PacClipMinUpdateSmallInt); + AddClipMinMaxCountersFcn(set, name, LogicalType::INTEGER, + IS_MAX ? PacClipMaxScatterUpdateInteger : PacClipMinScatterUpdateInteger, + counters_finalize, IS_MAX ? PacClipMaxUpdateInteger : PacClipMinUpdateInteger); + AddClipMinMaxCountersFcn(set, name, LogicalType::BIGINT, + IS_MAX ? PacClipMaxScatterUpdateBigInt : PacClipMinScatterUpdateBigInt, + counters_finalize, IS_MAX ? PacClipMaxUpdateBigInt : PacClipMinUpdateBigInt); + // Unsigned types + AddClipMinMaxCountersFcn(set, name, LogicalType::UTINYINT, + IS_MAX ? PacClipMaxScatterUpdateUTinyInt : PacClipMinScatterUpdateUTinyInt, + counters_finalize, + IS_MAX ? PacClipMaxUpdateUTinyInt : PacClipMinUpdateUTinyInt); + AddClipMinMaxCountersFcn(set, name, LogicalType::USMALLINT, + IS_MAX ? PacClipMaxScatterUpdateUSmallInt : PacClipMinScatterUpdateUSmallInt, + counters_finalize, + IS_MAX ? PacClipMaxUpdateUSmallInt : PacClipMinUpdateUSmallInt); + AddClipMinMaxCountersFcn(set, name, LogicalType::UINTEGER, + IS_MAX ? PacClipMaxScatterUpdateUInteger : PacClipMinScatterUpdateUInteger, + counters_finalize, + IS_MAX ? PacClipMaxUpdateUInteger : PacClipMinUpdateUInteger); + AddClipMinMaxCountersFcn(set, name, LogicalType::UBIGINT, + IS_MAX ? PacClipMaxScatterUpdateUBigInt : PacClipMinScatterUpdateUBigInt, + counters_finalize, IS_MAX ? PacClipMaxUpdateUBigInt : PacClipMinUpdateUBigInt); + } else { + // Noised (scalar) variants — signed types + AddNoisedClipMinMaxFcn(set, name, LogicalType::TINYINT, LogicalType::BIGINT, + IS_MAX ? PacClipMaxScatterUpdateTinyInt : PacClipMinScatterUpdateTinyInt, + noised_finalize, IS_MAX ? PacClipMaxUpdateTinyInt : PacClipMinUpdateTinyInt); + AddNoisedClipMinMaxFcn(set, name, LogicalType::BOOLEAN, LogicalType::BIGINT, + IS_MAX ? PacClipMaxScatterUpdateTinyInt : PacClipMinScatterUpdateTinyInt, + noised_finalize, IS_MAX ? PacClipMaxUpdateTinyInt : PacClipMinUpdateTinyInt); + AddNoisedClipMinMaxFcn(set, name, LogicalType::SMALLINT, LogicalType::BIGINT, + IS_MAX ? PacClipMaxScatterUpdateSmallInt : PacClipMinScatterUpdateSmallInt, + noised_finalize, IS_MAX ? PacClipMaxUpdateSmallInt : PacClipMinUpdateSmallInt); + AddNoisedClipMinMaxFcn(set, name, LogicalType::INTEGER, LogicalType::BIGINT, + IS_MAX ? PacClipMaxScatterUpdateInteger : PacClipMinScatterUpdateInteger, + noised_finalize, IS_MAX ? PacClipMaxUpdateInteger : PacClipMinUpdateInteger); + AddNoisedClipMinMaxFcn(set, name, LogicalType::BIGINT, LogicalType::BIGINT, + IS_MAX ? PacClipMaxScatterUpdateBigInt : PacClipMinScatterUpdateBigInt, + noised_finalize, IS_MAX ? PacClipMaxUpdateBigInt : PacClipMinUpdateBigInt); + // Unsigned types + AddNoisedClipMinMaxFcn(set, name, LogicalType::UTINYINT, LogicalType::BIGINT, + IS_MAX ? PacClipMaxScatterUpdateUTinyInt : PacClipMinScatterUpdateUTinyInt, + noised_finalize, IS_MAX ? PacClipMaxUpdateUTinyInt : PacClipMinUpdateUTinyInt); + AddNoisedClipMinMaxFcn(set, name, LogicalType::USMALLINT, LogicalType::BIGINT, + IS_MAX ? PacClipMaxScatterUpdateUSmallInt : PacClipMinScatterUpdateUSmallInt, + noised_finalize, IS_MAX ? PacClipMaxUpdateUSmallInt : PacClipMinUpdateUSmallInt); + AddNoisedClipMinMaxFcn(set, name, LogicalType::UINTEGER, LogicalType::BIGINT, + IS_MAX ? PacClipMaxScatterUpdateUInteger : PacClipMinScatterUpdateUInteger, + noised_finalize, IS_MAX ? PacClipMaxUpdateUInteger : PacClipMinUpdateUInteger); + AddNoisedClipMinMaxFcn(set, name, LogicalType::UBIGINT, LogicalType::BIGINT, + IS_MAX ? PacClipMaxScatterUpdateUBigInt : PacClipMinScatterUpdateUBigInt, + noised_finalize, IS_MAX ? PacClipMaxUpdateUBigInt : PacClipMinUpdateUBigInt); + } +} + +// ============================================================================ +// Add float/double overloads to a function set +// ============================================================================ +template +static void AddFloatDoubleOverloads(AggregateFunctionSet &set, const string &name, bool counters) { + auto combine = IS_MAX ? PacClipMaxCombine : PacClipMinCombine; + auto state_size = PacClipMinMaxStateSize; + auto init = PacClipMinMaxInitialize; + + if (counters) { + auto finalize = IS_MAX ? PacClipMaxFinalizeCounters : PacClipMinFinalizeCounters; + auto list_type = LogicalType::LIST(PacFloatLogicalType()); + + // FLOAT + set.AddFunction( + AggregateFunction(name, {LogicalType::UBIGINT, LogicalType::FLOAT}, list_type, state_size, init, + IS_MAX ? PacClipMaxScatterUpdateSingleFloat : PacClipMinScatterUpdateSingleFloat, combine, + finalize, FunctionNullHandling::DEFAULT_NULL_HANDLING, + IS_MAX ? PacClipMaxUpdateSingleFloat : PacClipMinUpdateSingleFloat, PacClipBindFloat)); + set.AddFunction(AggregateFunction( + name, {LogicalType::UBIGINT, LogicalType::FLOAT, LogicalType::DOUBLE}, list_type, state_size, init, + IS_MAX ? PacClipMaxScatterUpdateSingleFloat : PacClipMinScatterUpdateSingleFloat, combine, finalize, + FunctionNullHandling::DEFAULT_NULL_HANDLING, + IS_MAX ? PacClipMaxUpdateSingleFloat : PacClipMinUpdateSingleFloat, PacClipBindFloat)); + + // DOUBLE + set.AddFunction( + AggregateFunction(name, {LogicalType::UBIGINT, LogicalType::DOUBLE}, list_type, state_size, init, + IS_MAX ? PacClipMaxScatterUpdateSingleDouble : PacClipMinScatterUpdateSingleDouble, + combine, finalize, FunctionNullHandling::DEFAULT_NULL_HANDLING, + IS_MAX ? PacClipMaxUpdateSingleDouble : PacClipMinUpdateSingleDouble, PacClipBindDouble)); + set.AddFunction(AggregateFunction( + name, {LogicalType::UBIGINT, LogicalType::DOUBLE, LogicalType::DOUBLE}, list_type, state_size, init, + IS_MAX ? PacClipMaxScatterUpdateSingleDouble : PacClipMinScatterUpdateSingleDouble, combine, finalize, + FunctionNullHandling::DEFAULT_NULL_HANDLING, + IS_MAX ? PacClipMaxUpdateSingleDouble : PacClipMinUpdateSingleDouble, PacClipBindDouble)); + } else { + auto float_finalize = IS_MAX ? PacClipMaxNoisedFinalizeFloat : PacClipMinNoisedFinalizeFloat; + auto double_finalize = IS_MAX ? PacClipMaxNoisedFinalizeDouble : PacClipMinNoisedFinalizeDouble; + + // FLOAT → FLOAT + set.AddFunction( + AggregateFunction(name, {LogicalType::UBIGINT, LogicalType::FLOAT}, LogicalType::FLOAT, state_size, init, + IS_MAX ? PacClipMaxScatterUpdateSingleFloat : PacClipMinScatterUpdateSingleFloat, combine, + float_finalize, FunctionNullHandling::DEFAULT_NULL_HANDLING, + IS_MAX ? PacClipMaxUpdateSingleFloat : PacClipMinUpdateSingleFloat, PacClipBindFloat)); + set.AddFunction(AggregateFunction( + name, {LogicalType::UBIGINT, LogicalType::FLOAT, LogicalType::DOUBLE}, LogicalType::FLOAT, state_size, init, + IS_MAX ? PacClipMaxScatterUpdateSingleFloat : PacClipMinScatterUpdateSingleFloat, combine, float_finalize, + FunctionNullHandling::DEFAULT_NULL_HANDLING, + IS_MAX ? PacClipMaxUpdateSingleFloat : PacClipMinUpdateSingleFloat, PacClipBindFloat)); + + // DOUBLE → DOUBLE + set.AddFunction( + AggregateFunction(name, {LogicalType::UBIGINT, LogicalType::DOUBLE}, LogicalType::DOUBLE, state_size, init, + IS_MAX ? PacClipMaxScatterUpdateSingleDouble : PacClipMinScatterUpdateSingleDouble, + combine, double_finalize, FunctionNullHandling::DEFAULT_NULL_HANDLING, + IS_MAX ? PacClipMaxUpdateSingleDouble : PacClipMinUpdateSingleDouble, PacClipBindDouble)); + set.AddFunction(AggregateFunction( + name, {LogicalType::UBIGINT, LogicalType::DOUBLE, LogicalType::DOUBLE}, LogicalType::DOUBLE, state_size, + init, IS_MAX ? PacClipMaxScatterUpdateSingleDouble : PacClipMinScatterUpdateSingleDouble, combine, + double_finalize, FunctionNullHandling::DEFAULT_NULL_HANDLING, + IS_MAX ? PacClipMaxUpdateSingleDouble : PacClipMinUpdateSingleDouble, PacClipBindDouble)); + } +} + +// ============================================================================ +// Registration: templated helpers to avoid duplicating MIN/MAX registration +// ============================================================================ +template +static void RegisterPacClipMinMaxCountersFunctions(ExtensionLoader &loader) { + const string name = IS_MAX ? "pac_clip_max" : "pac_clip_min"; + const string short_name = IS_MAX ? "clip_max" : "clip_min"; + AggregateFunctionSet fcn_set(name); + RegisterClipMinMaxTypeOverloads(fcn_set, name, true); + + // DECIMAL overloads + auto list_type = LogicalType::LIST(PacFloatLogicalType()); + fcn_set.AddFunction(AggregateFunction({LogicalType::UBIGINT, LogicalTypeId::DECIMAL}, list_type, nullptr, nullptr, + nullptr, nullptr, nullptr, FunctionNullHandling::DEFAULT_NULL_HANDLING, + nullptr, BindDecimalPacNoisedClipMinMax)); + + AddFloatDoubleOverloads(fcn_set, name, true); + AddPacListAggregateOverload(fcn_set, short_name); + + CreateAggregateFunctionInfo info(fcn_set); + FunctionDescription desc; + desc.description = IS_MAX ? "[INTERNAL] Returns 64 PAC subsample max values with per-level clipping as LIST." + : "[INTERNAL] Returns 64 PAC subsample min values with per-level clipping as LIST."; + info.descriptions.push_back(std::move(desc)); + loader.RegisterFunction(std::move(info)); +} + +template +static void RegisterPacNoisedClipMinMaxFunctions(ExtensionLoader &loader) { + const string name = IS_MAX ? "pac_noised_clip_max" : "pac_noised_clip_min"; + AggregateFunctionSet fcn_set(name); + RegisterClipMinMaxTypeOverloads(fcn_set, name, false); + + // DECIMAL overloads + fcn_set.AddFunction(AggregateFunction( + {LogicalType::UBIGINT, LogicalTypeId::DECIMAL}, LogicalTypeId::DECIMAL, nullptr, nullptr, nullptr, nullptr, + nullptr, FunctionNullHandling::DEFAULT_NULL_HANDLING, nullptr, BindDecimalPacNoisedClipMinMax)); + fcn_set.AddFunction(AggregateFunction({LogicalType::UBIGINT, LogicalTypeId::DECIMAL, LogicalType::DOUBLE}, + LogicalTypeId::DECIMAL, nullptr, nullptr, nullptr, nullptr, nullptr, + FunctionNullHandling::DEFAULT_NULL_HANDLING, nullptr, + BindDecimalPacNoisedClipMinMax)); + + AddFloatDoubleOverloads(fcn_set, name, false); + + CreateAggregateFunctionInfo info(fcn_set); + FunctionDescription desc; + desc.description = IS_MAX ? "Privacy-preserving MAX with per-level clipping and noising." + : "Privacy-preserving MIN with per-level clipping and noising."; + info.descriptions.push_back(std::move(desc)); + loader.RegisterFunction(std::move(info)); +} + +// Public registration functions (called from pac_extension.cpp) +void RegisterPacClipMinFunctions(ExtensionLoader &loader) { + RegisterPacClipMinMaxCountersFunctions(loader); +} +void RegisterPacClipMaxFunctions(ExtensionLoader &loader) { + RegisterPacClipMinMaxCountersFunctions(loader); +} +void RegisterPacNoisedClipMinFunctions(ExtensionLoader &loader) { + RegisterPacNoisedClipMinMaxFunctions(loader); +} +void RegisterPacNoisedClipMaxFunctions(ExtensionLoader &loader) { + RegisterPacNoisedClipMinMaxFunctions(loader); +} + +} // namespace duckdb diff --git a/src/aggregates/pac_clip_sum.cpp b/src/aggregates/pac_clip_sum.cpp new file mode 100644 index 00000000..b9a13ba6 --- /dev/null +++ b/src/aggregates/pac_clip_sum.cpp @@ -0,0 +1,960 @@ +#include "aggregates/pac_clip_sum.hpp" +#include "categorical/pac_categorical.hpp" +#include "duckdb/common/types/decimal.hpp" +#include "duckdb/parser/parsed_data/create_aggregate_function_info.hpp" +#include + +namespace duckdb { + +// ============================================================================ +// Inner state update: add one unsigned value to the state +// ============================================================================ +template +AUTOVECTORIZE inline void PacClipSumUpdateOneInternal(PacClipSumIntState &state, uint64_t key_hash, uint64_t value, + ArenaAllocator &allocator) { + state.key_hash |= key_hash; + + int level = PacClipSumIntState::GetLevel(value); + uint64_t shift = level << 1; + uint16_t shifted_val = static_cast(value >> shift); // max 255 (8 bits) + + state.EnsureLevelAllocated(allocator, level); + uint64_t *buf = state.levels[level]; + + // Set bitmap bit + buf[17] |= (1ULL << (key_hash >> 58)); + + // Update exact_count (may cascade top 4 bits to overflow) + state.AddToExactCount(buf, shifted_val, allocator); + + // Add to SWAR counters + Pac2AddToTotalsSWAR16(buf, shifted_val, key_hash); +} + +// Overload for hugeint_t +template +AUTOVECTORIZE inline void PacClipSumUpdateOneInternal(PacClipSumIntState &state, uint64_t key_hash, hugeint_t value, + ArenaAllocator &allocator) { + state.key_hash |= key_hash; + + uint64_t upper, lower; + if (value.upper < 0) { + hugeint_t abs_val = -value; + upper = static_cast(abs_val.upper); + lower = abs_val.lower; + } else { + upper = static_cast(value.upper); + lower = value.lower; + } + + int level = PacClipSumIntState::GetLevel128(upper, lower); + uint64_t shift = level << 1; + + // Shift the 128-bit value right by shift bits, take lower 8 bits + uint16_t shifted_val; + if (shift >= 64) { + shifted_val = static_cast(upper >> (shift - 64)); + } else if (shift > 0) { + shifted_val = static_cast((lower >> shift) | (upper << (64 - shift))); + } else { + shifted_val = static_cast(lower); + } + shifted_val &= 0xFF; // max 255 + + state.EnsureLevelAllocated(allocator, level); + uint64_t *buf = state.levels[level]; + buf[17] |= (1ULL << (key_hash >> 58)); + state.AddToExactCount(buf, shifted_val, allocator); + Pac2AddToTotalsSWAR16(buf, shifted_val, key_hash); +} + +// ============================================================================ +// Value routing: two-sided (pos/neg) dispatch +// ============================================================================ +// Route a uint64_t value — when SIGNED, the bits represent a signed int64_t (two's complement) +template +inline void PacClipSumRouteValue(PacClipSumStateWrapper &wrapper, PacClipSumIntState *pos_state, uint64_t hash, + uint64_t value, ArenaAllocator &a) { + if (DUCKDB_LIKELY(hash)) { + int64_t sval = static_cast(value); // reinterpret bits as signed + if (SIGNED && sval < 0) { + auto *neg = wrapper.EnsureNegState(a); + PacClipSumUpdateOneInternal(*neg, hash, static_cast(-sval), a); + neg->update_count++; + } else { + PacClipSumUpdateOneInternal(*pos_state, hash, value, a); + pos_state->update_count++; + } + } +} + +// Overload for hugeint routing (signed) +template +inline void PacClipSumRouteHugeint(PacClipSumStateWrapper &wrapper, PacClipSumIntState *pos_state, + uint64_t hash, hugeint_t value, ArenaAllocator &a, bool is_signed) { + if (DUCKDB_LIKELY(hash)) { + if (is_signed && value.upper < 0) { + auto *neg = wrapper.EnsureNegState(a); + hugeint_t abs_val = -value; + uint64_t upper = static_cast(abs_val.upper); + uint64_t lower = abs_val.lower; + int level = PacClipSumIntState::GetLevel128(upper, lower); + uint64_t shift = level << 1; + uint16_t shifted_val; + if (shift >= 64) { + shifted_val = static_cast(upper >> (shift - 64)); + } else if (shift > 0) { + shifted_val = static_cast((lower >> shift) | (upper << (64 - shift))); + } else { + shifted_val = static_cast(lower); + } + shifted_val &= 0xFF; + neg->key_hash |= hash; + neg->EnsureLevelAllocated(a, level); + uint64_t *lbuf = neg->levels[level]; + lbuf[17] |= (1ULL << (hash >> 58)); + neg->AddToExactCount(lbuf, shifted_val, a); + Pac2AddToTotalsSWAR16(lbuf, shifted_val, hash); + neg->update_count++; + } else { + PacClipSumUpdateOneInternal(*pos_state, hash, value, a); + pos_state->update_count++; + } + } +} + +// ============================================================================ +// Buffer flush +// ============================================================================ +template +inline void PacClipSumFlushBuffer(PacClipSumStateWrapper &src, PacClipSumStateWrapper &dst, ArenaAllocator &a) { + uint64_t cnt = src.n_buffered & PacClipSumStateWrapper::BUF_MASK; + if (cnt > 0) { + auto *dst_state = dst.EnsureState(a); + for (uint64_t i = 0; i < cnt; i++) { + PacClipSumRouteValue(dst, dst_state, src.hash_buf[i], src.val_buf[i], a); + } + src.n_buffered &= ~PacClipSumStateWrapper::BUF_MASK; + } +} + +// ============================================================================ +// Buffered update +// ============================================================================ +template +AUTOVECTORIZE inline void PacClipSumUpdateOne(PacClipSumStateWrapper &agg, uint64_t key_hash, ValueT value, + ArenaAllocator &a) { + uint64_t cnt = agg.n_buffered & PacClipSumStateWrapper::BUF_MASK; + if (DUCKDB_UNLIKELY(cnt == PacClipSumStateWrapper::BUF_SIZE)) { + auto *dst_state = agg.EnsureState(a); + for (int i = 0; i < PacClipSumStateWrapper::BUF_SIZE; i++) { + PacClipSumRouteValue(agg, dst_state, agg.hash_buf[i], agg.val_buf[i], a); + } + PacClipSumRouteValue(agg, dst_state, key_hash, static_cast(value), a); + agg.n_buffered &= ~PacClipSumStateWrapper::BUF_MASK; + } else { + agg.val_buf[cnt] = static_cast(value); + agg.hash_buf[cnt] = key_hash; + agg.n_buffered++; + } +} + +// Hugeint buffered update — bypass buffer, update directly +template +inline void PacClipSumUpdateOne(PacClipSumStateWrapper &agg, uint64_t key_hash, hugeint_t value, + ArenaAllocator &a) { + PacClipSumFlushBuffer(agg, agg, a); // flush any buffered values first + auto *state = agg.EnsureState(a); + PacClipSumRouteHugeint(agg, state, key_hash, value, a, SIGNED); +} + +// ============================================================================ +// Vectorized Update and ScatterUpdate +// ============================================================================ +template +static void PacClipSumUpdate(Vector inputs[], PacClipSumStateWrapper &state, idx_t count, + ArenaAllocator &allocator) { + UnifiedVectorFormat hash_data, value_data; + inputs[0].ToUnifiedFormat(count, hash_data); + inputs[1].ToUnifiedFormat(count, value_data); + auto hashes = UnifiedVectorFormat::GetData(hash_data); + auto values = UnifiedVectorFormat::GetData(value_data); + + if (hash_data.validity.AllValid() && value_data.validity.AllValid()) { + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + PacClipSumUpdateOne(state, hashes[h_idx], ConvertValue::convert(values[v_idx]), + allocator); + } + } else { + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + if (!hash_data.validity.RowIsValid(h_idx) || !value_data.validity.RowIsValid(v_idx)) { + continue; + } + PacClipSumUpdateOne(state, hashes[h_idx], ConvertValue::convert(values[v_idx]), + allocator); + } + } +} + +template +static void PacClipSumScatterUpdate(Vector inputs[], Vector &states, idx_t count, ArenaAllocator &allocator) { + UnifiedVectorFormat hash_data, value_data, sdata; + inputs[0].ToUnifiedFormat(count, hash_data); + inputs[1].ToUnifiedFormat(count, value_data); + states.ToUnifiedFormat(count, sdata); + + auto hashes = UnifiedVectorFormat::GetData(hash_data); + auto values = UnifiedVectorFormat::GetData(value_data); + auto state_ptrs = UnifiedVectorFormat::GetData *>(sdata); + + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + auto state = state_ptrs[sdata.sel->get_index(i)]; + if (!hash_data.validity.RowIsValid(h_idx) || !value_data.validity.RowIsValid(v_idx)) { + continue; + } + PacClipSumUpdateOne(*state, hashes[h_idx], ConvertValue::convert(values[v_idx]), + allocator); + } +} + +// ============================================================================ +// X-macro: generate Update/ScatterUpdate for integer types +// ============================================================================ +#define CLIP_SUM_INT_TYPES_SIGNED \ + X(TinyInt, int64_t, int8_t, true) \ + X(SmallInt, int64_t, int16_t, true) \ + X(Integer, int64_t, int32_t, true) \ + X(BigInt, int64_t, int64_t, true) + +#define CLIP_SUM_INT_TYPES_UNSIGNED \ + X(UTinyInt, uint64_t, uint8_t, false) \ + X(USmallInt, uint64_t, uint16_t, false) \ + X(UInteger, uint64_t, uint32_t, false) \ + X(UBigInt, uint64_t, uint64_t, false) + +#define X(NAME, VALUE_T, INPUT_T, SIGNED) \ + static void PacClipSumUpdate##NAME(Vector input[], AggregateInputData &agg, idx_t, data_ptr_t state_p, \ + idx_t cnt) { \ + auto &state = *reinterpret_cast *>(state_p); \ + PacClipSumUpdate(input, state, cnt, agg.allocator); \ + } \ + static void PacClipSumScatterUpdate##NAME(Vector input[], AggregateInputData &agg, idx_t, Vector &sts, \ + idx_t cnt) { \ + PacClipSumScatterUpdate(input, sts, cnt, agg.allocator); \ + } +CLIP_SUM_INT_TYPES_SIGNED +CLIP_SUM_INT_TYPES_UNSIGNED +#undef X + +// HugeInt update (signed, via hugeint routing — 128-bit needs full 62 levels) +static void PacClipSumUpdateHugeInt(Vector inputs[], AggregateInputData &aggr, idx_t, data_ptr_t state_p, idx_t count) { + auto &state = *reinterpret_cast *>(state_p); + UnifiedVectorFormat hash_data, value_data; + inputs[0].ToUnifiedFormat(count, hash_data); + inputs[1].ToUnifiedFormat(count, value_data); + auto hashes = UnifiedVectorFormat::GetData(hash_data); + auto values = UnifiedVectorFormat::GetData(value_data); + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + if (!hash_data.validity.RowIsValid(h_idx) || !value_data.validity.RowIsValid(v_idx)) { + continue; + } + PacClipSumUpdateOne(state, hashes[h_idx], values[v_idx], aggr.allocator); + } +} +static void PacClipSumScatterUpdateHugeInt(Vector inputs[], AggregateInputData &aggr, idx_t, Vector &states, + idx_t count) { + UnifiedVectorFormat hash_data, value_data, sdata; + inputs[0].ToUnifiedFormat(count, hash_data); + inputs[1].ToUnifiedFormat(count, value_data); + states.ToUnifiedFormat(count, sdata); + auto hashes = UnifiedVectorFormat::GetData(hash_data); + auto values = UnifiedVectorFormat::GetData(value_data); + auto state_ptrs = UnifiedVectorFormat::GetData *>(sdata); + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + auto state = state_ptrs[sdata.sel->get_index(i)]; + if (!hash_data.validity.RowIsValid(h_idx) || !value_data.validity.RowIsValid(v_idx)) { + continue; + } + PacClipSumUpdateOne(*state, hashes[h_idx], values[v_idx], aggr.allocator); + } +} + +// UHugeInt update (unsigned, convert to hugeint for routing — 128-bit needs full 62 levels) +static void PacClipSumUpdateUHugeInt(Vector inputs[], AggregateInputData &aggr, idx_t, data_ptr_t state_p, + idx_t count) { + auto &state = *reinterpret_cast *>(state_p); + UnifiedVectorFormat hash_data, value_data; + inputs[0].ToUnifiedFormat(count, hash_data); + inputs[1].ToUnifiedFormat(count, value_data); + auto hashes = UnifiedVectorFormat::GetData(hash_data); + auto values = UnifiedVectorFormat::GetData(value_data); + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + if (!hash_data.validity.RowIsValid(h_idx) || !value_data.validity.RowIsValid(v_idx)) { + continue; + } + // uhugeint_t is always positive; treat as 128-bit unsigned + auto &v = values[v_idx]; + auto *pos_state = state.EnsureState(aggr.allocator); + if (DUCKDB_LIKELY(hashes[h_idx])) { + uint64_t upper = static_cast(v.upper); + uint64_t lower = v.lower; + int level = PacClipSumIntState::GetLevel128(upper, lower); + uint64_t shift = level << 1; + uint16_t shifted_val; + if (shift >= 64) { + shifted_val = static_cast(upper >> (shift - 64)); + } else if (shift > 0) { + shifted_val = static_cast((lower >> shift) | (upper << (64 - shift))); + } else { + shifted_val = static_cast(lower); + } + shifted_val &= 0xFF; + pos_state->key_hash |= hashes[h_idx]; + pos_state->EnsureLevelAllocated(aggr.allocator, level); + uint64_t *buf = pos_state->levels[level]; + buf[17] |= (1ULL << (hashes[h_idx] >> 58)); + pos_state->AddToExactCount(buf, shifted_val, aggr.allocator); + Pac2AddToTotalsSWAR16(buf, shifted_val, hashes[h_idx]); + pos_state->update_count++; + } + } +} +static void PacClipSumScatterUpdateUHugeInt(Vector inputs[], AggregateInputData &aggr, idx_t, Vector &states, + idx_t count) { + UnifiedVectorFormat hash_data, value_data, sdata; + inputs[0].ToUnifiedFormat(count, hash_data); + inputs[1].ToUnifiedFormat(count, value_data); + states.ToUnifiedFormat(count, sdata); + auto hashes = UnifiedVectorFormat::GetData(hash_data); + auto values = UnifiedVectorFormat::GetData(value_data); + auto state_ptrs = UnifiedVectorFormat::GetData *>(sdata); + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + auto state = state_ptrs[sdata.sel->get_index(i)]; + if (!hash_data.validity.RowIsValid(h_idx) || !value_data.validity.RowIsValid(v_idx)) { + continue; + } + auto &v = values[v_idx]; + auto *pos_state = state->EnsureState(aggr.allocator); + if (DUCKDB_LIKELY(hashes[h_idx])) { + uint64_t upper = static_cast(v.upper); + uint64_t lower = v.lower; + int level = PacClipSumIntState::GetLevel128(upper, lower); + uint64_t shift = level << 1; + uint16_t shifted_val; + if (shift >= 64) { + shifted_val = static_cast(upper >> (shift - 64)); + } else if (shift > 0) { + shifted_val = static_cast((lower >> shift) | (upper << (64 - shift))); + } else { + shifted_val = static_cast(lower); + } + shifted_val &= 0xFF; + pos_state->key_hash |= hashes[h_idx]; + pos_state->EnsureLevelAllocated(aggr.allocator, level); + uint64_t *buf = pos_state->levels[level]; + buf[17] |= (1ULL << (hashes[h_idx] >> 58)); + pos_state->AddToExactCount(buf, shifted_val, aggr.allocator); + Pac2AddToTotalsSWAR16(buf, shifted_val, hashes[h_idx]); + pos_state->update_count++; + } + } +} + +// ============================================================================ +// Float/Double update: scale to int64, route through signed path +// ============================================================================ +template +static void PacClipSumUpdateFloat(Vector inputs[], AggregateInputData &aggr, idx_t, data_ptr_t state_p, idx_t count) { + auto &state = *reinterpret_cast *>(state_p); + UnifiedVectorFormat hash_data, value_data; + inputs[0].ToUnifiedFormat(count, hash_data); + inputs[1].ToUnifiedFormat(count, value_data); + auto hashes = UnifiedVectorFormat::GetData(hash_data); + auto values = UnifiedVectorFormat::GetData(value_data); + + if (hash_data.validity.AllValid() && value_data.validity.AllValid()) { + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + PacClipSumUpdateOne( + state, hashes[h_idx], ScaleFloatToInt64(values[v_idx]), aggr.allocator); + } + } else { + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + if (!hash_data.validity.RowIsValid(h_idx) || !value_data.validity.RowIsValid(v_idx)) { + continue; + } + PacClipSumUpdateOne( + state, hashes[h_idx], ScaleFloatToInt64(values[v_idx]), aggr.allocator); + } + } +} + +template +static void PacClipSumScatterUpdateFloat(Vector inputs[], AggregateInputData &aggr, idx_t, Vector &states, + idx_t count) { + UnifiedVectorFormat hash_data, value_data, sdata; + inputs[0].ToUnifiedFormat(count, hash_data); + inputs[1].ToUnifiedFormat(count, value_data); + states.ToUnifiedFormat(count, sdata); + auto hashes = UnifiedVectorFormat::GetData(hash_data); + auto values = UnifiedVectorFormat::GetData(value_data); + auto state_ptrs = UnifiedVectorFormat::GetData *>(sdata); + + for (idx_t i = 0; i < count; i++) { + auto h_idx = hash_data.sel->get_index(i); + auto v_idx = value_data.sel->get_index(i); + auto state = state_ptrs[sdata.sel->get_index(i)]; + if (!hash_data.validity.RowIsValid(h_idx) || !value_data.validity.RowIsValid(v_idx)) { + continue; + } + PacClipSumUpdateOne( + *state, hashes[h_idx], ScaleFloatToInt64(values[v_idx]), aggr.allocator); + } +} + +// Instantiate float/double update functions +static void PacClipSumUpdateSingleFloat(Vector inputs[], AggregateInputData &aggr, idx_t n, data_ptr_t state_p, + idx_t count) { + PacClipSumUpdateFloat(inputs, aggr, n, state_p, count); +} +static void PacClipSumScatterUpdateSingleFloat(Vector inputs[], AggregateInputData &aggr, idx_t n, Vector &states, + idx_t count) { + PacClipSumScatterUpdateFloat(inputs, aggr, n, states, count); +} +static void PacClipSumUpdateSingleDouble(Vector inputs[], AggregateInputData &aggr, idx_t n, data_ptr_t state_p, + idx_t count) { + PacClipSumUpdateFloat(inputs, aggr, n, state_p, count); +} +static void PacClipSumScatterUpdateSingleDouble(Vector inputs[], AggregateInputData &aggr, idx_t n, Vector &states, + idx_t count) { + PacClipSumScatterUpdateFloat(inputs, aggr, n, states, count); +} + +// ============================================================================ +// Combine +// ============================================================================ +template +AUTOVECTORIZE static void PacClipSumCombineInt(Vector &src, Vector &dst, idx_t count, ArenaAllocator &allocator) { + auto src_wrapper = FlatVector::GetData *>(src); + auto dst_wrapper = FlatVector::GetData *>(dst); + + for (idx_t i = 0; i < count; i++) { + // Flush src's buffer into dst + PacClipSumFlushBuffer(*src_wrapper[i], *dst_wrapper[i], allocator); + + auto *s = src_wrapper[i]->GetState(); + if (!s) { + continue; + } + auto *d = dst_wrapper[i]->EnsureState(allocator); + d->CombineFrom(s, allocator); + + // Combine neg states + auto *s_neg = src_wrapper[i]->GetNegState(); + if (s_neg) { + auto *d_neg = dst_wrapper[i]->GetNegState(); + if (!d_neg) { + dst_wrapper[i]->neg_state = s_neg; // steal + } else { + d_neg->CombineFrom(s_neg, allocator); + } + } + } +} + +static void PacClipSumCombine(Vector &src, Vector &dst, AggregateInputData &aggr, idx_t count) { + PacClipSumCombineInt<>(src, dst, count, aggr.allocator); +} +static void PacClipSumCombine128(Vector &src, Vector &dst, AggregateInputData &aggr, idx_t count) { + PacClipSumCombineInt(src, dst, count, aggr.allocator); +} + +// PacClipBindData is defined in pac_clip_aggr.hpp + +// ============================================================================ +// Finalize +// ============================================================================ +template +static void PacClipSumFinalize(Vector &states, AggregateInputData &input, Vector &result, idx_t count, idx_t offset) { + auto state_ptrs = FlatVector::GetData *>(states); + auto data = FlatVector::GetData(result); + auto &result_mask = FlatVector::Validity(result); + auto &bind = static_cast(*input.bind_data); + double mi = bind.mi; + double correction = bind.correction; + uint64_t query_hash = bind.query_hash; + auto pstate = bind.pstate; + int clip_support = bind.clip_support_threshold; + bool clip_scale = bind.clip_scale; + + for (idx_t i = 0; i < count; i++) { + PacClipSumFlushBuffer(*state_ptrs[i], *state_ptrs[i], input.allocator); + + PAC_FLOAT buf[64] = {0}; + auto *pos = state_ptrs[i]->GetState(); + if (!pos) { + result_mask.SetInvalid(offset + i); + continue; + } + uint64_t key_hash = pos->key_hash; + std::mt19937_64 gen(bind.seed); + if (PacNoiseInNull(key_hash, mi, correction, gen)) { + result_mask.SetInvalid(offset + i); + continue; + } + + // Non-mutating: just read totals with clip_support filtering + pos->GetTotals(buf, clip_support, clip_scale); + uint64_t update_count = pos->update_count; + + // Subtract neg state + auto *neg = state_ptrs[i]->GetNegState(); + if (neg) { + PAC_FLOAT neg_buf[64] = {0}; + neg->GetTotals(neg_buf, clip_support, clip_scale); + key_hash |= neg->key_hash; + for (int j = 0; j < 64; j++) { + buf[j] -= neg_buf[j]; + } + update_count += neg->update_count; + } + + CheckPacSampleDiversity(key_hash, buf, update_count, "pac_clip_sum", bind); + PAC_FLOAT result_val = PacNoisySampleFrom64Counters(buf, mi, correction, gen, ~key_hash, query_hash, pstate); + result_val *= PAC_FLOAT(2.0); // 2x compensation for ~50% sampling + result_val /= static_cast(bind.float_scale); // undo float→int64 scaling (1.0 for integers) + data[offset + i] = FromDouble(result_val); + } +} + +// Instantiate noised finalize (scalar output for pac_noised_clip_sum) +// 64-bit types +static void PacClipSumNoisedFinalizeSigned(Vector &states, AggregateInputData &input, Vector &result, idx_t count, + idx_t offset) { + PacClipSumFinalize(states, input, result, count, offset); +} +static void PacClipSumNoisedFinalizeUnsigned(Vector &states, AggregateInputData &input, Vector &result, idx_t count, + idx_t offset) { + PacClipSumFinalize(states, input, result, count, offset); +} +// 128-bit types +static void PacClipSumNoisedFinalizeSigned128(Vector &states, AggregateInputData &input, Vector &result, idx_t count, + idx_t offset) { + PacClipSumFinalize(states, input, result, count, offset); +} +static void PacClipSumNoisedFinalizeUnsigned128(Vector &states, AggregateInputData &input, Vector &result, idx_t count, + idx_t offset) { + PacClipSumFinalize(states, input, result, count, offset); +} +// BIGINT output variant — used for count→sum conversion where the original returned BIGINT +static void PacClipSumNoisedFinalizeBigInt(Vector &states, AggregateInputData &input, Vector &result, idx_t count, + idx_t offset) { + PacClipSumFinalize(states, input, result, count, offset); +} +// Float/double output variants +static void PacClipSumNoisedFinalizeFloat(Vector &states, AggregateInputData &input, Vector &result, idx_t count, + idx_t offset) { + PacClipSumFinalize(states, input, result, count, offset); +} +static void PacClipSumNoisedFinalizeDouble(Vector &states, AggregateInputData &input, Vector &result, idx_t count, + idx_t offset) { + PacClipSumFinalize(states, input, result, count, offset); +} + +// ============================================================================ +// Counters finalize (LIST output for pac_clip_sum) +// ============================================================================ +template +static void PacClipSumFinalizeCounters(Vector &states, AggregateInputData &input, Vector &result, idx_t count, + idx_t offset) { + auto state_ptrs = FlatVector::GetData *>(states); + auto &bind = static_cast(*input.bind_data); + int clip_support = bind.clip_support_threshold; + double correction = bind.correction; + double float_scale = bind.float_scale; + bool clip_scale = bind.clip_scale; + + // Result is LIST + auto list_entries = FlatVector::GetData(result); + auto &child_vec = ListVector::GetEntry(result); + + idx_t total_elements = count * 64; + ListVector::Reserve(result, total_elements); + ListVector::SetListSize(result, total_elements); + + auto child_data = FlatVector::GetData(child_vec); + + for (idx_t i = 0; i < count; i++) { + PacClipSumFlushBuffer(*state_ptrs[i], *state_ptrs[i], input.allocator); + + list_entries[offset + i].offset = i * 64; + list_entries[offset + i].length = 64; + + PAC_FLOAT buf[64] = {0}; + uint64_t key_hash = 0; + uint64_t update_count = 0; + + auto *pos = state_ptrs[i]->GetState(); + if (pos) { + key_hash = pos->key_hash; + update_count = pos->update_count; + pos->GetTotals(buf, clip_support, clip_scale); + + auto *neg = state_ptrs[i]->GetNegState(); + if (neg) { + PAC_FLOAT neg_buf[64] = {0}; + neg->GetTotals(neg_buf, clip_support, clip_scale); + key_hash |= neg->key_hash; + for (int j = 0; j < 64; j++) { + buf[j] -= neg_buf[j]; + } + update_count += neg->update_count; + } + } + + CheckPacSampleDiversity(key_hash, buf, update_count, "pac_clip_sum", bind); + + idx_t base = i * 64; + for (int j = 0; j < 64; j++) { + if ((key_hash >> j) & 1ULL) { + child_data[base + j] = static_cast(buf[j] * 2.0 * correction / float_scale); + } else { + child_data[base + j] = 0.0; + } + } + } +} + +// 64-bit counters finalize +static void PacClipSumFinalizeCountersSigned(Vector &states, AggregateInputData &input, Vector &result, idx_t count, + idx_t offset) { + PacClipSumFinalizeCounters(states, input, result, count, offset); +} +static void PacClipSumFinalizeCountersUnsigned(Vector &states, AggregateInputData &input, Vector &result, idx_t count, + idx_t offset) { + PacClipSumFinalizeCounters(states, input, result, count, offset); +} +// 128-bit counters finalize +static void PacClipSumFinalizeCountersSigned128(Vector &states, AggregateInputData &input, Vector &result, idx_t count, + idx_t offset) { + PacClipSumFinalizeCounters(states, input, result, count, offset); +} +static void PacClipSumFinalizeCountersUnsigned128(Vector &states, AggregateInputData &input, Vector &result, + idx_t count, idx_t offset) { + PacClipSumFinalizeCounters(states, input, result, count, offset); +} + +// ============================================================================ +// State size / init / bind +// ============================================================================ +static idx_t PacClipSumStateSize(const AggregateFunction &) { + return sizeof(PacClipSumStateWrapper<>); +} +static idx_t PacClipSumStateSize128(const AggregateFunction &) { + return sizeof(PacClipSumStateWrapper); +} + +static void PacClipSumInitialize(const AggregateFunction &, data_ptr_t state_p) { + memset(state_p, 0, sizeof(PacClipSumStateWrapper<>)); +} +static void PacClipSumInitialize128(const AggregateFunction &, data_ptr_t state_p) { + memset(state_p, 0, sizeof(PacClipSumStateWrapper)); +} + +// PacClipBind, PacClipBindFloat, PacClipBindDouble are defined in pac_clip_aggr.hpp + +// ============================================================================ +// DECIMAL support: dispatch by physical type, same pattern as pac_noised_sum +// ============================================================================ +static AggregateFunction GetPacClipSumNoisedAggregate(PhysicalType type) { + switch (type) { + case PhysicalType::INT16: + return AggregateFunction("pac_noised_clip_sum", {LogicalType::UBIGINT, LogicalType::SMALLINT}, + LogicalType::HUGEINT, PacClipSumStateSize, PacClipSumInitialize, + PacClipSumScatterUpdateSmallInt, PacClipSumCombine, PacClipSumNoisedFinalizeSigned, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateSmallInt); + case PhysicalType::INT32: + return AggregateFunction("pac_noised_clip_sum", {LogicalType::UBIGINT, LogicalType::INTEGER}, + LogicalType::HUGEINT, PacClipSumStateSize, PacClipSumInitialize, + PacClipSumScatterUpdateInteger, PacClipSumCombine, PacClipSumNoisedFinalizeSigned, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateInteger); + case PhysicalType::INT64: + return AggregateFunction("pac_noised_clip_sum", {LogicalType::UBIGINT, LogicalType::BIGINT}, + LogicalType::HUGEINT, PacClipSumStateSize, PacClipSumInitialize, + PacClipSumScatterUpdateBigInt, PacClipSumCombine, PacClipSumNoisedFinalizeSigned, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateBigInt); + case PhysicalType::INT128: + return AggregateFunction( + "pac_noised_clip_sum", {LogicalType::UBIGINT, LogicalType::HUGEINT}, LogicalType::HUGEINT, + PacClipSumStateSize128, PacClipSumInitialize128, PacClipSumScatterUpdateHugeInt, PacClipSumCombine128, + PacClipSumNoisedFinalizeSigned128, FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateHugeInt); + default: + throw InternalException("pac_noised_clip_sum: unsupported decimal physical type"); + } +} + +static AggregateFunction GetPacClipSumCountersAggregate(PhysicalType type) { + auto list_type = LogicalType::LIST(PacFloatLogicalType()); + switch (type) { + case PhysicalType::INT16: + return AggregateFunction("pac_clip_sum", {LogicalType::UBIGINT, LogicalType::SMALLINT}, list_type, + PacClipSumStateSize, PacClipSumInitialize, PacClipSumScatterUpdateSmallInt, + PacClipSumCombine, PacClipSumFinalizeCountersSigned, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateSmallInt); + case PhysicalType::INT32: + return AggregateFunction("pac_clip_sum", {LogicalType::UBIGINT, LogicalType::INTEGER}, list_type, + PacClipSumStateSize, PacClipSumInitialize, PacClipSumScatterUpdateInteger, + PacClipSumCombine, PacClipSumFinalizeCountersSigned, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateInteger); + case PhysicalType::INT64: + return AggregateFunction("pac_clip_sum", {LogicalType::UBIGINT, LogicalType::BIGINT}, list_type, + PacClipSumStateSize, PacClipSumInitialize, PacClipSumScatterUpdateBigInt, + PacClipSumCombine, PacClipSumFinalizeCountersSigned, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateBigInt); + case PhysicalType::INT128: + return AggregateFunction("pac_clip_sum", {LogicalType::UBIGINT, LogicalType::HUGEINT}, list_type, + PacClipSumStateSize128, PacClipSumInitialize128, PacClipSumScatterUpdateHugeInt, + PacClipSumCombine128, PacClipSumFinalizeCountersSigned128, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateHugeInt); + default: + throw InternalException("pac_clip_sum: unsupported decimal physical type"); + } +} + +static unique_ptr BindDecimalPacNoisedClipSum(ClientContext &ctx, AggregateFunction &function, + vector> &args) { + auto decimal_type = args[1]->return_type; + function = GetPacClipSumNoisedAggregate(decimal_type.InternalType()); + function.name = "pac_noised_clip_sum"; + function.arguments[1] = decimal_type; + function.return_type = LogicalType::DECIMAL(Decimal::MAX_WIDTH_DECIMAL, DecimalType::GetScale(decimal_type)); + return PacClipBind(ctx, function, args); +} + +static unique_ptr BindDecimalPacClipSum(ClientContext &ctx, AggregateFunction &function, + vector> &args) { + auto decimal_type = args[1]->return_type; + function = GetPacClipSumCountersAggregate(decimal_type.InternalType()); + function.name = "pac_clip_sum"; + function.arguments[1] = decimal_type; + // counters always return LIST, no DECIMAL return type needed + return PacClipBind(ctx, function, args); +} + +// ============================================================================ +// Registration helpers +// ============================================================================ +static void AddClipSumCountersFcn(AggregateFunctionSet &set, const string &name, const LogicalType &value_type, + aggregate_update_t scatter, aggregate_finalize_t finalize, + aggregate_simple_update_t update) { + auto list_type = LogicalType::LIST(PacFloatLogicalType()); + set.AddFunction(AggregateFunction(name, {LogicalType::UBIGINT, value_type}, list_type, PacClipSumStateSize, + PacClipSumInitialize, scatter, PacClipSumCombine, finalize, + FunctionNullHandling::DEFAULT_NULL_HANDLING, update, PacClipBind)); + set.AddFunction(AggregateFunction(name, {LogicalType::UBIGINT, value_type, LogicalType::DOUBLE}, list_type, + PacClipSumStateSize, PacClipSumInitialize, scatter, PacClipSumCombine, finalize, + FunctionNullHandling::DEFAULT_NULL_HANDLING, update, PacClipBind)); +} + +static void AddNoisedClipSumFcn(AggregateFunctionSet &set, const string &name, const LogicalType &value_type, + const LogicalType &result_type, aggregate_update_t scatter, + aggregate_finalize_t finalize, aggregate_simple_update_t update) { + set.AddFunction(AggregateFunction(name, {LogicalType::UBIGINT, value_type}, result_type, PacClipSumStateSize, + PacClipSumInitialize, scatter, PacClipSumCombine, finalize, + FunctionNullHandling::DEFAULT_NULL_HANDLING, update, PacClipBind)); + set.AddFunction(AggregateFunction(name, {LogicalType::UBIGINT, value_type, LogicalType::DOUBLE}, result_type, + PacClipSumStateSize, PacClipSumInitialize, scatter, PacClipSumCombine, finalize, + FunctionNullHandling::DEFAULT_NULL_HANDLING, update, PacClipBind)); +} + +// Helper to register all type overloads for a clip sum function set +static void RegisterClipSumTypeOverloads(AggregateFunctionSet &set, const string &name, bool counters) { + if (counters) { + // Counters (LIST) variants + AddClipSumCountersFcn(set, name, LogicalType::TINYINT, PacClipSumScatterUpdateTinyInt, + PacClipSumFinalizeCountersSigned, PacClipSumUpdateTinyInt); + AddClipSumCountersFcn(set, name, LogicalType::BOOLEAN, PacClipSumScatterUpdateTinyInt, + PacClipSumFinalizeCountersSigned, PacClipSumUpdateTinyInt); + AddClipSumCountersFcn(set, name, LogicalType::SMALLINT, PacClipSumScatterUpdateSmallInt, + PacClipSumFinalizeCountersSigned, PacClipSumUpdateSmallInt); + AddClipSumCountersFcn(set, name, LogicalType::INTEGER, PacClipSumScatterUpdateInteger, + PacClipSumFinalizeCountersSigned, PacClipSumUpdateInteger); + AddClipSumCountersFcn(set, name, LogicalType::BIGINT, PacClipSumScatterUpdateBigInt, + PacClipSumFinalizeCountersSigned, PacClipSumUpdateBigInt); + AddClipSumCountersFcn(set, name, LogicalType::UTINYINT, PacClipSumScatterUpdateUTinyInt, + PacClipSumFinalizeCountersUnsigned, PacClipSumUpdateUTinyInt); + AddClipSumCountersFcn(set, name, LogicalType::USMALLINT, PacClipSumScatterUpdateUSmallInt, + PacClipSumFinalizeCountersUnsigned, PacClipSumUpdateUSmallInt); + AddClipSumCountersFcn(set, name, LogicalType::UINTEGER, PacClipSumScatterUpdateUInteger, + PacClipSumFinalizeCountersUnsigned, PacClipSumUpdateUInteger); + AddClipSumCountersFcn(set, name, LogicalType::UBIGINT, PacClipSumScatterUpdateUBigInt, + PacClipSumFinalizeCountersUnsigned, PacClipSumUpdateUBigInt); + // HUGEINT/UHUGEINT: use 128-bit state (62 levels) + { + auto lt = LogicalType::LIST(PacFloatLogicalType()); + set.AddFunction(AggregateFunction( + name, {LogicalType::UBIGINT, LogicalType::HUGEINT}, lt, PacClipSumStateSize128, PacClipSumInitialize128, + PacClipSumScatterUpdateHugeInt, PacClipSumCombine128, PacClipSumFinalizeCountersSigned128, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateHugeInt, PacClipBind)); + set.AddFunction( + AggregateFunction(name, {LogicalType::UBIGINT, LogicalType::UHUGEINT}, lt, PacClipSumStateSize128, + PacClipSumInitialize128, PacClipSumScatterUpdateUHugeInt, PacClipSumCombine128, + PacClipSumFinalizeCountersUnsigned128, FunctionNullHandling::DEFAULT_NULL_HANDLING, + PacClipSumUpdateUHugeInt, PacClipBind)); + } + } else { + // Noised (scalar HUGEINT) variants + AddNoisedClipSumFcn(set, name, LogicalType::TINYINT, LogicalType::HUGEINT, PacClipSumScatterUpdateTinyInt, + PacClipSumNoisedFinalizeSigned, PacClipSumUpdateTinyInt); + AddNoisedClipSumFcn(set, name, LogicalType::BOOLEAN, LogicalType::HUGEINT, PacClipSumScatterUpdateTinyInt, + PacClipSumNoisedFinalizeSigned, PacClipSumUpdateTinyInt); + AddNoisedClipSumFcn(set, name, LogicalType::SMALLINT, LogicalType::HUGEINT, PacClipSumScatterUpdateSmallInt, + PacClipSumNoisedFinalizeSigned, PacClipSumUpdateSmallInt); + AddNoisedClipSumFcn(set, name, LogicalType::INTEGER, LogicalType::HUGEINT, PacClipSumScatterUpdateInteger, + PacClipSumNoisedFinalizeSigned, PacClipSumUpdateInteger); + AddNoisedClipSumFcn(set, name, LogicalType::BIGINT, LogicalType::HUGEINT, PacClipSumScatterUpdateBigInt, + PacClipSumNoisedFinalizeSigned, PacClipSumUpdateBigInt); + AddNoisedClipSumFcn(set, name, LogicalType::UTINYINT, LogicalType::HUGEINT, PacClipSumScatterUpdateUTinyInt, + PacClipSumNoisedFinalizeUnsigned, PacClipSumUpdateUTinyInt); + AddNoisedClipSumFcn(set, name, LogicalType::USMALLINT, LogicalType::HUGEINT, PacClipSumScatterUpdateUSmallInt, + PacClipSumNoisedFinalizeUnsigned, PacClipSumUpdateUSmallInt); + AddNoisedClipSumFcn(set, name, LogicalType::UINTEGER, LogicalType::HUGEINT, PacClipSumScatterUpdateUInteger, + PacClipSumNoisedFinalizeUnsigned, PacClipSumUpdateUInteger); + AddNoisedClipSumFcn(set, name, LogicalType::UBIGINT, LogicalType::HUGEINT, PacClipSumScatterUpdateUBigInt, + PacClipSumNoisedFinalizeUnsigned, PacClipSumUpdateUBigInt); + // HUGEINT/UHUGEINT: use 128-bit state (62 levels) + set.AddFunction( + AggregateFunction(name, {LogicalType::UBIGINT, LogicalType::HUGEINT}, LogicalType::HUGEINT, + PacClipSumStateSize128, PacClipSumInitialize128, PacClipSumScatterUpdateHugeInt, + PacClipSumCombine128, PacClipSumNoisedFinalizeSigned128, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateHugeInt, PacClipBind)); + set.AddFunction( + AggregateFunction(name, {LogicalType::UBIGINT, LogicalType::UHUGEINT}, LogicalType::HUGEINT, + PacClipSumStateSize128, PacClipSumInitialize128, PacClipSumScatterUpdateUHugeInt, + PacClipSumCombine128, PacClipSumNoisedFinalizeUnsigned128, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateUHugeInt, PacClipBind)); + } +} + +// ============================================================================ +// Registration: pac_clip_sum (counters, LIST) +// ============================================================================ +void RegisterPacClipSumFunctions(ExtensionLoader &loader) { + AggregateFunctionSet fcn_set("pac_clip_sum"); + RegisterClipSumTypeOverloads(fcn_set, "pac_clip_sum", true); + + // DECIMAL overloads + auto list_type = LogicalType::LIST(PacFloatLogicalType()); + fcn_set.AddFunction(AggregateFunction({LogicalType::UBIGINT, LogicalTypeId::DECIMAL}, list_type, nullptr, nullptr, + nullptr, nullptr, nullptr, FunctionNullHandling::DEFAULT_NULL_HANDLING, + nullptr, BindDecimalPacClipSum)); + fcn_set.AddFunction(AggregateFunction({LogicalType::UBIGINT, LogicalTypeId::DECIMAL, LogicalType::DOUBLE}, + list_type, nullptr, nullptr, nullptr, nullptr, nullptr, + FunctionNullHandling::DEFAULT_NULL_HANDLING, nullptr, BindDecimalPacClipSum)); + + // FLOAT/DOUBLE overloads (scale to int64 internally) + fcn_set.AddFunction(AggregateFunction( + "pac_clip_sum", {LogicalType::UBIGINT, LogicalType::FLOAT}, list_type, PacClipSumStateSize, + PacClipSumInitialize, PacClipSumScatterUpdateSingleFloat, PacClipSumCombine, PacClipSumFinalizeCountersSigned, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateSingleFloat, PacClipBindFloat)); + fcn_set.AddFunction(AggregateFunction( + "pac_clip_sum", {LogicalType::UBIGINT, LogicalType::FLOAT, LogicalType::DOUBLE}, list_type, PacClipSumStateSize, + PacClipSumInitialize, PacClipSumScatterUpdateSingleFloat, PacClipSumCombine, PacClipSumFinalizeCountersSigned, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateSingleFloat, PacClipBindFloat)); + fcn_set.AddFunction(AggregateFunction( + "pac_clip_sum", {LogicalType::UBIGINT, LogicalType::DOUBLE}, list_type, PacClipSumStateSize, + PacClipSumInitialize, PacClipSumScatterUpdateSingleDouble, PacClipSumCombine, PacClipSumFinalizeCountersSigned, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateSingleDouble, PacClipBindDouble)); + fcn_set.AddFunction(AggregateFunction( + "pac_clip_sum", {LogicalType::UBIGINT, LogicalType::DOUBLE, LogicalType::DOUBLE}, list_type, + PacClipSumStateSize, PacClipSumInitialize, PacClipSumScatterUpdateSingleDouble, PacClipSumCombine, + PacClipSumFinalizeCountersSigned, FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateSingleDouble, + PacClipBindDouble)); + + // Add list aggregate overload (LIST → LIST) for categorical/subquery + AddPacListAggregateOverload(fcn_set, "clip_sum"); + + CreateAggregateFunctionInfo info(fcn_set); + FunctionDescription desc; + desc.description = "[INTERNAL] Returns 64 PAC subsample counters with per-level clipping as LIST."; + desc.examples = {"SELECT c_mktsegment, pac_clip_sum(pac_hash(hash(c_custkey)), c_acctbal) FROM customer GROUP BY " + "c_mktsegment"}; + info.descriptions.push_back(std::move(desc)); + loader.RegisterFunction(std::move(info)); +} + +// ============================================================================ +// Registration: pac_noised_clip_sum (fused noised, scalar HUGEINT) +// ============================================================================ +void RegisterPacNoisedClipSumFunctions(ExtensionLoader &loader) { + AggregateFunctionSet fcn_set("pac_noised_clip_sum"); + RegisterClipSumTypeOverloads(fcn_set, "pac_noised_clip_sum", false); + + // DECIMAL overloads + fcn_set.AddFunction(AggregateFunction( + {LogicalType::UBIGINT, LogicalTypeId::DECIMAL}, LogicalTypeId::DECIMAL, nullptr, nullptr, nullptr, nullptr, + nullptr, FunctionNullHandling::DEFAULT_NULL_HANDLING, nullptr, BindDecimalPacNoisedClipSum)); + fcn_set.AddFunction(AggregateFunction( + {LogicalType::UBIGINT, LogicalTypeId::DECIMAL, LogicalType::DOUBLE}, LogicalTypeId::DECIMAL, nullptr, nullptr, + nullptr, nullptr, nullptr, FunctionNullHandling::DEFAULT_NULL_HANDLING, nullptr, BindDecimalPacNoisedClipSum)); + + // FLOAT/DOUBLE overloads (return FLOAT/DOUBLE respectively) + fcn_set.AddFunction(AggregateFunction( + "pac_noised_clip_sum", {LogicalType::UBIGINT, LogicalType::FLOAT}, LogicalType::FLOAT, PacClipSumStateSize, + PacClipSumInitialize, PacClipSumScatterUpdateSingleFloat, PacClipSumCombine, PacClipSumNoisedFinalizeFloat, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateSingleFloat, PacClipBindFloat)); + fcn_set.AddFunction( + AggregateFunction("pac_noised_clip_sum", {LogicalType::UBIGINT, LogicalType::FLOAT, LogicalType::DOUBLE}, + LogicalType::FLOAT, PacClipSumStateSize, PacClipSumInitialize, + PacClipSumScatterUpdateSingleFloat, PacClipSumCombine, PacClipSumNoisedFinalizeFloat, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateSingleFloat, PacClipBindFloat)); + fcn_set.AddFunction(AggregateFunction( + "pac_noised_clip_sum", {LogicalType::UBIGINT, LogicalType::DOUBLE}, LogicalType::DOUBLE, PacClipSumStateSize, + PacClipSumInitialize, PacClipSumScatterUpdateSingleDouble, PacClipSumCombine, PacClipSumNoisedFinalizeDouble, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateSingleDouble, PacClipBindDouble)); + fcn_set.AddFunction(AggregateFunction( + "pac_noised_clip_sum", {LogicalType::UBIGINT, LogicalType::DOUBLE, LogicalType::DOUBLE}, LogicalType::DOUBLE, + PacClipSumStateSize, PacClipSumInitialize, PacClipSumScatterUpdateSingleDouble, PacClipSumCombine, + PacClipSumNoisedFinalizeDouble, FunctionNullHandling::DEFAULT_NULL_HANDLING, PacClipSumUpdateSingleDouble, + PacClipBindDouble)); + + CreateAggregateFunctionInfo info(fcn_set); + FunctionDescription desc; + desc.description = "Privacy-preserving SUM with per-level clipping and noising. Supports 128-bit."; + desc.examples = {"SELECT c_mktsegment, pac_noised_clip_sum(pac_hash(hash(c_custkey)), c_acctbal) FROM customer " + "GROUP BY c_mktsegment"}; + info.descriptions.push_back(std::move(desc)); + loader.RegisterFunction(std::move(info)); +} + +// ============================================================================ +// Registration: pac_noised_clip_sumcount (sum-of-counts, BIGINT → BIGINT) +// Used when count→sum conversion needs to preserve BIGINT return type. +// ============================================================================ +void RegisterPacNoisedClipSumCountFunctions(ExtensionLoader &loader) { + AggregateFunctionSet fcn_set("pac_noised_clip_sumcount"); + // Only BIGINT input → BIGINT output (counts are always BIGINT) + AddNoisedClipSumFcn(fcn_set, "pac_noised_clip_sumcount", LogicalType::BIGINT, LogicalType::BIGINT, + PacClipSumScatterUpdateBigInt, PacClipSumNoisedFinalizeBigInt, PacClipSumUpdateBigInt); + CreateAggregateFunctionInfo info(fcn_set); + loader.RegisterFunction(std::move(info)); +} + +} // namespace duckdb diff --git a/src/aggregates/pac_count.cpp b/src/aggregates/pac_count.cpp index 86e86de9..80376c27 100644 --- a/src/aggregates/pac_count.cpp +++ b/src/aggregates/pac_count.cpp @@ -341,4 +341,50 @@ void RegisterPacAvgFunctions(ExtensionLoader &loader) { loader.RegisterFunction(std::move(avg_counters_info)); } +// ============================================================================ +// Clip synonyms: pac_noised_clip_count = pac_noised_count, +// pac_clip_count = pac_count +// ============================================================================ +void RegisterPacNoisedClipCountFunctions(ExtensionLoader &loader) { + AggregateFunctionSet fcn_set("pac_noised_clip_count"); + + fcn_set.AddFunction(AggregateFunction("pac_noised_clip_count", {LogicalType::UBIGINT}, LogicalType::BIGINT, + PacCountStateSize, PacCountInitialize, PacCountScatterUpdate, PacCountCombine, + PacCountFinalize, FunctionNullHandling::SPECIAL_HANDLING, PacCountUpdate, + PacCountBind)); + fcn_set.AddFunction(AggregateFunction("pac_noised_clip_count", {LogicalType::UBIGINT, LogicalType::DOUBLE}, + LogicalType::BIGINT, PacCountStateSize, PacCountInitialize, + PacCountScatterUpdate, PacCountCombine, PacCountFinalize, + FunctionNullHandling::SPECIAL_HANDLING, PacCountUpdate, PacCountBind)); + fcn_set.AddFunction(AggregateFunction("pac_noised_clip_count", {LogicalType::UBIGINT, LogicalType::ANY}, + LogicalType::BIGINT, PacCountStateSize, PacCountInitialize, + PacCountColumnScatterUpdate, PacCountCombine, PacCountFinalize, + FunctionNullHandling::SPECIAL_HANDLING, PacCountColumnUpdate, PacCountBind)); + fcn_set.AddFunction(AggregateFunction( + "pac_noised_clip_count", {LogicalType::UBIGINT, LogicalType::ANY, LogicalType::DOUBLE}, LogicalType::BIGINT, + PacCountStateSize, PacCountInitialize, PacCountColumnScatterUpdate, PacCountCombine, PacCountFinalize, + FunctionNullHandling::SPECIAL_HANDLING, PacCountColumnUpdate, PacCountBind)); + + CreateAggregateFunctionInfo info(fcn_set); + loader.RegisterFunction(std::move(info)); +} + +void RegisterPacClipCountFunctions(ExtensionLoader &loader) { + auto list_double_type = LogicalType::LIST(PacFloatLogicalType()); + AggregateFunctionSet fcn_set("pac_clip_count"); + + fcn_set.AddFunction(AggregateFunction("pac_clip_count", {LogicalType::UBIGINT}, list_double_type, PacCountStateSize, + PacCountInitialize, PacCountScatterUpdate, PacCountCombine, + PacCountFinalizeCounters, FunctionNullHandling::DEFAULT_NULL_HANDLING, + PacCountUpdate, PacCountBind)); + fcn_set.AddFunction(AggregateFunction( + "pac_clip_count", {LogicalType::UBIGINT, LogicalType::ANY}, list_double_type, PacCountStateSize, + PacCountInitialize, PacCountColumnScatterUpdate, PacCountCombine, PacCountFinalizeCounters, + FunctionNullHandling::DEFAULT_NULL_HANDLING, PacCountColumnUpdate, PacCountBind)); + AddPacListAggregateOverload(fcn_set, "clip_count"); + + CreateAggregateFunctionInfo info(fcn_set); + loader.RegisterFunction(std::move(info)); +} + } // namespace duckdb diff --git a/src/aggregates/pac_min_max.cpp b/src/aggregates/pac_min_max.cpp index 8fdfec10..125eadf9 100644 --- a/src/aggregates/pac_min_max.cpp +++ b/src/aggregates/pac_min_max.cpp @@ -371,6 +371,9 @@ void RegisterPacMaxCountersFunctions(ExtensionLoader &loader) { loader.RegisterFunction(std::move(info)); } +// ============================================================================ +// Clip min/max registration moved to pac_clip_min_max.cpp + // Explicit template instantiations #define INST_ALL(T) \ template void PacMinMaxUpdate(Vector[], AggregateInputData &, idx_t, data_ptr_t, idx_t); \ diff --git a/src/compiler/pac_bitslice_compiler.cpp b/src/compiler/pac_bitslice_compiler.cpp index 6e93db68..e2e45784 100644 --- a/src/compiler/pac_bitslice_compiler.cpp +++ b/src/compiler/pac_bitslice_compiler.cpp @@ -880,6 +880,16 @@ void CompilePacBitsliceQuery(const PACCompatibilityResult &check, OptimizerExten // errors. The post-optimizer still handles user-written pac_avg() in SQL. RewritePacAvgToDiv(input, plan); + // Clip rewrite: when pac_clip_support is set, refine PAC aggregates to use + // clipping variants with per-PU pre-aggregation below. + { + Value clip_val; + if (input.context.TryGetCurrentSetting("pac_clip_support", clip_val) && !clip_val.IsNull()) { + auto &pu_names = (pu_present_in_tree && !pu_via_cte) ? check.scanned_pu_tables : privacy_units; + RewriteClipAggregates(input, plan, check, pu_names); + } + } + #if PAC_DEBUG PAC_DEBUG_PRINT("=== PAC-OPTIMIZED PLAN ==="); plan->Print(); diff --git a/src/core/pac_extension.cpp b/src/core/pac_extension.cpp index 2393fe80..40c2e584 100644 --- a/src/core/pac_extension.cpp +++ b/src/core/pac_extension.cpp @@ -18,7 +18,9 @@ #include "aggregates/pac_aggregate.hpp" #include "aggregates/pac_count.hpp" #include "aggregates/pac_sum.hpp" +#include "aggregates/pac_clip_sum.hpp" #include "aggregates/pac_min_max.hpp" +#include "aggregates/pac_clip_min_max.hpp" #include "categorical/pac_categorical.hpp" #include "parser/pac_parser.hpp" #include "diff/pac_utility_diff.hpp" @@ -247,17 +249,38 @@ static void LoadInternal(ExtensionLoader &loader) { db.config.AddExtensionOption("pac_ptracking", "[INTERNAL] Enable persistent secret p-tracking for query-level MIA", LogicalType::BOOLEAN, Value::BOOLEAN(true)); + db.config.AddExtensionOption("pac_clip_support", + "Dynamic outlier clipping threshold for pac_clip_sum. " + "Levels with fewer than this many estimated distinct contributors are zeroed out. " + "NULL (default) disables pac_clip_sum; set to e.g. 64 to enable.", + LogicalType::BIGINT, Value()); + + db.config.AddExtensionOption("pac_clip_scale", + "Scale unsupported outlier levels to nearest supported level instead of omitting. " + "Default false (omit).", + LogicalType::BOOLEAN, Value::BOOLEAN(false)); + // Register pac_sum aggregate functions RegisterPacSumFunctions(loader); RegisterPacSumCountersFunctions(loader); + RegisterPacClipSumFunctions(loader); + RegisterPacNoisedClipSumFunctions(loader); + RegisterPacNoisedClipSumCountFunctions(loader); RegisterPacCountFunctions(loader); RegisterPacCountCountersFunctions(loader); + RegisterPacClipCountFunctions(loader); + RegisterPacNoisedClipCountFunctions(loader); // Register pac_min/pac_max aggregate functions RegisterPacMinFunctions(loader); RegisterPacMaxFunctions(loader); // Register _counters variants for categorical queries RegisterPacMinCountersFunctions(loader); RegisterPacMaxCountersFunctions(loader); + // Register clip synonyms for min/max + RegisterPacClipMinFunctions(loader); + RegisterPacClipMaxFunctions(loader); + RegisterPacNoisedClipMinFunctions(loader); + RegisterPacNoisedClipMaxFunctions(loader); // Register dummy pac_noised_avg / pac_avg (replaced by RewritePacAvgToDiv before execution) RegisterPacAvgFunctions(loader); diff --git a/src/include/aggregates/pac_clip_aggr.hpp b/src/include/aggregates/pac_clip_aggr.hpp new file mode 100644 index 00000000..85c07fac --- /dev/null +++ b/src/include/aggregates/pac_clip_aggr.hpp @@ -0,0 +1,171 @@ +// +// pac_clip_aggr.hpp: Shared constants, helpers, and bind data for clip aggregates +// (pac_clip_sum, pac_clip_min, pac_clip_max) +// +#ifndef PAC_CLIP_AGGR_HPP +#define PAC_CLIP_AGGR_HPP + +#include "duckdb.hpp" +#include "pac_aggregate.hpp" +#include +#include + +namespace duckdb { + +// ============================================================================ +// Shared clip aggregate constants +// ============================================================================ +constexpr int CLIP_NUM_LEVELS = 62; // 62 levels × 2-bit bands covers full 128-bit +constexpr int CLIP_NUM_LEVELS_64 = 30; // 30 levels covers 64-bit (max_level = 29) +constexpr int CLIP_LEVEL_SHIFT = 2; // 2^2 = 4x per level + +// Float/double → int64 scale factors (powers of 2 for exact FP arithmetic) +constexpr int CLIP_FLOAT_SHIFT = 20; // 2^20 ≈ 1M +constexpr int CLIP_DOUBLE_SHIFT = 27; // 2^27 ≈ 100M +constexpr double CLIP_FLOAT_SCALE = static_cast(1 << CLIP_FLOAT_SHIFT); // 1048576.0 +constexpr double CLIP_DOUBLE_SCALE = static_cast(1 << CLIP_DOUBLE_SHIFT); // 134217728.0 + +// ============================================================================ +// Scale float/double to int64 with branchless clamping +// ============================================================================ +template +static inline int64_t ScaleFloatToInt64(FLOAT_TYPE value) { + FLOAT_TYPE scale = static_cast(1 << SHIFT); + FLOAT_TYPE scaled = value * scale; + scaled = std::max(scaled, static_cast(INT64_MIN)); + scaled = std::min(scaled, static_cast(INT64_MAX)); + return static_cast(scaled); +} + +// ============================================================================ +// Birthday-paradox distinct-count estimation from 64-bit bitmap +// ============================================================================ +static inline int ClipEstimateDistinct(uint64_t bitmap) { + int k = pac_popcount64(bitmap); + if (k >= 64) { + return 256; // saturated — could be any large number + } + if (k == 0) { + return 0; + } + // n ≈ -64 * ln(1 - k/64) + return static_cast(-64.0 * std::log(1.0 - k / 64.0)); +} + +// ============================================================================ +// Shared boundary helpers for clip outlier elimination +// ============================================================================ + +// Scan levels to find first and last with sufficient distinct-PU support. +// bitmap_offset: index of the bitmap uint64_t within a level buffer +// (17 for sum, PCMM_SWAR=8 for min_max). +static inline void ClipFindSupportedRange(uint64_t *const *levels, int max_level_used, int bitmap_offset, int threshold, + int &first_supported, int &last_supported) { + first_supported = -1; + last_supported = -1; + for (int k = 0; k <= max_level_used; k++) { + if (levels[k] && ClipEstimateDistinct(levels[k][bitmap_offset]) >= threshold) { + if (first_supported < 0) { + first_supported = k; + } + last_supported = k; + } + } +} + +// Given level k and the supported range, return the effective level to use +// for scale computation, or -1 to skip this level entirely. +// clip_scale=false (default): omit unsupported prefix/suffix levels. +// clip_scale=true: scale them to nearest supported boundary. +static inline int ClipEffectiveLevel(int k, int first_supported, int last_supported, bool clip_scale) { + if (first_supported < 0) { + return -1; // no supported levels at all + } + if (k < first_supported) { + return clip_scale ? first_supported : -1; // prefix: scale or omit + } + if (k > last_supported) { + return clip_scale ? last_supported : -1; // suffix: scale or omit + } + return k; // interior or supported: actual level +} + +// ============================================================================ +// Shared bind data for all clip aggregates +// ============================================================================ +struct PacClipBindData : public PacBindData { + int clip_support_threshold; // levels with fewer estimated distinct contributors are clipped + double float_scale; // scale factor for float/double→int64 conversion (1.0 for integer types) + bool clip_scale; // true: scale outlier levels to nearest supported; false: omit them + + PacClipBindData(ClientContext &ctx, double mi_val, double correction_val, int clip_support, + double float_scale_val = 1.0, bool clip_scale_val = false) + : PacBindData(ctx, mi_val, correction_val, 1.0), clip_support_threshold(clip_support), + float_scale(float_scale_val), clip_scale(clip_scale_val) { + } + + unique_ptr Copy() const override { + auto copy = make_uniq(*this); + copy->total_update_count = 0; + copy->suspicious_count = 0; + copy->nonsuspicious_count = 0; + return copy; + } + bool Equals(const FunctionData &other) const override { + if (!PacBindData::Equals(other)) { + return false; + } + auto *o = dynamic_cast(&other); + return o && clip_support_threshold == o->clip_support_threshold && float_scale == o->float_scale && + clip_scale == o->clip_scale; + } +}; + +// ============================================================================ +// Shared bind functions for clip aggregates +// ============================================================================ +static unique_ptr PacClipBindWithScale(ClientContext &ctx, vector> &args, + const string &func_name, double float_scale = 1.0) { + double mi = GetPacMiFromSetting(ctx); + double correction = 1.0; + if (2 < args.size()) { + if (!args[2]->IsFoldable()) { + throw InvalidInputException("%s: correction parameter must be a constant", func_name); + } + auto val = ExpressionExecutor::EvaluateScalar(ctx, *args[2]); + correction = val.GetValue(); + if (correction < 0.0) { + throw InvalidInputException("%s: correction must be >= 0", func_name); + } + } + int clip_support = 0; + Value dc_val; + if (ctx.TryGetCurrentSetting("pac_clip_support", dc_val) && !dc_val.IsNull()) { + clip_support = static_cast(dc_val.GetValue()); + } + bool clip_scale_val = false; + Value cs_val; + if (ctx.TryGetCurrentSetting("pac_clip_scale", cs_val) && !cs_val.IsNull()) { + clip_scale_val = cs_val.GetValue(); + } + return make_uniq(ctx, mi, correction, clip_support, float_scale, clip_scale_val); +} + +static unique_ptr PacClipBind(ClientContext &ctx, AggregateFunction &, + vector> &args) { + return PacClipBindWithScale(ctx, args, "pac_clip"); +} + +static unique_ptr PacClipBindFloat(ClientContext &ctx, AggregateFunction &, + vector> &args) { + return PacClipBindWithScale(ctx, args, "pac_clip", CLIP_FLOAT_SCALE); +} + +static unique_ptr PacClipBindDouble(ClientContext &ctx, AggregateFunction &, + vector> &args) { + return PacClipBindWithScale(ctx, args, "pac_clip", CLIP_DOUBLE_SCALE); +} + +} // namespace duckdb + +#endif // PAC_CLIP_AGGR_HPP diff --git a/src/include/aggregates/pac_clip_min_max.hpp b/src/include/aggregates/pac_clip_min_max.hpp new file mode 100644 index 00000000..f528334e --- /dev/null +++ b/src/include/aggregates/pac_clip_min_max.hpp @@ -0,0 +1,294 @@ +// +// pac_clip_min_max: Approximate min/max with per-level uint8_t extremes + distinct bitmaps +// Two-sided (unsigned pos/neg), NUM_LEVELS sized for input type width +// +#ifndef PAC_CLIP_MIN_MAX_HPP +#define PAC_CLIP_MIN_MAX_HPP + +#include "duckdb.hpp" +#include "pac_clip_aggr.hpp" +#include "pac_min_max.hpp" // for UpdateExtremesSIMD + +namespace duckdb { + +void RegisterPacClipMinFunctions(ExtensionLoader &loader); +void RegisterPacClipMaxFunctions(ExtensionLoader &loader); +void RegisterPacNoisedClipMinFunctions(ExtensionLoader &loader); +void RegisterPacNoisedClipMaxFunctions(ExtensionLoader &loader); + +// ============================================================================ +// Min/max-specific constants (shared constants in pac_clip_aggr.hpp) +// ============================================================================ +constexpr int PCMM_SWAR = 8; // 8 × uint64_t = 64 × uint8_t extremes (SWAR packed) +constexpr int PCMM_ELEMENTS = 9; // 8 SWAR + 1 bitmap + +// ============================================================================ +// PacClipMinMaxIntState: core state with uint8_t extremes per level +// Unsigned: stores absolute values only. Caller routes negatives to a +// separate state with !IS_MAX (two-sided approach, same as clip_sum). +// NUM_LEVELS: 30 for ≤64-bit types, 62 for 128-bit types +// ============================================================================ +template +struct PacClipMinMaxIntState { + static constexpr int INLINE_THRESHOLD = NUM_LEVELS - PCMM_ELEMENTS; + + uint64_t key_hash; + uint64_t update_count; + int8_t max_level_used; // -1 if none + int8_t inline_level_idx; // which level uses inline, -1 if none + uint8_t level_bounds[NUM_LEVELS]; // BOUNDOPT: worst-of-64 per level for early skip + + // Level pointers with inline optimization: last PCMM_ELEMENTS slots + // overlap with one inline level buffer, saving one arena allocation. + union { + uint64_t *levels[NUM_LEVELS]; + struct { + uint64_t *_ptrs[INLINE_THRESHOLD]; + uint64_t inline_level[PCMM_ELEMENTS]; + }; + }; + + // ======================================================================== + // GetLevel: route value to lowest level where shifted value fits in uint8_t [0,255] + // ======================================================================== + static inline int GetLevel(uint64_t abs_val) { + if (abs_val < 256) { + return 0; + } + int bit_pos = 63 - pac_clzll(abs_val); + return std::min((bit_pos - 4) >> 1, NUM_LEVELS - 1); + } + + static inline int GetLevel128(uint64_t upper, uint64_t lower) { + if (upper == 0) { + return GetLevel(lower); + } + int bit_pos = 127 - pac_clzll(upper); + return std::min((bit_pos - 4) >> 1, NUM_LEVELS - 1); + } + + // ======================================================================== + // Level allocation + // ======================================================================== + inline void AllocateLevel(ArenaAllocator &allocator, int k) { + if (k >= INLINE_THRESHOLD && inline_level_idx >= 0) { + // Evict inline level to arena + auto *ext = reinterpret_cast(allocator.Allocate(PCMM_ELEMENTS * sizeof(uint64_t))); + memcpy(ext, inline_level, PCMM_ELEMENTS * sizeof(uint64_t)); + levels[inline_level_idx] = ext; + inline_level_idx = -1; + memset(inline_level, 0, PCMM_ELEMENTS * sizeof(uint64_t)); + } + uint64_t *buf; + if (k < INLINE_THRESHOLD && inline_level_idx < 0) { + buf = inline_level; + inline_level_idx = static_cast(k); + } else { + buf = reinterpret_cast(allocator.Allocate(PCMM_ELEMENTS * sizeof(uint64_t))); + } + // Unsigned uint8_t: IS_MAX init to 0x00 (worst max=0), IS_MIN init to 0xFF (worst min=255) + if (IS_MAX) { + memset(buf, 0x00, PCMM_SWAR * sizeof(uint64_t)); + } else { + memset(buf, 0xFF, PCMM_SWAR * sizeof(uint64_t)); + } + buf[PCMM_SWAR] = 0; // bitmap starts empty + levels[k] = buf; + level_bounds[k] = IS_MAX ? 0 : UINT8_MAX; // init bound to worst case + } + + inline void EnsureLevelAllocated(ArenaAllocator &allocator, int k) { + if (DUCKDB_LIKELY(k <= max_level_used)) { + return; + } + for (int i = max_level_used + 1; i <= k; i++) { + AllocateLevel(allocator, i); + } + max_level_used = static_cast(k); + } + + // ======================================================================== + // BOUNDOPT: recompute worst-of-64 bound for level k + // ======================================================================== + void RecomputeBound(int k) { + auto *extremes = reinterpret_cast(levels[k]); + uint8_t worst = extremes[0]; + for (int i = 1; i < 64; i++) { + worst = PAC_WORSE(worst, extremes[i]); + } + level_bounds[k] = worst; + } + + // ======================================================================== + // UpdateExtreme: reuse the SIMD kernel from pac_min_max.hpp + // uint8_t: SHIFTS=8, MASK=0x0101..., SIGNED=false, FLOAT=false + // ======================================================================== + inline void UpdateExtreme(uint64_t *buf, uint8_t shifted_val, uint64_t kh) { + auto *extremes = reinterpret_cast(buf); + UpdateExtremesSIMD(extremes, kh, + shifted_val); + } + + // ======================================================================== + // GetTotals: non-mutating finalization — reconstruct unsigned extremes + // Uses same boundary logic as clip_sum (shared helpers in pac_clip_aggr.hpp) + // ======================================================================== + void GetTotals(PAC_FLOAT *dst, int clip_support_threshold = 0, bool clip_scale = false) const { + memset(dst, 0, 64 * sizeof(PAC_FLOAT)); + + // Pass 1: find first and last supported levels + int first_supported = -1, last_supported = -1; + if (clip_support_threshold > 0) { + ClipFindSupportedRange(levels, max_level_used, PCMM_SWAR, clip_support_threshold, first_supported, + last_supported); + } + + // Pass 2: accumulate extremes + for (int k = 0; k <= max_level_used; k++) { + if (!levels[k]) { + continue; + } + + int eff = + (clip_support_threshold > 0) ? ClipEffectiveLevel(k, first_supported, last_supported, clip_scale) : k; + if (eff < 0) { + continue; + } + + PAC_FLOAT scale = std::exp2(static_cast(CLIP_LEVEL_SHIFT * eff)); + auto *extremes = reinterpret_cast(levels[k]); + + // Undo SWAR interleaving from UpdateExtremesSIMD (uint8_t: ELEMS=8, SHIFTS=8) + for (int bit = 0; bit < 64; bit++) { + int swar_pos = (bit % 8) * 8 + bit / 8; + PAC_FLOAT reconstructed = static_cast(extremes[swar_pos]) * scale; + if (IS_MAX) { + if (reconstructed > dst[bit]) { + dst[bit] = reconstructed; + } + } else { + if (reconstructed < dst[bit] || dst[bit] == 0) { + dst[bit] = reconstructed; + } + } + } + } + } + + // ======================================================================== + // CombineFrom: merge another state into this one + // ======================================================================== + void CombineFrom(PacClipMinMaxIntState *src, ArenaAllocator &allocator) { + if (!src) { + return; + } + key_hash |= src->key_hash; + update_count += src->update_count; + + for (int k = 0; k <= src->max_level_used; k++) { + if (!src->levels[k]) { + continue; + } + + if (k > max_level_used || !levels[k]) { + EnsureLevelAllocated(allocator, k); + // Steal or copy src level + if (k != src->inline_level_idx) { + levels[k] = src->levels[k]; + src->levels[k] = nullptr; + } else { + memcpy(levels[k], src->levels[k], PCMM_ELEMENTS * sizeof(uint64_t)); + } + continue; + } + + // Both have this level: merge extremes element-wise + auto *dst_ext = reinterpret_cast(levels[k]); + auto *src_ext = reinterpret_cast(src->levels[k]); + for (int j = 0; j < 64; j++) { + if (IS_MAX) { + if (src_ext[j] > dst_ext[j]) { + dst_ext[j] = src_ext[j]; + } + } else { + if (src_ext[j] < dst_ext[j]) { + dst_ext[j] = src_ext[j]; + } + } + } + // OR bitmaps + levels[k][PCMM_SWAR] |= src->levels[k][PCMM_SWAR]; + } + } + + PacClipMinMaxIntState *GetState() { + return this; + } + PacClipMinMaxIntState *EnsureState(ArenaAllocator &) { + return this; + } +}; + +// ============================================================================ +// PacClipMinMaxStateWrapper: buffering wrapper with two-sided pos/neg +// neg_state uses !IS_MAX (opposite direction on absolute values) +// NUM_LEVELS: 30 for ≤64-bit types, 62 for 128-bit types +// ============================================================================ +template +struct PacClipMinMaxStateWrapper { + using State = PacClipMinMaxIntState; + using NegState = PacClipMinMaxIntState; + static constexpr int BUF_SIZE = 2; + static constexpr uint64_t BUF_MASK = 3ULL; + + int64_t val_buf[BUF_SIZE]; + uint64_t hash_buf[BUF_SIZE]; + union { + uint64_t n_buffered; // lower 2 bits: count, upper bits: state pointer + State *state; + }; + NegState *neg_state; // separate state for negatives (stores absolute values, opposite direction) + + State *GetState() const { + return reinterpret_cast(reinterpret_cast(state) & ~7ULL); + } + + State *EnsureState(ArenaAllocator &a) { + State *s = GetState(); + if (!s) { + s = reinterpret_cast(a.Allocate(sizeof(State))); + memset(s, 0, sizeof(State)); + s->max_level_used = -1; + s->inline_level_idx = -1; + state = s; + } + return s; + } + + NegState *GetNegState() const { + return neg_state; + } + + NegState *EnsureNegState(ArenaAllocator &a) { + if (!neg_state) { + neg_state = reinterpret_cast(a.Allocate(sizeof(NegState))); + memset(neg_state, 0, sizeof(NegState)); + neg_state->max_level_used = -1; + neg_state->inline_level_idx = -1; + } + return neg_state; + } + + // For unsigned types, neg_state is never used — report smaller size + template + static idx_t StateSize() { + if (!SIGNED) { + return sizeof(PacClipMinMaxStateWrapper) - sizeof(NegState *); + } + return sizeof(PacClipMinMaxStateWrapper); + } +}; + +} // namespace duckdb + +#endif // PAC_CLIP_MIN_MAX_HPP diff --git a/src/include/aggregates/pac_clip_sum.hpp b/src/include/aggregates/pac_clip_sum.hpp new file mode 100644 index 00000000..2eb43b7c --- /dev/null +++ b/src/include/aggregates/pac_clip_sum.hpp @@ -0,0 +1,371 @@ +// +// pac_clip_sum: Approximate sum with per-level overflow + distinct bitmaps +// Always: buffered, approximate, two-sided (unsigned pos/neg), 31 levels covering 128-bit +// +#ifndef PAC_CLIP_SUM_HPP +#define PAC_CLIP_SUM_HPP + +#include "duckdb.hpp" +#include "pac_clip_aggr.hpp" +#include + +namespace duckdb { + +void RegisterPacClipSumFunctions(ExtensionLoader &loader); +void RegisterPacNoisedClipSumFunctions(ExtensionLoader &loader); +void RegisterPacNoisedClipSumCountFunctions(ExtensionLoader &loader); + +// ============================================================================ +// Sum-specific constants (shared constants in pac_clip_aggr.hpp) +// ============================================================================ +constexpr int CLIP_NORMAL_SWAR = 16; // 16 x uint64_t = 64 x uint16_t SWAR counters +constexpr int CLIP_NORMAL_ELEMENTS = 18; // 16 SWAR + 1 packed ptr/ec + 1 bitmap +constexpr int CLIP_OVERFLOW_SWAR = 32; // 32 x uint64_t = 64 x uint32_t SWAR counters +constexpr int CLIP_OVERFLOW_ELEMENTS = 33; // 32 SWAR + 1 exact_count +constexpr uint64_t CLIP_SWAR_MASK_16 = 0x0001000100010001ULL; + +// ============================================================================ +// Packed pointer + exact_count helpers +// Normal level[16] stores: upper 16 bits = exact_count, lower 48 bits = overflow pointer +// ============================================================================ +static inline uint64_t *Pac2GetOverflowPtr(uint64_t packed) { + return reinterpret_cast(packed & 0x0000FFFFFFFFFFFFULL); +} +static inline uint16_t Pac2GetExactCount(uint64_t packed) { + return static_cast(packed >> 48); +} +static inline void Pac2SetExactCount(uint64_t &packed, uint16_t count) { + packed = (packed & 0x0000FFFFFFFFFFFFULL) | (static_cast(count) << 48); +} +static inline void Pac2SetOverflowPtr(uint64_t &packed, uint64_t *ptr) { + packed = (packed & 0xFFFF000000000000ULL) | (reinterpret_cast(ptr) & 0x0000FFFFFFFFFFFFULL); +} + +// ============================================================================ +// SWAR kernel — identical to pac_sum's AddToTotalsSWAR for uint16_t +// ============================================================================ +AUTOVECTORIZE static inline void Pac2AddToTotalsSWAR16(uint64_t *PAC_RESTRICT total, uint64_t value, + uint64_t key_hash) { + uint64_t val_packed = static_cast(value) * CLIP_SWAR_MASK_16; + for (int i = 0; i < 16; i++) { + uint64_t bits = (key_hash >> i) & CLIP_SWAR_MASK_16; + uint64_t expanded = (bits << 16) - bits; + total[i] += val_packed & expanded; + } +} + +// ============================================================================ +// PacClipSumIntState — core state for one unsigned accumulator +// NUM_LEVELS: 30 for ≤64-bit types, 62 for 128-bit types +// ============================================================================ +template +struct PacClipSumIntState { + static constexpr int INLINE_THRESHOLD = NUM_LEVELS - CLIP_NORMAL_ELEMENTS; + + uint64_t key_hash; + uint64_t update_count; + int8_t max_level_used; // -1 if none + int8_t inline_level_idx; // which level uses inline, -1 if none + + // Level pointers with inline optimization: last CLIP_NORMAL_ELEMENTS slots + // overlap with one inline level buffer, saving one arena allocation. + union { + uint64_t *levels[NUM_LEVELS]; + struct { + uint64_t *_ptrs[INLINE_THRESHOLD]; + uint64_t inline_level[CLIP_NORMAL_ELEMENTS]; + }; + }; + + // ======================================================================== + // GetLevel: route value to lowest level where shifted value fits in 8 bits + // ======================================================================== + static inline int GetLevel(uint64_t abs_val) { + if (abs_val < 256) { + return 0; + } + int bit_pos = 63 - pac_clzll(abs_val); + return std::min((bit_pos - 4) >> 1, NUM_LEVELS - 1); + } + + // For 128-bit (hugeint) values — clamps to max level for very large values + static inline int GetLevel128(uint64_t upper, uint64_t lower) { + if (upper == 0) { + return GetLevel(lower); + } + int bit_pos = 127 - pac_clzll(upper); + return std::min((bit_pos - 4) >> 1, NUM_LEVELS - 1); + } + + // ======================================================================== + // Level allocation + // ======================================================================== + inline void AllocateLevel(ArenaAllocator &allocator, int k) { + if (k >= INLINE_THRESHOLD && inline_level_idx >= 0) { + // Evict inline level to arena + auto *ext = reinterpret_cast(allocator.Allocate(CLIP_NORMAL_ELEMENTS * sizeof(uint64_t))); + memcpy(ext, inline_level, CLIP_NORMAL_ELEMENTS * sizeof(uint64_t)); + levels[inline_level_idx] = ext; + inline_level_idx = -1; + memset(inline_level, 0, CLIP_NORMAL_ELEMENTS * sizeof(uint64_t)); + } + if (k < INLINE_THRESHOLD && inline_level_idx < 0) { + // Use inline storage + levels[k] = inline_level; + memset(inline_level, 0, CLIP_NORMAL_ELEMENTS * sizeof(uint64_t)); + inline_level_idx = static_cast(k); + } else { + auto *buf = reinterpret_cast(allocator.Allocate(CLIP_NORMAL_ELEMENTS * sizeof(uint64_t))); + memset(buf, 0, CLIP_NORMAL_ELEMENTS * sizeof(uint64_t)); + levels[k] = buf; + } + } + + inline void EnsureLevelAllocated(ArenaAllocator &allocator, int k) { + if (DUCKDB_LIKELY(k <= max_level_used)) { + return; + } + for (int i = max_level_used + 1; i <= k; i++) { + AllocateLevel(allocator, i); + } + max_level_used = static_cast(k); + } + + // ======================================================================== + // CascadeTop4: extract top 4 bits of 16-bit SWAR → add to 32-bit overflow + // ======================================================================== + void CascadeTop4(uint64_t *normal_buf, ArenaAllocator &allocator) { + // 1. Ensure overflow level allocated + uint64_t *overflow = Pac2GetOverflowPtr(normal_buf[16]); + if (!overflow) { + overflow = reinterpret_cast(allocator.Allocate(CLIP_OVERFLOW_ELEMENTS * sizeof(uint64_t))); + memset(overflow, 0, CLIP_OVERFLOW_ELEMENTS * sizeof(uint64_t)); + Pac2SetOverflowPtr(normal_buf[16], overflow); + } + + // 2. Extract top 4 bits of each 16-bit counter → add to 32-bit overflow + // SWAR 16-bit element i holds bit positions: i, i+16, i+32, i+48 + // SWAR 32-bit element i holds: i, i+32; element i+16 holds: i+16, i+48 + for (int i = 0; i < 16; i++) { + uint64_t swar = normal_buf[i]; + uint64_t top4 = (swar >> 12) & 0x000F000F000F000FULL; + normal_buf[i] = swar & 0x0FFF0FFF0FFF0FFFULL; + + auto *t = reinterpret_cast(&top4); + auto *o1 = reinterpret_cast(&overflow[i]); // bits i, i+32 + auto *o2 = reinterpret_cast(&overflow[i + 16]); // bits i+16, i+48 + o1[0] += t[0]; // bit i + o1[1] += t[2]; // bit i+32 + o2[0] += t[1]; // bit i+16 + o2[1] += t[3]; // bit i+48 + } + + // 3. Cascade exact_count top 4 bits + uint16_t ec = Pac2GetExactCount(normal_buf[16]); + auto *overflow_ec = reinterpret_cast(&overflow[32]); + *overflow_ec += (ec >> 12); + Pac2SetExactCount(normal_buf[16], ec & 0x0FFF); + } + + // ======================================================================== + // AddValue: overflow-aware exact_count update + // ======================================================================== + inline void AddToExactCount(uint64_t *normal_buf, uint16_t shifted_val, ArenaAllocator &allocator) { + uint16_t ec = Pac2GetExactCount(normal_buf[16]); + uint32_t new_ec = static_cast(ec) + shifted_val; + if (DUCKDB_UNLIKELY(new_ec > 0xFFFF)) { + CascadeTop4(normal_buf, allocator); + ec = Pac2GetExactCount(normal_buf[16]); // now ≤ 0x0FFF + new_ec = static_cast(ec) + shifted_val; + } + Pac2SetExactCount(normal_buf[16], static_cast(new_ec)); + } + + // ======================================================================== + // GetTotals: non-mutating finalization — sums all levels + // clip_support_threshold: levels with fewer estimated distinct PUs are clipped + // clip_scale: false=omit unsupported prefix/suffix, true=scale to nearest supported + // Interior unsupported levels (between first and last supported) always contribute. + // ======================================================================== + void GetTotals(PAC_FLOAT *dst, int clip_support_threshold = 0, bool clip_scale = false) const { + memset(dst, 0, 64 * sizeof(PAC_FLOAT)); + + // Pass 1: find first and last supported levels + int first_supported = -1, last_supported = -1; + if (clip_support_threshold > 0) { + ClipFindSupportedRange(levels, max_level_used, 17, clip_support_threshold, first_supported, last_supported); + } + + // Pass 2: accumulate contributions + for (int k = 0; k <= max_level_used; k++) { + if (!levels[k]) { + continue; + } + + int eff = + (clip_support_threshold > 0) ? ClipEffectiveLevel(k, first_supported, last_supported, clip_scale) : k; + if (eff < 0) { + continue; + } + + PAC_FLOAT scale = std::exp2(static_cast(CLIP_LEVEL_SHIFT * eff)); + + // Add normal 16-bit SWAR contribution + auto *counters = reinterpret_cast(levels[k]); + for (int j = 0; j < 64; j++) { + int swar_idx = (j % 16) * 4 + (j / 16); + dst[j] += static_cast(counters[swar_idx]) * scale; + } + + // Add overflow 32-bit SWAR contribution (scaled by 2^12 relative to normal) + uint64_t *overflow = Pac2GetOverflowPtr(levels[k][16]); + if (overflow) { + PAC_FLOAT overflow_scale = scale * std::exp2(static_cast(12)); + auto *ocounters = reinterpret_cast(overflow); + for (int j = 0; j < 64; j++) { + int swar_idx = (j % 32) * 2 + (j / 32); + dst[j] += static_cast(ocounters[swar_idx]) * overflow_scale; + } + } + } + } + + // ======================================================================== + // CombineFrom: merge another state into this one + // ======================================================================== + void CombineFrom(PacClipSumIntState *src, ArenaAllocator &allocator) { + if (!src) { + return; + } + key_hash |= src->key_hash; + update_count += src->update_count; + + for (int k = 0; k <= src->max_level_used; k++) { + if (!src->levels[k]) { + continue; + } + + // If dst doesn't have this level: steal src's pointer + if (k > max_level_used || !levels[k]) { + EnsureLevelAllocated(allocator, k); // ensures max_level_used >= k, allocates if needed + // If we just allocated a fresh level, steal src's data over it + if (k != src->inline_level_idx) { + // src level is arena-allocated, can steal + levels[k] = src->levels[k]; + src->levels[k] = nullptr; + } else { + // src is using inline — copy instead + memcpy(levels[k], src->levels[k], CLIP_NORMAL_ELEMENTS * sizeof(uint64_t)); + } + continue; + } + + // Both have this level: merge + // Add SWAR counters + for (int i = 0; i < CLIP_NORMAL_SWAR; i++) { + levels[k][i] += src->levels[k][i]; + } + // OR bitmaps + levels[k][17] |= src->levels[k][17]; + + // Merge exact_counts (check overflow) + uint16_t dst_ec = Pac2GetExactCount(levels[k][16]); + uint16_t src_ec = Pac2GetExactCount(src->levels[k][16]); + uint32_t sum_ec = static_cast(dst_ec) + src_ec; + if (sum_ec > 0xFFFF) { + CascadeTop4(levels[k], allocator); + } + dst_ec = Pac2GetExactCount(levels[k][16]); + Pac2SetExactCount(levels[k][16], dst_ec + src_ec); + + // Merge overflow levels + uint64_t *src_overflow = Pac2GetOverflowPtr(src->levels[k][16]); + uint64_t *dst_overflow = Pac2GetOverflowPtr(levels[k][16]); + if (src_overflow && !dst_overflow) { + // Steal overflow from src + Pac2SetOverflowPtr(levels[k][16], src_overflow); + Pac2SetOverflowPtr(src->levels[k][16], nullptr); + } else if (src_overflow && dst_overflow) { + // Merge overflow SWAR counters + for (int i = 0; i < CLIP_OVERFLOW_SWAR; i++) { + dst_overflow[i] += src_overflow[i]; + } + // Merge overflow exact_counts + auto *dec = reinterpret_cast(&dst_overflow[32]); + auto *sec = reinterpret_cast(&src_overflow[32]); + *dec += *sec; + } + } + } + + // Interface methods + PacClipSumIntState *GetState() { + return this; + } + PacClipSumIntState *EnsureState(ArenaAllocator &) { + return this; + } +}; + +// ============================================================================ +// PacClipSumStateWrapper: buffering wrapper with two-sided pos/neg +// NUM_LEVELS: 30 for ≤64-bit types, 62 for 128-bit types +// ============================================================================ +template +struct PacClipSumStateWrapper { + using State = PacClipSumIntState; + using Value = uint64_t; + static constexpr int BUF_SIZE = 2; + static constexpr uint64_t BUF_MASK = 3ULL; + + uint64_t val_buf[BUF_SIZE]; + uint64_t hash_buf[BUF_SIZE]; + union { + uint64_t n_buffered; // lower 2 bits: count, upper bits: state pointer + State *state; + }; + State *neg_state; // separate state for negatives (stores absolute values) + + State *GetState() const { + return reinterpret_cast(reinterpret_cast(state) & ~7ULL); + } + + State *EnsureState(ArenaAllocator &a) { + State *s = GetState(); + if (!s) { + s = reinterpret_cast(a.Allocate(sizeof(State))); + memset(s, 0, sizeof(State)); + s->max_level_used = -1; + s->inline_level_idx = -1; + state = s; + } + return s; + } + + State *GetNegState() const { + return neg_state; + } + + State *EnsureNegState(ArenaAllocator &a) { + if (!neg_state) { + neg_state = reinterpret_cast(a.Allocate(sizeof(State))); + memset(neg_state, 0, sizeof(State)); + neg_state->max_level_used = -1; + neg_state->inline_level_idx = -1; + } + return neg_state; + } + + // For unsigned types, neg_state is never used — report smaller size + template + static idx_t StateSize() { + if (!SIGNED) { + return sizeof(PacClipSumStateWrapper) - sizeof(State *); + } + return sizeof(PacClipSumStateWrapper); + } +}; + +} // namespace duckdb + +#endif // PAC_CLIP_SUM_HPP diff --git a/src/include/aggregates/pac_count.hpp b/src/include/aggregates/pac_count.hpp index ea7069da..aa0f34d0 100644 --- a/src/include/aggregates/pac_count.hpp +++ b/src/include/aggregates/pac_count.hpp @@ -25,6 +25,8 @@ namespace duckdb { void RegisterPacCountFunctions(ExtensionLoader &); void RegisterPacCountCountersFunctions(ExtensionLoader &); void RegisterPacAvgFunctions(ExtensionLoader &); +void RegisterPacNoisedClipCountFunctions(ExtensionLoader &); +void RegisterPacClipCountFunctions(ExtensionLoader &); // PAC_COUNT(key_hash) implements a COUNT aggregate that for each privacy-unit (identified by a key_hash) // computes 64 independent counts, where each independent count randomly (50% chance) includes a PU or not. diff --git a/src/include/aggregates/pac_min_max.hpp b/src/include/aggregates/pac_min_max.hpp index e6cf177e..a825f503 100644 --- a/src/include/aggregates/pac_min_max.hpp +++ b/src/include/aggregates/pac_min_max.hpp @@ -32,6 +32,10 @@ void RegisterPacMinFunctions(ExtensionLoader &loader); void RegisterPacMaxFunctions(ExtensionLoader &loader); void RegisterPacMinCountersFunctions(ExtensionLoader &loader); void RegisterPacMaxCountersFunctions(ExtensionLoader &loader); +void RegisterPacNoisedClipMinFunctions(ExtensionLoader &loader); +void RegisterPacNoisedClipMaxFunctions(ExtensionLoader &loader); +void RegisterPacClipMinFunctions(ExtensionLoader &loader); +void RegisterPacClipMaxFunctions(ExtensionLoader &loader); // ============================================================================ // PAC_MIN/PAC_MAX(hash_key, value) aggregate functions diff --git a/src/include/query_processing/pac_expression_builder.hpp b/src/include/query_processing/pac_expression_builder.hpp index 168fe183..d1ad9c48 100644 --- a/src/include/query_processing/pac_expression_builder.hpp +++ b/src/include/query_processing/pac_expression_builder.hpp @@ -10,6 +10,7 @@ #include "duckdb/planner/operator/logical_get.hpp" #include "duckdb/planner/operator/logical_aggregate.hpp" #include "duckdb/planner/operator/logical_cteref.hpp" +#include "metadata/pac_compatibility_check.hpp" namespace duckdb { @@ -67,6 +68,13 @@ void ModifyAggregatesWithPacFunctions(OptimizerExtensionInput &input, LogicalAgg unique_ptr &hash_input_expr, unique_ptr &plan, double correction = 1.0); +// Rewrite PAC aggregates to use clipping variants when pac_clip_support is set. +// Inserts a lower aggregate with plain DuckDB aggregates (GROUP BY groups + PU hash), +// and rewrites the top aggregate to use pac_noised_clip_* / pac_clip_* functions. +// Skips insertion if child already groups by PU key (Q13 exception). +void RewriteClipAggregates(OptimizerExtensionInput &input, unique_ptr &plan, + const PACCompatibilityResult &check, const vector &privacy_units); + } // namespace duckdb #endif // PAC_EXPRESSION_BUILDER_HPP diff --git a/src/include/query_processing/pac_plan_traversal.hpp b/src/include/query_processing/pac_plan_traversal.hpp index fbb778aa..a4be99e3 100644 --- a/src/include/query_processing/pac_plan_traversal.hpp +++ b/src/include/query_processing/pac_plan_traversal.hpp @@ -115,6 +115,9 @@ vector FilterTargetAggregatesWithPUKeyCheck(const vector &privacy_units); +// Find the first aggregate in a subtree (depth-first). +LogicalAggregate *FindFirstChildAggregate(LogicalOperator *op); + // Check if a target node is inside a DELIM_JOIN's subquery branch (children[1]). // This is important for correlated subqueries where nodes in the subquery branch // cannot directly access tables from the outer query. diff --git a/src/query_processing/pac_expression_builder.cpp b/src/query_processing/pac_expression_builder.cpp index c1c2740e..1cf09243 100644 --- a/src/query_processing/pac_expression_builder.cpp +++ b/src/query_processing/pac_expression_builder.cpp @@ -24,6 +24,7 @@ #include "duckdb/planner/operator/logical_comparison_join.hpp" #include "duckdb/planner/operator/logical_cross_product.hpp" #include "utils/pac_helpers.hpp" +#include "categorical/pac_categorical_detection.hpp" namespace duckdb { @@ -444,7 +445,7 @@ unique_ptr BindBitOrAggregate(OptimizerExtensionInput &input, unique } // Map aggregate function name to PAC function name -static string GetPacAggregateFunctionName(const string &function_name) { +static string GetPacAggregateFunctionName(const string &function_name, ClientContext *ctx = nullptr) { string pac_function_name; if (function_name == "sum" || function_name == "sum_no_overflow") { pac_function_name = "pac_noised_sum"; @@ -521,7 +522,7 @@ static void InsertDistinctPreAggregation(OptimizerExtensionInput &input, Logical for (idx_t i = 0; i < agg->expressions.size(); i++) { auto &old_aggr = agg->expressions[i]->Cast(); string function_name = old_aggr.function.name; - string pac_name = GetPacAggregateFunctionName(function_name); + string pac_name = GetPacAggregateFunctionName(function_name, &input.context); auto hash_ref = make_uniq(LogicalType::UBIGINT, combined_hash_binding); unique_ptr value_ref; @@ -594,7 +595,7 @@ BuildDistinctBranch(OptimizerExtensionInput &input, unique_ptr vector> outer_expressions; for (auto &spec : agg_specs) { - string pac_name = GetPacAggregateFunctionName(spec.second); + string pac_name = GetPacAggregateFunctionName(spec.second, &input.context); auto hash_ref = make_uniq(LogicalType::UBIGINT, combined_hash_binding); unique_ptr value_ref; if (spec.second == "count" || spec.second == "count_star") { @@ -633,7 +634,7 @@ static unique_ptr BuildNonDistinctBranch( for (auto &spec : agg_specs) { auto &old_aggr = *spec.second; string function_name = old_aggr.function.name; - string pac_name = GetPacAggregateFunctionName(function_name); + string pac_name = GetPacAggregateFunctionName(function_name, &input.context); unique_ptr value_child; if (old_aggr.children.empty()) { @@ -1110,7 +1111,7 @@ void ModifyAggregatesWithPacFunctions(OptimizerExtensionInput &input, LogicalAgg value_child = old_aggr.children[0]->Copy(); } - string pac_function_name = GetPacAggregateFunctionName(function_name); + string pac_function_name = GetPacAggregateFunctionName(function_name, &input.context); unique_ptr correction_expr; if (correction != 1.0) { correction_expr = make_uniq_base(Value::DOUBLE(correction)); @@ -1122,4 +1123,261 @@ void ModifyAggregatesWithPacFunctions(OptimizerExtensionInput &input, LogicalAgg agg->ResolveOperatorTypes(); } +// ============================================================================ +// Clip aggregate rewrite: pac_noised_* → pac_noised_clip_* / pac_clip_* +// with optional lower aggregate insertion for per-PU pre-aggregation +// ============================================================================ + +// Map pac function names to their clip variants +static string GetClipVariant(const string &name) { + if (name == "pac_noised_sum") { + return "pac_noised_clip_sum"; + } + if (name == "pac_noised_count") { + return "pac_noised_clip_count"; + } + if (name == "pac_noised_min") { + return "pac_noised_clip_min"; + } + if (name == "pac_noised_max") { + return "pac_noised_clip_max"; + } + if (name == "pac_sum") { + return "pac_clip_sum"; + } + if (name == "pac_count") { + return "pac_clip_count"; + } + if (name == "pac_min") { + return "pac_clip_min"; + } + if (name == "pac_max") { + return "pac_clip_max"; + } + return ""; // not a pac aggregate +} + +// Map pac function names to their original DuckDB aggregate +static string GetOriginalAggregate(const string &name) { + if (name == "pac_noised_sum" || name == "pac_sum") { + return "sum"; + } + if (name == "pac_noised_count" || name == "pac_count") { + return "count"; + } + if (name == "pac_noised_min" || name == "pac_min") { + return "min"; + } + if (name == "pac_noised_max" || name == "pac_max") { + return "max"; + } + return ""; +} + +// Is this a noised (scalar) variant? If so, top aggregate uses pac_noised_clip_* +static bool IsNoisedVariant(const string &name) { + return name.find("pac_noised_") == 0; +} + +// Bind a plain DuckDB aggregate function (sum, count, min, max) +static unique_ptr BindPlainAggregate(OptimizerExtensionInput &input, const string &func_name, + vector> children) { + FunctionBinder function_binder(input.context); + ErrorData error; + vector arg_types; + for (auto &child : children) { + arg_types.push_back(child->return_type); + } + auto &entry = Catalog::GetSystemCatalog(input.context) + .GetEntry(input.context, DEFAULT_SCHEMA, func_name); + auto best = function_binder.BindFunction(entry.name, entry.functions, arg_types, error); + if (!best.IsValid()) { + throw InternalException("PAC clip rewrite: failed to bind " + func_name); + } + auto func = entry.functions.GetFunctionByOffset(best.GetIndex()); + return function_binder.BindAggregateFunction(func, std::move(children), nullptr, AggregateType::NON_DISTINCT); +} + +// Check if an aggregate contains pac_noised_* or pac_* (counters) expressions +static bool IsPacAggregate(LogicalAggregate *agg) { + for (auto &expr : agg->expressions) { + if (expr->GetExpressionClass() != ExpressionClass::BOUND_AGGREGATE) { + continue; + } + auto &aggr = expr->Cast(); + if (!GetClipVariant(aggr.function.name).empty()) { + return true; + } + } + return false; +} + +void RewriteClipAggregates(OptimizerExtensionInput &input, unique_ptr &plan, + const PACCompatibilityResult &check, const vector &privacy_units) { + // Find all aggregate nodes + vector all_aggregates; + FindAllAggregates(plan, all_aggregates); + + for (auto *agg : all_aggregates) { + if (!IsPacAggregate(agg)) { + continue; + } + + // Check Q13 exception: does the child aggregate already group by PU key? + bool child_groups_by_pu = false; + for (auto &child : agg->children) { + auto *inner_agg = FindFirstChildAggregate(child.get()); + if (inner_agg && AggregateGroupsByPUKey(inner_agg, check, privacy_units)) { + child_groups_by_pu = true; + break; + } + } + + if (child_groups_by_pu) { + // Q13 exception: just rename pac_noised_* → pac_noised_clip_* in place + for (idx_t i = 0; i < agg->expressions.size(); i++) { + if (agg->expressions[i]->GetExpressionClass() != ExpressionClass::BOUND_AGGREGATE) { + continue; + } + auto &aggr = agg->expressions[i]->Cast(); + string clip_name = GetClipVariant(aggr.function.name); + if (clip_name.empty()) { + continue; + } + // Rebind with the clip variant name + vector> children; + for (auto &child : aggr.children) { + children.push_back(child->Copy()); + } + agg->expressions[i] = RebindAggregate(input.context, clip_name, std::move(children), false); + } + agg->ResolveOperatorTypes(); + continue; + } + + // Normal path: insert lower aggregate + auto &binder = input.optimizer.binder; + idx_t lower_group_index = binder.GenerateTableIndex(); + idx_t lower_agg_index = binder.GenerateTableIndex(); + + // Identify the PU hash expression from the first pac aggregate's first child (hash arg) + unique_ptr pu_hash_expr; + for (auto &expr : agg->expressions) { + if (expr->GetExpressionClass() != ExpressionClass::BOUND_AGGREGATE) { + continue; + } + auto &aggr = expr->Cast(); + if (!GetClipVariant(aggr.function.name).empty() && !aggr.children.empty()) { + pu_hash_expr = aggr.children[0]->Copy(); + break; + } + } + if (!pu_hash_expr) { + continue; // shouldn't happen + } + + idx_t num_original_groups = agg->groups.size(); + + // Build lower aggregate expressions (plain DuckDB aggregates) + vector> lower_expressions; + for (idx_t i = 0; i < agg->expressions.size(); i++) { + auto &aggr = agg->expressions[i]->Cast(); + string orig_name = GetOriginalAggregate(aggr.function.name); + if (orig_name.empty()) { + throw InternalException("PAC clip rewrite: unexpected aggregate " + aggr.function.name); + } + + vector> plain_children; + if (orig_name == "count" && (aggr.children.size() <= 1)) { + // pac_noised_count(hash) or pac_count(hash) → count_star() + // pac_noised_count(hash, col) → count(col) — but children[1] might be constant 1 + if (aggr.children.size() >= 2) { + auto &val_child = aggr.children[1]; + // Check if it's a constant 1 (from count_star rewrite) + if (val_child->type == ExpressionType::VALUE_CONSTANT) { + auto &const_expr = val_child->Cast(); + if (const_expr.value.IsNull() || const_expr.value == Value::BIGINT(1)) { + // count_star — no children + } else { + plain_children.push_back(val_child->Copy()); + } + } else { + plain_children.push_back(val_child->Copy()); + } + } + lower_expressions.push_back(BindPlainAggregate(input, "count_star", std::move(plain_children))); + } else if (orig_name == "count" && aggr.children.size() > 1) { + // count with column reference + plain_children.push_back(aggr.children[1]->Copy()); + lower_expressions.push_back(BindPlainAggregate(input, "count", std::move(plain_children))); + } else { + // sum, min, max — extract the value child (children[1]) + if (aggr.children.size() >= 2) { + plain_children.push_back(aggr.children[1]->Copy()); + } + lower_expressions.push_back(BindPlainAggregate(input, orig_name, std::move(plain_children))); + } + } + + // Create lower aggregate node + auto lower_agg = make_uniq(lower_group_index, lower_agg_index, std::move(lower_expressions)); + + // Copy original groups + add PU hash as extra group + for (auto &g : agg->groups) { + lower_agg->groups.push_back(g->Copy()); + } + lower_agg->groups.push_back(pu_hash_expr->Copy()); + + // Steal top's child → lower's child + lower_agg->children.push_back(std::move(agg->children[0])); + lower_agg->ResolveOperatorTypes(); + + // Rewrite top aggregate's groups to reference lower's group output + for (idx_t i = 0; i < num_original_groups; i++) { + auto gtype = agg->groups[i]->return_type; + agg->groups[i] = make_uniq(gtype, ColumnBinding(lower_group_index, i)); + } + + // PU hash ref from lower's group output + auto pu_hash_ref = make_uniq(pu_hash_expr->return_type, + ColumnBinding(lower_group_index, num_original_groups)); + + // Rewrite top aggregate's expressions to clip variants + for (idx_t i = 0; i < agg->expressions.size(); i++) { + auto &aggr = agg->expressions[i]->Cast(); + string pac_name = aggr.function.name; + bool noised = IsNoisedVariant(pac_name); + string orig = GetOriginalAggregate(pac_name); + + // Reference to lower aggregate's result + auto lower_type = lower_agg->types[num_original_groups + 1 + i]; + unique_ptr lower_ref = + make_uniq(lower_type, ColumnBinding(lower_agg_index, i)); + + // pac_clip_sum has integer + DECIMAL overloads but no FLOAT/DOUBLE. + // Cast FLOAT/DOUBLE to BIGINT so binding succeeds. + if ((orig == "sum" || orig == "count") && + (lower_type.id() == LogicalTypeId::FLOAT || lower_type.id() == LogicalTypeId::DOUBLE)) { + lower_ref = + BoundCastExpression::AddCastToType(input.context, std::move(lower_ref), LogicalType::BIGINT); + } + + // count → sumcount (preserves BIGINT return type), others → clip variant + string clip_func; + if (orig == "count") { + clip_func = noised ? "pac_noised_clip_sumcount" : "pac_clip_sum"; + } else { + clip_func = GetClipVariant(pac_name); + } + + agg->expressions[i] = + BindPacAggregate(input, clip_func, pu_hash_ref->Copy(), std::move(lower_ref), nullptr); + } + + // Set lower as top's child + agg->children[0] = std::move(lower_agg); + agg->ResolveOperatorTypes(); + } +} + } // namespace duckdb diff --git a/src/query_processing/pac_plan_traversal.cpp b/src/query_processing/pac_plan_traversal.cpp index 24bec2ac..fbbb3113 100644 --- a/src/query_processing/pac_plan_traversal.cpp +++ b/src/query_processing/pac_plan_traversal.cpp @@ -812,7 +812,7 @@ bool AggregateGroupsByPUKey(LogicalAggregate *agg, const PACCompatibilityResult } // Find the first aggregate in a subtree (depth-first). -static LogicalAggregate *FindFirstChildAggregate(LogicalOperator *op) { +LogicalAggregate *FindFirstChildAggregate(LogicalOperator *op) { if (!op) { return nullptr; } diff --git a/test/sql/pac_clip_min_max.test b/test/sql/pac_clip_min_max.test new file mode 100644 index 00000000..c8284047 --- /dev/null +++ b/test/sql/pac_clip_min_max.test @@ -0,0 +1,359 @@ +# name: test/sql/pac_clip_min_max.test +# description: Test pac_clip_min/max with level-based clipping +# group: [sql] + +require pac + +statement ok +PRAGMA clear_pac_metadata; + +statement ok +SET pac_seed = 42 + +statement ok +SET threads = 1 + +statement ok +SET pac_mi = 0 + +# ============================================================================ +# Basic type checks +# ============================================================================ + +query I +SELECT typeof(pac_clip_max(hash(i)::UBIGINT, i::INTEGER)) FROM range(1, 101) t(i) +---- +FLOAT[] + +query I +SELECT typeof(pac_noised_clip_max(hash(i)::UBIGINT, i::INTEGER)) FROM range(1, 101) t(i) +---- +BIGINT + +query I +SELECT typeof(pac_clip_min(hash(i)::UBIGINT, i::INTEGER)) FROM range(1, 101) t(i) +---- +FLOAT[] + +query I +SELECT typeof(pac_noised_clip_min(hash(i)::UBIGINT, i::INTEGER)) FROM range(1, 101) t(i) +---- +BIGINT + +query I +SELECT typeof(pac_noised_clip_max(hash(i)::UBIGINT, (i*0.5)::FLOAT)) FROM range(1, 101) t(i) +---- +FLOAT + +query I +SELECT typeof(pac_noised_clip_min(hash(i)::UBIGINT, (i*0.5)::DOUBLE)) FROM range(1, 101) t(i) +---- +DOUBLE + +# ============================================================================ +# MAX outlier clipping: 1000 normal rows (1-100), 1 outlier at 1000000 +# Without clipping the outlier should dominate (~999424) +# With clipping the outlier level has 1 contributor → zeroed out, max → ~100 +# ============================================================================ + +statement ok +CREATE TABLE max_outlier AS +SELECT i as id, + CASE WHEN i <= 1000 THEN (i % 100) + 1 + ELSE 1000000 + END as value +FROM range(1, 1002) t(i) + +statement ok +SET pac_clip_support = NULL + +query I +SELECT pac_noised_clip_max(hash(id)::UBIGINT, value) FROM max_outlier +---- +999424 + +statement ok +SET pac_clip_support = 5 + +query I +SELECT pac_noised_clip_max(hash(id)::UBIGINT, value) FROM max_outlier +---- +100 + +# ============================================================================ +# MIN outlier clipping: 1000 normal rows (10-60), 1 negative outlier at -999999 +# Without clipping: min ~ -1015808 (approximate -999999) +# With clipping: negative outlier zeroed, min → ~10 +# ============================================================================ + +statement ok +CREATE TABLE min_outlier AS +SELECT i as id, + CASE WHEN i <= 1000 THEN (i % 50) + 10 + ELSE -999999 + END as value +FROM range(1, 1002) t(i) + +statement ok +SET pac_clip_support = NULL + +query I +SELECT pac_noised_clip_min(hash(id)::UBIGINT, value) FROM min_outlier +---- +-999424 + +statement ok +SET pac_clip_support = 5 + +query I +SELECT pac_noised_clip_min(hash(id)::UBIGINT, value) FROM min_outlier +---- +10 + +# ============================================================================ +# Same-level values should NOT be clipped +# Level 0 covers [0, 255], so 50 and 120 are in the same level +# ============================================================================ + +statement ok +CREATE TABLE same_level AS +SELECT i as id, + CASE WHEN i <= 1000 THEN 50 + ELSE 120 + END as value +FROM range(1, 1002) t(i) + +statement ok +SET pac_clip_support = NULL + +query I +SELECT pac_noised_clip_max(hash(id)::UBIGINT, value) FROM same_level +---- +120 + +# With clipping: 120 is same level as 50, NOT clipped +statement ok +SET pac_clip_support = 5 + +query I +SELECT pac_noised_clip_max(hash(id)::UBIGINT, value) FROM same_level +---- +120 + +# ============================================================================ +# Cross-level outlier: value at level 3 (4096+) among level 0 values (1-100) +# ============================================================================ + +statement ok +CREATE TABLE cross_level AS +SELECT i as id, + CASE WHEN i <= 1000 THEN (i % 100) + 1 + ELSE 5000 + END as value +FROM range(1, 1002) t(i) + +statement ok +SET pac_clip_support = 5 + +query I +SELECT pac_noised_clip_max(hash(id)::UBIGINT, value) < 1000 FROM cross_level +---- +true + +# ============================================================================ +# Grouped aggregation +# ============================================================================ + +statement ok +SET pac_clip_support = NULL + +query I +SELECT count(*) FROM ( + SELECT id % 3 AS grp, pac_noised_clip_max(hash(id)::UBIGINT, value) AS m + FROM max_outlier GROUP BY grp +) t WHERE m IS NOT NULL +---- +3 + +query I +SELECT count(*) FROM ( + SELECT id % 3 AS grp, pac_noised_clip_min(hash(id)::UBIGINT, value) AS m + FROM min_outlier GROUP BY grp +) t WHERE m IS NOT NULL +---- +3 + +# ============================================================================ +# FLOAT max clipping +# ============================================================================ + +statement ok +CREATE TABLE float_mm AS +SELECT i as id, + CASE WHEN i <= 1000 THEN ((i % 100) * 0.5 + 0.5)::FLOAT + ELSE 50000.0::FLOAT + END as value +FROM range(1, 1002) t(i) + +statement ok +SET pac_clip_support = NULL + +query I +SELECT pac_noised_clip_max(hash(id)::UBIGINT, value) > 10000.0 FROM float_mm +---- +true + +statement ok +SET pac_clip_support = 5 + +query I +SELECT pac_noised_clip_max(hash(id)::UBIGINT, value) < 200.0 FROM float_mm +---- +true + +# ============================================================================ +# DOUBLE min clipping +# ============================================================================ + +statement ok +CREATE TABLE double_mm AS +SELECT i as id, + CASE WHEN i <= 1000 THEN ((i % 100) * 0.01 + 1.0)::DOUBLE + ELSE -99999.99::DOUBLE + END as value +FROM range(1, 1002) t(i) + +statement ok +SET pac_clip_support = NULL + +query I +SELECT pac_noised_clip_min(hash(id)::UBIGINT, value) < -50000.0 FROM double_mm +---- +true + +statement ok +SET pac_clip_support = 5 + +query I +SELECT pac_noised_clip_min(hash(id)::UBIGINT, value) BETWEEN 0.0 AND 5.0 FROM double_mm +---- +true + +# ============================================================================ +# NULL handling +# ============================================================================ + +statement ok +SET pac_clip_support = NULL + +query I +SELECT pac_noised_clip_max(hash(id)::UBIGINT, CASE WHEN id % 2 = 0 THEN value ELSE NULL END) IS NOT NULL +FROM max_outlier +---- +true + +# ============================================================================ +# Different integer types +# ============================================================================ + +query I +SELECT pac_noised_clip_max(hash(i)::UBIGINT, i::SMALLINT) BETWEEN 50 AND 128 FROM range(1, 100) t(i) +---- +true + +query I +SELECT pac_noised_clip_max(hash(i)::UBIGINT, i::BIGINT) BETWEEN 50 AND 128 FROM range(1, 100) t(i) +---- +true + +query I +SELECT pac_noised_clip_min(hash(i)::UBIGINT, i::TINYINT) BETWEEN 1 AND 10 FROM range(1, 100) t(i) +---- +true + +query I +SELECT pac_noised_clip_max(hash(i)::UBIGINT, i::UTINYINT) BETWEEN 50 AND 255 FROM range(1, 100) t(i) +---- +true + +query I +SELECT pac_noised_clip_min(hash(i)::UBIGINT, (i + 10)::UINTEGER) BETWEEN 1 AND 20 FROM range(1, 100) t(i) +---- +true + +# ============================================================================ +# Two-sided: negative-only values +# ============================================================================ + +# MAX of all-negative values: should be close to -1 (the largest) +query I +SELECT pac_noised_clip_max(hash(i)::UBIGINT, -i) BETWEEN -10 AND 0 FROM range(1, 1001) t(i) +---- +true + +# MIN of all-negative values: should be close to -1000 (the smallest) +query I +SELECT pac_noised_clip_min(hash(i)::UBIGINT, -i) BETWEEN -1100 AND -900 FROM range(1, 1001) t(i) +---- +true + +# ============================================================================ +# Two-sided: mixed positive and negative values +# ============================================================================ + +# MAX of mixed: positives 1..500, negatives -1..-500 → max should be ~500 +query I +SELECT pac_noised_clip_max(hash(i)::UBIGINT, + CASE WHEN i <= 500 THEN i ELSE -(i - 500) END +) BETWEEN 400 AND 600 FROM range(1, 1001) t(i) +---- +true + +# MIN of mixed: positives 1..500, negatives -1..-500 → min should be ~-500 +query I +SELECT pac_noised_clip_min(hash(i)::UBIGINT, + CASE WHEN i <= 500 THEN i ELSE -(i - 500) END +) BETWEEN -600 AND -400 FROM range(1, 1001) t(i) +---- +true + +# ============================================================================ +# Two-sided: positive-only should not allocate neg state (just verify correctness) +# ============================================================================ + +query I +SELECT pac_noised_clip_max(hash(i)::UBIGINT, i) BETWEEN 50 AND 128 FROM range(1, 100) t(i) +---- +true + +query I +SELECT pac_noised_clip_min(hash(i)::UBIGINT, i) BETWEEN 1 AND 10 FROM range(1, 100) t(i) +---- +true + +# ============================================================================ +# Two-sided: negative outlier clipped from mixed data +# ============================================================================ + +statement ok +CREATE TABLE neg_clip AS +SELECT i as id, + CASE WHEN i <= 1000 THEN 50 + (i % 20) + ELSE -999999 + END as value +FROM range(1, 1002) t(i) + +# Without clip: min dominated by outlier +query I +SELECT pac_noised_clip_min(hash(id)::UBIGINT, value) < -500000 FROM neg_clip +---- +true + +# With clip: outlier removed, min should be close to 50 +statement ok +SET pac_clip_support = 5 + +query I +SELECT pac_noised_clip_min(hash(id)::UBIGINT, value) BETWEEN 40 AND 70 FROM neg_clip +---- +true diff --git a/test/sql/pac_clip_sum.test b/test/sql/pac_clip_sum.test new file mode 100644 index 00000000..de1c2e1a --- /dev/null +++ b/test/sql/pac_clip_sum.test @@ -0,0 +1,485 @@ +# name: test/sql/pac_clip_sum.test +# description: Test pac_clip_sum and pac_noised_clip_sum aggregate functions with clipping +# group: [sql] + +require pac + +statement ok +PRAGMA clear_pac_metadata; + +statement ok +SET pac_seed = 42 + +statement ok +SET threads = 1 + +statement ok +SET pac_mi = 0 + +# ============================================================================ +# Basic pac_clip_sum correctness (returns LIST) +# ============================================================================ + +statement ok +CREATE TABLE test_data AS +SELECT i AS rowid, i % 3 AS grp, (i % 100) AS value +FROM range(4000) t(i) + +# pac_clip_sum returns LIST (64 counters) +query I +SELECT typeof(pac_clip_sum(hash(rowid)::UBIGINT, value::INTEGER)) FROM test_data +---- +FLOAT[] + +# pac_noised_clip_sum returns HUGEINT (fused noised scalar) +query I +SELECT typeof(pac_noised_clip_sum(hash(rowid)::UBIGINT, value::INTEGER)) FROM test_data +---- +HUGEINT + +# Works with different types +query I +SELECT pac_noised_clip_sum(hash(rowid)::UBIGINT, value::BIGINT) IS NOT NULL FROM test_data +---- +true + +query I +SELECT pac_noised_clip_sum(hash(rowid)::UBIGINT, value::SMALLINT) IS NOT NULL FROM test_data +---- +true + +# Grouped aggregation +query I +SELECT count(*) FROM ( + SELECT grp, pac_noised_clip_sum(hash(rowid)::UBIGINT, value::INTEGER) as s + FROM test_data GROUP BY grp +) t WHERE s IS NOT NULL +---- +3 + +# NULL handling +query I +SELECT pac_noised_clip_sum(hash(rowid)::UBIGINT, CASE WHEN rowid % 2 = 0 THEN value ELSE NULL END) IS NOT NULL +FROM test_data +---- +true + +# ============================================================================ +# Clipping: outlier elimination via pac_noised_clip_sum +# ============================================================================ + +# Create data with 1000 normal rows and 1 huge outlier +statement ok +CREATE TABLE outlier_test AS +SELECT i as id, + CASE WHEN i <= 1000 THEN (i % 10) + 1 + ELSE 1000000 + END as value +FROM range(1, 1002) t(i) + +# Without clip_support: result includes the outlier (expect ~1M+ range) +query I +SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value) > 100000 FROM outlier_test +---- +true + +# With clip_support=5: outlier level has only 1 contributor, gets clipped +# Result should be close to sum without outlier = 5500 +statement ok +SET pac_clip_support = 5 + +query I +SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value) < 100000 FROM outlier_test +---- +true + +# Verify the clipped result is in the right ballpark (within 5x of 5500) +query I +SELECT abs(pac_noised_clip_sum(hash(id)::UBIGINT, value) - 5500) < 5500 * 5 FROM outlier_test +---- +true + +# Hard-zero: outlier level is unsupported and contributes nothing, +# so result should be >= no-outlier baseline (equal or greater due to noise) +query I +SELECT (SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value) FROM outlier_test) + >= (SELECT pac_noised_clip_sum(hash(i)::UBIGINT, (i % 10) + 1) FROM range(1, 1001) t(i)) +---- +true + +# ============================================================================ +# Clipping with grouped data: outlier in one group, normal in another +# ============================================================================ + +statement ok +CREATE TABLE grouped_outlier AS +SELECT i as id, i % 2 as grp, + CASE WHEN i <= 1000 THEN (i % 10) + 1 + WHEN i = 1001 THEN 10000000 -- massive outlier in group 1 + ELSE (i % 10) + 1 + END as value +FROM range(1, 1003) t(i) + +# Group 0 (even ids): ~500 normal values, no outlier +# Group 1 (odd ids): ~500 normal values + one 10M outlier +# With clip_support: the outlier's level should be clipped in group 1 +query IT +SELECT grp, pac_noised_clip_sum(hash(id)::UBIGINT, value) < 100000 as reasonable +FROM grouped_outlier +GROUP BY grp +ORDER BY grp +---- +0 true +1 true + +# ============================================================================ +# Clipping with negative values (two-sided) +# ============================================================================ + +statement ok +CREATE TABLE neg_outlier AS +SELECT i as id, + CASE WHEN i <= 1000 THEN i - 500 -- normal: -499 to 500 + ELSE -5000000 -- single negative outlier + END as value +FROM range(1, 1002) t(i) + +# Without clip_support: outlier drags result very negative +statement ok +SET pac_clip_support = NULL + +query I +SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value) < -100000 FROM neg_outlier +---- +true + +# With clip_support: outlier clipped, result near 500 (sum of 1..500 - sum of 1..499 = 500) +statement ok +SET pac_clip_support = 5 + +query I +SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value) > -100000 FROM neg_outlier +---- +true + +# ============================================================================ +# HUGEINT support +# ============================================================================ + +statement ok +SET pac_clip_support = NULL + +query I +SELECT pac_noised_clip_sum(hash(i)::UBIGINT, i::HUGEINT) IS NOT NULL FROM range(1, 101) t(i) +---- +true + +# ============================================================================ +# Clip synonyms: pac_clip_count, pac_clip_min, pac_clip_max exist +# ============================================================================ + +query I +SELECT typeof(pac_clip_count(hash(i)::UBIGINT)) FROM range(1, 101) t(i) +---- +FLOAT[] + +query I +SELECT typeof(pac_noised_clip_count(hash(i)::UBIGINT)) FROM range(1, 101) t(i) +---- +BIGINT + +# ============================================================================ +# pac_clip_support setting via compiler (sum → pac_noised_clip_sum) +# ============================================================================ + +statement ok +SET pac_clip_support = 64 + +statement ok +CREATE TABLE compiler_test (id INTEGER, value INTEGER) + +statement ok +ALTER TABLE compiler_test ADD PAC_KEY (id) + +statement ok +ALTER TABLE compiler_test SET PU + +statement ok +INSERT INTO compiler_test SELECT i, i % 100 FROM range(1, 1001) t(i) + +# When pac_clip_support is set, regular SUM should go through clip rewrite +query I +SELECT sum(value) IS NOT NULL FROM compiler_test +---- +true + +# ============================================================================ +# Level boundary tests (4x bands with shift=2) +# Values at exact level boundaries should be correctly routed +# ============================================================================ + +statement ok +SET pac_clip_support = 5 + +# Level 0: 0-255, Level 1: 256-1023, Level 2: 1024-4095, Level 3: 4096-16383 +# 1000 rows at value=100 (level 0), 1 outlier at 4096 (level 3) → different level → clipped +statement ok +CREATE TABLE boundary_test AS +SELECT i as id, + CASE WHEN i <= 1000 THEN 100 + ELSE 4096 + END as value +FROM range(1, 1002) t(i) + +query I +SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value) < 200000 FROM boundary_test +---- +true + +# Outlier at 255 (max level 0) — same level as value=100 → NOT clipped +statement ok +CREATE TABLE same_level_test AS +SELECT i as id, + CASE WHEN i <= 1000 THEN 100 + ELSE 255 + END as value +FROM range(1, 1002) t(i) + +# Without clipping: sum = 1000*100 + 255 = 100255 +statement ok +SET pac_clip_support = NULL + +query I +SELECT abs(pac_noised_clip_sum(hash(id)::UBIGINT, value) - 100255) < 50000 FROM same_level_test +---- +true + +# With clipping: 255 is same level as 100, so it's NOT clipped — result similar +statement ok +SET pac_clip_support = 5 + +query I +SELECT abs(pac_noised_clip_sum(hash(id)::UBIGINT, value) - 100255) < 50000 FROM same_level_test +---- +true + +# ============================================================================ +# HUGEINT outlier clipping (levels 30+, beyond int64 range) +# ============================================================================ + +statement ok +SET pac_clip_support = 5 + +# Normal values + one HUGEINT outlier at 2^70 +statement ok +CREATE TABLE hugeint_outlier AS +SELECT i as id, + CASE WHEN i <= 1000 THEN i::HUGEINT + ELSE (1::HUGEINT << 70) + END as value +FROM range(1, 1002) t(i) + +# Without clip_support: result includes the huge outlier +statement ok +SET pac_clip_support = NULL + +query I +SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value) > 1000000000000 FROM hugeint_outlier +---- +true + +# With clip_support: outlier at high level gets hard-zeroed +statement ok +SET pac_clip_support = 5 + +query I +SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value) < 1000000000000 FROM hugeint_outlier +---- +true + +# ============================================================================ +# Negative HUGEINT outlier +# ============================================================================ + +statement ok +CREATE TABLE neg_hugeint_outlier AS +SELECT i as id, + CASE WHEN i <= 1000 THEN (i * 100)::HUGEINT + ELSE -(1::HUGEINT << 70) + END as value +FROM range(1, 1002) t(i) + +# Without clip_support: huge negative outlier dominates +statement ok +SET pac_clip_support = NULL + +query I +SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value) < -1000000000000 FROM neg_hugeint_outlier +---- +true + +# With clip_support: negative outlier clipped via neg_state +statement ok +SET pac_clip_support = 5 + +query I +SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value) > -1000000000000 FROM neg_hugeint_outlier +---- +true + +# ============================================================================ +# FLOAT support (fractional values preserved via 2^20 scaling) +# ============================================================================ + +statement ok +SET pac_clip_support = NULL + +statement ok +SET pac_mi = 0 + +# Type checks +query I +SELECT typeof(pac_clip_sum(hash(i)::UBIGINT, (i * 0.5)::FLOAT)) FROM range(1, 101) t(i) +---- +FLOAT[] + +query I +SELECT typeof(pac_noised_clip_sum(hash(i)::UBIGINT, (i * 0.5)::FLOAT)) FROM range(1, 101) t(i) +---- +FLOAT + +# Basic FLOAT sum: 4000 rows each 0.5 → true sum = 2000 +# pac is approximate, but should be in the right ballpark +query I +SELECT abs(pac_noised_clip_sum(hash(i)::UBIGINT, 0.5::FLOAT) - 2000.0) < 1000.0 FROM range(1, 4001) t(i) +---- +true + +# FLOAT with clipping: outlier at 100000.0 among 1000 values of 1.5 +# True sum = 1000*1.5 + 100000 = 101500 +statement ok +CREATE TABLE float_outlier AS +SELECT i as id, + CASE WHEN i <= 1000 THEN 1.5::FLOAT + ELSE 100000.0::FLOAT + END as value +FROM range(1, 1002) t(i) + +# Without clip: outlier included → ~198039 +query I +SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value)::INTEGER FROM float_outlier +---- +198039 + +# With clip: outlier removed → ~1431 (close to 1000*1.5=1500) +statement ok +SET pac_clip_support = 5 + +query I +SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value)::INTEGER FROM float_outlier +---- +1431 + +# ============================================================================ +# DOUBLE support (fractional values preserved via 2^27 scaling) +# ============================================================================ + +statement ok +SET pac_clip_support = NULL + +# Type checks +query I +SELECT typeof(pac_clip_sum(hash(i)::UBIGINT, (i * 0.001)::DOUBLE)) FROM range(1, 101) t(i) +---- +FLOAT[] + +query I +SELECT typeof(pac_noised_clip_sum(hash(i)::UBIGINT, (i * 0.001)::DOUBLE)) FROM range(1, 101) t(i) +---- +DOUBLE + +# Basic DOUBLE sum: 4000 rows each 0.25 → true sum = 1000 +query I +SELECT abs(pac_noised_clip_sum(hash(i)::UBIGINT, 0.25::DOUBLE) - 1000.0) < 500.0 FROM range(1, 4001) t(i) +---- +true + +# DOUBLE with clipping: true sum = 1000*2.718 + 999999.99 = 1002717.99 +statement ok +CREATE TABLE double_outlier AS +SELECT i as id, + CASE WHEN i <= 1000 THEN 2.718::DOUBLE + ELSE 999999.99::DOUBLE + END as value +FROM range(1, 1002) t(i) + +# Without clip: outlier included → ~1968584 +statement ok +SET pac_clip_support = NULL + +query I +SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value)::INTEGER FROM double_outlier +---- +1968584 + +# With clip: outlier removed → ~2504 (close to 1000*2.718=2718) +statement ok +SET pac_clip_support = 5 + +query I +SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value)::INTEGER FROM double_outlier +---- +2504 + +# Grouped float/double aggregation +statement ok +SET pac_clip_support = NULL + +query I +SELECT count(*) FROM ( + SELECT id % 3 AS grp, pac_noised_clip_sum(hash(id)::UBIGINT, value) AS s + FROM float_outlier GROUP BY grp +) t WHERE s IS NOT NULL +---- +3 + +# ============================================================================ +# Over-clipping: pac_clip_support higher than group size → zero result +# ============================================================================ + +statement ok +CREATE TABLE tiny_group AS SELECT i as id, i * 10 as value FROM range(1, 4) t(i) + +# 3 rows, clip_support=10 → no level has 10 contributors → all zeroed +statement ok +SET pac_clip_support = 10 + +query I +SELECT pac_noised_clip_sum(hash(id)::UBIGINT, value) FROM tiny_group +---- +0 + +# ============================================================================ +# Multi-group with outlier in one group only +# ============================================================================ + +statement ok +SET pac_clip_support = 5 + +statement ok +CREATE TABLE multigroup AS +SELECT i as id, i % 4 as grp, + CASE WHEN i = 997 THEN 9999999 -- outlier in group 1 (997 % 4 = 1) + ELSE (i % 50) + 1 + END as value +FROM range(1, 1001) t(i) + +# Group 1 has the outlier — should be clipped to reasonable range +# Other groups are normal +query I +SELECT count(*) FROM ( + SELECT grp, pac_noised_clip_sum(hash(id)::UBIGINT, value) as s + FROM multigroup GROUP BY grp +) t WHERE abs(s) < 500000 +---- +4