From 58ecd0fdd086431bb17fd782144aeb6c1086f9df Mon Sep 17 00:00:00 2001 From: George Talbot Date: Thu, 12 Mar 2026 14:07:28 -0400 Subject: [PATCH 1/4] docs: add CLAUDE.md and docs/internals architecture documentation Ports CLAUDE.md (development guide, coding standards, known pitfalls) and the full docs/internals tree including ADRs, gap analyses, TLA+ specs, verification guides, style references, and compaction architecture. Co-Authored-By: Claude Opus 4.6 (1M context) --- CLAUDE.md | 417 +++++++++++ docs/internals/BENCHMARKING.md | 277 +++++++ docs/internals/RUST_STYLE.md | 401 +++++++++++ docs/internals/SIMULATION_FIRST_WORKFLOW.md | 223 ++++++ docs/internals/VERIFICATION.md | 345 +++++++++ docs/internals/VERIFICATION_STACK.md | 372 ++++++++++ docs/internals/adr/000-template.md | 83 +++ docs/internals/adr/001-parquet-data-model.md | 183 +++++ .../adr/002-sort-schema-parquet-splits.md | 196 +++++ .../003-time-windowed-sorted-compaction.md | 305 ++++++++ ...04-cloud-native-storage-characteristics.md | 113 +++ docs/internals/adr/EVOLUTION.md | 143 ++++ docs/internals/adr/README.md | 94 +++ docs/internals/adr/deviations/README.md | 108 +++ .../adr/gaps/001-no-parquet-compaction.md | 71 ++ .../adr/gaps/002-fixed-sort-schema.md | 112 +++ .../gaps/003-no-time-window-partitioning.md | 101 +++ .../adr/gaps/004-incomplete-split-metadata.md | 133 ++++ .../gaps/005-no-per-point-deduplication.md | 72 ++ .../gaps/006-no-independent-auto-scaling.md | 52 ++ .../adr/gaps/007-no-multi-level-caching.md | 54 ++ .../008-no-high-query-rate-optimization.md | 58 ++ .../009-no-leading-edge-prioritization.md | 57 ++ .../010-no-data-caching-or-query-affinity.md | 65 ++ docs/internals/adr/gaps/README.md | 118 +++ .../supplements/000-supplement-template.md | 64 ++ docs/internals/adr/supplements/README.md | 52 ++ docs/internals/compaction-architecture.md | 582 +++++++++++++++ .../phase-1-sorted-splits.md | 512 +++++++++++++ docs/internals/specs/tla/CLAUDE.md | 94 +++ docs/internals/specs/tla/ParquetDataModel.cfg | 34 + docs/internals/specs/tla/ParquetDataModel.tla | 336 +++++++++ .../specs/tla/ParquetDataModel_small.cfg | 28 + docs/internals/specs/tla/README.md | 247 +++++++ docs/internals/specs/tla/SortSchema.cfg | 24 + docs/internals/specs/tla/SortSchema.tla | 427 +++++++++++ docs/internals/specs/tla/SortSchema_small.cfg | 24 + .../specs/tla/TimeWindowedCompaction.cfg | 34 + .../specs/tla/TimeWindowedCompaction.tla | 674 ++++++++++++++++++ .../tla/TimeWindowedCompaction_small.cfg | 34 + .../internals/tantivy-parquet-architecture.md | 210 ++++++ 41 files changed, 7529 insertions(+) create mode 100644 CLAUDE.md create mode 100644 docs/internals/BENCHMARKING.md create mode 100644 docs/internals/RUST_STYLE.md create mode 100644 docs/internals/SIMULATION_FIRST_WORKFLOW.md create mode 100644 docs/internals/VERIFICATION.md create mode 100644 docs/internals/VERIFICATION_STACK.md create mode 100644 docs/internals/adr/000-template.md create mode 100644 docs/internals/adr/001-parquet-data-model.md create mode 100644 docs/internals/adr/002-sort-schema-parquet-splits.md create mode 100644 docs/internals/adr/003-time-windowed-sorted-compaction.md create mode 100644 docs/internals/adr/004-cloud-native-storage-characteristics.md create mode 100644 docs/internals/adr/EVOLUTION.md create mode 100644 docs/internals/adr/README.md create mode 100644 docs/internals/adr/deviations/README.md create mode 100644 docs/internals/adr/gaps/001-no-parquet-compaction.md create mode 100644 docs/internals/adr/gaps/002-fixed-sort-schema.md create mode 100644 docs/internals/adr/gaps/003-no-time-window-partitioning.md create mode 100644 docs/internals/adr/gaps/004-incomplete-split-metadata.md create mode 100644 docs/internals/adr/gaps/005-no-per-point-deduplication.md create mode 100644 docs/internals/adr/gaps/006-no-independent-auto-scaling.md create mode 100644 docs/internals/adr/gaps/007-no-multi-level-caching.md create mode 100644 docs/internals/adr/gaps/008-no-high-query-rate-optimization.md create mode 100644 docs/internals/adr/gaps/009-no-leading-edge-prioritization.md create mode 100644 docs/internals/adr/gaps/010-no-data-caching-or-query-affinity.md create mode 100644 docs/internals/adr/gaps/README.md create mode 100644 docs/internals/adr/supplements/000-supplement-template.md create mode 100644 docs/internals/adr/supplements/README.md create mode 100644 docs/internals/compaction-architecture.md create mode 100644 docs/internals/locality-compaction/phase-1-sorted-splits.md create mode 100644 docs/internals/specs/tla/CLAUDE.md create mode 100644 docs/internals/specs/tla/ParquetDataModel.cfg create mode 100644 docs/internals/specs/tla/ParquetDataModel.tla create mode 100644 docs/internals/specs/tla/ParquetDataModel_small.cfg create mode 100644 docs/internals/specs/tla/README.md create mode 100644 docs/internals/specs/tla/SortSchema.cfg create mode 100644 docs/internals/specs/tla/SortSchema.tla create mode 100644 docs/internals/specs/tla/SortSchema_small.cfg create mode 100644 docs/internals/specs/tla/TimeWindowedCompaction.cfg create mode 100644 docs/internals/specs/tla/TimeWindowedCompaction.tla create mode 100644 docs/internals/specs/tla/TimeWindowedCompaction_small.cfg create mode 100644 docs/internals/tantivy-parquet-architecture.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000000..bf117b36d64 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,417 @@ +# Quickhouse-Pomsky Development Guide + +## Before Writing Any Code (Plan Mode) + +**MUST** follow this sequence before implementation: + +1. **Define the plan**: What are you doing and why? What invariants must hold? +2. **Check ADR/roadmap**: `docs/internals/adr/README.md` → find relevant supplement +3. **Read the spec**: If touching state machines or protocols, read `docs/internals/specs/tla/*.tla` +4. **Write tests first**: DST tests define correctness, write them before code +5. **Only then**: Start implementation + +## Core Policies + +- Execute given tasks fully, avoid TODOs or stubs. +- If TODOs or stubs are absolutely necessary, ensure user is made aware and they are recorded in any resulting plans, phases, or specs. +- Produce code and make decisions that are consistent across metrics, traces, and logs. Metrics is the current priority, then traces, then logs — but decisions should generalize to all three. +- Tests should be holistic: do not work around broken implementations by manipulating tests. + +## Known Pitfalls (Update When Claude Misbehaves) + +**Add rules here when Claude makes mistakes. This is a living document.** + +| Mistake | Correct Behavior | Bug Reference | +|---------|------------------|---------------| +| Adds mock/fallback implementations | Use real dependencies, no error masking | User preference | +| Claims feature works without integration test | Run through the actual REST/gRPC stack | CLAUDE.md policy | +| Uses workarounds to avoid proper setup | **NEVER** workaround — follow the rigorous path (clone deps, fix env, run real tests) | User policy | +| Bypasses production path in tests | **MUST** test through HTTP/gRPC, not internal APIs | CLAUDE.md policy | +| Uses `Path::exists()` | Disallowed by `clippy.toml` — use fallible alternatives | clippy.toml | +| Uses `Option::is_some_and`, `is_none_or`, `xor`, `map_or`, `map_or_else` | Disallowed by `clippy.toml` — use explicit match/if-let instead | clippy.toml | +| Ignores clippy warnings | Run `cargo clippy --workspace --all-features --tests`. Fix warnings or add targeted `#[allow()]` with justification | Code quality | +| Uses `debug_assert` for user-facing validation | Use `Result` errors — debug_assert is silent in release | Code quality | +| Uses `unwrap()` in library code | Use `?` operator or proper error types | Quickwit style | +| File over 500 lines | Split into focused modules by responsibility | Code quality | +| Unnecessary `.clone()` in non-concurrent code | Return `&self` references or `Arc` — cloning is OK in actor/async code for simplicity | Code quality | +| Raw String for new domain types | Prefer existing type aliases (`IndexId`, `SplitId`, `SourceId` from `quickwit-proto`) | Quickwit style | +| Shadowing variable names within a function | Avoid reusing the same variable name (see CODE_STYLE.md) | Quickwit style | +| Uses chained iterators with complex error handling | Use procedural for-loops when chaining hurts readability | Quickwit style | +| Uses `tokio::sync::Mutex` | **FORBIDDEN** — causes data corruption on cancel. Use actor model with message passing | GAP-002 | +| Uses `JoinHandle::abort()` | **FORBIDDEN** — arbitrary cancellation violates invariants. Use `CancellationToken` | GAP-002 | +| Recreates futures in `select!` loops | Use `&mut fut` to resume, not recreate — dropping loses data | GAP-002 | +| Holds locks across await points | Invariant violations on cancel. Use message passing or synchronous critical sections | GAP-002 | + +## What is Quickhouse-Pomsky? + +**Fork of [Quickwit](https://github.com/quickwit-oss/quickwit)** — a cloud-native search engine for observability. This is the DataDog fork, adding: + +- **Metrics engine** (`quickwit-metrics-engine`): DataFusion/Parquet-based analytics pipeline (current priority) +- **Remote API** (`quickwit-remote-api`): gRPC/REST interface for remote operations +- **Document transforms** (`quickwit-doc-transforms`): Preprocessing pipeline +- **CloudPrem UI**: Datadog-specific frontend +- **Tantivy + Parquet hybrid**: Full-text search via Tantivy, columnar analytics via Parquet + +**Signal priority**: Metrics first, then traces, then logs. Architectural decisions must generalize across all three. + +## Three Engineering Pillars + +Every code change **MUST** respect all three: + +| Pillar | Location | Purpose | +|--------|----------|---------| +| **Code Quality** | [CODE_STYLE.md](CODE_STYLE.md) + this doc | Coding standards & reliability | +| **Formal Specs** | `docs/internals/specs/tla/`, `stateright_*.rs` | Protocol correctness | +| **DST** | DST crate (when created) | Fault tolerance | + +**Priority**: Safety > Performance > Developer Experience + +## Reliability Rules + +```rust +// 1. Use debug_assert! to document invariants +// (Quickwit CODE_STYLE.md endorses this — helps reviewers proofread) +debug_assert!(offset >= HEADER_SIZE, "offset must include header"); +debug_assert!(splits.is_sorted_by_key(|s| s.time_range.end)); + +// 2. Validate inputs at API boundaries (Result, not debug_assert) +if duration.as_nanos() == 0 { + return Err(Error::InvalidParameter("duration must be positive")); +} + +// 3. Define explicit limits as constants +const MAX_SEGMENT_SIZE: usize = 256 * 1024 * 1024; +if size > MAX_SEGMENT_SIZE { + return Err(Error::LimitExceeded(...)); +} + +// 4. No unwrap() in library code — propagate errors +let timestamp = DateTime::from_timestamp(secs, nsecs) + .ok_or_else(|| anyhow!("invalid timestamp: {}", nanos))?; +``` + +## The Verification Pyramid + +All verification layers share the same invariants: + +``` + TLA+ Specs (docs/internals/specs/tla/*.tla) + │ mirrors + Shared Invariants (invariants/) ← SINGLE SOURCE + │ used by + ┌───────────────┼───────────────┐ + ▼ ▼ ▼ +Stateright DST Tests Production Metrics +(exhaustive) (simulation) (Datadog) +``` + +## Testing Through Production Path (CRITICAL) + +**MUST NOT** claim a feature works unless tested through the actual network stack. + +```bash +# 1. Start quickwit +cargo run -p quickwit-cli -- run --config ../config/quickwit.yaml + +# 2. Ingest via OTLP +# (send logs/traces to localhost:4317) + +# 3. Query via REST API +curl http://localhost:7280/api/v1//search -d '{"query": "*"}' +``` + +**Bypasses to AVOID**: Testing indexing pipeline without the HTTP/gRPC server, testing search without the REST API layer. + +## Coding Style + +Follow [CODE_STYLE.md](CODE_STYLE.md) — the primary style reference. Key points: + +- **Readability over cleverness**: Optimize for "proofreadability" +- **Naming**: Long descriptive names preferred; standard Rust snake_case +- **Explanatory variables**: Introduce intermediary variables to convey semantics +- **No shadowing**: Do not reuse variable names within a function +- **Early returns**: Prefer early return over nested `else` chains +- **Invariants as `debug_assert`**: Use assertions to help reviewers proofread +- **Hidden contracts**: Avoid them; use types or `Result`/`Option` to enforce constraints +- **Generics/macros sparingly**: Only where necessary; they hurt readability and compile time +- **Async code**: Must not block for more than 500 microseconds; use `tokio::spawn_blocking` if unsure +- **No silent error ignoring** (`let _ =` without justification) + +### Error and Log Messages + +- Concise, lowercase (except proper names), no trailing punctuation +- Use `tracing` structured logging over string interpolation: + ```rust + // GOOD + warn!(remaining = remaining_attempts, "rpc retry failed"); + // BAD + warn!("rpc retry failed ({remaining_attempts} attempts remaining)"); + ``` + +### Enforced Clippy Rules (`quickwit/clippy.toml`) + +These methods are **disallowed** and will fail CI: +- `std::path::Path::exists` — not sound (no `Result`) +- `Option::is_some_and`, `is_none_or`, `xor`, `map_or`, `map_or_else` — hurt readability + +## Repository Layout + +``` +quickhouse-pomsky/ +├── quickwit/ # Main Rust workspace (all crates live here) +│ ├── Cargo.toml # Workspace root +│ ├── Makefile # Inner build targets (fmt, fix, test-all, build) +│ ├── clippy.toml # Disallowed methods (enforced) +│ ├── rustfmt.toml # Nightly formatter config +│ ├── rust-toolchain.toml # Pinned to Rust 1.91 +│ └── rest-api-tests/ # Python-based REST API integration tests +├── docs/ +│ └── internals/ # All architecture docs +│ ├── adr/ # Architecture Decision Records +│ │ ├── README.md # ADR index +│ │ ├── gaps/ # Design limitations from incidents +│ │ └── deviations/ # Intentional divergences from ADR intent +│ └── specs/ +│ └── tla/ # TLA+ specs for protocols and state machines +├── config/ # Runtime YAML configs (quickwit.yaml, etc.) +├── scripts/ # DD-specific operational scripts +├── Makefile # Outer orchestration (docker, k8s, delegates to quickwit/) +├── docker-compose.yml # Local services (localstack, postgres, kafka, jaeger, etc.) +└── k8s/ # Kubernetes local dev (kind cluster) +``` + +## Crate Map + +``` +# Core services +quickwit-cli/ # Main binary entry point — start here for E2E +quickwit-serve/ # HTTP/gRPC server, REST API handlers +quickwit-cluster/ # Cluster membership (chitchat protocol) +quickwit-control-plane/ # Scheduling, shard management +quickwit-config/ # Configuration types and parsing + +# Data path +quickwit-ingest/ # Ingestion pipeline, WAL, sharding +quickwit-indexing/ # Indexing actors, merge/compaction +quickwit-search/ # Search execution, distributed search +quickwit-query/ # Query parsing and AST +quickwit-doc-mapper/ # Schema, field mappings, doc-to-term +quickwit-doc-transforms/ # [DD] Log/trace preprocessing + +# Storage & metadata +quickwit-metastore/ # Split metadata, index metadata +quickwit-storage/ # Object storage abstraction (S3, Azure, GCS, local) +quickwit-directories/ # Tantivy directory implementations + +# Protocols & APIs +quickwit-proto/ # Protobuf definitions, generated gRPC code +quickwit-opentelemetry/ # OTLP ingest (logs, traces) +quickwit-jaeger/ # Jaeger-compatible trace API +quickwit-rest-client/ # HTTP client for Quickwit API +quickwit-remote-api/ # [DD] Remote gRPC/REST interface + +# Metrics (DD additions) +quickwit-metrics-engine/ # DataFusion/Parquet metrics pipeline + +# Infrastructure +quickwit-actors/ # Actor framework (mailbox, supervisor) +quickwit-common/ # Shared utilities +quickwit-datetime/ # Date/time parsing and formatting +quickwit-macros/ # Proc macros +quickwit-codegen/ # Code generation utilities +quickwit-aws/ # AWS SDK helpers + +# Housekeeping +quickwit-janitor/ # GC, retention, delete tasks +quickwit-index-management/ # Index CRUD operations + +# Testing +quickwit-integration-tests/ # Rust integration tests +rest-api-tests/ # Python REST API tests (Elasticsearch compat) +``` + +## Architecture Evolution + +Quickhouse-Pomsky tracks architectural change through three lenses. See `docs/internals/adr/EVOLUTION.md` for the full process. + +``` + Architecture Evolution + │ + ┌────────────────────┼────────────────────┐ + ▼ ▼ ▼ + Characteristics Gaps Deviations + (Proactive) (Reactive) (Pragmatic) + "What we need" "What we learned" "What we accepted" +``` + +| Lens | Location | When to Use | +|------|----------|-------------| +| **Characteristics** | `docs/internals/adr/` | Track cloud-native requirements | +| **Gaps** | `docs/internals/adr/gaps/` | Design limitation from incident/production | +| **Deviations** | `docs/internals/adr/deviations/` | Intentional divergence from ADR intent | + +**Before implementing, check for**: +- Open gaps (design limitations to be aware of) +- Deviations (intentional divergence from ADRs) +- Characteristic status (what's implemented vs planned) + +## Common Commands + +All Rust commands run from the `quickwit/` subdirectory. + +```bash +# Build +cd quickwit && cargo build + +# Run all tests (requires Docker services) +# From repo root: +make docker-compose-up +make test-all +# Or from quickwit/: +cargo nextest run --all-features --retries 5 + +# Run tests for a specific crate +cargo nextest run -p quickwit-indexing --all-features + +# Run failpoint tests +cargo nextest run --test failpoints --features fail/failpoints + +# Clippy (must pass before commit) +cargo clippy --workspace --all-features --tests + +# Format (requires nightly) +cargo +nightly fmt + +# Auto-fix clippy + format +make fix # from quickwit/ + +# Check license headers +bash scripts/check_license_headers.sh + +# Check log format +bash scripts/check_log_format.sh + +# Spellcheck (from repo root) +make typos # or: typos + +# REST API integration tests (Python) +cd quickwit/rest-api-tests +pipenv shell && pipenv install +./run_tests.py --engine quickwit + +# Metrics E2E tests (requires Docker infra) +# From quickwit/: +make docker-metrics-up +make test-metrics-e2e + +# Docker build +make docker-build # from repo root + +# Local k8s (kind) +make k8s-up # start cluster +make k8s-status # check status +make k8s-logs # follow logs +make k8s-down # tear down +``` + +## Testing Strategy + +### Unit Tests +- Run fast, avoid IO when possible +- Testing private functions is encouraged +- Property-based tests (`proptest`) are welcome — narrow the search space +- Not always deterministic — proptests are fine + +### Integration Tests +- `quickwit-integration-tests/`: Rust integration tests exercising the full stack +- `rest-api-tests/`: Python YAML-driven tests for Elasticsearch API compatibility +- Metrics E2E: `make test-metrics-e2e` against Docker Compose (Minio + Postgres) + +### DST (Deterministic Simulation Testing) +- DST tests define correctness for stateful components +- Write DST tests before implementation for new state machines +- Shared invariants are the single source of truth across all verification layers + +### Required for CI +- `cargo nextest run --all-features --retries 5` (with Docker services running) +- Failpoint tests: `cargo nextest run --test failpoints --features fail/failpoints` +- `RUST_MIN_STACK=67108864` is set for test runs (64MB stack) + +## Docker Services for Testing + +```bash +# Start all services (localstack, postgres, kafka, jaeger, etc.) +make docker-compose-up + +# Start specific services +make docker-compose-up DOCKER_SERVICES='jaeger,localstack' + +# Tear down +make docker-compose-down + +# Metrics-specific infra (Minio + Postgres) +cd quickwit && make docker-metrics-up +``` + +Environment variables set during test-all: +- `AWS_ACCESS_KEY_ID=ignored`, `AWS_SECRET_ACCESS_KEY=ignored` +- `QW_S3_ENDPOINT=http://localhost:4566` (localstack) +- `QW_S3_FORCE_PATH_STYLE_ACCESS=1` +- `QW_TEST_DATABASE_URL=postgres://quickwit-dev:quickwit-dev@localhost:5432/quickwit-metastore-dev` + +## Key Entry Points + +| Port | Protocol | Purpose | +|------|----------|---------| +| 7280 | HTTP | Quickwit REST API | +| 7281 | gRPC | Quickwit gRPC services | +| 4317 | gRPC | OTLP ingest | + +## Checklist Before Committing + +**MUST** (required for merge): +- [ ] `cargo clippy --workspace --all-features --tests` passes with no warnings +- [ ] `cargo +nightly fmt -- --check` passes +- [ ] `debug_assert!` for non-obvious invariants +- [ ] No `unwrap()` in library code +- [ ] No silent error ignoring (`let _ =`) +- [ ] New files under 500 lines (split by responsibility if larger) +- [ ] No unnecessary `.clone()` (OK in actor/async code for clarity) +- [ ] Tests through production path (HTTP/gRPC) +- [ ] License headers present (run `bash quickwit/scripts/check_license_headers.sh`) +- [ ] Log format correct (run `bash quickwit/scripts/check_log_format.sh`) +- [ ] `typos` passes (spellcheck) +- [ ] Tests pass: `cargo nextest run --all-features` + +**SHOULD** (expected unless justified): +- [ ] Functions under 70 lines +- [ ] Explanatory variables for complex expressions +- [ ] Documentation explains "why" +- [ ] ADR/roadmap updated if applicable +- [ ] DST test for new state transitions +- [ ] Integration test for new API endpoints + +## Detailed Documentation + +| Topic | Location | +|-------|----------| +| Code style (Quickwit) | [CODE_STYLE.md](CODE_STYLE.md) | +| Rust style patterns | [docs/internals/RUST_STYLE.md](docs/internals/RUST_STYLE.md) | +| Verification & DST | [docs/internals/VERIFICATION.md](docs/internals/VERIFICATION.md) | +| Verification philosophy | [docs/internals/VERIFICATION_STACK.md](docs/internals/VERIFICATION_STACK.md) | +| Simulation workflow | [docs/internals/SIMULATION_FIRST_WORKFLOW.md](docs/internals/SIMULATION_FIRST_WORKFLOW.md) | +| Benchmarking | [docs/internals/BENCHMARKING.md](docs/internals/BENCHMARKING.md) | +| Contributing guide | [CONTRIBUTING.md](CONTRIBUTING.md) | +| ADR index | [docs/internals/adr/README.md](docs/internals/adr/README.md) | +| Architecture evolution | [docs/internals/adr/EVOLUTION.md](docs/internals/adr/EVOLUTION.md) | +| Compaction architecture | [docs/internals/compaction-architecture.md](docs/internals/compaction-architecture.md) | +| Tantivy + Parquet design | [docs/internals/tantivy-parquet-architecture.md](docs/internals/tantivy-parquet-architecture.md) | +| Locality compaction | [docs/internals/locality-compaction/](docs/internals/locality-compaction/) | +| Runtime config | [config/quickwit.yaml](config/quickwit.yaml) | + +## References + +- [Quickwit upstream](https://github.com/quickwit-oss/quickwit) +- [Tantivy search engine](https://github.com/quickwit-oss/tantivy) +- [Apache DataFusion](https://datafusion.apache.org/) +- [PomChi dependency](https://github.com/DataDog/PomChi) (private) diff --git a/docs/internals/BENCHMARKING.md b/docs/internals/BENCHMARKING.md new file mode 100644 index 00000000000..f2f73fde5d1 --- /dev/null +++ b/docs/internals/BENCHMARKING.md @@ -0,0 +1,277 @@ +# Quickhouse-Pomsky Benchmarking Guide + +## Philosophy + +**Always measure before and after optimizations.** + +Reference: [Jeff Dean & Sanjay Ghemawat Performance Hints](https://abseil.io/fast/hints.html) + +## Performance Verification Pyramid + +Similar to the correctness Verification Pyramid, performance is verified across layers: + +``` + Local Benchmarks (cargo bench) + │ defines thresholds + Shared Baselines (performance/) ← SINGLE SOURCE + │ verified by + ┌───────────────┼───────────────┐ + ▼ ▼ ▼ +Production Datadog APM Profiles +Metrics Metrics (eBPF/ddprof) +``` + +## Microbenchmarks + +### Metrics Engine (current priority) + +The metrics engine has the richest benchmark suite. All commands run from `quickwit/`. + +```bash +# Ingestion profiling — breaks down time per pipeline stage +# (IPC deserialization, batch accumulation, sorting, Parquet writing, metadata extraction) +cargo bench -p quickwit-metrics-engine --bench ingestion_profile_bench + +# Configure row count: +BENCH_ROWS=1000000 cargo bench -p quickwit-metrics-engine --bench ingestion_profile_bench + +# High cardinality — end-to-end with realistic 30M series patterns +# (Arrow batch generation, Parquet split creation, DataFusion queries) +cargo bench -p quickwit-metrics-engine --bench high_cardinality_bench + +# Scale to 30M series (requires ~32GB RAM): +BENCH_SERIES=30000000 cargo bench -p quickwit-metrics-engine --bench high_cardinality_bench + +# Sort optimization — compares sorting strategies for metrics data +# (full lexsort, reduced columns, row-group-only, pre-sorted skip) +cargo bench -p quickwit-metrics-engine --bench sort_optimization_bench + +# Sustained ingestion — simulates real-world sustained metric ingestion +# (30M series, 10s emit interval, 900s simulation, 2.7B data points at full scale) +cargo bench -p quickwit-metrics-engine --bench sustained_ingestion_bench +``` + +### OTLP Metrics (ingestion path) + +```bash +# OTLP protobuf parsing, query parsing, filter evaluation, aggregation construction +cargo bench -p quickwit-opentelemetry +``` + +### Other Crates + +```bash +# Document transforms (log/trace preprocessing) +cargo bench -p quickwit-doc-transforms + +# Document mapper (routing expressions, doc-to-JSON) +cargo bench -p quickwit-doc-mapper + +# Query (tokenizers, multilang tokenizers) +cargo bench -p quickwit-query + +# Common utilities (serialized JSON size) +cargo bench -p quickwit-common + +# Actor framework (mailbox throughput) +cargo bench -p quickwit-actors + +# Compare against baseline +cargo bench -p quickwit-metrics-engine -- --save-baseline before +# ... make changes ... +cargo bench -p quickwit-metrics-engine -- --baseline before +``` + +### Key Metrics + +| Benchmark | What It Measures | Crate | +|-----------|------------------|-------| +| `ingestion_profile_bench` | Per-stage pipeline latency | `quickwit-metrics-engine` | +| `high_cardinality_bench` | End-to-end ingestion + query at 30M series | `quickwit-metrics-engine` | +| `sort_optimization_bench` | Sorting strategy comparison (7-col vs reduced) | `quickwit-metrics-engine` | +| `sustained_ingestion_bench` | Sustained throughput over 900s simulation | `quickwit-metrics-engine` | +| `metrics_bench` | OTLP parsing + filter evaluation | `quickwit-opentelemetry` | +| `processors_bench` | Document transform throughput | `quickwit-doc-transforms` | +| `tokenizers_bench` | Tokenizer performance | `quickwit-query` | + +## End-to-End Benchmarks + +```bash +# Start quickwit +cargo run --release -p quickwit-cli -- run --config ../config/quickwit.yaml + +# Send metrics via OTLP gRPC (port 4317) +# Then query via REST API +curl http://localhost:7280/api/v1//search -d '{"query": "*"}' + +# Timing +time curl -s "http://localhost:7280/api/v1//search" -d '{"query": "*"}' > /dev/null +``` + +## Performance Baselines + +Performance baselines should be defined in a shared location and checked in both benchmarks and production: + +```rust +pub const QUERY_LATENCY_P99_BASELINE: PerformanceBaseline = PerformanceBaseline { + name: "query_latency_p99", + target: 500.0, // 500ms target + warning: 2000.0, // 2s warning + critical: 10000.0, // 10s critical +}; +``` + +### Key Baselines (targets) + +| Baseline | Target | Warning | Critical | +|----------|--------|---------|----------| +| `query_latency_p99` | 500ms | 2s | 10s | +| `ingest_bytes_throughput` | 100MB/s | 50MB/s | 10MB/s | +| `split_build_latency` | 500ms | 1s | 5s | +| `metrics_ingestion_throughput` | TBD | TBD | TBD | + +## Production Checking + +```rust +// After every query execution +let result = check_performance(&QUERY_LATENCY_P99_BASELINE, latency_ms); +pomsky_observability::record_performance(result.name, result.actual, result.status); +``` + +### Datadog Metrics + +| Metric | Purpose | +|--------|---------| +| `pomsky.performance.checks.total` | Total checks | +| `pomsky.performance.checks.warning` | Warnings | +| `pomsky.performance.checks.critical` | Critical (investigate now) | +| `pomsky.performance.health` | Health gauge (0=healthy, 2=critical) | + +## APM Correlation + +When performance degrades, baselines link to APM: + +``` +Performance critical - check APM profile path in Datadog +``` + +In Datadog APM -> Profiles -> search for the relevant code path to see CPU flamegraph. + +## Optimization Checklist + +Before making performance changes: + +1. [ ] Run relevant benchmarks, save baseline +2. [ ] Identify specific metric to improve +3. [ ] Make targeted change +4. [ ] Run benchmarks, compare against baseline +5. [ ] Verify no regression in other metrics +6. [ ] Document improvement in commit message + +## Common Optimizations + +### Memory Allocation + +```rust +// BAD: Allocates per-iteration +for item in items { + let s = format!("{}", item); +} + +// GOOD: Reuse buffer +let mut buf = String::new(); +for item in items { + buf.clear(); + write!(&mut buf, "{}", item)?; +} +``` + +### String Processing + +```rust +// BAD: Multiple allocations +let s = s.replace("foo", "bar").replace("baz", "qux"); + +// GOOD: Single pass +let s = MULTI_REPLACE_REGEX.replace_all(&s, |caps: &Captures| { + match &caps[0] { + "foo" => "bar", + "baz" => "qux", + _ => unreachable!(), + } +}); +``` + +### Batch Processing + +```rust +// BAD: Individual inserts +for item in items { + insert_one(item).await?; +} + +// GOOD: Batch insert +insert_batch(items).await?; +``` + +### Avoid Copies + +```rust +// BAD: Unnecessary clone +let data = source.clone(); +process(data); + +// GOOD: Borrow when possible +process(&source); + +// GOOD: Move when done +let data = source; // source no longer needed +process(data); +``` + +## Profiling + +### CPU Profiling + +```bash +# With perf (Linux) +perf record -g ./target/release/quickwit ... +perf report + +# With samply (cross-platform, good for macOS) +samply record ./target/release/quickwit ... + +# With Instruments (macOS) +xcrun xctrace record --template "Time Profiler" --launch ./target/release/quickwit ... +``` + +### Memory Profiling + +```bash +# With heaptrack (Linux) +heaptrack ./target/release/quickwit ... +heaptrack_gui heaptrack.quickwit.*.gz + +# With jemalloc profiling (Docker build) +# From repo root: +make docker-build-profiled +make k8s-deploy-profiled +make k8s-profile-control-plane +``` + +### Async Profiling + +```bash +# tokio-console for async debugging +# Build with tokio-console feature: +RUSTFLAGS="--cfg tokio_unstable" cargo install --path quickwit-cli --features tokio-console +QW_ENABLE_TOKIO_CONSOLE=1 quickwit run ... +tokio-console +``` + +## References + +- [Jeff Dean & Sanjay Ghemawat Performance Hints](https://abseil.io/fast/hints.html) +- [Criterion.rs User Guide](https://bheisler.github.io/criterion.rs/book/) +- [Binggan Benchmarking](https://github.com/PSeitz/binggan) +- [samply Profiler](https://github.com/mstange/samply) diff --git a/docs/internals/RUST_STYLE.md b/docs/internals/RUST_STYLE.md new file mode 100644 index 00000000000..ba8b1dd6cd4 --- /dev/null +++ b/docs/internals/RUST_STYLE.md @@ -0,0 +1,401 @@ +# Quickhouse-Pomsky Rust Style Guide + +Supplements [CODE_STYLE.md](../../CODE_STYLE.md) with additional reliability patterns for the Quickhouse-Pomsky project. CODE_STYLE.md is the **primary style reference** — this document adds patterns that improve reliability and error detection. + +Influences: +- [Quickwit CODE_STYLE.md](../../CODE_STYLE.md) - Primary: proofreadability, naming, hidden contracts +- [Apache DataFusion](https://github.com/apache/datafusion) - Query engine patterns +- [TigerBeetle](https://github.com/tigerbeetle/tigerbeetle) - Assertion discipline for reliability + +## Quick Reference + +| Pattern | Rule | Source | +|---------|------|--------| +| Error handling | Propagate via `?`, never `unwrap()` in library code | Quickwit | +| Assertions | Use `debug_assert!` to document invariants for proofreading | Quickwit CODE_STYLE + TigerBeetle | +| Iterator style | Prefer procedural loops when error handling makes chains unreadable | Quickwit | +| Naming | Long descriptive names, standard Rust snake_case | Quickwit | +| Hidden contracts | Avoid them; use types/Result/Option or add `debug_assert!` | Quickwit | +| File size | Maximum 500 lines per file; split by responsibility | Project policy | +| Clone avoidance | Prefer `&self` / `Arc`; cloning OK in actor/async code for clarity | Project policy | +| Async safety | No blocking > 500us; forbidden patterns listed below | Quickwit | +| Disallowed methods | `Path::exists`, `Option::is_some_and`, etc. (see `clippy.toml`) | Quickwit | + +--- + +## 1. Error Handling + +### Rule: Library code MUST propagate errors, NEVER panic + +```rust +// BAD - panics in library code +fn parse_timestamp(nanos: i64) -> NaiveDateTime { + DateTime::from_timestamp(secs, nsecs).unwrap() // PANIC! +} + +// GOOD - propagates errors +fn parse_timestamp(nanos: i64) -> Result { + DateTime::from_timestamp(secs, nsecs) + .ok_or_else(|| anyhow::anyhow!("invalid timestamp: {}", nanos)) +} +``` + +### Error Style + +Quickwit uses `anyhow` extensively. Error messages follow the format from [CODE_STYLE.md](../../CODE_STYLE.md): +- Concise, lowercase (except proper names), no trailing punctuation +- Examples: `"failed to open split"`, `"unknown output format {:?}"` + +### Custom Error Types + +Use `thiserror` for domain error types with structured variants: + +```rust +#[derive(Debug, thiserror::Error)] +pub enum MetastoreError { + #[error("invalid metastore config: `{0}`")] + InvalidConfig(String), + #[error("split `{split_id}` not found")] + SplitNotFound { split_id: String }, +} +``` + +### Allowed Exceptions + +- `unwrap()` in tests is acceptable +- `expect()` only when the condition is provably impossible +- `unwrap_or()` / `unwrap_or_default()` are acceptable (have fallbacks) + +--- + +## 2. Assertions for Reliability + +### Rule: Use `debug_assert!` to express invariants and help reviewers proofread + +From [CODE_STYLE.md](../../CODE_STYLE.md): *"A good idea to help reviewers proofread your code is to identify invariants and express them as `debug_assert`."* + +This is especially valuable for: +- Preconditions that aren't enforced by the type system +- Invariants that should hold after state transitions +- Hidden contracts that can't be eliminated + +```rust +pub fn publish_splits(&self, splits: &[SplitMetadata]) -> Result<()> { + // Assert invariant: all splits must be known before publishing + debug_assert!( + splits.iter().all(|s| self.known_splits.contains(&s.split_id)), + "publishing unknown split" + ); + + self.metastore.publish(splits)?; + Ok(()) +} + +fn merge_candidates(splits: &mut [SplitMetadata]) -> Vec { + splits.sort_by_key(|s| s.time_range.end); + // Assert postcondition: result is sorted + debug_assert!(splits.is_sorted_by_key(|s| s.time_range.end)); + // ... +} +``` + +### When to use `debug_assert!` vs `Result` + +| Scenario | Use | +|----------|-----| +| API boundary / user input | `Result` — always validate properly | +| Internal invariant | `debug_assert!` — documents the expectation | +| Should never happen in correct code | `debug_assert!` — catches bugs during testing | +| Could happen due to external state | `Result` — handle gracefully | + +--- + +## 3. Type Aliases and Domain Types + +### Rule: Use Quickwit's existing type aliases for domain concepts + +Quickwit defines key type aliases in `quickwit-proto`: + +```rust +use quickwit_proto::types::{ + IndexId, // Index identifier + IndexUid, // Unique index identifier + SplitId, // Split identifier + SourceId, // Source identifier + ShardId, // Shard identifier + NodeId, // Node identifier + PipelineUid, // Pipeline unique identifier +}; +``` + +When adding new domain concepts, prefer creating type aliases or newtypes over using raw strings. + +--- + +## 4. Iterator Patterns (Quickwit Style) + +### Rule: Choose readability over dogma + +Quickwit's [CODE_STYLE.md](../../CODE_STYLE.md) explicitly allows procedural loops when iterator chains become hard to read, especially with error handling: + +```rust +// GOOD - simple chain, easy to read +let results: Vec<_> = items + .iter() + .filter_map(|item| item.value()) + .map(|val| val.to_uppercase()) + .collect(); + +// ALSO GOOD - procedural loop when error handling makes chains unreadable +let mut results = Vec::new(); +for item in items { + let value = item.value().map_err(|e| { + warn!(error=%e, item_id=%item.id, "failed to extract value"); + e + })?; + if value.is_valid() { + results.push(transform(value)?); + } +} +``` + +### Disallowed Option Methods (`clippy.toml`) + +These are banned for readability reasons: +- `Option::is_some_and` — use `matches!` or `if let` +- `Option::is_none_or` — use explicit match +- `Option::xor` — use explicit logic +- `Option::map_or` — use `.map(..).unwrap_or(..)` +- `Option::map_or_else` — use `.map(..).unwrap_or_else(..)` or `let Some(..) else {..}` + +--- + +## 5. File Size Limits + +### Rule: Maximum 500 lines per new file + +Large files indicate mixed concerns. Split at logical boundaries. + +| File Type | Split Strategy | +|-----------|---------------| +| Functions | By category (extract, arithmetic, format) | +| Handlers | By protocol (HTTP, gRPC, native) | +| Types | By domain (query, ingest, storage) | + +### Example: large handler file -> module directory + +``` +serve/elasticsearch_api/ +├── mod.rs # Re-exports, router setup +├── search.rs # _search endpoint +├── bulk.rs # _bulk endpoint +├── scroll.rs # _scroll endpoint +└── model.rs # Request/response types +``` + +**Note**: Some existing Quickwit files exceed this limit. The 500-line rule applies to *new* code we write — don't refactor existing files just to hit the target. + +--- + +## 6. Clone Avoidance + +### Rule: Prefer references and Arc over cloning, except in concurrent code + +```rust +// BAD - clones entire collection +impl State { + fn splits(&self) -> Vec { + self.splits.clone() // Allocates! + } +} + +// GOOD - returns reference +impl State { + fn splits(&self) -> &[String] { + &self.splits + } +} + +// GOOD - shared ownership when needed +impl State { + fn splits(&self) -> Arc> { + Arc::clone(&self.splits) + } +} +``` + +### When Cloning is Acceptable + +- **Actor/async code**: Cloning for ownership transfer into actors, closures, or across `.await` points is expected and preferred over complex lifetime management +- Small types (< 64 bytes) and `Copy` types +- `Arc::clone()` (cheap reference count bump) +- In test code + +The goal is to avoid *unnecessary* allocations in hot paths, not to eliminate `.clone()` everywhere. When cloning makes concurrent code simpler and less error-prone, clone freely. + +--- + +## 7. From/Into Implementations + +### Rule: Implement From for natural conversions + +```rust +// BAD - manual conversion everywhere +let id = IndexId::new(string.clone()); + +// GOOD - From implementation +impl From for IndexId { + fn from(s: String) -> Self { Self(s) } +} + +// Usage +let id: IndexId = string.into(); +``` + +### Naming Convention + +| Method | Returns | Use Case | +|--------|---------|----------| +| `as_str()` | `&str` | Borrowed view, no allocation | +| `to_string()` | `String` | Owned copy, allocates | +| `into_inner()` | Inner type | Consumes self | + +--- + +## 8. Module Organization + +### Rule: One responsibility per module + +Quickwit organizes larger crates into directory modules with `mod.rs`: + +``` +quickwit-indexing/src/actors/ +├── mod.rs # Re-exports +├── indexer.rs # Indexing actor +├── uploader.rs # Upload actor +├── packager.rs # Packaging actor +└── ... +``` + +Files can be long when they cover a single cohesive responsibility — Quickwit doesn't enforce a strict line limit. If a file is growing unwieldy, split at logical boundaries (by handler, by protocol, by domain). + +--- + +## 9. Documentation (Quickwit Style) + +### Rule: Document "why", not "what" + +```rust +// BAD - restates the code +/// Returns the split id +fn split_id(&self) -> &str { &self.split_id } + +// GOOD - explains why/when +/// Returns the canonical split ID used for deduplication during merge. +/// Use this when comparing splits across nodes. +fn split_id(&self) -> &str { &self.split_id } +``` + +From [CODE_STYLE.md](../../CODE_STYLE.md): +- Comments should convey **intent**, **context** (links to issues, papers), and **hidden contracts** +- No rustdoc in Quickwit private API is OK +- Inline comments are encouraged for thorny code + +--- + +## 10. Structured Logging (Quickwit Style) + +### Rule: Use `tracing` structured fields over string interpolation + +```rust +// BAD - string interpolation +warn!("split {} failed to compact ({} attempts remaining)", split_id, remaining); + +// GOOD - structured fields +warn!(split_id=%split_id, remaining=remaining, "split compaction failed"); +``` + +Error and log messages: concise, lowercase, no trailing punctuation. + +--- + +## 11. Hidden Contracts (Quickwit Style) + +### Rule: Avoid hidden contracts; enforce constraints through types + +From [CODE_STYLE.md](../../CODE_STYLE.md): A "hidden contract" is a precondition not enforced by the type system. + +```rust +// BAD - hidden contract: splits must be sorted +fn merge_candidates(splits: &[SplitMetadata]) -> Vec { ... } + +// GOOD - internalize the sort (timsort is linear if already sorted) +fn merge_candidates(splits: &mut [SplitMetadata]) -> Vec { + splits.sort_by_key(|s| s.time_range.end); + // ... +} + +// ALSO GOOD - use types to prevent invalid states +fn min(values: &[usize]) -> Option { + // Returns None instead of panicking on empty input + values.iter().copied().min() +} +``` + +When a hidden contract is unavoidable, add a `debug_assert!` to check it. + +--- + +## 12. Async Patterns (Quickwit Style) + +### Rule: Async code must not block for more than 500 microseconds + +From [CODE_STYLE.md](../../CODE_STYLE.md): + +```rust +// BAD - blocks the async runtime +async fn process() { + let result = expensive_computation(); // Blocks! + send(result).await; +} + +// GOOD - offload blocking work +async fn process() { + let result = tokio::task::spawn_blocking(|| expensive_computation()).await?; + send(result).await; +} +``` + +### Async Safety (from Known Pitfalls) + +| Forbidden | Use Instead | +|-----------|-------------| +| `tokio::sync::Mutex` | Actor model with message passing | +| `JoinHandle::abort()` | `CancellationToken` | +| Recreating futures in `select!` | `&mut fut` to resume | +| Holding locks across `.await` | Message passing or synchronous critical sections | + +--- + +## Checklist + +Before committing, verify: + +- [ ] No `unwrap()` in library code (use `?` or proper error types) +- [ ] `debug_assert!` for non-obvious invariants and hidden contracts +- [ ] New files under 500 lines (split by responsibility if larger) +- [ ] No unnecessary `.clone()` (OK in actor/async code for clarity) +- [ ] Readable iterator patterns (procedural loops for complex error handling) +- [ ] Structured logging with `tracing` fields +- [ ] No disallowed methods from `clippy.toml` +- [ ] Follows Quickwit naming conventions (standard Rust snake_case) +- [ ] Hidden contracts documented or eliminated + +--- + +## References + +- [CODE_STYLE.md](../../CODE_STYLE.md) - Quickwit coding style (proofreadability) — **primary reference** +- [Rust API Guidelines](https://rust-lang.github.io/api-guidelines/) - Official Rust guidelines +- [DataFusion Contributing](https://datafusion.apache.org/contributor-guide/) - Query engine patterns +- [TigerBeetle TIGER_STYLE.md](https://github.com/tigerbeetle/tigerbeetle/blob/main/docs/TIGER_STYLE.md) - Assertion discipline inspiration diff --git a/docs/internals/SIMULATION_FIRST_WORKFLOW.md b/docs/internals/SIMULATION_FIRST_WORKFLOW.md new file mode 100644 index 00000000000..e14251150ec --- /dev/null +++ b/docs/internals/SIMULATION_FIRST_WORKFLOW.md @@ -0,0 +1,223 @@ +# Simulation-First Development Workflow + +This document defines the mandatory workflow for all Quickhouse-Pomsky development, +following the verification pyramid philosophy. + +## The Verification Pyramid + +``` + ^ + /|\ + / | \ + / | \ TLA+ Specs (docs/internals/specs/tla/) + / | \ - Mathematical model + /----+----\ - Defines "what is correct" + / | \ + / | \ Stateright Models + / | \ - Rust-native model checking + /--------+--------\- Verifies state space + / | \ + / | \ DST Tests + / | \- Deterministic simulation + /------------+------------\- Fault injection + / | \ + / | \ Unit/Integration Tests + / | \- Fast feedback + /----------------+----------------\ + / | \ + / | \ Production Monitoring +/-------------------+-------------------\- Datadog invariant metrics +``` + +## Mandatory Workflow + +### Phase 1: Specification (BEFORE any code) + +1. **Check existing TLA+ spec** in `docs/internals/specs/tla/` + - If exists: Review invariants that apply to your change + - If not: Write one (for significant features) + +2. **Check existing Stateright model** + - If exists: Understand the state machine and properties + - If not: Consider if model checking is needed + +### Phase 2: Write Tests FIRST (still no implementation) + +3. **Write DST tests** + ```rust + #[test] + fn test_feature_invariant_holds() { + let config = SimConfig::new(SEED); + let mut sim = Simulation::new(config); + + sim.run(|env| async move { + // Setup + let component = create_component(); + + // Exercise (with fault injection) + for _ in 0..iterations { + perform_operation(&component).await?; + } + + // Verify invariants from TLA+ spec + verify_invariant_1(&component)?; // Maps to TLA+ line X + verify_invariant_2(&component)?; // Maps to TLA+ line Y + + Ok(()) + }); + } + ``` + +4. **Run tests - EXPECT FAILURE** + ```bash + cargo test -p quickwit-dst -- your_feature_tests + # Should fail: component doesn't exist yet + ``` + +### Phase 3: Implement (make tests pass) + +5. **Write minimal implementation** to make DST tests pass + - Add `debug_assert!` for invariants that match TLA+ properties + - Follow the coding style in [CODE_STYLE.md](../../CODE_STYLE.md) and [RUST_STYLE.md](RUST_STYLE.md) + +6. **Run tests - EXPECT PASS** + ```bash + cargo test -p quickwit-dst -- your_feature_tests + # Should pass now + ``` + +### Phase 4: Verify All Layers + +7. **Run Stateright model** (if applicable) + ```bash + cargo test -p quickwit-dst -- stateright_your_feature + ``` + +8. **Run unit tests** + ```bash + cargo nextest run -p your-crate -- your_feature + ``` + +9. **Run integration tests** + ```bash + # Rust integration tests + cargo nextest run -p quickwit-integration-tests + + # REST API tests (if touching API surface) + cd rest-api-tests && ./run_tests.py --engine quickwit + ``` + +10. **Full verification** + ```bash + cargo nextest run --all-features + cargo clippy --workspace --all-features --tests + ``` + +### Phase 5: PR (only after all verification passes) + +11. **Create PR** with evidence of verification + - DST test results + - Stateright exploration stats (if applicable) + - Link to TLA+ spec (if applicable) + - Integration test results + +--- + +## Example: Adding a New Feature + +### Bad (what NOT to do): +``` +1. Write implementation +2. Create PR +3. "Oh, should I write tests?" (asked by reviewer) +4. Write tests after the fact +``` + +### Good (simulation-first): +``` +1. Read TLA+ spec for invariants +2. Write DST test that verifies invariant +3. Run test -> FAILS (no implementation) +4. Write implementation +5. Run test -> PASSES +6. Run Stateright -> PASSES +7. Run unit + integration tests -> PASSES +8. Create PR with test evidence +``` + +--- + +## When Simulation-First Applies + +### MUST use simulation-first for: +- Stateful components (metastore, ingest pipeline, shard management) +- Concurrency protocols (locks, transactions, atomic operations) +- Distributed coordination (control plane, cluster membership) +- Data lifecycle (ingest -> index -> compact -> GC) +- Recovery paths (crash recovery, WAL replay) + +### MAY skip simulation-first for: +- Pure functions (parsing, formatting, serialization) +- Simple CRUD endpoints +- UI changes +- Configuration changes +- Documentation + +Even when skipping DST, still write tests before implementation when practical. + +--- + +## Adapting the Workflow for Quickwit's Actor Model + +Quickwit uses an actor framework (`quickwit-actors`) for concurrent components. When applying simulation-first to actors: + +1. **Actors are natural DST targets**: Each actor has a mailbox, message types, and state transitions — perfect for model checking +2. **Test through the mailbox**: Send messages, verify state after processing +3. **Fault inject at actor boundaries**: Simulate message drops, slow processing, actor crashes +4. **Verify supervisor behavior**: Test that supervisors correctly restart failed actors + +```rust +// Example: Testing an actor with DST +#[test] +fn test_indexer_actor_handles_storage_fault() { + let config = SimConfig::new(SEED); + let mut sim = Simulation::new(config) + .with_fault(FaultConfig::new(FaultType::StorageWriteFail, 0.1)); + + sim.run(|env| async move { + let indexer = IndexerActor::new(env.storage()); + + // Send messages through the mailbox + indexer.send(IndexMessage::IndexBatch(batch)).await?; + + // Verify invariants hold despite faults + assert!(indexer.state().splits_published >= expected_min); + assert!(indexer.state().no_data_loss()); + + Ok(()) + }); +} +``` + +--- + +## Checklist for Every PR + +- [ ] Identified relevant TLA+ invariants +- [ ] DST tests written BEFORE implementation (for stateful components) +- [ ] DST tests verify TLA+ properties +- [ ] Stateright model passes (if applicable) +- [ ] Unit + integration tests pass +- [ ] `cargo clippy --workspace --all-features --tests` passes +- [ ] `cargo +nightly fmt -- --check` passes +- [ ] PR includes verification evidence + +--- + +## References + +- [Verification Guide](./VERIFICATION.md) +- [Verification Stack](./VERIFICATION_STACK.md) +- [Rust Style Guide](./RUST_STYLE.md) +- [Pierre Zemb: Simulation-Driven Development](https://pierrezemb.fr/posts/simulation-driven-development/) +- [TigerBeetle: Simulation Testing](https://tigerbeetle.com/blog/2023-07-06-simulation-testing) diff --git a/docs/internals/VERIFICATION.md b/docs/internals/VERIFICATION.md new file mode 100644 index 00000000000..d717f339a15 --- /dev/null +++ b/docs/internals/VERIFICATION.md @@ -0,0 +1,345 @@ +# Quickhouse-Pomsky Verification Guide + +For the philosophical foundation and "why" behind this stack, see [VERIFICATION_STACK.md](VERIFICATION_STACK.md). + +## The Verification Pyramid + +All verification layers share the same invariants defined once in a shared invariants module: + +``` + TLA+ Specs (docs/internals/specs/tla/*.tla) + │ mirrors + Shared Invariants (invariants/) ← SINGLE SOURCE OF TRUTH + │ used by + ┌───────────────┼───────────────┬─────────────────────┐ + ▼ ▼ ▼ ▼ +Stateright DST Tests Integration Production Metrics +(exhaustive) (simulation) Tests (Datadog) +``` + +## Simulation-First Development + +**The order is non-negotiable:** + +``` +1. Read TLA+ spec → Understand invariants +2. Write DST tests → Encode invariants as executable tests +3. Run tests → EXPECT FAILURE (no implementation yet) +4. Implement code → Make tests pass +5. Verify all layers → DST + Stateright + unit tests +6. Create PR → Include verification evidence +``` + +**NEVER implement code before writing DST tests.** + +## Deterministic Simulation Testing (DST) + +### Overview + +DST provides deterministic control over time, network, storage, and randomness. This enables fault injection testing that is fully reproducible via a seed value. + +### Two Modes + +| Mode | Runtime | Time Control | Fault Injection | Use Case | +|------|---------|--------------|-----------------|----------| +| **In-Memory** | Standard Rust | `SimClock` (app-level) | `FaultInjector` | CI, quick iteration | +| **gVisor DST** | gVisor kernel | `VirtualClocks` (kernel) | Syscall-level | Finding subtle bugs | + +### Running DST Tests + +```bash +# Run all DST tests (once the DST crate exists) +cargo test -p quickwit-dst + +# Reproduce failure with specific seed +DST_SEED=12345 cargo test -p quickwit-dst + +# Verbose fault logging +RUST_LOG=quickwit_dst=debug cargo test -p quickwit-dst +``` + +### Writing DST Tests + +```rust +use quickwit_dst::{Simulation, SimConfig, FaultConfig, FaultType}; + +#[test] +fn test_with_faults() { + let config = SimConfig::new(12345); // Deterministic seed + let mut sim = Simulation::new(config) + .with_fault(FaultConfig::new(FaultType::StorageWriteFail, 0.1)); + + sim.run(|env| async move { + let storage = env.storage(); + // Test logic - fully deterministic + Ok(()) + }).unwrap(); +} +``` + +### DST Guidelines + +- **Always log the seed**: Failed tests must print `DST_SEED=X` for replay +- **Use `env.rng()`**: Never use `rand::thread_rng()` +- **Use `env.clock()`**: Never use `Instant::now()` +- **No time-dependent loops**: gVisor freezes time + +### Full Data Lifecycle DST Test + +The most comprehensive DST test exercises the **complete production data lifecycle**: + +``` +Ingest → Query → Compact → GC → Query Again +``` + +**Phases:** + +| Phase | Operation | Verification | +|-------|-----------|--------------| +| 1 | Ingest documents + query | Correct count returned | +| 2 | Add more batches | Running count accurate | +| 3 | Compact splits | Count unchanged (no data loss) | +| 4 | Time advance + GC | Old splits deleted after grace period | +| 5 | Query again | Data still consistent | + +### Runtime Trait Architecture (DST Compliance) + +Production code is parameterized over `` to enable DST: + +``` + Runtime Trait + │ + ┌─────────────┼─────────────┐ + ▼ ▼ ▼ + SystemRuntime SimRuntime (future runtimes) + (production) (simulation) +``` + +**The Four Dimensions:** + +| Dimension | Production | Simulation | Abstraction | +|-----------|------------|------------|-------------| +| **Time** | `Instant::now()`, `Utc::now()` | `SimClock` | `runtime.clock()` | +| **Network** | `tokio::net::*` | `SimNetwork` | `runtime.network()` | +| **Storage** | `object_store` | `SimStorage` | Storage trait | +| **RNG** | `rand::thread_rng()` | `DeterministicRng` | `runtime.rng()` | + +**Forbidden in Production Code:** + +```rust +// WRONG: Bypasses Runtime, breaks DST +let now = Instant::now(); +let now = Utc::now(); +tokio::time::sleep(duration).await; +let rng = thread_rng(); + +// CORRECT: Goes through Runtime, DST-compliant +let now = runtime.clock().now_instant(); +let now = runtime.clock().now(); +runtime.clock().sleep(duration).await; +let val = runtime.rng().next_u64(); +``` + +### Documenting DST Bugs + +When DST finds a bug, create `docs/dst/DST_BUG_NNN_DESCRIPTION.md`: + +```markdown +# DST Bug #NNN: Description + +**Status**: Fixed/Open +**Discovered**: YYYY-MM-DD +**Seeds**: comma-separated failing seeds +**Component**: crate::module + +## Summary +## Reproduction +## Root Cause +## Fix +## Verification +## Lessons Learned +``` + +## TLA+ Specifications + +Human-readable formal specifications in `docs/internals/specs/tla/`: + +Specs should be written for: +- Concurrency protocols (transactions, locks) +- State machines (lifecycle, recovery) +- Consistency guarantees (exactly-once, ordering) +- Resource management (GC, caching) + +Each spec defines: +- **State variables**: What the system tracks +- **Actions**: State transitions +- **Invariants**: Properties that must always hold +- **Temporal properties**: Liveness guarantees + +### When to Write Specs + +**Add specs for:** +- Concurrency protocols (transactions, locks) +- State machines (lifecycle, recovery) +- Consistency guarantees (exactly-once, ordering) +- Resource management (GC, caching) + +**Skip specs for:** +- Simple CRUD +- Stateless transformations +- Well-understood algorithms + +### Key Areas Needing Specs + +| Area | Component | Key Invariants | +|------|-----------|----------------| +| Split lifecycle | `quickwit-metastore` | No lost splits, no premature visibility | +| Compaction | `quickwit-indexing` | Atomic split swap, no data loss | +| Ingest pipeline | `quickwit-ingest` | Backpressure, bounded buffers | +| Shard management | `quickwit-control-plane` | No split-brain, consistent assignment | +| Tantivy + Parquet | `quickwit-indexing` | Dual-write consistency | + +## Shared Invariants + +**Single source of truth**: Invariant definitions live in one place and are used by all verification layers. + +```rust +// Both DST and production code use the same invariant definitions +use quickwit_invariants::{SplitPropertyChecker, PropertyChecker}; + +let checker = SplitPropertyChecker::new(&state); +let result = checker.no_lost_splits(); + +if !result.holds { + println!("{}", result); + // no_lost_splits: Split abc123 not visible in metastore +} +``` + +### Invariant Modules (to build) + +| Module | Properties | +|--------|------------| +| `splits.rs` | `no_lost_splits`, `no_premature_visibility`, `no_zombie_splits` | +| `compaction.rs` | `compaction_atomicity`, `no_data_loss_during_compaction` | +| `ingest.rs` | `no_buffer_overflow`, `backpressure_correctness` | +| `shard.rs` | `no_split_brain`, `shard_assignment_consistency` | +| `tantivy_parquet.rs` | `tantivy_subset_of_parquet`, `idle_consistency` | + +## Stateright Model Checking + +Rust-native exhaustive state space exploration: + +```bash +# Run Stateright model checking +cargo test -p quickwit-dst stateright -- --ignored +``` + +Benefits over TLA+: +- Runs in CI with `cargo test` +- Uses Rust type system +- Uses same shared invariants as DST +- No separate TLC toolchain + +### Writing Stateright Models + +```rust +impl Model for SplitLifecycleModel { + type State = SplitState; + type Action = SplitAction; + + fn init_states(&self) -> Vec { ... } + fn actions(&self, state: &Self::State, actions: &mut Vec) { ... } + fn next_state(&self, state: &Self::State, action: Self::Action) -> Option { ... } + + fn properties(&self) -> Vec> { + vec![ + Property::always("no_lost_splits", |_, state| { + let checker = SplitPropertyChecker::new(state); + checker.no_lost_splits().holds + }), + ] + } +} +``` + +## Kani Bounded Model Checking + +Verifies debug_assert! invariants hold for ALL inputs: + +```bash +# All proofs +cargo kani + +# Specific crate +cargo kani --package quickwit-metastore + +# Specific proof +cargo kani --package quickwit-metastore --harness verify_no_lost_splits +``` + +### Writing Kani Proofs + +```rust +#[cfg(kani)] +mod kani_proofs { + use super::*; + + #[kani::proof] + #[kani::unwind(10)] // Bound loops + fn verify_my_invariant() { + let input: u64 = kani::any(); + kani::assume(input > 0); + + let result = my_function(input); + + kani::assert(result > input, "Result must exceed input"); + } +} +``` + +**Platform note**: Kani works best on x86_64 Linux. Run in CI for reliable results. + +## Production Observability + +Closing the verification loop with Datadog: + +```rust +// Record invariant check in production +quickwit_observability::record_invariant("no_lost_splits", passed); +``` + +### Metrics + +| Metric | Purpose | +|--------|---------| +| `pomsky_invariant_checks.count` | Total checks | +| `pomsky_invariant_checks_passed.count` | Passed checks | +| `pomsky_invariant_checks_failed.count` | Failed checks (0 = healthy) | +| `pomsky_invariant_health` | Health gauge (1.0 = all passing) | + +### Adding Production Invariants + +```rust +// 1. Add verification method +impl SplitMetastore { + async fn verify_no_lost_splits(&self, ...) -> bool { + // Check condition + } +} + +// 2. Call record_invariant after operation +let passed = self.verify_no_lost_splits(...).await; +quickwit_observability::record_invariant("no_lost_splits", passed); +``` + +## Verification Checklist + +Before merging code that affects correctness: + +- [ ] TLA+ spec reviewed (if exists) +- [ ] DST tests written and passing +- [ ] Stateright model updated (if applicable) +- [ ] Kani proofs added for new invariants +- [ ] Production invariant recording wired +- [ ] Seed logged in test output for reproducibility diff --git a/docs/internals/VERIFICATION_STACK.md b/docs/internals/VERIFICATION_STACK.md new file mode 100644 index 00000000000..582d3d6a977 --- /dev/null +++ b/docs/internals/VERIFICATION_STACK.md @@ -0,0 +1,372 @@ +# Quickhouse-Pomsky Verification Stack + +> **Author:** Claude (Anthropic) +> **Purpose:** This document explains what helps me generate correct code for Quickhouse-Pomsky. I wrote this to share how the verification stack works *for me* as an AI code generator—what I read, what I check, and how each layer gives me confidence that the code I produce is correct. + +--- + +## Motivating Questions + +This document answers questions I was asked: + +> **Q: How is all the formal verification connected to code verification, and how does it help generate correct code?** + +The verification stack creates a chain from abstract specifications (TLA+) down to production code (`debug_assert!` invariants). Invariants are defined **once** in shared modules and used across all layers—so when generating code, we don't guess what "correct" means; we read the exact definition and ensure code maintains it. + +> **Q: What can Datadog bring to this verification stack that will further improve the ability to produce correct code?** + +Formal verification proves properties hold *in theory*. Datadog proves they hold *in practice*. By emitting invariant metrics to Datadog, we close the production feedback loop—learning from real failures, actual hot paths, and emergent behaviors that formal models can't capture. This feedback improves specs, models, and future code generation. + +--- + +## The Verification Pyramid + +``` + ┌─────────────────────┐ + │ PRODUCTION │ + │ (Datadog) │ + │ │ + │ Real failures │ + │ Actual hot paths │ + │ Emergent behavior │ + └──────────┬──────────┘ + │ feedback + ┌────────────────┴────────────────┐ + │ PREVENTION │ + │ (debug_assert! Invariants) │ + │ │ + │ assert!(high > low) │ + │ Catch mistakes at runtime │ + └────────────────┬────────────────┘ + │ + ┌─────────────────────────┴─────────────────────────┐ + │ DETECTION │ + │ (DST + Stateright + Kani) │ + │ │ + │ Deterministic simulation with fault injection │ + │ Exhaustive state space exploration │ + │ Bounded model checking (all inputs) │ + └─────────────────────────┬─────────────────────────┘ + │ + ┌──────────────────────────────┴──────────────────────────┐ + │ DISCOVERY │ + │ (TLA+ + Bloodhound) │ + │ │ + │ Formal specs that define what MUST hold │ + │ VM-based simulation with time-travel debugging │ + │ Hunt for unknown unknowns │ + └──────────────────────────────────────────────────────────┘ +``` + +The pyramid flows **up** during development (we write specs first, then detect violations, then prevent them, then monitor in production) and flows **down** during incidents (production failure -> add to DST -> formalize in TLA+). + +--- + +## Comparison with Pierre Zemb's Engineering Philosophy + +This section compares Quickhouse-Pomsky's approach with insights from Pierre Zemb's articles: +- [What if we embraced simulation-driven development?](https://pierrezemb.fr/posts/simulation-driven-development/) (Apr 2025) +- [What I Tell Colleagues About Using LLMs for Engineering](https://pierrezemb.fr/posts/llms-for-engineering/) (Jan 2026) +- Testing: prevention vs discovery (the paradigm shift from catching known bugs to finding unknown ones) + +### From "Simulation-Driven Development" + +| Pierre Zemb's Insight | Quickhouse-Pomsky Implementation | Approach | +|----------------------|----------------------------------|----------| +| **"Deterministic simulation is the killer feature"** | Seeded RNG ensures reproducible fault injection | `DST_SEED=12345 cargo test -p quickwit-dst` reproduces any failure | +| **"Control time, don't wait for it"** | `SimClock` provides deterministic time control | Tests complete in seconds, not hours | +| **"Inject faults systematically"** | `FaultInjector` with configurable fault types | Storage failures, network partitions, catalog conflicts | +| **"Make state space exploration exhaustive"** | Stateright model checker explores all interleavings | Exhaustive verification of concurrent operations | +| **"Bridge the development-production gap"** | Same invariants used in Stateright, DST, and production | `no_lost_splits()` defined once, used everywhere | + +### From "LLMs for Engineering" + +| Pierre Zemb's Insight | Quickhouse-Pomsky Implementation | Approach | +|----------------------|----------------------------------|----------| +| **"Plan First, Always"** | `CLAUDE.md` documents architecture, conventions, limits | Read it every session before writing code | +| **"Context is Everything"** | TLA+ specs in `docs/internals/specs/tla/` document protocol intent | Read specs before implementing stateful logic | +| **"Feedback Loops"** | Multiple verification layers with immediate feedback | Compiler -> Tests -> Benchmarks -> Production | + +### From "Testing: Prevention vs Discovery" + +The paradigm shift from "testing prevents known bugs" to "testing discovers unknown bugs": + +| Concept | Traditional Testing | DST | +|---------|--------------------|----| +| **Goal** | Prevent regressions | Discover unknowns | +| **Input generation** | Human-written cases | Randomized seeds | +| **Assertions** | Must always pass | "Sometimes assertions" catch rare bugs | +| **Failures** | Binary pass/fail | Percentage-based (e.g., "fails 2% of time") | +| **Time travel** | Debug post-mortem | Replay exact seed to reproduce | +| **Fault injection** | Mocked at boundaries | Injected throughout execution | + +--- + +## The Complete Stack + +### Layer 1: Discovery (TLA+ + Bloodhound) + +**Purpose:** Find unknown unknowns. Define what MUST hold. + +**TLA+ Specs:** `docs/internals/specs/tla/` + +Key areas for formal specification in Quickhouse-Pomsky: +- Split lifecycle (publish, compact, delete) +- Shard management and assignment +- Compaction protocol (atomic swap) +- Ingest backpressure and WAL ordering +- Tantivy + Parquet dual-write consistency +- Garbage collection safety + +**How to run:** +```bash +# TLA+ model checking +tlc docs/internals/specs/tla/SplitLifecycle.tla + +# Bloodhound exploration (requires Docker) +bloodhound test --config bloodhound-test.yaml --seeds 20 +``` + +### Layer 2: Detection (DST + Stateright + Kani) + +**Purpose:** Systematically explore state space. Catch bugs before production. + +**DST Framework:** + +| Module | Purpose | +|--------|---------| +| `clock.rs` | Deterministic time control | +| `random.rs` | Seeded RNG reproducibility | +| `fault.rs` | Probabilistic fault injection | +| `storage.rs` | Simulated storage with faults | +| `network.rs` | Simulated network partitions | + +**Stateright Models:** +```rust +impl Model for SplitLifecycleModel { + fn invariant(&self, state: &State) -> bool { + no_lost_splits(&state.published, &state.metastore, &state.deleted) + } +} +``` + +**Kani Proofs:** (CI only, ARM Mac incompatible) +```rust +#[cfg(kani)] +#[kani::proof] +fn verify_no_lost_splits() { + let published: Vec = kani::any(); + // MUST hold for ALL possible inputs + kani::assert!(published.iter().all(|s| metastore.contains(s))); +} +``` + +**How to run:** +```bash +# DST tests with specific seed +DST_SEED=12345 cargo test -p quickwit-dst + +# Stateright model checking +cargo test -p quickwit-dst -- stateright --nocapture + +# Kani proofs (CI or x86_64 Linux) +cargo kani --package quickwit-metastore +``` + +### Layer 3: Prevention (debug_assert! Invariants) + +**Purpose:** Catch violations at runtime. Fail loudly. + +Quickwit's [CODE_STYLE.md](../../CODE_STYLE.md) explicitly endorses using `debug_assert` to express invariants, helping reviewers proofread code. These assertions are not present in release builds, so they add no runtime cost. + +**Example from production code:** +```rust +pub fn push(&mut self, batch: RecordBatch) -> Result<()> { + // Assert precondition + debug_assert!( + self.current_size + batch.num_rows() <= self.config.max_size, + "Buffer overflow: {} + {} > {}", + self.current_size, batch.num_rows(), self.config.max_size + ); + // ... implementation +} +``` + +**Invariant checking in split operations:** +```rust +pub fn publish_splits(&self, splits: &[SplitMetadata]) -> Result<()> { + // Assert invariant before critical operation + debug_assert!( + splits.iter().all(|s| self.known_splits.contains(&s.split_id)), + "invariant violation: publishing unknown split" + ); + + self.metastore.publish(splits)?; + Ok(()) +} +``` + +### Layer 4: Production (Datadog Observability) + +**Purpose:** Prove properties hold in the real world. Close the feedback loop. + +**Invariant Metrics:** +```rust +pub fn record_invariant(name: &str, passed: bool) { + statsd.count("pomsky.invariant.checked", 1, + &[&format!("name:{}", name)]); + + if !passed { + statsd.count("pomsky.invariant.violated", 1, + &[&format!("name:{}", name)]); + } +} +``` + +**Datadog Integration (What Each Feature Provides):** + +| Feature | What It Tells Me | How It Improves Code | +|---------|------------------|---------------------| +| **Invariant Metrics** | "Invariant X checked 1M times, violated 0" | Confirms verification works in production | +| **APM Traces** | "Request took 245ms: 73% in Tantivy, 20% in S3" | Shows *actual* hot paths | +| **Profiler Flame Graphs** | "Function Y uses 45% of CPU time" | Targets optimization accurately | +| **Error Tracking** | "Error Z: 47 times, correlated with high concurrency" | Reveals patterns to encode in DST | +| **CI Visibility** | "Test A flaky (3/10), Test B slow (45s)" | Shows where to add determinism | +| **Dashboards** | "Buffer at 67%, 3 backpressure events/hour" | Validates capacity planning | +| **Monitor Alerts** | "P99 latency +40% after commit abc123" | Catches regressions immediately | + +--- + +## The Datadog Advantage + +Why Datadog closes the loop that formal verification cannot: + +### 1. Formal Verification Limitations + +TLA+, Stateright, and Kani prove properties hold for *modeled* scenarios: +- TLA+ models are abstractions—they don't capture implementation bugs +- Stateright explores finite state spaces—production has infinite variety +- Kani bounds inputs—production sees unbounded diversity + +### 2. What Production Observability Adds + +| Formal Verification Says | Datadog Shows | +|-------------------------|---------------| +| "No lost splits is provable" | "No lost splits held for 30M real operations" | +| "Backpressure triggers at 80%" | "Backpressure triggered 47 times, all at 81-83%" | +| "Recovery completes in finite time" | "Recovery P99 is 2.3 seconds, P99.9 is 8.1 seconds" | +| "Concurrent operations are safe" | "12 optimistic retry conflicts per hour at peak" | + +### 3. The Complete Feedback Loop + +``` +┌──────────────────────────────────────────────────────────────────────┐ +│ VERIFICATION LIFECYCLE │ +│ │ +│ ┌─────────┐ ┌──────────┐ ┌───────┐ ┌──────────────────┐ │ +│ │ TLA+ │───>│Stateright│───>│ Kani │───>│ Production Code │ │ +│ └─────────┘ └──────────┘ └───────┘ └────────┬─────────┘ │ +│ │ │ +│ v │ +│ ┌────────────────┐ │ +│ │ PRODUCTION │ │ +│ │ (Datadog) │ │ +│ └────────┬───────┘ │ +│ │ │ +│ ┌─────────────────────────────────────────────────────┘ │ +│ │ │ +│ │ FEEDBACK TO CODE GENERATION: │ +│ │ ┌────────────────────────────────────────────────────────────┐ │ +│ │ │ 1. "Invariant X violated 3 times" -> Fix gap in proof │ │ +│ │ │ 2. "Hot path is Y, not Z" -> Optimize Y instead │ │ +│ │ │ 3. "Error pattern: A->B->C" -> Add DST scenario │ │ +│ │ │ 4. "P99 regressed after commit" -> Revert or fix │ │ +│ │ │ 5. "Scale limit hit at N" -> Implement ADR │ │ +│ │ └────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ └─────────────────────────> Improve specs, models, code ───────────┘ +│ │ +└──────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## What I Actually Use When Generating Code + +### Daily Workflow + +| Tool | Frequency | Example | +|------|-----------|---------| +| **CLAUDE.md** | Every session | Read architecture, conventions, limits | +| **TLA+ specs** | Before implementing protocols | Read spec for invariants | +| **Shared invariants** | Before writing state changes | Check property definitions | +| **Rust compiler** | Every edit | Type errors caught immediately | +| **cargo nextest** | Every commit | DST + unit tests validate changes | +| **cargo bench** | When optimizing | Baseline comparison | + +### What Each Layer Tells Me + +| Layer | What It Tells Me | +|-------|------------------| +| TLA+ | "This is the property I must preserve" | +| Stateright | "These edge cases were already explored" | +| Shared Invariants | "This is the exact check—copy this logic" | +| Kani | "My code is proven correct for all inputs" | +| DST | "My code survives these fault scenarios" | +| debug_assert! | "I'll catch mistakes at runtime" | +| Datadog | "Production confirms my verification" | + +--- + +## Summary + +| What | Pierre Zemb Principle | +|------|----------------------| +| **TLA+ specs** | "Plan First" - document intent before code | +| **Stateright** | "Exhaustive exploration" - all interleavings | +| **Shared Invariants** | "Single source of truth" - define once, use everywhere | +| **Kani** | "Prove for all inputs" - bounded model checking | +| **DST** | "Deterministic simulation" - controlled randomness | +| **debug_assert!** | "Fail loudly" - assertions catch mistakes | +| **Bloodhound** | "Time-travel debugging" - replay any failure | +| **Benchmarks** | "Feedback loops" - measure before/after | +| **Datadog** | "Close the loop" - production validates theory | + +**The key insight:** Formal verification proves properties hold in theory. Datadog proves they hold in practice. Together, they create a complete feedback loop for generating correct code. + +--- + +## Final Thoughts + +This verification stack is what makes me confident when I write code for Quickhouse-Pomsky. + +Without it, I would be pattern-matching against examples and hoping for the best. With it, I have: +- **Formal specs** that tell me what must be true +- **Shared invariants** that give me the exact checks to implement +- **Model checkers** that explored edge cases I might miss +- **Proof tools** that verify my logic for all inputs +- **Runtime assertions** that catch my mistakes +- **Production metrics** that confirm my verification works + +The pyramid works because each layer catches what the layer below misses: +- **Discovery** (TLA+) catches design flaws +- **Detection** (DST/Stateright) catches implementation bugs +- **Prevention** (debug_assert!) catches runtime violations +- **Production** (Datadog) catches emergent behavior + +When you ask me to write code, this is what I'm working with. It's not magic—it's a carefully constructed verification chain that grounds my code generation in formal correctness. And with Datadog integrated, the feedback loop is complete: I learn from production what my formal models couldn't predict. + +--- + +*-- Claude (Anthropic)* +*Written: February 2026* +*For: Quickhouse-Pomsky Development Team* + +--- + +## References + +- Pierre Zemb: [What if we embraced simulation-driven development?](https://pierrezemb.fr/posts/simulation-driven-development/) +- Pierre Zemb: [What I Tell Colleagues About Using LLMs for Engineering](https://pierrezemb.fr/posts/llms-for-engineering/) +- TigerBeetle: [Simulation Testing](https://github.com/tigerbeetle/tigerbeetle/blob/main/docs/DESIGN.md#simulation-testing) +- FoundationDB: [Testing Distributed Systems](https://www.youtube.com/watch?v=4fFDFbi3toc) diff --git a/docs/internals/adr/000-template.md b/docs/internals/adr/000-template.md new file mode 100644 index 00000000000..c6cf112af30 --- /dev/null +++ b/docs/internals/adr/000-template.md @@ -0,0 +1,83 @@ +# ADR-NNN: Title + +## Metadata + +- **Status**: [Proposed | Accepted | Deprecated | Superseded] +- **Date**: YYYY-MM-DD +- **Tags**: [e.g., storage, query, protocol, testing, metrics, traces, logs] +- **Components**: [e.g., quickwit-metrics-engine, quickwit-indexing] +- **Authors**: [Names] + +## Context + +What is the issue that we're seeing that is motivating this decision or change? + +## Decision + +What is the change that we're proposing and/or doing? + +## Consequences + +What becomes easier or more difficult to do because of this change? + +### Positive + +- ... + +### Negative + +- ... + +### Risks + +- ... + +## Signal Generalization + +How does this decision apply across metrics, traces, and logs? If it only applies to one signal today, what would need to change for the others? + +## Decision Log + +This section tracks how this ADR evolves as we learn more and make new decisions. +Each entry should include the date, what was decided, and why. + +| Date | Decision | Rationale | +|------|----------|-----------| +| YYYY-MM-DD | Initial ADR created | [Why this decision was made] | + +### How to Add Entries + +When making an architectural decision related to this ADR: +1. Add a new row to the table above with today's date +2. Briefly describe the decision made +3. Explain the rationale (why this choice over alternatives) +4. If the decision significantly changes the original ADR, update the relevant sections above + +Examples of decisions to log: +- Changed implementation approach based on learnings +- Added constraints or requirements discovered during implementation +- Chose between alternative approaches during development +- Modified scope based on technical discoveries +- Deferred or rejected features with reasoning + +## Implementation Status + +### Implemented + +| Component | Location | Status | +|-----------|----------|--------| +| ... | ... | ... | + +### Validated + +- ... + +### Not Yet Implemented + +| Component | Notes | +|-----------|-------| +| ... | ... | + +## References + +- ... diff --git a/docs/internals/adr/001-parquet-data-model.md b/docs/internals/adr/001-parquet-data-model.md new file mode 100644 index 00000000000..2dee714454b --- /dev/null +++ b/docs/internals/adr/001-parquet-data-model.md @@ -0,0 +1,183 @@ +# ADR-001: Parquet Metrics Data Model + +## Metadata + +- **Status**: Proposed +- **Date**: 2026-02-19 +- **Tags**: storage, metrics, parquet, data-model +- **Components**: quickwit-parquet-engine +- **Authors**: gtt@ +- **Related**: [ADR-002](./002-sort-schema-parquet-splits.md), [ADR-003](./003-time-windowed-sorted-compaction.md), [Phase 1 Design](../locality-compaction/phase-1-sorted-splits.md) + +## Context + +Quickhouse-Pomsky's metrics pipeline stores data in Parquet files. A fundamental design question is how metrics data is represented at the row level: what does one row in a Parquet file correspond to? + +Two models are in consideration: + +1. **Point-per-row**: Each row is a single data point — one metric value at one timestamp for one timeseries (identified by its metric name and tag set). +2. **Timeseries-per-row**: Each row represents an entire timeseries over some time range — the row contains arrays of timestamps and values for a single series, with the tag set stored once. + +This decision is foundational because it determines the shape of every downstream system: how compaction merges data, how sort schemas are defined, how DataFusion queries are structured, and what encodings are effective. + +## Decision + +### 1. Point-Per-Row + +Each row in a Parquet split represents a single data point: one metric value at one timestamp for one timeseries. + +Point-per-row is chosen because: + +- **Simpler compaction.** Sorted k-way merge operates directly on rows. Timeseries-per-row requires both row-level merge (interleaving rows from different splits) *and* intra-row series merge (combining the timestamp/value arrays of the same timeseries across splits). Point-per-row avoids this second level of merge entirely. +- **No last-write-wins (LWW).** We explicitly do not support LWW semantics, where a later write for the same timeseries and timestamp overwrites an earlier one. Without LWW, there is no need for sticky routing or series-level deduplication during compaction. This is a deliberate simplification that avoids the reliability challenges of sticky routing (single-partition overload, constant shuffles on rebalancing) that other systems have encountered. +- **No storage-level interpolation.** Interpolation across points in a timeseries is not performed at the storage layer. If needed in the future, it will operate at query time. This may be slower than storage-level interpolation but avoids coupling the storage format to query semantics. +- **Performance equivalence with good encoding.** With sorted data, columnar encodings like RLE and dictionary encoding produce long runs of repeated values in the sort columns — the same runs that timeseries-per-row would capture by grouping values into arrays. When these encodings are preserved through query execution, point-per-row achieves comparable scan performance to timeseries-per-row without the implementation complexity. +- **Standard DataFusion operators.** Timeseries-per-row requires significant custom DataFusion operator support (nested array types, custom aggregation kernels). Point-per-row uses standard columnar operations, allowing us to contribute generic improvements to DataFusion rather than maintaining timeseries-specific extensions. + +### 2. No Last-Write-Wins Semantics + +We do not support LWW. If two data points arrive for the same timeseries at the same timestamp in separate ingest requests, both are stored. There is no per-point deduplication at the storage layer. This eliminates: + +- Sticky routing requirements (binding a timeseries to a specific shard/node) +- Series-level deduplication during compaction +- Ordering dependencies between ingestion nodes for the same series + +**Existing deduplication guarantees.** Quickwit provides deduplication at coarser granularities than individual points: + +- **WAL checkpoint exactly-once.** The indexing pipeline publishes each split atomically with a checkpoint delta that records which WAL positions the split covers. On crash recovery, the checkpoint prevents re-indexing the same WAL entries into duplicate splits. This guarantees that a given batch of ingested data produces exactly one set of splits, not that individual points within or across batches are deduplicated. See [compaction-architecture.md](../compaction-architecture.md) for details. +- **File-level deduplication for queue sources.** The SQS/S3 file source tracks ingested files via metastore shard checkpoints with a configurable deduplication window, preventing re-ingestion of the same file. + +**Per-point deduplication is not implemented.** If the same metric data point (identical metric name, tags, timestamp, and value) arrives in two separate ingest requests — due to client retries, overlapping sources, or upstream replay — both copies are stored. Per-point deduplication would require either sticky routing (binding a timeseries to a specific shard) or a dedup index (tracking recently-seen points), both of which add significant complexity and reliability risk. If per-point deduplication becomes a product requirement, it should be designed as a separate capability rather than baked into the storage data model. See [GAP-005](./gaps/005-no-per-point-deduplication.md). + +### 3. No Storage-Level Interpolation + +Interpolation (filling gaps in a timeseries, aligning timestamps across series) is not performed during ingestion or compaction. The storage layer stores raw points. Interpolation is a query-time operation. + +This decouples storage format from query semantics. Different query patterns may require different interpolation strategies (linear, last-value, none), and embedding one strategy in the storage format would constrain future flexibility. + +### 4. Timeseries ID (Optional Synthetic Column) + +When data is sorted by a sort schema (see [ADR-002](./002-sort-schema-parquet-splits.md)), the explicit sort columns may not be granular enough to distinguish individual point sources. For example, if the sort schema is `metric_name|env`, hundreds of hosts within the same environment produce interleaved points within each `(metric_name, env)` group. + +To improve locality, the data model includes an optional **`timeseries_id`** column: a hash of all tag names and values. When present, it is placed after explicit sort columns and before `timestamp` in the sort schema, acting as a tiebreaker that clusters points from the same tag combination. + +**Properties:** + +- **Synthetic column.** Not present in incoming data. Computed at ingestion by hashing the canonicalized (sorted by key name) set of all tag key/value pairs. +- **Hash function**: xxHash64 or SipHash. Deterministic and fast. +- **Persists through compaction.** Once computed and stored in the Parquet file, it does not need recomputation during merges. +- **Purely a physical layout optimization.** Not a semantic concept, not part of the query model. Nothing in the query path or system correctness depends on it. +- **Optional.** If explicit sort columns already provide sufficient granularity (e.g., include `host` or `container`), `timeseries_id` adds little value and can be omitted. It can be added or removed from the sort schema at any time — this is a schema change handled by the normal transition mechanism (new splits use the new schema, old splits age out via retention). + +**Limitations:** + +- **Hash collisions** (extremely unlikely with 64-bit hashes) would interleave two distinct point sources, affecting only physical layout, not correctness. +- **Tag flapping.** If a tag value intermittently changes (e.g., enrichment flapping between `NA` and an actual value), the hash changes and points from what a user would consider "the same source" get different `timeseries_id` values. This degrades locality but never correctness. If tag flapping is prevalent, omitting `timeseries_id` and relying on explicit sort columns alone may be preferable. +- **Min/max metadata is meaningless.** The hash value's min/max range has no query pruning utility (nobody filters by hash value). Implementations should emit null for `timeseries_id` in per-column min/max/regex metadata. + +### 5. OTel Attribute Schema and Schema-on-Read + +**Current state: OTel map-based attributes.** The OpenTelemetry schema for metrics (as implemented in systems like [ClickStack](https://clickhouse.com/docs/use-cases/observability/clickstack/ingesting-data/schemas)) represents metric attributes as key-value maps — typically `Map(LowCardinality(String), String)` for resource attributes, scope attributes, and metric-level attributes. This is the schema our metrics pipeline currently ingests. + +Map columns are fundamentally non-columnar: all key-value pairs for a row are packed into a single column value. This has two consequences for Parquet storage: + +- **Poor compression.** A map column contains interleaved keys and values from many different attributes. Columnar encodings (RLE, dictionary) cannot exploit the structure of individual attributes because different attributes are mixed together in the same column. A dedicated `host` column with sorted data produces long runs of repeated values; a map column containing `host` alongside `env`, `region`, and dozens of other keys produces no useful runs. +- **No direct column access.** To evaluate a predicate like `attributes['host'] = 'web-01'`, the query engine must deserialize the entire map for each row and search for the key. There is no way to seek to `host` values specifically, and Parquet page-level statistics are meaningless for a map column (the min/max of a serialized map has no relationship to the values of individual keys within it). + +**Schema-on-read: attributes as columns.** A more effective storage representation is to extract each attribute into its own Parquet column at write time. Rather than storing `Map{"host": "web-01", "env": "prod", "region": "us-east-1"}` as one map value, we store three separate columns: `attr.host = "web-01"`, `attr.env = "prod"`, `attr.region = "us-east-1"`. Each column is independently typed, independently compressed, and independently accessible for predicate evaluation and page-level pruning. + +This is a **schema-on-read** approach: the storage layer stores data in whatever shape arrives, creating columns as needed, and any schema interpretation happens at query time. There is no requirement for the schema to be specified up front — new attribute keys that appear in incoming data produce new columns automatically. This is the approach taken by Datadog's internal systems (Husky, Metrics) and is identified as a key characteristic of cloud-native observability storage: "store all the data that the user sends, in whatever types they send it, resolving any ambiguity at query time." + +**Dense vs sparse columns.** Not every attribute needs its own column. Attributes that appear in <1% of rows produce extremely sparse columns that waste storage on null markers and add schema complexity. A practical threshold is to extract attributes as dedicated columns when they are **dense** (present in >1% of rows) and keep rare attributes in a residual map column. The density threshold is a tunable parameter. Over time, compaction could consolidate: an attribute that starts sparse (few sources report it) but becomes dense (adopted widely) can be promoted to its own column. + +**Implications for the sort schema.** Extracting attributes into columns is a prerequisite for effective sorting. The sort schema ([ADR-002](./002-sort-schema-parquet-splits.md)) references column names like `host`, `env`, `metric_name`. If these values are buried inside a map column, the sort is impossible — the writer cannot extract sort keys from a serialized map efficiently. Columnar attributes enable the sort schema to reference any attribute by name, and enable page-level statistics on sort columns that make intra-file pruning effective. + +**Transition.** The current OTel map-based ingestion format is the starting point. The indexing pipeline can extract attributes into columns at write time, presenting the original OTel map interface at the API boundary while storing columnar data internally. This is transparent to ingest clients — they continue sending OTel-format data. Queries can access attributes either by the original map path (for compatibility) or by direct column access (for performance). The storage representation is an internal optimization, not a change to the external data model. + +### 6. RLE/Dictionary Encoding and the Flurry Project + +The point-per-row model's performance depends on columnar encodings being preserved through the query pipeline. Currently, RLE and dictionary encoding are decoded to plain arrays early in DataFusion's execution. There is significant ongoing investment in **Flurry** (the metrics equivalent of Bolt) to preserve these encodings through more operators. + +As Flurry matures, the performance benefits of sorted point-per-row data increase: longer runs in sorted columns translate directly to better RLE compression ratios that are maintained through query execution. This makes point-per-row a bet that improves over time rather than a static trade-off. + +## Invariants + +These invariants must hold across all code paths (ingestion, compaction, query). + +| ID | Invariant | Rationale | +|----|-----------|-----------| +| **DM-1** | Each row in a Parquet split is exactly one data point: one metric value at one timestamp for one timeseries | Foundational data model. Enables row-level sorted merge without series-level merge logic | +| **DM-2** | No last-write-wins. If two data points with the same (metric name, tags, timestamp) arrive in separate ingest requests, both are stored | Eliminates sticky routing, series-level dedup, and ordering dependencies between nodes | +| **DM-3** | The storage layer does not perform interpolation. Points are stored as received; interpolation is a query-time operation | Decouples storage format from query semantics | +| **DM-4** | `timeseries_id`, if present, is deterministic: the same canonicalized tag set always produces the same hash value | Required for locality grouping to be consistent across ingestion and compaction | +| **DM-5** | `timeseries_id` persists through compaction without recomputation. The column is written once at ingestion and carried through all subsequent merges | Avoids recomputing hashes during merge (tags may not all be available as separate columns at merge time) | + +## Consequences + +### Positive + +- **Simple, composable storage format.** Standard Parquet rows with no nested types. Every tool in the Parquet/Arrow ecosystem works out of the box. +- **Straightforward compaction.** K-way merge is row-level only. No series-level merge logic. +- **No routing constraints.** Any node can ingest any point for any series. Load balancing is unconstrained. +- **Query flexibility.** No interpolation baked into storage. Batch-level dedup at ingest handles the common cases; storage and query layers are not coupled to a dedup strategy. +- **Encoding-friendly.** Sorted point-per-row produces long columnar runs that compress well and benefit from RLE/dictionary preservation. + +### Negative + +- **Tag redundancy.** Every row for the same timeseries repeats all tag values. In timeseries-per-row, tags are stored once per series. With good columnar encoding on sorted data, this redundancy compresses away, but it is still present in the uncompressed representation and affects memory usage during query execution until Flurry-style encoding preservation is complete. +- **OTel map attributes defeat columnar benefits.** The current OTel ingest schema stores attributes as key-value maps. Until schema-on-read column extraction is implemented, attributes cannot participate in sorting, page-level pruning, or efficient columnar compression. This is the most significant near-term limitation of the data model. +- **No intra-series locality guarantee.** Without `timeseries_id` in the sort schema, points from the same series may be interleaved with points from other series that share the same sort-column values. This is a configuration choice, not an inherent limitation. +- **Duplicate points are stored.** Without LWW or per-point dedup, retried ingestion or overlapping sources can produce duplicate points. Existing batch-level dedup (WAL checkpoints, file-level tracking) prevents most duplicates, but cross-request duplicates are possible. See [GAP-005](./gaps/005-no-per-point-deduplication.md). + +### Risks + +- **Flurry dependency for performance parity.** Until RLE/dictionary encoding is preserved through DataFusion, point-per-row may scan more data than timeseries-per-row for series-centric queries (e.g., "plot CPU for host X"). The magnitude depends on the encoding preservation timeline. +- **Wide tables (future research).** Metrics from the same source share nearly identical tags. Multiple metric names could be stored as separate value columns in a single wide row (e.g., `k8s.cpu.usage`, `k8s.cpu.limit`, `k8s.mem.usage` as columns sharing one tag set). This is the approach taken by TimescaleDB's hypertables. It would amortize tag storage further but requires significant compactor changes. Worth investigating as future research; it is compatible with point-per-row as an evolution, not a replacement. + +## Signal Generalization + +This ADR applies to **metrics** (Parquet pipeline). The data model decisions generalize as follows: + +- **Traces**: Point-per-row maps naturally to span-per-row (each span is one row). No LWW applies (spans are immutable). `timeseries_id` equivalent would be a hash of trace attributes for locality grouping. +- **Logs**: Point-per-row maps to log-entry-per-row (already the Quickwit model). No LWW. `timeseries_id` equivalent could group log entries by service/host. + +The no-LWW and no-storage-interpolation decisions are universal across signals. The `timeseries_id` concept generalizes to any signal where grouping related records improves compression. + +## Decision Log + +| Date | Decision | Rationale | +|------|----------|-----------| +| 2026-02-19 | Initial ADR created | Establish foundational data model for Parquet metrics pipeline | +| 2026-02-19 | Point-per-row chosen over timeseries-per-row | Simpler compaction, no LWW, standard DataFusion operators. Performance parity via columnar encoding + Flurry | +| 2026-02-19 | No LWW semantics | Eliminates sticky routing and series-level dedup. Simplifies ingestion and compaction | +| 2026-02-19 | Dedup clarified: batch-level exists, per-point does not | WAL checkpoints provide exactly-once at the batch level. File-level dedup for queue sources. Per-point dedup not implemented; identified as GAP-005 if needed | +| 2026-02-19 | timeseries_id defined as optional synthetic column | Provides intra-group locality tiebreaker without adding complexity to the core data model | +| 2026-02-19 | Schema-on-read identified as target for attribute storage | OTel map-based attributes are non-columnar, defeating compression and sort/pruning. Extract dense attributes (>1% non-null) into individual columns at write time, keep rare attributes in residual map | + +## Implementation Status + +### Implemented + +| Component | Location | Status | +|-----------|----------|--------| +| Point-per-row Parquet schema | `quickwit-parquet-engine/src/schema/fields.rs` | Done. Each row is one metric data point | +| Tag columns in Parquet | `quickwit-parquet-engine/src/schema/fields.rs` | Done. Tags stored as dictionary-encoded columns per row | + +### Not Yet Implemented + +| Component | Notes | +|-----------|-------| +| timeseries_id computation | Hash of canonicalized tag key/value pairs, added as column at ingestion | +| timeseries_id persistence through compaction | Column must survive merge without recomputation | +| Schema-on-read attribute extraction | Extract dense attributes from OTel map columns into individual Parquet columns at write time | +| Dense/sparse column threshold | Determine density threshold (e.g., >1% non-null) for column extraction vs residual map | +| Residual map for sparse attributes | Keep rare attributes in a fallback map column alongside extracted dense columns | + +## References + +- [Phase 1: Sorted Splits for Parquet](../locality-compaction/phase-1-sorted-splits.md) — full design document +- [ADR-002: Configurable Sort Schema](./002-sort-schema-parquet-splits.md) — sort schema that operates on this data model +- [ADR-003: Time-Windowed Sorted Compaction](./003-time-windowed-sorted-compaction.md) — compaction that relies on this data model +- [Husky Storage Compaction Blog Post](https://www.datadoghq.com/blog/engineering/husky-storage-compaction/) +- [ClickStack OTel Schemas](https://clickhouse.com/docs/use-cases/observability/clickstack/ingesting-data/schemas) — OTel map-based attribute schema +- [Characteristics of Cloud Native Storage relevant to Quickhouse](https://docs.google.com/document/d/...) — schema-on-read and cloud-native storage characteristics diff --git a/docs/internals/adr/002-sort-schema-parquet-splits.md b/docs/internals/adr/002-sort-schema-parquet-splits.md new file mode 100644 index 00000000000..09c0feca993 --- /dev/null +++ b/docs/internals/adr/002-sort-schema-parquet-splits.md @@ -0,0 +1,196 @@ +# ADR-002: Configurable Sort Schema for Parquet Splits + +## Metadata + +- **Status**: Proposed +- **Date**: 2026-02-19 +- **Tags**: storage, metrics, compaction, parquet, sorting +- **Components**: quickwit-parquet-engine, quickwit-indexing +- **Authors**: gtt@ +- **Related**: [ADR-001](./001-parquet-data-model.md), [ADR-003](./003-time-windowed-sorted-compaction.md), [Phase 1 Design](../locality-compaction/phase-1-sorted-splits.md) + +## Context + +Metrics data arrives at Quickwit through load-balanced routing: an external load balancer distributes requests across nodes, each node's `IngestRouter` picks a shard via round-robin, and the indexing pipeline produces splits stamped with the producing node's identity. Points for any given timeseries are scattered across whichever nodes happened to receive them. + +Within each split, rows are stored in ingestion order (see [ADR-001](./001-parquet-data-model.md) for the point-per-row data model). There is no relationship between the physical layout of rows and the logical structure of the data. A query for a specific metric name must scan all rows in every split in the time range. + +Sorting rows within each split by a schema aligned with common query predicates produces two immediate benefits: + +1. **Compression improvement.** Columnar formats like Parquet compress data by encoding runs of similar values. When rows are sorted by metric name and tags, the columns for those fields contain long runs of identical or similar values, benefiting RLE, dictionary encoding, and general-purpose compression (ZSTD). In Husky Phase 1, this yielded ~33% size reduction for APM data and ~25% for Logs data. +2. **Query efficiency.** Parquet's column index (format v2) stores min/max statistics per page within each column chunk. When data is sorted, pages within each column naturally have non-overlapping value ranges for the sort columns. DataFusion supports page index pruning, allowing it to skip pages that cannot match a query predicate. + +Matthew Kim's implementation added a fixed sort on `(MetricName, TagService, TagEnv, TagDatacenter, TagRegion, TagHost, TimestampSecs)` in the Parquet writer (`quickwit-parquet-engine/src/storage/writer.rs`), demonstrating that sorting is feasible and inexpensive. However, this sort order is hardcoded in `ParquetField::sort_order()` and cannot be customized per index or deployment. Different workloads have different high-value columns; a metrics index tracking Kubernetes containers benefits from sorting by `pod` and `namespace`, while an infrastructure metrics index benefits from `host` and `datacenter`. + +This ADR formalizes the sort schema as a configurable, per-index property stored in the metastore. + +## Decision + +### 1. Sort Schema Format + +A sort schema is a per-index (per-table) property stored in the metastore. It specifies an ordered list of columns that determine row sort order within each split, and optionally additional columns for which metadata (min/max/regex) is emitted but which do not participate in sorting. + +The sort schema is **mutable at runtime**. When an operator changes the sort schema for an index in the metastore, the change is propagated to the indexing pipelines on the appropriate nodes so that newly-produced splits use the new schema. Already-written splits retain their original sort schema and are not rewritten — they age out via retention. The compaction scope includes `sort_schema` (see [ADR-003](./003-time-windowed-sorted-compaction.md)), so splits with different sort schemas are never merged together. + +Format (following Husky convention): + +``` +[schema_name=]column[+/-]|...[&column[+/-]|...]/V2 +``` + +Components: + +- **Schema name** (optional): Labels the schema for identification. Example: `metrics_default=metric_name|...` +- **Sort columns** (pipe-delimited): Define the sort order. Each column may have `+` (ascending) or `-` (descending) suffix. Default direction is ascending, except `timestamp` which defaults to descending. +- **LSM cutoff** (`&`): Separates sort columns from metadata-only columns. Columns after `&` do not affect sort order, but min/max/regex metadata is emitted for them to enable future query pruning. +- **Version suffix** (`/V2`): Format version identifier. + +Each column has: + +| Property | Description | +|----------|-------------| +| **Name** | Column name as it appears in the Parquet schema | +| **Direction** | Ascending (`+`, default) or descending (`-`). `timestamp` defaults to descending | +| **Type** | Inferred from Parquet schema: string/binary (lexicographic), integer types (numeric), float types (numeric, NaN sorts after all values per IEEE 754 total order) | +| **Null handling** | Nulls sort **after** non-null values for ascending columns, **before** non-null values for descending columns | + +**Note on null handling:** The current implementation uses `nulls_first: true` for all columns. This must be changed to match the design: ascending columns should use `nulls_first: false` (nulls last), descending columns should use `nulls_first: true` (nulls first). This ensures nulls cluster at the end of each column's value range in both directions. + +### 2. Schema Requirements + +- Sort columns should be a small subset (typically 3-5) corresponding to the most common query predicates, optionally followed by `timeseries_id` (see [ADR-001](./001-parquet-data-model.md)), followed by `timestamp`. +- Missing sort columns in a split (e.g., from schema evolution) are treated as null for all rows in that split. This is not an error condition. +- The schema string must end with `/V2`. +- Metadata-only columns (after `&`) are optional. + +### 3. Sorting at Ingestion + +The Parquet writer is modified to sort accumulated RecordBatch data by the configured sort schema before writing. The steps for each split: + +1. **Accumulate rows** into RecordBatch arrays (as today). +2. **Compute timeseries_id** (if configured in the sort schema). See [ADR-001](./001-parquet-data-model.md) for computation details. +3. **Extract sort columns** from the accumulated rows. +4. **Compute sort indices** using Arrow's `lexsort_to_indices`, respecting direction and null ordering per the schema. +5. **Apply permutation** to all columns using Arrow's `take` kernel. +6. **Write Parquet file** with column index (page-level min/max) and offset index enabled. These are opt-in Parquet format v2 features required for DataFusion page-level predicate pushdown. +7. **Record metadata**: sort schema string, per-column min/max/regex. + +### 4. Sort Metadata Storage + +The sort schema and per-column statistics are stored in two places: + +**PostgreSQL (`MetricsSplitMetadata`)**: The schema string and min/max/regex vectors are stored alongside existing split metadata. This enables split-level query pruning without reading Parquet data. + +**Parquet `key_value_metadata`**: The schema is embedded in the file, making it self-describing: + +| Key | Value | +|-----|-------| +| `sort_schema` | Full schema string (e.g., `metric_name\|host\|env\|timeseries_id\|timestamp&service/V2`) | +| `schema_column_min_values` | JSON array of min values, positional by schema column order | +| `schema_column_max_values` | JSON array of max values, positional by schema column order | +| `schema_column_regexes` | JSON array of regex strings, positional by schema column order | + +**Parquet `sorting_columns`**: Sort columns (before `&`) are declared using Parquet's native `sorting_columns` field, specifying column index, direction, and null ordering. This allows Parquet-native tooling and DataFusion to leverage sort order without understanding our custom format. + +### 5. Examples + +Metrics index with explicit sort on metric name, host, and env, with timeseries_id tiebreaker and service as metadata-only: + +``` +metric_name|host|env|timeseries_id|timestamp&service/V2 +``` + +Without timeseries_id (when host provides sufficient granularity): + +``` +metric_name|host|env|timestamp&service/V2 +``` + +Minimal schema: + +``` +metric_name|timestamp/V2 +``` + +## Invariants + +These invariants must hold across all code paths (ingestion, compaction, query). + +| ID | Invariant | Rationale | +|----|-----------|-----------| +| **SS-1** | All rows within a split are sorted according to the sort schema recorded in that split's metadata | Foundation for page-level pruning and sorted merge. Violated data produces incorrect merge results | +| **SS-2** | Nulls sort after non-null values for ascending columns and before non-null values for descending columns | Consistent null ordering across ingestion and merge. Matches Husky convention | +| **SS-3** | If a sort column is missing from a split, all rows in that split are treated as null for that column. This is not an error | Enables schema evolution — columns can be added to the sort schema without rewriting existing splits | +| **SS-4** | The sort schema stored in a split's metadata is the schema that was in effect when that split was written. Already-written splits are never re-sorted | Changes propagate forward only. Old splits age out via retention | +| **SS-5** | The sort schema string is the same in the metastore (per-split metadata), the Parquet `key_value_metadata`, and the Parquet `sorting_columns` field for a given split | Three representations of the same truth. Inconsistency between them would cause incorrect merge or pruning behavior | + +## Consequences + +### Positive + +- **20-35% compression improvement** for metrics data (based on Husky Phase 1 results for similar workloads). Sorted columnar layout compresses tag columns with high value repetition very efficiently. +- **Page-level query pruning** via Parquet column index. When data is sorted, pages within each column have non-overlapping value ranges for sort columns. DataFusion can skip irrelevant pages. +- **Customizable per workload.** Different indexes can use different sort schemas optimized for their query patterns. +- **Runtime mutability.** Sort schema changes propagate to indexing pipelines without restart or redeployment. Old-schema splits coexist safely via the compaction scope. +- **Self-describing files.** Sort metadata in the Parquet file enables debugging, offline analysis, and disaster recovery without metastore access. +- **Foundation for compaction.** Sorted splits are a prerequisite for sorted merge compaction ([ADR-003](./003-time-windowed-sorted-compaction.md)). + +### Negative + +- **~2% CPU overhead at ingestion** for sorting. Expected to be offset by reduced compression cost (ZSTD works less on better-organized data), resulting in net CPU neutral or positive. +- **Schema format complexity.** The pipe-delimited format with direction suffixes, LSM cutoff, and version suffix is non-trivial. Parsing and validation code must be thorough. +- **Metadata-only columns (after `&`) have zero Phase 1 benefit.** They add storage overhead with no payoff until split-level query pruning exists (Phase 3). This is a deliberate bet on future value. + +### Risks + +- **Compression improvement may differ from Husky.** Husky's 25-33% was measured on logs/APM data. Metrics data has different characteristics (lower cardinality metric names, higher cardinality tag values). The design doc recommends running a validation experiment (sort existing Parquet files by the proposed schema and compare sizes) before committing to the full implementation. + +## Signal Generalization + +This ADR applies to **metrics** (Parquet pipeline) in Phase 1. The sort schema concept generalizes to all three signals: + +- **Traces**: Sort by `service_name|operation_name|trace_id|timestamp` would co-locate spans from the same service and enable page-level pruning on service. +- **Logs**: Sort by `service_name|level|host|timestamp` would co-locate logs from the same service at the same severity level. + +Phase 4 of the locality compaction roadmap extends sorting to the Tantivy pipeline for logs/traces. The sort schema format, null handling, and metadata storage are designed to be signal-agnostic. The main adaptation required for Tantivy is integrating sort order with fast fields rather than Parquet columns. + +## Decision Log + +| Date | Decision | Rationale | +|------|----------|-----------| +| 2026-02-19 | Initial ADR created | Formalize existing sort implementation and design configurable sort schema for Phase 1 locality compaction | +| 2026-02-19 | Husky-compatible sort schema format adopted | Enables knowledge transfer and tooling reuse from Husky locality project | +| 2026-02-19 | Null sort direction: nulls-last for ascending, nulls-first for descending | Matches Husky behavior, ensures nulls cluster at end of value range. Current implementation (nulls_first=true for all) must be corrected | +| 2026-02-19 | Sort schema stored in metastore per-index, mutable at runtime, propagated to pipelines | Schema is a table-level property, not static config. Changes distributed to indexing nodes without restart. Already-written splits keep old schema, age out via retention | + +## Implementation Status + +### Implemented + +| Component | Location | Status | +|-----------|----------|--------| +| Fixed sort at ingestion | `quickwit-parquet-engine/src/storage/writer.rs:84-109` | Done (Matthew Kim). Hardcoded sort on MetricName, TagService, TagEnv, TagDatacenter, TagRegion, TagHost, TimestampSecs | +| Sort column definition | `quickwit-parquet-engine/src/schema/fields.rs:146-158` | Done. `ParquetField::sort_order()` returns fixed column list | +| lexsort_to_indices usage | `quickwit-parquet-engine/src/storage/writer.rs` | Done. Arrow sort + take kernel applied in `sort_batch()` | + +### Not Yet Implemented + +| Component | Notes | Gap | +|-----------|-------|-----| +| Sort schema parser | Parse `column\|...\|timestamp&metadata/V2` format | [GAP-002](./gaps/002-fixed-sort-schema.md) | +| Sort schema in metastore | Schema stored per-index in metastore, mutable at runtime, propagated to pipelines on change | [GAP-002](./gaps/002-fixed-sort-schema.md) | +| Configurable sort directions | Currently all ascending. Need per-column `+`/`-` | [GAP-002](./gaps/002-fixed-sort-schema.md) | +| Correct null ordering | Currently `nulls_first: true` for all. Need nulls-last for ascending | [GAP-002](./gaps/002-fixed-sort-schema.md) | +| Parquet column index + offset index emission | Enable page-level min/max stats at write time | [GAP-004](./gaps/004-incomplete-split-metadata.md) | +| Sort metadata in PostgreSQL | sort_schema, per-column min/max/regex in MetricsSplitMetadata | [GAP-004](./gaps/004-incomplete-split-metadata.md) | +| Sort metadata in Parquet key_value_metadata | sort_schema, min/max/regex embedded in file | [GAP-004](./gaps/004-incomplete-split-metadata.md) | +| Parquet native sorting_columns field | Declare sort order in Parquet file metadata | [GAP-004](./gaps/004-incomplete-split-metadata.md) | + +## References + +- [Phase 1: Sorted Splits for Parquet](../locality-compaction/phase-1-sorted-splits.md) — full design document +- [Compaction Architecture](../compaction-architecture.md) — current compaction system description +- [ADR-001: Parquet Data Model](./001-parquet-data-model.md) — point-per-row data model and timeseries_id +- [ADR-003: Time-Windowed Sorted Compaction](./003-time-windowed-sorted-compaction.md) — compaction that depends on sort schema +- [Husky Phase 1: Locality of Reference](https://docs.google.com/document/d/1x9BO1muCTo1TmfhPYBdIxZ-59aU0ECSiEaGPUcDZkPs/edit) — prior art +- [Husky Storage Compaction Blog Post](https://www.datadoghq.com/blog/engineering/husky-storage-compaction/) diff --git a/docs/internals/adr/003-time-windowed-sorted-compaction.md b/docs/internals/adr/003-time-windowed-sorted-compaction.md new file mode 100644 index 00000000000..e9e254aa643 --- /dev/null +++ b/docs/internals/adr/003-time-windowed-sorted-compaction.md @@ -0,0 +1,305 @@ +# ADR-003: Time-Windowed Sorted Compaction for Parquet + +## Metadata + +- **Status**: Proposed +- **Date**: 2026-02-19 +- **Tags**: storage, metrics, compaction, parquet, time-windowing +- **Components**: quickwit-parquet-engine, quickwit-indexing, quickwit-metastore +- **Authors**: gtt@ +- **Related**: [ADR-001](./001-parquet-data-model.md), [ADR-002](./002-sort-schema-parquet-splits.md), [Phase 1 Design](../locality-compaction/phase-1-sorted-splits.md) + +## Context + +The metrics pipeline currently has **no compaction**. Splits accumulate without merging, relying on DataFusion to query many small Parquet files and on time-based retention to remove old data. This is documented in [compaction-architecture.md](../compaction-architecture.md) which notes: "Metrics splits accumulate without compaction. This is tolerable in the short term because DataFusion can query many small Parquet files, and time-based retention eventually removes old data. But it is not ideal." + +The consequences of no compaction are severe at scale: + +- **Unbounded split count within retention window.** At 10 GiB/s ingestion with 10 MiB splits, the system produces ~1,024 splits per second — ~921,600 splits per 15-minute window before compaction. +- **Query fan-out proportional to split count.** Every query must open and scan every split in the relevant time range. More splits means more I/O, more metadata lookups, and more DataFusion task scheduling overhead. +- **No intra-file pruning without sort order.** Without page-level column indexes on sorted data, DataFusion must scan the entire file even when only a small fraction of rows match the query predicate. + +The existing logs/traces compaction system (`StableLogMergePolicy` with Tantivy merge) is designed for a different storage format and does not apply to Parquet splits. Metrics compaction requires a purpose-built pipeline that understands Parquet, sort order, and time-based data organization. + +This ADR introduces time-windowed sorted compaction: all data is partitioned into fixed-duration time windows, and compaction merges splits within each window using a k-way sorted merge that preserves the sort order established by [ADR-002](./002-sort-schema-parquet-splits.md), operating on the point-per-row data model defined in [ADR-001](./001-parquet-data-model.md). + +## Decision + +### 1. Time Windowing + +All data in the Parquet pipeline is organized into **time windows**: fixed-duration, non-overlapping intervals of wall-clock time aligned to the Unix epoch. Time windowing is enforced by both the **indexing pipeline** (which honors window boundaries — if a batch straddles a boundary, it is split so that each resulting split belongs to exactly one window) and the **compaction pipeline** (which only merges splits within the same window and never combines data across window boundaries). + +**Configuration:** + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `window_duration` | 15 minutes | Duration of each window. Must evenly divide one hour (valid: 1m, 2m, 3m, 4m, 5m, 6m, 10m, 12m, 15m, 20m, 30m, 60m) | +| `compaction_start_time` | (required) | Unix timestamp (seconds). Only windows at or after this time are eligible for compaction. Data before this time ages out via retention | +| `late_data_acceptance_window` | 1 hour | Maximum age of a data point accepted at ingestion. Points older than this are dropped. Bounds disturbance of compacted windows | + +**Window computation:** + +``` +window_start = t - (t % window_duration_seconds) +window_end = window_start + window_duration_seconds +``` + +Each window is identified by its `window_start` timestamp (seconds since Unix epoch). + +**Why time windowing:** + +1. **Bounds compaction scope.** Each window is an independent compaction unit. The total data eligible for a single merge is bounded by window duration and ingestion rate. +2. **Aligns with query patterns.** Observability queries always include a time range predicate. The query engine can discard all windows outside the query range without examining individual split metadata. +3. **Enables efficient retention.** Dropping old data becomes a window-level operation: all splits in expired windows can be deleted as a batch. +4. **Limits write amplification.** Old, fully-compacted windows are never disturbed by new data in newer windows. + +### 2. Split-to-Window Assignment at Ingestion + +Splits are assigned to a time window based on the timestamps of the rows they contain. When a split contains rows from multiple windows (common when a commit interval straddles a window boundary), the split is **partitioned by window** before writing: rows are grouped by window assignment, and a separate Parquet file is written for each window. + +**Invariant:** Every split in object storage belongs to exactly one time window. This invariant is established at ingestion (when a batch of data straddles a window boundary, the indexing pipeline splits it at the boundary, producing separate splits for each window) and preserved through compaction (the compaction pipeline includes `window_start` in the merge scope, preventing cross-window merges). A window will typically contain many small splits from ingestion; compaction reduces them over time. + +The window assignment uses the timestamp column referenced in the sort schema. Rows with null timestamps are assigned to a designated overflow window (`window_start = 0`), compacted separately. + +**Note on existing time partitioning:** The codebase has a `PartitionGranularity` enum in `quickwit-parquet-engine/src/split/partition.rs` with `Hour`, `Day`, `Week` variants. This does not match the design requirement for finer-grained, epoch-aligned windows (1-60 minutes). The time windowing implementation should either extend or replace the existing partitioning infrastructure. + +### 3. Compaction Scope + +The compaction scope for Parquet splits has two layers: a **compatibility scope** that determines which splits may be merged, and a **grouping dimension** (`window_start`) that the merge planner uses to select per-window merge candidates within the compatibility scope. + +**Compatibility scope** (6-part key): + +``` +(index_uid, source_id, partition_id, doc_mapping_uid, sort_schema, window_duration) +``` + +Only splits sharing all six components are merge-compatible. + +| Component | Purpose | Change from current | +|-----------|---------|---------------------| +| `index_uid` | Prevents cross-index merging | No change | +| `source_id` | Prevents cross-source merging | No change | +| `partition_id` | Tenant isolation | No change | +| `doc_mapping_uid` | Prevents incompatible schema merging | No change | +| `sort_schema` | Prevents merging splits with different sort orders | **New** | +| `window_duration` | Prevents merging splits from different window duration configurations | **New** | + +**Merge grouping** (within the compatibility scope): + +Within a compatibility scope, the merge planner groups splits by `window_start` and only merges splits within the same window. This is analogous to how the existing Tantivy merge planner groups by `(partition_id, doc_mapping_uid)` within a pipeline's filtered set. + +`window_duration` rather than `window_start` is the right compatibility dimension because different window durations can produce windows with the same start time. For example, a 5-minute window `[00:00, 00:05)` and a 15-minute window `[00:00, 00:15)` both have `window_start = 0`, but they contain data for different time ranges and must not be merged. Partitioning on `window_duration` prevents this; the merge planner then naturally groups by `window_start` within each duration. + +**`node_id` is intentionally excluded.** In Phase 1, each node compacts its own splits (the current behavior), but the scope definition does not require it. This is a forward-looking design choice — Phase 2 lifts the node constraint for cross-node compaction. + +**Window duration changes.** If the configured window duration changes, new splits use the new duration. The compatibility scope prevents merging across durations. Old-duration splits age out via retention. + +### 4. Sorted Merge + +The merge process combines N sorted input splits into one or more sorted output splits. Logically, the rows of all input splits are sorted together by their sort key — the lexicographic ordering of all sort column values for each row. The merge proceeds in two phases: determine the global sort order, then stream all columns through that order. + +**Phase 1: Determine global sort order.** + +Read the sort columns from each input split. Parquet's footer-based format allows seeking directly to any column. Compute a permutation that represents the sorted interleaving of all rows across all inputs, respecting the sort schema's comparison rules (lexicographic for strings, numeric for numbers, nulls-last for ascending, nulls-first for descending). + +The sort order is represented as a **run-length encoded** sequence of `(split_index, start_row, row_count)` triples. Because inputs are already sorted, the merge naturally produces long contiguous runs from the same input. This representation enables bulk operations (bulk `take`, bulk copy) during column streaming. + +**Sort order implementation — open question.** There are two candidate approaches for computing the global sort order, and both should be benchmarked on representative workloads: + +- **K-way merge.** Use a min-heap (priority queue) with one entry per input split, advancing through rows in sorted order. Complexity O(R log N) where R is total row count and N is number of input splits. +- **Stable sort.** Concatenate the sort columns from all inputs and perform a stable sort over the combined rows. Complexity O(R log R), but stable sort implementations benefit from presorted runs (e.g., Timsort detects and exploits existing order). In Husky's Go implementation, stable sort was faster than k-way merge, likely because of better cache locality and lower per-comparison overhead for the common case of long sorted runs. + +**Row comparison — open question.** Two strategies for comparing rows during sorting: + +- **Composite key.** Encode all sort column values into a single byte-comparable key per row (e.g., using Google's [Ordered Code](https://github.com/google/orderedcode) encoding or Arrow's row format). Comparisons become a single `memcmp`. Amortizes multi-column comparison cost but requires encoding all sort column values upfront. +- **Column-at-a-time comparison.** Consult the individual sort column values at each row position during comparison, comparing column by column. Avoids the encoding step and may be faster when early columns (e.g., `metric_name`) distinguish most rows without needing to examine later columns. + +**Phase 2: Stream columns through the merge.** + +Once the global sort order is determined, each column is read from the input splits and written to the output in sorted order. Columns are processed one at a time (or in small groups) for memory efficiency. + +For large columns, it may be advantageous to operate at **page granularity** rather than loading an entire column from each input: read individual Parquet pages from inputs as needed and write individual pages to the output. This bounds memory usage for columns with large values (e.g., high-cardinality string tags, large attribute maps) and avoids materializing an entire column across all inputs simultaneously. + +**Phase 3: Emit split metadata.** The output split records sort_schema, window_start, window_duration_secs, and per-column min/max/regex. + +**Self-reinforcing feedback loop:** Sorted inputs produce longer contiguous runs in the merge order. Each compaction cycle produces better-sorted outputs, meaning the next merge has longer runs, smaller merge orders, and cheaper execution. The system gets cheaper to compact over time. + +### 5. Column Set Differences Across Inputs + +Schema evolution means input splits may have different column sets: + +- **Sort columns missing from an input:** All rows from that split are treated as null for the missing column. Nulls sort according to the schema rules. The k-way merge handles this naturally. +- **Non-sort columns:** The merge computes the **union** of all column names across inputs. The output contains every column that appears in at least one input. Rows from inputs lacking a column are filled with nulls. If the same column name has different types across inputs, the merge fails with an error (schema evolution conflict requiring resolution at the index configuration level). + +### 6. Late-Arriving Data + +Points with timestamps older than `late_data_acceptance_window` are **dropped at ingestion** rather than accepted. This bounds the window of time during which late data can disturb already-compacted windows. + +Within the acceptance window, late data is handled naturally: + +1. Late data is written to a new split assigned to the historical window (based on timestamp, not ingestion time). +2. The next compaction cycle for that window picks up the new split and merges it with existing compacted splits. +3. No special handling required; the window gains additional splits merged in the normal course. + +For windows already compacted to a single large split, a late-arriving small split triggers a merge of the large split with the small one. The acceptance window bounds how far back this happens. + +### 7. Pre-existing Unsorted Data + +Splits produced before Phase 1 have no sort schema and no window assignment. These splits are **not compacted** — they remain as-is until they expire via retention. + +`compaction_start_time` defines the boundary: only splits whose `window_start` >= this time are eligible for compaction. Splits with no window assignment or `window_start` before the cutoff are excluded from compaction planning entirely. + +Data arriving after Phase 1 enablement but with timestamps before `compaction_start_time` is still written as a sorted, windowed split (the indexer always applies windowing and sorting once Phase 1 is active). However, these splits are not eligible for compaction and age out alongside pre-existing unsorted splits. + +### 8. Compaction Policy + +Phase 1 adapts Quickwit's existing compaction scheduling and `StableLogMergePolicy` for Parquet splits. Within each time window, the merge policy uses the same maturity/age constraints to determine merge eligibility. + +Key parameters requiring experimental validation: + +| Parameter | Question | Approach | +|-----------|----------|----------| +| Target split size | How large should merge output be? | Sweep 64MB, 128MB, 256MB, 512MB on representative workload | +| Merge fanin | How many inputs per merge? | Sweep 4, 8, 16; measure duration, memory, write amplification | +| Window size interaction | How many splits accumulate per window? What is steady-state after compaction? | Measure at representative ingestion rates | + +**Recommended experiments before finalizing policy:** + +1. **Baseline:** Measure splits per 15-minute window, size of each split, total data volume per window. +2. **Merge fanin sweep:** Fixed target size, vary fanin (4, 8, 16). Measure merge duration, peak memory, write amplification. +3. **Target size sweep:** Fixed fanin, vary target size (64MB-512MB). Measure steady-state split count, query latency, compression ratio. +4. **Compression validation:** Compare sorted vs. unsorted Parquet files for same data. **This is the existential experiment** — if compression improvement is <10% for metrics data (vs Husky's 25-33% for logs/APM), the cost-benefit calculus changes significantly. + +### 9. Split Metadata Extensions + +The following fields are added to `MetricsSplitMetadata` and the `metrics_splits` PostgreSQL table: + +| Field | Type | Description | +|-------|------|-------------| +| `window_start` | `i64` | Unix timestamp (seconds) of the time window start | +| `window_duration_secs` | `u32` | Window duration in effect when split was produced | +| `sort_schema` | `String` | Full sort schema string including version suffix. Empty if pre-Phase-1 | +| `schema_column_min_values` | `Vec` | Min value per schema column (sort + metadata-only), positional | +| `schema_column_max_values` | `Vec` | Max value per schema column, positional | +| `schema_column_regexes` | `Vec` | Regex matching any value per schema column, positional | + +`SortColumnValue` is a tagged union of string, i64, u64, f64, and null. + +These metadata fields are stored in **both** PostgreSQL (authoritative source for query planning) and Parquet `key_value_metadata` (making files self-describing). + +**PostgreSQL scalability note.** At high ingestion rates (~921K pre-compaction splits per 15-minute window), PostgreSQL metadata volume can exceed what a single OLTP database handles efficiently. The design mitigates this in two ways: (1) self-describing Parquet files mean the external store is an index, not the sole source of truth; (2) metadata fields are simple, typed, and portable — they can be stored in any system supporting efficient range queries. + +## Invariants + +These invariants must hold across all code paths (ingestion, compaction, query). + +### Window invariants + +| ID | Invariant | Rationale | +|----|-----------|-----------| +| **TW-1** | Every split in object storage belongs to exactly one time window. Established at ingestion (batches that straddle a window boundary are split), preserved through compaction (merge scope includes `window_start` as a grouping dimension) | Enables window-level retention, query pruning by window, and bounded compaction scope | +| **TW-2** | `window_duration` must evenly divide one hour (valid: 1, 2, 3, 4, 5, 6, 10, 12, 15, 20, 30, 60 minutes) | Ensures window boundaries align across hours and days, preventing fragmented or offset windows | +| **TW-3** | Data is never merged across window boundaries | Each window is an independent compaction unit. Cross-window merges would violate TW-1 and make window-level retention impossible | + +### Compaction scope invariants + +| ID | Invariant | Rationale | +|----|-----------|-----------| +| **CS-1** | Only splits sharing all six compatibility scope components (`index_uid`, `source_id`, `partition_id`, `doc_mapping_uid`, `sort_schema`, `window_duration`) may be merged | Prevents merging incompatible data: different indexes, schemas, sort orders, or window durations | +| **CS-2** | Within a compatibility scope, only splits with the same `window_start` are merged | Enforces TW-3 (no cross-window merges). The merge planner groups by `window_start` | +| **CS-3** | Splits produced before `compaction_start_time` are never compacted | Clean transition: no mixed sorted/unsorted merge path. Pre-Phase-1 data ages out via retention | + +### Merge correctness invariants + +| ID | Invariant | Rationale | +|----|-----------|-----------| +| **MC-1** | The set of rows does not change during compaction, only their order. The output of a merge contains exactly the same rows as the union of its inputs — no rows are added, removed, or duplicated | Compaction is a physical reorganization, not a logical transformation. Queries over a window must return the same results before and after compaction | +| **MC-2** | Row contents do not change during compaction. The value of every column for every row is identical in the output as in the input, except for explicitly designated bookkeeping columns (e.g., `write_amplification_count`) that track compaction metadata | Data integrity through compaction. The storage layer does not transform, aggregate, or filter user data | +| **MC-3** | The output of a merge is sorted according to the sort schema of the inputs. Sort order is preserved, never degraded, through compaction | Enables sorted merge to be applied iteratively. Each compaction generation is at least as well-sorted as its inputs | +| **MC-4** | If inputs have different column sets (schema evolution), the output contains the union of all columns. Rows from inputs missing a column are filled with nulls. Type conflicts on the same column name are an error | Ensures no data loss during merge. Type conflicts require explicit resolution at the index configuration level | + +## Consequences + +### Positive + +- **Reduces split count per time window** from unbounded accumulation to a bounded steady-state after compaction converges. +- **Larger splits improve query throughput.** Fewer splits means less fan-out, less metadata overhead, less DataFusion scheduling cost. +- **Sort order preserved through merges.** Compaction never degrades the sort quality established at ingestion. +- **Time windows provide free query pruning.** A query for a specific time range can discard entire windows without consulting min/max metadata — this works immediately, before Phase 3 split-level pruning. +- **Clean transition.** `compaction_start_time` cutoff means no mixed sorted/unsorted merge path. Old data ages out via retention. +- **Foundation for Phase 2.** Cross-node compaction, m:n merges, and split-level query pruning all build on the sorted-split + time-window infrastructure. + +### Negative + +- **New actor pipeline required.** The Parquet merge pipeline (planner, downloader, merge executor, uploader, publisher) must be built from scratch. The Tantivy merge actors cannot be reused directly — they use `UnionDirectory` for Tantivy segments, not Parquet k-way merge. +- **Memory cost of sorted merge.** The sort-order computation phase must hold sort columns from all input splits in memory. For large merges (16 inputs x 500K rows x 5 sort columns), this could be significant. Page-level streaming for non-sort columns mitigates total memory, but the sort-order phase is unavoidable. +- **Compaction policy borrowed from a different workload.** `StableLogMergePolicy` was designed for Tantivy log/trace splits. Metrics have different characteristics (higher write rates, smaller events, time-series structure). The policy may need metrics-specific tuning or replacement. + +### Risks + +- **PostgreSQL metadata scalability.** At extreme ingestion rates, the per-split metadata volume may exceed PostgreSQL's capacity for efficient query planning lookups. The design explicitly acknowledges this and recommends the metadata architecture be prepared for a future migration to a dedicated metadata service or columnar store. +- **Late data volume at scale.** The design assumes late-arriving data is rare. At 10 GiB/s, even 0.1% late data is 10 MiB/s, triggering re-merges of compacted windows. The `late_data_acceptance_window` bounds this, but sustained late data from a source with systematic clock skew can cause compaction churn. +- **Window duration sensitivity.** Too-short windows relative to commit interval produce many tiny splits per window. Too-long windows at high ingestion rates make the per-window split count unmanageable. Operators must tune `window_duration` based on ingestion rate. +- **`doc_mapping_uid` vs `sort_schema` in compaction scope.** If a sort schema change also triggers a new `doc_mapping_uid`, both scope dimensions prevent merging on schema changes (redundant). If not, they serve complementary purposes (schema structure vs sort order). The relationship between these two should be clarified during implementation. + +## Signal Generalization + +This ADR applies to **metrics** (Parquet pipeline) in Phase 1. The compaction architecture generalizes to all three signals: + +- **Time windowing** is universal. Logs and traces are time-stamped data with time-range queries. Window-scoped compaction applies directly. +- **Sorted merge** applies to any signal with a sort schema. For Tantivy (logs/traces), sorted merge would operate on fast fields. The k-way merge algorithm is format-independent — the Parquet-specific part is reading/writing via arrow-rs rather than Tantivy segment APIs. +- **Compaction scope** generalizes with the addition of `sort_schema` and `window_duration`. The existing Tantivy compaction could adopt this scope if extended. + +Phase 4 of the locality compaction roadmap extends time-windowed sorted compaction to the Tantivy pipeline. The main adaptation is replacing the Parquet merge executor with a Tantivy-aware one that produces sorted fast fields. + +## Decision Log + +| Date | Decision | Rationale | +|------|----------|-----------| +| 2026-02-19 | Initial ADR created | Formalize compaction design for metrics Parquet pipeline, addressing the fundamental gap of no metrics compaction | +| 2026-02-19 | Time windows chosen over unbounded compaction | Bounds merge scope, aligns with query patterns, enables efficient retention, limits write amplification | +| 2026-02-19 | node_id excluded from compaction scope | Forward-looking for Phase 2 cross-node compaction. Merge operations do not interact with checkpoints, so this is safe | +| 2026-02-19 | StableLogMergePolicy adapted for initial compaction policy | Reuse existing, proven merge planning logic. May need metrics-specific tuning after experiments | +| 2026-02-19 | compaction_start_time cutoff for clean transition | Avoids complexity of merging sorted/unsorted inputs. Old data ages out via retention | +| 2026-02-19 | RLE merge order representation | Sorted inputs produce long contiguous runs, enabling bulk operations and creating a positive feedback loop across compaction generations | +| 2026-02-19 | Compaction scope uses window_duration, not window_start | window_start is a merge planner grouping dimension, not a compatibility dimension. Different durations can produce windows with the same start time (e.g., 5m and 15m windows both start at :00), so duration must be in the scope to prevent cross-duration merges | +| 2026-02-19 | Sorted merge strategy is an open question: k-way merge vs stable sort | In Husky's Go impl, stable sort was faster than k-way merge due to cache locality on presorted runs. Both should be benchmarked | +| 2026-02-19 | Row comparison strategy is an open question: composite key vs column-at-a-time | Composite key (Ordered Code, Arrow row format) enables single memcmp; column-at-a-time avoids encoding cost and may short-circuit on leading columns | +| 2026-02-19 | Page-level streaming for column merge phase | Loading/writing individual Parquet pages instead of whole columns bounds memory for large columns | +| 2026-02-20 | Merge correctness invariants MC-1 through MC-4 formalized | Compaction must not change the set of rows or their contents (except bookkeeping columns). Sort order must be preserved. Column set is the union of inputs | + +## Implementation Status + +### Implemented + +| Component | Location | Status | +|-----------|----------|--------| +| (none) | - | No Parquet compaction infrastructure exists yet | + +### Not Yet Implemented + +| Component | Notes | Gap | +|-----------|-------|-----| +| Time-window partitioning at ingestion | Splits must be partitioned by window before writing | [GAP-003](./gaps/003-no-time-window-partitioning.md) | +| Late data acceptance window (drop at ingestion) | Points older than threshold dropped | [GAP-003](./gaps/003-no-time-window-partitioning.md) | +| Parquet merge planner | Selects merge candidates per window, respecting 6-part scope | [GAP-001](./gaps/001-no-parquet-compaction.md) | +| Parquet merge split downloader | Downloads source splits from object storage | [GAP-001](./gaps/001-no-parquet-compaction.md) | +| Parquet sorted merge executor | K-way merge with RLE merge order, column streaming | [GAP-001](./gaps/001-no-parquet-compaction.md) | +| Parquet merge uploader | Uploads merged split to object storage | [GAP-001](./gaps/001-no-parquet-compaction.md) | +| Parquet merge publisher | Atomically updates PostgreSQL metadata | [GAP-001](./gaps/001-no-parquet-compaction.md) | +| StableLogMergePolicy adaptation for metrics | Size-tiered merge policy within time windows | [GAP-001](./gaps/001-no-parquet-compaction.md) | +| Split metadata extensions | window_start, window_duration_secs, sort_schema, min/max/regex fields | [GAP-004](./gaps/004-incomplete-split-metadata.md) | +| PostgreSQL schema migration | Add new columns to metrics_splits table | [GAP-004](./gaps/004-incomplete-split-metadata.md) | +| compaction_start_time configuration | Index-level config for transition boundary | [GAP-003](./gaps/003-no-time-window-partitioning.md) | +| Compaction policy experiments | Fanin sweep, target size sweep, compression validation | Pre-implementation | + +## References + +- [Phase 1: Sorted Splits for Parquet](../locality-compaction/phase-1-sorted-splits.md) — full design document +- [Compaction Architecture](../compaction-architecture.md) — current compaction system description +- [ADR-001: Parquet Data Model](./001-parquet-data-model.md) — point-per-row data model +- [ADR-002: Sort Schema for Parquet Splits](./002-sort-schema-parquet-splits.md) — sort schema that compaction preserves +- [StableLogMergePolicy](../../quickwit/quickwit-indexing/src/merge_policy/stable_log_merge_policy.rs) — existing merge policy +- [Merge Planner](../../quickwit/quickwit-indexing/src/actors/merge_planner.rs) — existing merge planner (Tantivy) +- [Husky Storage Compaction Blog Post](https://www.datadoghq.com/blog/engineering/husky-storage-compaction/) +- [Husky Phase 2: Locality of Reference](https://docs.google.com/document/d/1vax-vv0wbhfddo4n5obhlVJxsmUa9N_62tKs5ZmYC6k/edit) diff --git a/docs/internals/adr/004-cloud-native-storage-characteristics.md b/docs/internals/adr/004-cloud-native-storage-characteristics.md new file mode 100644 index 00000000000..32ac87e9dad --- /dev/null +++ b/docs/internals/adr/004-cloud-native-storage-characteristics.md @@ -0,0 +1,113 @@ +# ADR-004: Cloud-Native Storage Characteristics + +## Metadata + +- **Status**: Proposed +- **Date**: 2026-02-19 +- **Tags**: architecture, storage, cloud-native, observability +- **Components**: all +- **Authors**: gtt@ +- **Related**: [ADR-001](./001-parquet-data-model.md), [ADR-002](./002-sort-schema-parquet-splits.md), [ADR-003](./003-time-windowed-sorted-compaction.md) + +## Context + +To compete with ClickHouse Commercial (SharedMergeTree) in a cloud-native observability context, Quickhouse-Pomsky must exhibit the characteristics that Datadog's internal systems (Husky, Metrics) have demonstrated at scale. These characteristics are derived from operational experience with cloud-native storage for observability workloads and are documented in [Characteristics of Cloud Native Storage relevant to Quickhouse](https://docs.google.com/document/d/...). + +This ADR catalogs those characteristics, evaluates Quickwit's current status for each, and identifies gaps that must be closed. + +## Decision + +We adopt the following characteristics as the target architecture for Quickhouse-Pomsky. Each characteristic is evaluated against Quickwit's current implementation. + +### Cloud-Native Storage Characteristics + +| # | Characteristic | Quickwit Status | Notes | Gap | +|---|---------------|----------------|-------|-----| +| C1 | **Independent auto-scaling of query, ingest, compaction** | Not yet | Quickwit does not independently auto-scale these components | [GAP-006](./gaps/006-no-independent-auto-scaling.md) | +| C2 | **Object-store-aware file format** | Implemented | Parquet/Tantivy files designed for object storage access patterns | — | +| C3 | **Object storage throughput** | Implemented | Parallelized access to files and columns | — | +| C4 | **Multi-level caching** | Not implemented | No columnar data cache, column header cache, predicate result cache, or query result cache | [GAP-007](./gaps/007-no-multi-level-caching.md) | +| C5 | **Distributed query execution** | Implemented | Two-stage planning + execution with parallel fan-out to searcher nodes | — | +| C6 | **Query affinity** | Partial | Rendezvous hashing on `split_id` assigns splits to searchers. But without exactly-once semantics, dedup may force scatter-gather that undermines affinity | [GAP-007](./gaps/007-no-multi-level-caching.md) | +| C7 | **Query-aware data layout** | Partial | Ingest sharding supports a form of locality. ADR-002/003 introduce sorted splits and compaction to achieve global data layout | [GAP-001](./gaps/001-no-parquet-compaction.md) | +| C8 | **Storage layout decoupled from ingest sharding** | Not yet | Current compaction is node-local (`node_id` in merge scope). ADR-003 excludes `node_id` from scope, enabling future cross-node compaction | [GAP-001](./gaps/001-no-parquet-compaction.md) | +| C9 | **Exactly-once semantics** | Partial | WAL checkpoints provide batch-level exactly-once. No per-point dedup | [GAP-005](./gaps/005-no-per-point-deduplication.md) | +| C10 | **Query pruning before scanning** | Partial | Tag-based and time-range pruning on existing split metadata. Full split-level pruning on sort column min/max/regex requires metadata from ADR-003 | [GAP-004](./gaps/004-incomplete-split-metadata.md) | + +### Observability-Specific Characteristics + +| # | Characteristic | Quickwit Status | Notes | Gap | +|---|---------------|----------------|-------|-----| +| C11 | **Schema-on-read** | Partial | Quickwit supports dynamic fields. OTel map-based attributes need column extraction (ADR-001 section 5) | ADR-001 §5 | +| C12 | **Minimal ingest-to-query latency** | Implemented | Small splits published quickly; commit timeout bounds latency | — | +| C13 | **High query rate for monitors** | Not implemented | PostgreSQL metadata not optimized for ~800k QPS. No query/predicate result caching | [GAP-008](./gaps/008-no-high-query-rate-optimization.md) | +| C14 | **New data prioritized over updates** | Not implemented | No priority scheduling between fresh ingestion and compaction/backfill | [GAP-009](./gaps/009-no-leading-edge-prioritization.md) | +| C15 | **Time is first-class** | Partial | Splits have time ranges. ADR-003 introduces formal time windowing | [GAP-003](./gaps/003-no-time-window-partitioning.md) | + +### KTLO Characteristics + +| # | Characteristic | Quickwit Status | Notes | Gap | +|---|---------------|----------------|-------|-----| +| C16 | **Prioritize leading edge** | Not implemented | No compaction priority for recent windows. Too many small files at the leading edge degrades query performance | [GAP-009](./gaps/009-no-leading-edge-prioritization.md) | +| C17 | **Traffic burst handling** | Not implemented | No burst lane or overflow buffering mechanism | [GAP-006](./gaps/006-no-independent-auto-scaling.md) | + +### ClickStack Characteristics + +| # | Characteristic | Quickwit Status | Notes | Gap | +|---|---------------|----------------|-------|-----| +| C18 | **Materialized views** | Not implemented | No pre-aggregation or materialized view support | — (future) | +| C19 | **SQL** | Not implemented | Planned via ClickHouse integration (remote storage engine) | — (future) | +| C20 | **Available on-prem** | Implemented | Open-source Quickwit is deployable on-prem | — | + +### Summary + +| Status | Count | Characteristics | +|--------|-------|----------------| +| Implemented | 5 | C2, C3, C5, C12, C20 | +| Partial | 5 | C6, C7, C9, C10, C11, C15 | +| Not yet | 9 | C1, C4, C8, C13, C14, C16, C17, C18, C19 | + +The existing ADRs (001-003) and gaps (001-005) address: C7, C8, C9, C10, C11, C15. + +New gaps created by this ADR address: C1/C17, C4/C6, C13, C14/C16. + +## Consequences + +### Positive + +- **Clear target architecture.** The characteristics provide a concrete checklist for what Quickhouse-Pomsky must achieve to compete with ClickHouse Commercial and match Datadog's internal systems. +- **Prioritization framework.** Characteristics can be prioritized by impact: C7 (query-aware layout) and C10 (pruning) have the highest query performance impact; C1 (auto-scaling) and C16 (leading edge) have the highest operational impact. +- **Gap tracking.** Each missing characteristic has a corresponding gap document with potential solutions and next steps. + +### Negative + +- **Large surface area.** 9 characteristics are not yet implemented. Achieving all of them is a multi-quarter effort. +- **Interdependencies.** Some characteristics depend on others (C10 pruning requires C7 data layout; C6 affinity benefits from C9 exactly-once). The implementation order matters. + +### Risks + +- **PostgreSQL as bottleneck.** Multiple characteristics (C10 pruning, C13 high query rate) require fast metadata access. PostgreSQL may not scale to the required QPS, as noted in ADR-003. +- **ClickHouse Commercial is a moving target.** SharedMergeTree continues to improve. The characteristics identified here represent the current gap, not a static finish line. + +## Signal Generalization + +All characteristics apply across metrics, traces, and logs. The evaluation in this ADR focuses on the metrics Parquet pipeline (current priority), but the characteristics are signal-agnostic. Query-aware data layout, pruning, caching, and auto-scaling benefit all three signals equally. + +## Decision Log + +| Date | Decision | Rationale | +|------|----------|-----------| +| 2026-02-19 | Initial ADR created | Catalog cloud-native storage characteristics as target architecture, evaluate Quickwit status, identify gaps | +| 2026-02-19 | 20 characteristics adopted from internal experience | Based on operational lessons from Husky, Metrics, and competitive analysis with ClickHouse Commercial | + +## Implementation Status + +See the status table in the Decision section above. Implementation is tracked through the referenced ADRs and gaps. + +## References + +- [Characteristics of Cloud Native Storage relevant to Quickhouse](https://docs.google.com/document/d/...) — source document +- [ADR-001: Parquet Data Model](./001-parquet-data-model.md) — schema-on-read (C11) +- [ADR-002: Sort Schema](./002-sort-schema-parquet-splits.md) — query-aware layout foundation (C7) +- [ADR-003: Time-Windowed Sorted Compaction](./003-time-windowed-sorted-compaction.md) — compaction, time windowing, storage decoupling (C7, C8, C15) +- [Husky Storage Compaction Blog Post](https://www.datadoghq.com/blog/engineering/husky-storage-compaction/) — query-aware data layout reference diff --git a/docs/internals/adr/EVOLUTION.md b/docs/internals/adr/EVOLUTION.md new file mode 100644 index 00000000000..957402f4b85 --- /dev/null +++ b/docs/internals/adr/EVOLUTION.md @@ -0,0 +1,143 @@ +# Architecture Evolution + +This document defines how Quickhouse-Pomsky tracks architectural change through three complementary lenses. + +## The Three Lenses + +``` + Architecture Evolution + │ + ┌────────────────────────┼────────────────────────┐ + │ │ │ + ▼ ▼ ▼ + Characteristics Gaps Deviations + (Proactive) (Reactive) (Pragmatic) + │ │ │ + "What we need" "What we learned" "What we accepted" + │ │ │ + From requirements From production From trade-offs +``` + +| Lens | Question | Trigger | Outcome | +|------|----------|---------|---------| +| **Characteristics** | What capabilities must we have? | Product requirements, competitive analysis | Feature roadmap, ADR targets | +| **Gaps** | What limitations have we discovered? | Incidents, scale tests, code reviews | Potential ADRs, system improvements | +| **Deviations** | Where did we intentionally diverge? | Implementation trade-offs, PoC scope | Documented tech debt, migration plans | + +## Signal Priority + +Quickhouse-Pomsky handles three observability signals: **metrics** (current priority), **traces**, and **logs**. Architectural decisions should generalize across all three, but metrics drives the initial implementation. + +## Characteristics (Proactive) + +**Location:** Tracked in ADRs as they are created. + +**Purpose:** Track implementation status of cloud-native storage and query capabilities required for production observability at scale. + +**Status Legend:** +- Implemented - Production ready +- Partial - Some aspects implemented, gaps remain +- Implicit - Achieved as side effect +- Proposed - ADR exists but not implemented +- Deviation/Not Planned - Gap or intentional omission + +## Gaps (Reactive) + +**Location:** [gaps/](./gaps/) + +**Purpose:** Capture design limitations discovered through production behavior, incidents, or research. Lightweight pre-ADR documents that may evolve into formal ADRs. + +**Lifecycle:** +``` +Discovered → Open → Investigating → ADR-Drafted → Closed + ↓ + (Won't Fix) → Closed +``` + +**When to Create:** +- Design limitation exposed by production behavior +- Pattern used by other systems that we're missing +- Recurring problem needing architectural attention + +**Current Gaps:** + +| Gap | Title | Status | Severity | +|-----|-------|--------|----------| +| [001](./gaps/001-no-parquet-compaction.md) | No Parquet Split Compaction | Open | High | +| [002](./gaps/002-fixed-sort-schema.md) | Fixed Hardcoded Sort Schema | Open | Medium | +| [003](./gaps/003-no-time-window-partitioning.md) | No Time-Window Partitioning at Ingestion | Open | High | +| [004](./gaps/004-incomplete-split-metadata.md) | Incomplete Split Metadata for Compaction | Open | High | +| [005](./gaps/005-no-per-point-deduplication.md) | No Per-Point Deduplication | Open | Medium | +| [006](./gaps/006-no-independent-auto-scaling.md) | No Independent Auto-Scaling | Open | High | +| [007](./gaps/007-no-multi-level-caching.md) | No Multi-Level Caching | Open | High | +| [008](./gaps/008-no-high-query-rate-optimization.md) | No High Query Rate Optimization | Open | High | +| [009](./gaps/009-no-leading-edge-prioritization.md) | No Leading Edge Prioritization | Open | High | + +## Deviations (Pragmatic) + +**Location:** [deviations/](./deviations/) + +**Purpose:** Document where actual implementation intentionally differs from ADR intent. These are known, accepted trade-offs - not bugs. + +**When to Create:** +- Implementation takes a different approach than ADR described +- PoC simplification that will need future work +- Architectural compromise due to time/resource constraints + +**Current Deviations:** + +| Deviation | Title | Related ADR | Priority | +|-----------|-------|-------------|----------| + +*No deviations recorded yet.* + +## Relationships + +### Gaps → Characteristics +A gap may provide evidence for a characteristic's partial status. + +### Gaps → ADRs +A gap may evolve into a formal ADR when the solution is designed. + +### Gaps → Deviations +A gap may become a deviation if we decide to accept the limitation. + +### Characteristics → Deviations +A characteristic marked as not planned should have a corresponding deviation explaining why. + +## Decision Flow + +``` +Problem Discovered + │ + ▼ + Is it a known requirement? + │ + ┌────┴────┐ + │ Yes │ No + ▼ ▼ +Update Create Gap +Characteristic │ +Status ▼ + Can we fix it? + │ + ┌─────┴─────┐ + │ Yes │ No (or not now) + ▼ ▼ + Draft ADR Create Deviation + (close gap) (document trade-off) +``` + +## Maintenance + +| Document Type | Review Cadence | Owner | +|---------------|----------------|-------| +| Characteristics | Quarterly (roadmap sync) | Product/Architecture | +| Gaps | After incidents, scale tests | Engineering | +| Deviations | Before major releases | Tech Lead | + +## References + +- [ADR Index](./README.md) +- [Gaps Directory](./gaps/README.md) +- [Deviations Directory](./deviations/README.md) diff --git a/docs/internals/adr/README.md b/docs/internals/adr/README.md new file mode 100644 index 00000000000..752539d4b86 --- /dev/null +++ b/docs/internals/adr/README.md @@ -0,0 +1,94 @@ +# Architecture Decision Records (ADR) Index + +This directory serves as the **central knowledge base** for Quickhouse-Pomsky architecture. + +## Knowledge Map (Agent Context) + +For AI agents and developers, here is how the system is organized by domain: + +### Core Architecture + +ADRs will be created here as we implement new systems. Start with the metrics pipeline and work outward. + +### Signal Priority + +**Metrics first**, then traces, then logs. Architectural decisions must generalize across all three signals. + +--- + +## Master Index + +| ADR | Title | Status | Tags | Key Components | +|-----|-------|--------|------|----------------| +| [000](./000-template.md) | Template | - | `meta` | - | +| [001](./001-parquet-data-model.md) | Parquet Metrics Data Model | Proposed | `storage`, `metrics`, `parquet`, `data-model` | quickwit-parquet-engine | +| [002](./002-sort-schema-parquet-splits.md) | Configurable Sort Schema for Parquet Splits | Proposed | `storage`, `metrics`, `compaction`, `parquet`, `sorting` | quickwit-parquet-engine, quickwit-indexing | +| [003](./003-time-windowed-sorted-compaction.md) | Time-Windowed Sorted Compaction for Parquet | Proposed | `storage`, `metrics`, `compaction`, `parquet`, `time-windowing` | quickwit-parquet-engine, quickwit-indexing, quickwit-metastore | +| [004](./004-cloud-native-storage-characteristics.md) | Cloud-Native Storage Characteristics | Proposed | `architecture`, `storage`, `cloud-native`, `observability` | all | + +## Supplements & Roadmaps + +Detailed implementation plans and reports linked to ADRs. + +| Parent ADR | Supplement | Description | +|------------|------------|-------------| +| [000](./000-template.md) | [Supplement Template](./supplements/000-supplement-template.md) | Template for new supplements | + +## Architecture Evolution + +Quickhouse-Pomsky tracks architectural change through three lenses. See **[EVOLUTION.md](./EVOLUTION.md)** for the full process. + +``` + Architecture Evolution + │ + ┌────────────────────┼────────────────────┐ + ▼ ▼ ▼ + Characteristics Gaps Deviations + (Proactive) (Reactive) (Pragmatic) +``` + +### Characteristics (What we need) + +Product requirements and capabilities we must have. See [ADR-004](./004-cloud-native-storage-characteristics.md) for the full characteristic status matrix. + +### Gaps (What we learned) + +| Gap | Title | Status | Severity | +|-----|-------|--------|----------| +| [001](./gaps/001-no-parquet-compaction.md) | No Parquet Split Compaction | Open | High | +| [002](./gaps/002-fixed-sort-schema.md) | Fixed Hardcoded Sort Schema | Open | Medium | +| [003](./gaps/003-no-time-window-partitioning.md) | No Time-Window Partitioning at Ingestion | Open | High | +| [004](./gaps/004-incomplete-split-metadata.md) | Incomplete Split Metadata for Compaction | Open | High | +| [005](./gaps/005-no-per-point-deduplication.md) | No Per-Point Deduplication | Open | Medium | +| [006](./gaps/006-no-independent-auto-scaling.md) | No Independent Auto-Scaling | Open | High | +| [007](./gaps/007-no-multi-level-caching.md) | No Parquet Metadata Caching | Open | High | +| [008](./gaps/008-no-high-query-rate-optimization.md) | No High Query Rate Optimization | Open | High | +| [009](./gaps/009-no-leading-edge-prioritization.md) | No Leading Edge Prioritization | Open | High | +| [010](./gaps/010-no-data-caching-or-query-affinity.md) | No Multi-Level Data Caching or Query Affinity Optimization | Open | High | + +**Create a gap** when you discover a design limitation from production, incidents, or research. See [gaps/README.md](./gaps/README.md). + +### Deviations (What we accepted) + +| Deviation | Title | Related ADR | Priority | +|-----------|-------|-------------|----------| + +*No deviations recorded yet.* + +**Create a deviation** when implementation intentionally differs from ADR intent. See [deviations/README.md](./deviations/README.md). + +## Decision Logs (How to use) + +We do not have a separate "Decision Log" file. **Decision Logs are embedded in each ADR.** + +When you need to understand *why* a decision was made: +1. Find the relevant ADR in the Knowledge Map above. +2. Scroll to the **Decision Log** section at the bottom of that ADR. +3. If making a NEW decision, update that table. + +## Status Definitions + +- **Proposed**: Under discussion, awaiting prototype or review. +- **Accepted**: Approved plan of record. Implementation should follow this. +- **Deprecated**: Replaced or abandoned. Kept for history. +- **Superseded**: Replaced by a newer ADR (see link). diff --git a/docs/internals/adr/deviations/README.md b/docs/internals/adr/deviations/README.md new file mode 100644 index 00000000000..000c2e5e0e5 --- /dev/null +++ b/docs/internals/adr/deviations/README.md @@ -0,0 +1,108 @@ +# Architecture Deviations + +> **Part of [Architecture Evolution](../EVOLUTION.md)** - the pragmatic lens for documenting intentional divergence from ADR intent. + +This directory tracks **architecture deviations** - places where actual implementation intentionally differs from what an ADR describes. These are known, accepted trade-offs, not bugs. + +## Role in Architecture Evolution + +| Lens | This Directory | +|------|----------------| +| **Characteristics** (Proactive) | "What we need" - see ADRs | +| **Gaps** (Reactive) | "What we learned" - see [gaps/](../gaps/) | +| **Deviations** (Pragmatic) | **You are here** - "What we accepted" | + +A deviation: +- Documents why a **Characteristic** is marked as not planned +- May originate from a **Gap** we decided to accept +- Requires a migration plan for eventual resolution + +## When to Create a Deviation + +Create a deviation document when: +- Implementation takes a different approach than ADR described +- PoC simplification that will need future work +- Architectural compromise due to time/resource constraints +- Intentional scope reduction with documented rationale + +**Don't create a deviation for**: bugs (use issues), unfinished features (use roadmap), or design limitations not yet decided (use gaps). + +## Template + +```markdown +# Deviation XXX: [Title] + +## Summary + +[1-2 sentences describing the divergence] + +## Related ADR + +- **ADR**: [ADR-NNN](../NNN-title.md) +- **Section**: [Which part of the ADR this deviates from] + +## ADR States + +> [Quote the relevant section from the ADR] + +## Current Implementation + +[Describe what was actually built and why] + +## Signal Impact + +Which signals are affected (metrics, traces, logs)? Does the deviation apply to all three or just one? + +## Impact + +| Aspect | ADR Target | Current Reality | +|--------|------------|-----------------| +| ... | ... | ... | + +## Why This Exists + +[Explain the trade-off decision] + +## Priority Assessment + +[When should this be resolved? Is it acceptable for PoC/MVP/Production?] + +## Work Required to Match ADR + +| Change | Difficulty | Description | +|--------|------------|-------------| +| ... | ... | ... | + +## Recommendation + +[Accept for now? Fix before X milestone?] + +## References + +- [Related Gap](../gaps/NNN-*.md) (if applicable) + +## Date + +YYYY-MM-DD +``` + +## Naming Convention + +Deviation files use sequential numbering: `001-short-description.md` + +## Index + +| Deviation | Title | Related ADR | Priority | +|-----------|-------|-------------|----------| + +*No deviations recorded yet.* + +## Lifecycle + +``` +Identified → Documented → Accepted → (Eventually) Resolved + ↓ + Permanent (rare) +``` + +Unlike gaps which may become ADRs, deviations are typically resolved by updating the implementation to match the ADR, or by updating the ADR to match reality. diff --git a/docs/internals/adr/gaps/001-no-parquet-compaction.md b/docs/internals/adr/gaps/001-no-parquet-compaction.md new file mode 100644 index 00000000000..aac8958dc3e --- /dev/null +++ b/docs/internals/adr/gaps/001-no-parquet-compaction.md @@ -0,0 +1,71 @@ +# GAP-001: No Parquet Split Compaction + +**Status**: Open +**Discovered**: 2026-02-19 +**Context**: Codebase analysis during Phase 1 locality compaction design. Confirmed by [compaction-architecture.md](../../compaction-architecture.md) section "Why No Compaction (Yet)?" + +## Problem + +The Parquet pipeline has no compaction. Splits accumulate without merging, and the only mechanism for reducing split count is time-based retention that removes expired data. Between ingestion and retention expiry, every split produced remains in its original form. + +While metrics is the immediate priority and the first product to use the Parquet pipeline, this is not a metrics-specific problem. Any product that stores data as Parquet splits — including future Parquet-based traces, logs, or other observability signals — will face the same unbounded split accumulation and query fan-out degradation. The compaction pipeline should be designed as generic Parquet infrastructure, not as a metrics-only feature. Metrics drives the initial implementation, but the architecture must not bake in metrics-specific assumptions that would need to be reworked when other products adopt Parquet storage. + +At current ingestion rates, each indexer node produces Parquet splits of approximately 600 KiB (the result of 128 MiB in-memory / 1M row batching thresholds combined with high compression ratios). At 10 GiB/s aggregate ingestion, the system produces ~1,024 splits per second, accumulating ~921,600 splits per 15-minute window before any compaction. Every query within that window's time range must fan out to all splits, creating O(split_count) metadata lookups, file opens, and DataFusion task scheduling overhead. + +The existing logs/traces compaction system (`StableLogMergePolicy` with Tantivy merge actors) cannot be applied to Parquet splits. The merge executor (`merge_executor.rs`) uses Tantivy's `UnionDirectory` to combine segments, which does not apply to Parquet files. A purpose-built Parquet merge pipeline is needed. + +## Evidence + +**No metrics-specific merge code exists.** Searching the codebase for `MetricsMerge*`, `ParquetMerge*`, `MetricsCompact*`, or any Parquet-specific merge executor returns no results. The metrics indexing pipeline has four actors (Source -> ParquetDocProcessor -> ParquetIndexer -> ParquetUploader -> ParquetPublisher) with no merge counterparts. + +**Metrics pipeline explicitly skipped by merge scheduler.** The `MergePlanner` in `quickwit-indexing/src/actors/merge_planner.rs` operates on `SplitMetadata` (the Tantivy split type). `MetricsSplitMetadata` is a separate type in `quickwit-parquet-engine/src/split/metadata.rs` with no merge planner integration. + +**Compaction architecture doc confirms absence:** +> "Metrics splits accumulate without compaction. This is tolerable in the short term because DataFusion can query many small Parquet files, and time-based retention eventually removes old data. But it is not ideal, and metrics compaction is a planned goal." + +## State of the Art + +- **ClickHouse**: MergeTree engine performs background merges of parts (equivalent to splits) using a sorted merge. Parts are organized by partition key (typically time) and merged within partitions. +- **Apache Iceberg**: Compaction rewrites small data files into fewer larger files. Sort-order-aware compaction produces files with non-overlapping key ranges. +- **Husky (Datadog)**: Size-tiered compaction within time buckets. Sort columns read first to determine merge order, then columns streamed through merge. Achieved 25-33% compression improvement and reduced query latency. +- **Prometheus/Mimir**: Head block compaction produces sorted, time-bounded blocks. Vertical compaction merges blocks with overlapping time ranges. + +All of these systems treat compaction as essential infrastructure, not optional optimization. + +## Potential Solutions + +- **Option A (Proposed by ADR-003)**: Build a dedicated Parquet merge pipeline with sorted k-way merge, time-windowed scope, and StableLogMergePolicy adaptation. This is the approach described in [ADR-003: Time-Windowed Sorted Compaction](../003-time-windowed-sorted-compaction.md). + +- **Option B**: Enable the existing Tantivy merge pipeline on Parquet splits by writing a Parquet-aware merge executor that plugs into the existing merge planner/scheduler/downloader/uploader/publisher actors. This reuses more infrastructure but requires the merge executor to handle format differences and does not address sort-order preservation. + +- **Option C**: External compaction via a separate service (e.g., a Spark/DataFusion job). Decouples compaction from the Quickwit process but adds operational complexity and latency. + +**Recommended**: Option A. It builds on the existing actor framework while being purpose-built for the Parquet format and sorted merge requirements. + +## Signal Impact + +**Metrics**: The immediate priority. Metrics is the first product on the Parquet pipeline and suffers from no compaction today. + +**All Parquet-based products**: This gap affects any product that adopts Parquet storage. The compaction pipeline — time windowing, sorted merge, split metadata, compaction scope — is generic to the Parquet format, not to metrics semantics. The sort schema, window duration, and scope components are configurable per index, so the same infrastructure serves different products with different sort orders and time characteristics. Traces and logs on Parquet would use the same compaction pipeline with different sort schemas (e.g., `service_name|trace_id|timestamp` for traces). + +## Impact + +- **Severity**: High +- **Frequency**: Constant (every query pays the cost of no compaction) +- **Affected Areas**: `quickwit-parquet-engine`, `quickwit-indexing` (merge pipeline), `quickwit-metastore` (merge publication), query performance + +## Next Steps + +- [ ] Validate compression improvement: sort existing metrics Parquet files by proposed schema, compare sizes (existential experiment) +- [ ] Design Parquet merge actor pipeline (planner, downloader, executor, uploader, publisher) +- [ ] Implement sorted k-way merge executor using Arrow/Parquet APIs +- [ ] Adapt StableLogMergePolicy for metrics (or design metrics-specific merge policy) +- [ ] Run compaction policy experiments (fanin sweep, target size sweep) +- [ ] Integration test: end-to-end compaction through the full pipeline + +## References + +- [ADR-003: Time-Windowed Sorted Compaction](../003-time-windowed-sorted-compaction.md) +- [Compaction Architecture](../../compaction-architecture.md) +- [Phase 1: Sorted Splits for Parquet](../../locality-compaction/phase-1-sorted-splits.md) +- [StableLogMergePolicy](../../../quickwit/quickwit-indexing/src/merge_policy/stable_log_merge_policy.rs) diff --git a/docs/internals/adr/gaps/002-fixed-sort-schema.md b/docs/internals/adr/gaps/002-fixed-sort-schema.md new file mode 100644 index 00000000000..f0ac90ab901 --- /dev/null +++ b/docs/internals/adr/gaps/002-fixed-sort-schema.md @@ -0,0 +1,112 @@ +# GAP-002: Fixed Hardcoded Sort Schema + +**Status**: Open +**Discovered**: 2026-02-19 +**Context**: Codebase analysis during Phase 1 locality compaction design. Sort implementation by Matthew Kim provides the foundation but is not configurable. + +## Problem + +The sort order for Parquet splits is hardcoded in `ParquetField::sort_order()` (`quickwit-parquet-engine/src/schema/fields.rs:146-158`). It returns a fixed list of seven columns: + +```rust +pub fn sort_order() -> &'static [ParquetField] { + &[ + Self::MetricName, + Self::TagService, + Self::TagEnv, + Self::TagDatacenter, + Self::TagRegion, + Self::TagHost, + Self::TimestampSecs, + ] +} +``` + +This sort order cannot be changed per index, per deployment, or at runtime. All metrics indexes use the same sort columns regardless of their query patterns. Different workloads have different high-value columns -- a Kubernetes metrics index benefits from `pod` and `namespace`, while an infrastructure metrics index benefits from `host` and `datacenter`. The fixed schema prevents workload-specific optimization. + +Additionally, the current sort implementation has three behavioral mismatches with the target design: + +1. **Null ordering**: The writer uses `nulls_first: true` for all columns (`writer.rs:95`). The design specifies nulls-last for ascending columns and nulls-first for descending columns, so that nulls cluster at the end of each column's value range in both directions. + +2. **No sort direction control**: All columns are sorted ascending (`descending: false`). The design specifies that `timestamp` should default to descending, and other columns should support configurable direction. + +3. **No stable sort**: The current sort using `lexsort_to_indices` is not guaranteed to be stable — rows with equal sort keys may be reordered arbitrarily. A stable sort preserves the relative order of rows with equal keys, which matters for compaction: when merging sorted inputs, a stable sort ensures that the merge is deterministic and that rows which compare equal are not needlessly shuffled across compaction generations. + +## Evidence + +**Hardcoded sort columns in `fields.rs`:** +```rust +// quickwit-parquet-engine/src/schema/fields.rs:146-158 +pub fn sort_order() -> &'static [ParquetField] { + &[ + Self::MetricName, // Always first + Self::TagService, // Always second + Self::TagEnv, // etc. + Self::TagDatacenter, + Self::TagRegion, + Self::TagHost, + Self::TimestampSecs, // Always last + ] +} +``` + +**Fixed sort options in `writer.rs`:** +```rust +// quickwit-parquet-engine/src/storage/writer.rs:93-96 +SortColumn { + values: Arc::clone(batch.column(col_idx)), + options: Some(SortOptions { + descending: false, // All ascending + nulls_first: true, // All nulls-first (design says nulls-last for ascending) + }), +} +``` + +**No sort schema in metastore.** The sort order is not part of the index metadata in the metastore, MetricsSplitMetadata, or the Parquet file metadata. There is no mechanism to specify, store, evolve, or propagate the sort schema to indexing pipelines at runtime. + +**No timeseries_id computation.** The optional `timeseries_id` tiebreaker column (hash of canonicalized tag key/value pairs) is not computed anywhere in the pipeline. + +## State of the Art + +- **Husky (Datadog)**: Sort schema is defined per table/track as a configuration string (e.g., `service__s|status__s|tag.env__s|timestamp|tiebreaker__i/V2`). Different tables use different sort schemas optimized for their query patterns. +- **ClickHouse**: `ORDER BY` clause in table definition specifies the sort key. Different tables have different sort keys. The sort key is stored in table metadata and enforced at write time and merge time. +- **Apache Iceberg**: Sort order is a table property that can be changed over time. Different sort orders coexist in the same table; the metadata tracks which files use which sort order. + +All of these systems treat sort order as a configurable, per-table property. + +## Potential Solutions + +- **Option A (Proposed by ADR-002)**: Implement the configurable sort schema format (`column|...|timestamp&metadata/V2`) as described in [ADR-002](../002-sort-schema-parquet-splits.md). Store the schema as a per-index property in the metastore, mutable at runtime. When the schema is changed, propagate the update to the indexing pipelines on the appropriate nodes so that new splits use the new schema. Already-written splits retain their original schema and are not rewritten; they age out via retention. Record the sort schema string in each split's MetricsSplitMetadata and Parquet file metadata so that the compaction scope can group splits by schema. + +- **Option B**: Extend the current `ParquetField::sort_order()` to accept a configuration parameter. Simpler than full schema parsing but doesn't support direction control, metadata-only columns, or the timeseries_id tiebreaker, and has no runtime change propagation. + +**Recommended**: Option A. The full schema format is needed for compaction (merges must know the sort order of inputs) and for future query pruning (metadata-only columns). Runtime mutability is essential for operational flexibility -- changing the sort schema should not require a pipeline restart or redeployment. + +## Signal Impact + +**Metrics**: Directly affected. All metrics indexes use the same fixed sort order. + +**Traces and logs**: Not affected today (Tantivy pipeline has no sort). Phase 4 would need a similar configurable sort schema for Tantivy fast fields. + +## Impact + +- **Severity**: Medium +- **Frequency**: Constant (every index is constrained to the same sort order) +- **Affected Areas**: `quickwit-parquet-engine/src/schema/fields.rs`, `quickwit-parquet-engine/src/storage/writer.rs`, index configuration, `MetricsSplitMetadata` + +## Next Steps + +- [ ] Define sort schema parser for `column|...|timestamp&metadata/V2` format +- [ ] Store sort schema as per-index property in the metastore (mutable at runtime) +- [ ] Propagate sort schema changes from metastore to indexing pipelines on appropriate nodes +- [ ] Replace `ParquetField::sort_order()` with schema-driven column selection +- [ ] Fix null ordering: ascending columns use `nulls_first: false`, descending columns use `nulls_first: true` +- [ ] Support per-column sort direction (`+`/`-` suffix) +- [ ] Implement optional timeseries_id computation (xxHash64 or SipHash over canonicalized tags) +- [ ] Store sort schema in MetricsSplitMetadata and Parquet file key_value_metadata + +## References + +- [ADR-002: Sort Schema for Parquet Splits](../002-sort-schema-parquet-splits.md) +- [Phase 1: Sorted Splits for Parquet](../../locality-compaction/phase-1-sorted-splits.md) +- Current implementation: `quickwit-parquet-engine/src/schema/fields.rs:146-158`, `quickwit-parquet-engine/src/storage/writer.rs:84-109` diff --git a/docs/internals/adr/gaps/003-no-time-window-partitioning.md b/docs/internals/adr/gaps/003-no-time-window-partitioning.md new file mode 100644 index 00000000000..13cadd8f719 --- /dev/null +++ b/docs/internals/adr/gaps/003-no-time-window-partitioning.md @@ -0,0 +1,101 @@ +# GAP-003: No Time-Window Partitioning at Ingestion + +**Status**: Open +**Discovered**: 2026-02-19 +**Context**: Codebase analysis during Phase 1 locality compaction design + +## Problem + +The metrics ingestion pipeline does not partition splits by time window. When the `ParquetIndexer` accumulates rows over a commit interval (default 60 seconds), the resulting split may contain rows whose timestamps span multiple time windows. There is no mechanism to ensure that each split belongs to exactly one time window, and no `window_start` or `window_duration_secs` is recorded in split metadata. + +Without time-window partitioning: + +1. **Compaction scope is unbounded by time.** The merge planner cannot scope compaction to individual windows, meaning there is no natural partitioning dimension to bound merge working sets. +2. **Retention is per-split, not per-window.** Deleting old data requires inspecting individual split time ranges rather than dropping entire windows. +3. **Queries cannot prune by window.** Without window assignment, the query planner must consult each split's time range individually rather than discarding entire windows. +4. **Late data has no acceptance boundary.** Without a `late_data_acceptance_window`, arbitrarily late data can be ingested, potentially triggering expensive operations on old data. + +The codebase has a `PartitionGranularity` enum in `quickwit-parquet-engine/src/split/partition.rs` with `Hour`, `Day`, `Week` variants. This does not match the design requirement: the Phase 1 design calls for finer-grained, epoch-aligned windows (1-60 minutes, default 15 minutes) that evenly divide one hour. The existing partitioning is too coarse for compaction scoping and is not aligned to the compaction scope model. + +## Evidence + +**No window assignment in MetricsSplitMetadata:** +```rust +// quickwit-parquet-engine/src/split/metadata.rs +pub struct MetricsSplitMetadata { + pub split_id: SplitId, + pub index_id: String, + pub time_range: TimeRange, // Coarse time range, not window assignment + pub num_rows: u64, + pub size_bytes: u64, + pub metric_names: HashSet, + pub low_cardinality_tags: HashMap>, + pub high_cardinality_tag_keys: HashSet, + pub created_at: SystemTime, + pub parquet_files: Vec, + // No window_start, no window_duration_secs, no sort_schema +} +``` + +**No window partitioning in ParquetIndexer:** The `ParquetBatchAccumulator` in `quickwit-parquet-engine/src/ingest/accumulator.rs` concatenates all pending batches into a single combined batch and writes one split. There is no grouping of rows by time window before writing. + +**No late data rejection at ingestion.** There is no check that compares a data point's timestamp against a configurable maximum age. All data points are accepted regardless of timestamp. + +**Existing PartitionGranularity is too coarse:** +```rust +// quickwit-parquet-engine/src/split/partition.rs +pub enum PartitionGranularity { + Hour, // 3600 seconds + Day, // 86400 seconds + Week, // 604800 seconds +} +``` + +The design requires granularities from 1 minute to 60 minutes, with the default of 15 minutes. + +## State of the Art + +- **Husky (Datadog)**: Fragments are bucketed by fixed-duration time windows. Each fragment belongs to exactly one window. Compaction is scoped per window. +- **ClickHouse**: Partitioning by `toYYYYMM(timestamp)` or finer granularity. Parts belong to a single partition. Merges only combine parts within the same partition. +- **Apache Iceberg**: Partition specs define time-based partitioning (hours, days, months). Each data file belongs to a single partition. Compaction operates within partitions. +- **Prometheus/Mimir**: Blocks cover fixed time ranges (2 hours by default). Compaction combines blocks with overlapping or adjacent time ranges into larger blocks. + +Time-based partitioning is universal in observability storage systems. + +## Potential Solutions + +- **Option A (Proposed by ADR-003)**: Implement epoch-aligned time-window partitioning at ingestion. Before writing, group rows by window assignment, produce a separate Parquet file per window. Add `window_start`, `window_duration_secs`, and `compaction_start_time` to configuration and metadata. Implement `late_data_acceptance_window` to drop points older than a configurable threshold. + +- **Option B**: Extend the existing `PartitionGranularity` enum to support finer granularities (1m, 5m, 15m, etc.) and use it for window assignment. This reuses existing code but may require significant refactoring of the partition logic to match the epoch-aligned design. + +**Recommended**: Option A, with the possibility of refactoring `PartitionGranularity` to support the required granularities if the existing code is close enough. + +## Signal Impact + +**Metrics**: Directly affected. No time-window partitioning exists for the Parquet pipeline. + +**Traces and logs**: Not directly affected (Tantivy pipeline has time ranges on splits but no formal window partitioning). Phase 4 would benefit from formal time windowing for the same reasons. + +## Impact + +- **Severity**: High +- **Frequency**: Constant (every split is affected by the lack of window assignment) +- **Affected Areas**: `quickwit-parquet-engine/src/ingest/accumulator.rs`, `quickwit-parquet-engine/src/split/metadata.rs`, `quickwit-indexing/src/actors/parquet_indexer.rs`, index configuration + +## Next Steps + +- [ ] Define epoch-aligned time-window computation: `window_start = t - (t % window_duration_seconds)` +- [ ] Implement row partitioning by window in `ParquetBatchAccumulator` (group rows by timestamp window before concatenation) +- [ ] Produce one split per window when commit interval straddles a window boundary +- [ ] Add `window_duration` configuration to index settings (default 15 minutes, valid: 1m-60m, must evenly divide 1 hour) +- [ ] Add `compaction_start_time` configuration (required for Phase 1 enablement) +- [ ] Implement `late_data_acceptance_window` at ingestion (drop points older than threshold) +- [ ] Add `window_start` and `window_duration_secs` to MetricsSplitMetadata +- [ ] Add window columns to `metrics_splits` PostgreSQL table +- [ ] Handle null-timestamp rows: assign to overflow window (window_start = 0) + +## References + +- [ADR-003: Time-Windowed Sorted Compaction](../003-time-windowed-sorted-compaction.md) +- [Phase 1: Sorted Splits for Parquet](../../locality-compaction/phase-1-sorted-splits.md) +- Current implementation: `quickwit-parquet-engine/src/split/partition.rs`, `quickwit-parquet-engine/src/ingest/accumulator.rs` diff --git a/docs/internals/adr/gaps/004-incomplete-split-metadata.md b/docs/internals/adr/gaps/004-incomplete-split-metadata.md new file mode 100644 index 00000000000..3232ada6c14 --- /dev/null +++ b/docs/internals/adr/gaps/004-incomplete-split-metadata.md @@ -0,0 +1,133 @@ +# GAP-004: Incomplete Split Metadata for Compaction and Query Pruning + +**Status**: Open +**Discovered**: 2026-02-19 +**Context**: Codebase analysis during Phase 1 locality compaction design + +## Problem + +`MetricsSplitMetadata` and the `metrics_splits` PostgreSQL table lack the fields needed for sorted compaction (ADR-003) and future split-level query pruning (Phase 3). Additionally, the Parquet writer does not emit page-level column indexes, preventing intra-file query pruning on sorted data. + +Specifically, the following are missing: + +### Missing metadata fields + +| Field | Purpose | Blocked capability | +|-------|---------|-------------------| +| `sort_schema` | Identifies the sort order of rows in the split | Compaction scope grouping (cannot determine if two splits are merge-compatible) | +| `window_start` | Identifies the time window the split belongs to | Time-windowed compaction scoping | +| `window_duration_secs` | Records window duration in effect at split creation | Detecting window duration changes (incompatible splits) | +| `schema_column_min_values` | Per-column minimum values for sort and metadata columns | Split-level query pruning (Phase 3) | +| `schema_column_max_values` | Per-column maximum values | Split-level query pruning (Phase 3) | +| `schema_column_regexes` | Per-column regex matching any value in the split | Split-level query pruning (Phase 3) | + +### Missing Parquet file features + +| Feature | Purpose | Current state | +|---------|---------|---------------| +| Page-level column index | Min/max statistics per page within each column chunk | Not enabled at write time | +| Offset index | Page byte offsets and row counts | Not enabled at write time | +| `sorting_columns` file metadata | Declares sort order for Parquet-native tooling | Not set | +| `key_value_metadata` sort entries | Sort schema, min/max/regex embedded in file | Not set | + +Without page-level column indexes, DataFusion cannot perform page-level predicate pushdown even when data is sorted -- the sort order provides no query benefit within the file. + +## Evidence + +**MetricsSplitMetadata lacks compaction fields:** +```rust +// quickwit-parquet-engine/src/split/metadata.rs +pub struct MetricsSplitMetadata { + pub split_id: SplitId, + pub index_id: String, + pub time_range: TimeRange, + pub num_rows: u64, + pub size_bytes: u64, + pub metric_names: HashSet, + pub low_cardinality_tags: HashMap>, + pub high_cardinality_tag_keys: HashSet, + pub created_at: SystemTime, + pub parquet_files: Vec, + // Missing: sort_schema, window_start, window_duration_secs + // Missing: schema_column_min_values, schema_column_max_values, schema_column_regexes +} +``` + +**PostgreSQL `metrics_splits` table lacks compaction columns:** +```sql +-- quickwit-metastore/migrations/postgresql/25_create-metrics-splits.up.sql +CREATE TABLE metrics_splits ( + split_id VARCHAR(50) PRIMARY KEY, + split_state VARCHAR(30) NOT NULL, + index_id VARCHAR(50) NOT NULL, + time_range_start BIGINT NOT NULL, + time_range_end BIGINT NOT NULL, + metric_names TEXT[] NOT NULL, + tag_service TEXT[], + tag_env TEXT[], + -- ... other tag columns ... + num_rows BIGINT NOT NULL, + size_bytes BIGINT NOT NULL, + split_metadata_json TEXT NOT NULL, + -- Missing: window_start, window_duration_secs, sort_schema + -- Missing: schema_column_min_values, schema_column_max_values, schema_column_regexes +); +``` + +**Parquet writer does not enable column index or offset index.** The writer in `quickwit-parquet-engine/src/storage/writer.rs` writes Parquet files using default writer properties. Parquet format v2 column indexes and offset indexes are opt-in features that must be explicitly enabled via `WriterProperties::builder().set_column_index_truncate_length()` and related settings. Without these, page-level statistics are not emitted. + +## State of the Art + +- **Husky (Datadog)**: Fragment metadata includes sort schema, per-column min/max/regex, time bucket assignment. Used for fragment-level query pruning. +- **Apache Iceberg**: Manifest files store per-column lower/upper bounds for each data file. Used for file-level pruning during query planning. +- **Delta Lake**: Transaction log entries include per-column statistics (min, max, null count). Used by the query optimizer to skip files. +- **ClickHouse**: Part metadata includes primary key min/max per granule. Used for index-level pruning within parts. + +Rich per-split (per-file) metadata for query pruning is standard in modern columnar storage systems. + +## Potential Solutions + +- **Option A (Proposed by ADR-002 and ADR-003)**: + 1. Extend `MetricsSplitMetadata` with `sort_schema`, `window_start`, `window_duration_secs`, and per-column min/max/regex fields. + 2. Add corresponding columns to the `metrics_splits` PostgreSQL table via a new migration. + 3. Enable Parquet column index and offset index at write time. + 4. Set `sorting_columns` in Parquet file metadata. + 5. Write sort schema, min/max/regex to Parquet `key_value_metadata`. + +- **Option B**: Store extended metadata only in `split_metadata_json` (the existing JSON blob column). This avoids a schema migration but prevents efficient SQL queries on the new fields (no window-based filtering, no sort-schema grouping in queries). + +**Recommended**: Option A. SQL-queryable metadata fields are essential for the merge planner (which queries PostgreSQL to find merge candidates) and for future query planning. + +## Signal Impact + +**Metrics**: Directly affected. The Parquet pipeline needs these metadata fields for compaction and pruning. + +**Traces and logs**: Not directly affected. Tantivy splits have their own metadata in the `splits` table. If Phase 4 extends sorted compaction to Tantivy, similar metadata extensions would be needed for `SplitMetadata`. + +## Impact + +- **Severity**: High +- **Frequency**: Constant (every split is missing the metadata; every potential merge and pruning operation is blocked) +- **Affected Areas**: `quickwit-parquet-engine/src/split/metadata.rs`, `quickwit-parquet-engine/src/split/postgres.rs`, `quickwit-parquet-engine/src/storage/writer.rs`, `quickwit-metastore/migrations/`, query planner (future) + +## Next Steps + +- [ ] Add `sort_schema`, `window_start`, `window_duration_secs` to `MetricsSplitMetadata` +- [ ] Add `SortColumnValue` tagged union type (string, i64, u64, f64, null) +- [ ] Add `schema_column_min_values`, `schema_column_max_values`, `schema_column_regexes` to `MetricsSplitMetadata` +- [ ] Create PostgreSQL migration adding new columns to `metrics_splits` +- [ ] Update `PgMetricsSplit` and `InsertableMetricsSplit` in `postgres.rs` +- [ ] Enable Parquet column index and offset index in writer properties +- [ ] Set `sorting_columns` in Parquet file metadata based on sort schema +- [ ] Write `sort_schema`, `schema_column_min_values`, `schema_column_max_values`, `schema_column_regexes`, `window_start`, `window_duration_secs` to Parquet `key_value_metadata` +- [ ] Compute per-column min/max during split writing (scan sort + metadata-only columns) +- [ ] Compute per-column regex during split writing (follow Husky implementation) + +## References + +- [ADR-001: Parquet Data Model](../001-parquet-data-model.md) +- [ADR-002: Sort Schema for Parquet Splits](../002-sort-schema-parquet-splits.md) +- [ADR-003: Time-Windowed Sorted Compaction](../003-time-windowed-sorted-compaction.md) +- [Phase 1: Sorted Splits for Parquet](../../locality-compaction/phase-1-sorted-splits.md) +- Current metadata: `quickwit-parquet-engine/src/split/metadata.rs` +- Current PostgreSQL schema: `quickwit-metastore/migrations/postgresql/25_create-metrics-splits.up.sql` diff --git a/docs/internals/adr/gaps/005-no-per-point-deduplication.md b/docs/internals/adr/gaps/005-no-per-point-deduplication.md new file mode 100644 index 00000000000..5ef4feaedaa --- /dev/null +++ b/docs/internals/adr/gaps/005-no-per-point-deduplication.md @@ -0,0 +1,72 @@ +# GAP-005: No Per-Point Deduplication + +**Status**: Open +**Discovered**: 2026-02-19 +**Context**: Data model analysis during Phase 1 locality compaction design ([ADR-001](../001-parquet-data-model.md)) + +## Problem + +Quickwit provides deduplication at coarse granularities — WAL checkpoint exactly-once (prevents re-indexing the same batch on crash recovery) and file-level dedup for queue sources (prevents re-ingesting the same S3 file). However, there is no per-point deduplication: if the same metric data point (identical metric name, tags, timestamp, and value) arrives in two separate ingest requests, both copies are stored. + +This can occur due to: + +- **Client retries.** A client retries an ingest request after a timeout, but the original request succeeded. Both copies are ingested. +- **Overlapping sources.** Two data sources (e.g., two Kafka consumers with overlapping offsets, or a primary and failover pipeline) submit the same points. +- **Upstream replay.** An upstream system replays a window of data (e.g., for backfill or correction), producing duplicates with the existing data. + +The impact depends on the query semantics. For `SUM` aggregations, duplicates inflate the result. For `MAX`/`MIN`/`AVG`, the impact varies. For `COUNT`, duplicates overcount. For queries that reconstruct individual timeseries (e.g., "plot CPU for host X"), duplicates produce repeated values at the same timestamp, which may or may not be visible depending on the visualization. + +## Evidence + +**No per-point dedup in ingest path.** The `IngestRouter` and `Ingester` in `quickwit-ingest/src/ingest_v2/` store all received documents without checking for duplicates against existing data. The `subrequest_id` field tracks request identity for response correlation, not for deduplication. + +**WAL checkpoint dedup is batch-level.** The checkpoint mechanism in the indexing pipeline (`quickwit-indexing/src/actors/indexing_pipeline.rs`) provides exactly-once at the WAL position level — it prevents the same WAL segment from being indexed twice on crash recovery. It does not prevent the same data from being submitted in two different WAL writes. + +**File-level dedup is source-specific.** The queue source coordinator (`quickwit-indexing/src/source/queue_sources/coordinator.rs`) tracks ingested files via `PartitionId` derived from file URI. This prevents re-ingesting the same file but does not detect duplicate points within or across files. + +## State of the Art + +- **Prometheus**: Accepts the latest sample per timeseries. Effectively LWW within the head block. TSDB compaction deduplicates samples with identical timestamps during vertical compaction. +- **Mimir/Thanos**: Deduplicate at query time using replica labels. Each replica stores its own copy; the query frontend selects one replica's data per series. +- **InfluxDB**: Supports upsert semantics — writing a point with the same measurement, tag set, and timestamp overwrites the previous value (LWW). +- **ClickHouse**: `ReplacingMergeTree` engine deduplicates rows by sort key during merge (eventual dedup, not at ingest). Standard `MergeTree` stores all duplicates. +- **Husky (Datadog)**: Does not deduplicate individual points. Dedup is handled upstream in the intake pipeline before data reaches storage. + +There is no consensus in the industry. Some systems dedup at ingest (Prometheus, InfluxDB), some at query time (Mimir/Thanos), some at compaction (ClickHouse), and some not at all (Husky). + +## Potential Solutions + +- **Option A: Upstream dedup (Husky model).** Deduplication is the responsibility of the intake pipeline before data reaches Quickwit. This keeps the storage layer simple and moves complexity to a layer that already understands the data semantics. This is the current implicit approach. + +- **Option B: Query-time dedup.** Store all duplicates, deduplicate during query execution (e.g., `DISTINCT ON (metric_name, tags, timestamp)` or selecting one value per series per timestamp). Adds query cost proportional to the duplication rate. Similar to Mimir/Thanos. + +- **Option C: Compaction-time dedup.** During sorted merge, detect adjacent rows with identical (metric_name, tags, timestamp) and keep only one. This is cheap once data is sorted (duplicates are adjacent) but provides only eventual consistency — duplicates exist until the next compaction cycle. Similar to ClickHouse's `ReplacingMergeTree`. + +- **Option D: Ingest-time dedup with a bloom filter or dedup index.** Maintain a probabilistic (bloom filter) or exact index of recently-seen points, and drop duplicates at ingest. This adds memory and CPU overhead at ingest and introduces a new stateful component that must be consistent across nodes (or accept per-node dedup only). + +## Signal Impact + +**Metrics**: Most affected. Aggregation queries (SUM, COUNT) are sensitive to duplicates. The product may require per-point dedup guarantees for correctness. + +**Traces**: Less affected. Spans are typically idempotent (same trace_id + span_id is the same span). Trace storage systems commonly deduplicate by span ID. + +**Logs**: Less affected. Log entries are generally append-only without dedup expectations. Duplicate log lines are tolerable in most use cases. + +## Impact + +- **Severity**: Medium (depends on whether the product requires per-point dedup guarantees) +- **Frequency**: Depends on upstream behavior (client retries, source overlap, replay frequency) +- **Affected Areas**: `quickwit-ingest`, `quickwit-parquet-engine` (if compaction-time dedup), query engine (if query-time dedup) + +## Next Steps + +- [ ] Determine whether the product requires per-point deduplication guarantees for metrics +- [ ] If yes, evaluate Options A-D based on the expected duplication rate and acceptable latency for dedup consistency +- [ ] If compaction-time dedup (Option C) is chosen, design the adjacent-row dedup logic for the sorted merge executor + +## References + +- [ADR-001: Parquet Data Model](../001-parquet-data-model.md) — no-LWW decision and dedup discussion +- [Compaction Architecture](../../compaction-architecture.md) — WAL checkpoint exactly-once mechanism +- WAL checkpoint code: `quickwit-indexing/src/actors/indexing_pipeline.rs` +- File-level dedup: `quickwit-indexing/src/source/queue_sources/coordinator.rs` diff --git a/docs/internals/adr/gaps/006-no-independent-auto-scaling.md b/docs/internals/adr/gaps/006-no-independent-auto-scaling.md new file mode 100644 index 00000000000..b0b8bdec8fd --- /dev/null +++ b/docs/internals/adr/gaps/006-no-independent-auto-scaling.md @@ -0,0 +1,52 @@ +# GAP-006: No Independent Auto-Scaling of Query, Ingest, and Compaction + +**Status**: Open +**Discovered**: 2026-02-19 +**Context**: Cloud-native storage characteristics analysis ([ADR-004](../004-cloud-native-storage-characteristics.md), characteristics C1, C17) + +## Problem + +Quickwit does not independently auto-scale its query, ingest, and compaction workloads. All three share the same node pool and resource allocation. When ingest load spikes, query performance degrades; when query load spikes, compaction falls behind, accumulating small files that further degrade queries. + +Cloud-native observability systems require independent scaling because these workloads have fundamentally different resource profiles and burstiness patterns: + +- **Ingest** is I/O-bound and can be bursty (traffic spikes, batch uploads, replay). +- **Query** is CPU-bound and driven by user activity and monitor evaluation. +- **Compaction** is I/O-and-CPU-bound and should run as background work that doesn't interfere with ingest or query, but can scale up when there is a backlog of small files. + +Without independent scaling, there is no burst handling mechanism (characteristic C17). When ingest traffic exceeds the capacity of the shared node pool, there is no overflow buffer or burst lane to absorb the excess. + +## Evidence + +Quickwit nodes serve all roles simultaneously. The `quickwit run` command starts a node that handles ingest, indexing, searching, and compaction. While Kubernetes deployments can separate indexer and searcher roles, the compaction (merge) workload always runs co-located with the indexer and cannot be independently scaled. + +## State of the Art + +- **Husky (Datadog)**: Writers, compactors, and leaf readers are independently auto-scaled services. Compactor fleet scales based on file backlog; writer fleet scales based on ingest QPS; leaf readers scale based on query QPS. +- **ClickHouse Commercial (SharedMergeTree)**: Compute/storage separation allows query nodes to scale independently. Merges run on separate resources. +- **Mimir/Cortex**: Ingesters, compactors, and queriers are separate Kubernetes deployments with independent HPA policies. + +## Potential Solutions + +- **Option A**: Separate Quickwit into distinct indexer, compactor, and searcher Kubernetes deployments with independent auto-scaling policies. This requires the compaction (merge) pipeline to be runnable as a standalone service. +- **Option B**: Implement resource isolation within a shared node (CPU/memory limits per workload type). Less flexible but simpler to deploy. + +## Signal Impact + +All signals equally affected. Independent scaling is signal-agnostic. + +## Impact + +- **Severity**: High +- **Frequency**: Constant (under production load) +- **Affected Areas**: Deployment architecture, Kubernetes configuration, merge pipeline decoupling + +## Next Steps + +- [ ] Evaluate separating the merge pipeline into a standalone compactor service +- [ ] Design auto-scaling policies for each workload type (ingest QPS, query QPS, file backlog) +- [ ] Investigate burst handling for ingest (overflow buffer, backpressure, burst lane) + +## References + +- [ADR-004: Cloud-Native Storage Characteristics](../004-cloud-native-storage-characteristics.md) diff --git a/docs/internals/adr/gaps/007-no-multi-level-caching.md b/docs/internals/adr/gaps/007-no-multi-level-caching.md new file mode 100644 index 00000000000..307e8cf02b1 --- /dev/null +++ b/docs/internals/adr/gaps/007-no-multi-level-caching.md @@ -0,0 +1,54 @@ +# GAP-007: No Parquet Metadata Caching + +**Status**: Open +**Discovered**: 2026-02-19 +**Context**: Cloud-native storage characteristics analysis ([ADR-004](../004-cloud-native-storage-characteristics.md), characteristic C4). Split from original multi-level caching gap; broader caching concerns tracked in [GAP-010](./010-no-data-caching-or-query-affinity.md). + +## Problem + +Quickwit does not cache Parquet file footers or column chunk metadata. Every file access pays the full latency-to-first-byte penalty to read the footer from object storage before any column data can be located and fetched. + +Parquet footers contain the schema, row group metadata, column chunk offsets, min/max statistics, and encoding information. Column chunk headers contain page-level metadata (data page offsets, dictionary page locations, compression codec). These are small (typically a few KB per file) but are read on every access to every split. + +At production scale with thousands of splits per time range, the aggregate cost of repeated footer fetches is significant — both in latency (sequential S3 round-trips) and in dollars (S3 GET request pricing). Storage tiering (e.g., S3 Intelligent Tiering) makes this worse: reading even one byte from a cold file promotes it to hot tier for 30 days at higher cost. + +## Evidence + +Quickwit's search path (`quickwit-search`) downloads split data from object storage on each query. There is a split cache for warming (`report_splits()`), but no dedicated cache for Parquet footers or column chunk metadata. Each query that touches a split must fetch the footer before it can locate and read any column data. + +A single S3 GET request has ~50-100ms latency-to-first-byte. For a query spanning 100 splits, footer fetches alone contribute 5-10 seconds of serial latency (or significant parallelism overhead). + +## State of the Art + +- **Husky (Datadog)**: Caches file metadata separately from data, enabling fast query planning without data fetches. +- **ClickHouse**: Mark cache stores index granule metadata in memory; file headers are cached independently of data pages. +- **Mimir/Cortex**: Index cache (backed by Memcached) caches block-level metadata separately from chunk data. +- **Apache Spark**: Parquet footer caching is a built-in feature (`spark.sql.parquet.footerCache`). + +## Potential Solutions + +- **Option A**: Local in-memory LRU cache on each searcher node, keyed by `(split_id, footer_offset)`. Footers are immutable (splits are immutable once written), so cache invalidation is trivial — evict on LRU pressure only. +- **Option B**: Pre-fetch and store footer bytes in the metastore alongside split metadata. Adds metastore storage cost but eliminates the S3 round-trip entirely for query planning. +- **Option C**: Embed essential footer statistics (row count, column min/max, size) in split metadata at ingestion time, reducing the need to read the full footer for query planning. Full footer still fetched on demand. + +## Signal Impact + +All signals equally affected. Parquet footer caching benefits metrics queries directly. For logs/traces using Tantivy, the analogous concern is segment metadata caching — the same pattern applies. + +## Impact + +- **Severity**: High +- **Frequency**: Every query against every split +- **Affected Areas**: `quickwit-search`, `quickwit-storage` + +## Next Steps + +- [ ] Measure footer fetch frequency and latency contribution for representative query workloads +- [ ] Design local LRU footer cache (sizing, eviction, key schema) +- [ ] Evaluate embedding footer statistics in split metadata at ingestion time +- [ ] Prototype and benchmark against baseline (no cache) + +## References + +- [ADR-004: Cloud-Native Storage Characteristics](../004-cloud-native-storage-characteristics.md) +- [GAP-010: No Multi-Level Data Caching or Query Affinity Optimization](./010-no-data-caching-or-query-affinity.md) diff --git a/docs/internals/adr/gaps/008-no-high-query-rate-optimization.md b/docs/internals/adr/gaps/008-no-high-query-rate-optimization.md new file mode 100644 index 00000000000..ec0bd83f7d5 --- /dev/null +++ b/docs/internals/adr/gaps/008-no-high-query-rate-optimization.md @@ -0,0 +1,58 @@ +# GAP-008: No High Query Rate Optimization + +**Status**: Open +**Discovered**: 2026-02-19 +**Context**: Cloud-native storage characteristics analysis ([ADR-004](../004-cloud-native-storage-characteristics.md), characteristic C13) + +## Problem + +Quickwit's metadata and query infrastructure is not optimized for the high query rates required by monitor evaluation. Datadog's global metrics query rate is approximately 800k QPS. At this scale, the metadata service must respond to split-listing queries with sub-millisecond latency, and query results for repeated predicates must be cacheable to avoid redundant computation. + +The current metadata service is PostgreSQL. While PostgreSQL handles moderate query planning loads, it is not designed for 800k QPS of metadata lookups with the per-split min/max/regex filtering that split-level pruning requires (see [GAP-004](./004-incomplete-split-metadata.md)). Each monitor evaluation cycle queries the metastore for relevant splits, evaluates the query, and discards the result. The next cycle repeats the same work. + +## Evidence + +The metrics pipeline stores split metadata in PostgreSQL (`metrics_splits` table). Query planning queries this table to find splits matching a time range and tag predicates. PostgreSQL handles this at current scale (low query rate, limited metadata), but the design does not address: + +- Metadata volume at high ingestion rates (~921K splits per 15-minute window before compaction) +- High QPS metadata lookups (monitor evaluation at 800k QPS) +- Repeated identical queries (monitors re-evaluate the same predicate every cycle) + +ADR-003 explicitly acknowledges the PostgreSQL scalability concern: "At high ingestion rates, PostgreSQL metadata volume can exceed what a single OLTP database handles efficiently." + +## State of the Art + +- **Husky (Datadog)**: Dedicated metadata service optimized for high-rate pruning queries. Predicate and query result caching for monitor evaluation. +- **Mimir/Cortex**: Index cache (memcached-backed) for label/series lookups. Query result cache for repeated queries. +- **ClickHouse Commercial**: SharedMergeTree metadata in Keeper (ZooKeeper), with in-memory sparse primary key indexes on query nodes for fast granule-level pruning. + +## Potential Solutions + +- **Option A**: Query result cache. Cache the results of repeated monitor queries, keyed by (query_hash, time_range). Invalidate when new splits are published in the relevant time range. This is the highest-leverage optimization for monitor workloads. +- **Option B**: Dedicated metadata service. Replace PostgreSQL for split-level metadata with a purpose-built service optimized for range queries and high QPS. The self-describing Parquet files (ADR-003) make this migration feasible — PostgreSQL is an index, not the sole source of truth. +- **Option C**: In-memory metadata index on query nodes. Each searcher node maintains an in-memory copy of split metadata for its assigned splits (via rendezvous hashing), updated via a change feed from the metastore. Eliminates per-query metadata round-trips. + +## Signal Impact + +**Metrics**: Most affected. Monitor evaluation drives the highest query rate. + +**Traces and logs**: Less affected for monitors, but dashboard queries at scale face the same metadata bottleneck. + +## Impact + +- **Severity**: High (for production monitor workloads) +- **Frequency**: Proportional to monitor count and evaluation frequency +- **Affected Areas**: `quickwit-metastore`, `quickwit-search` (query planning), metadata infrastructure + +## Next Steps + +- [ ] Measure current metadata query latency and throughput at representative scale +- [ ] Design query result cache for monitor-style repeated queries +- [ ] Evaluate in-memory metadata index on query nodes vs dedicated metadata service +- [ ] Prototype split-level pruning using existing PostgreSQL metadata to measure pruning effectiveness before optimizing throughput + +## References + +- [ADR-004: Cloud-Native Storage Characteristics](../004-cloud-native-storage-characteristics.md) +- [GAP-004: Incomplete Split Metadata](./004-incomplete-split-metadata.md) +- [ADR-003: Time-Windowed Sorted Compaction](../003-time-windowed-sorted-compaction.md) — PostgreSQL scalability note diff --git a/docs/internals/adr/gaps/009-no-leading-edge-prioritization.md b/docs/internals/adr/gaps/009-no-leading-edge-prioritization.md new file mode 100644 index 00000000000..7a8e563d09e --- /dev/null +++ b/docs/internals/adr/gaps/009-no-leading-edge-prioritization.md @@ -0,0 +1,57 @@ +# GAP-009: No Leading Edge Prioritization or New Data Prioritization + +**Status**: Open +**Discovered**: 2026-02-19 +**Context**: Cloud-native storage characteristics analysis ([ADR-004](../004-cloud-native-storage-characteristics.md), characteristics C14, C16) + +## Problem + +Quickwit's compaction does not prioritize recent time windows over older ones. The merge planner treats all eligible windows equally, processing them in whatever order it discovers merge candidates. This means that the **leading edge** — the most recent time windows where small files accumulate fastest and queries are most frequent — competes for compaction resources with older windows that are less query-hot. + +At high ingestion rates, the leading edge accumulates hundreds of thousands of small splits per 15-minute window (see ADR-003). If compaction doesn't keep up with this accumulation, query performance on recent data degrades because every query must fan out to all those small splits. This is the most visible and impactful degradation, because observability queries overwhelmingly target recent data (dashboards, alerts, incident investigation). + +**New data visibility** is also affected. The system does not prioritize making freshly-ingested data queryable over performing compaction on older data. In extremis, compaction of old windows should yield resources to ensure new data is visible within the ingest-to-query latency SLO (e.g., 30s p99.9). + +## Evidence + +The `StableLogMergePolicy` (which ADR-003 proposes adapting for metrics) does not have a concept of window priority. It evaluates merge candidates based on maturity and document count, not based on the age of the time window or the urgency of compaction for query performance. + +There is no mechanism to: +- Prioritize compaction of recent windows over old windows +- Back off compaction of old windows when the leading edge has a backlog +- Signal that a particular window needs urgent compaction (e.g., many small files degrading queries) + +## State of the Art + +- **Husky (Datadog)**: Compactor prioritizes the leading edge. Recent time buckets are compacted first. Compaction auto-scales based on the backlog of uncompacted files at the leading edge. +- **Prometheus/Mimir**: Head block compaction runs on a tight schedule (every 2 hours). Vertical compaction of overlapping blocks is lower priority. +- **ClickHouse**: Merge scheduling prioritizes parts that would most improve query performance (many small parts in a partition). + +## Potential Solutions + +- **Option A**: Priority queue for compaction scheduling. Assign priority based on window recency and split count. Recent windows with high split counts get compacted first. Older, already-compacted windows get lower priority. +- **Option B**: Separate leading-edge compaction from background compaction. Dedicate a portion of compaction resources to the most recent N windows (e.g., last 1 hour), with remaining resources for background compaction of older windows. +- **Option C**: Event-driven compaction hints. When a window's split count exceeds a threshold (e.g., affecting query latency), emit a compaction hint that bumps that window's priority. Similar to Husky's hinting mechanism mentioned in the Phase 1 design doc. + +## Signal Impact + +All signals equally affected. Leading edge prioritization is signal-agnostic — any signal with high write rates and time-range queries benefits. + +## Impact + +- **Severity**: High (directly affects query latency on recent data) +- **Frequency**: Constant under production load +- **Affected Areas**: Merge planner, compaction scheduler, resource allocation + +## Next Steps + +- [ ] Measure split count accumulation rate at the leading edge for representative ingestion rates +- [ ] Design priority-based compaction scheduling (window recency + split count) +- [ ] Define leading edge compaction SLO (e.g., max split count per window, max age before first compaction) +- [ ] Evaluate event-driven compaction hints vs polling-based priority + +## References + +- [ADR-004: Cloud-Native Storage Characteristics](../004-cloud-native-storage-characteristics.md) +- [ADR-003: Time-Windowed Sorted Compaction](../003-time-windowed-sorted-compaction.md) +- [Phase 1: Sorted Splits for Parquet](../../locality-compaction/phase-1-sorted-splits.md) — mentions hinting mechanism for compaction diff --git a/docs/internals/adr/gaps/010-no-data-caching-or-query-affinity.md b/docs/internals/adr/gaps/010-no-data-caching-or-query-affinity.md new file mode 100644 index 00000000000..18cabc4e15d --- /dev/null +++ b/docs/internals/adr/gaps/010-no-data-caching-or-query-affinity.md @@ -0,0 +1,65 @@ +# GAP-010: No Multi-Level Data Caching or Query Affinity Optimization + +**Status**: Open +**Discovered**: 2026-02-19 +**Context**: Cloud-native storage characteristics analysis ([ADR-004](../004-cloud-native-storage-characteristics.md), characteristics C4, C6). Split from [GAP-007](./007-no-multi-level-caching.md), which now focuses on metadata caching only. + +## Problem + +Beyond metadata caching (GAP-007), Quickwit lacks the higher-level caching infrastructure that production observability systems use to hide object storage latency and reduce repeated computation: + +- **Columnar data cache**: Recently-read column data from Parquet files, avoiding repeated S3 fetches for hot splits. Without this, every query re-fetches column pages from object storage even when the same splits were just queried moments ago. +- **Predicate evaluation cache**: Results of evaluating common predicates against splits. Monitor workloads re-evaluate the same predicate every evaluation cycle (typically 15s-5min). Caching predicate results avoids redundant computation. +- **Query result cache**: Full or partial query results for repeated queries. Dashboard refresh and monitor evaluation patterns produce highly repetitive queries. + +**Query affinity** (characteristic C6) exists partially — Quickwit uses rendezvous hashing on `split_id` to assign splits to searcher nodes, which promotes cache locality. However, without the caches themselves, affinity provides no benefit. And without exactly-once semantics (GAP-005), query-time deduplication may require scatter-gather across nodes, undermining the affinity pattern. + +## Evidence + +Quickwit's search path (`quickwit-search`) downloads split data from object storage on each query. There is a split cache for warming (`report_splits()`), but no columnar data cache, no predicate cache, and no query result cache. + +At production scale, a single S3 download thread delivers ~90 MiB/s. Without caching and parallelization, query latency is dominated by object storage round-trips. For monitor workloads that re-evaluate the same query every 15 seconds, the lack of any result or predicate caching means the full cost is paid on every cycle. + +## State of the Art + +- **Husky (Datadog)**: Multi-level caches (columnar data, predicate results) with query affinity via consistent hashing. Cache hit rates >90% for typical monitor workloads. +- **ClickHouse**: Uncompressed data cache, query result cache. Configurable per-table. Mark cache for index metadata. +- **Mimir/Cortex**: Chunk cache, query result cache. Memcached-backed. Query-frontend handles result caching and splitting. + +## Potential Solutions + +### Columnar Data Cache +- **Option A**: Local LRU cache on each searcher node, keyed by `(split_id, column_name, page_range)`. Combined with existing rendezvous hashing for affinity. +- **Option B**: Distributed cache layer (Redis/Memcached) for column data. Higher complexity but enables cache sharing across searcher nodes. + +### Predicate / Query Result Cache +- **Option C**: Predicate result cache — cache the set of split IDs that match a given predicate, keyed by predicate hash + time range. Lightweight, high-value for monitor workloads. +- **Option D**: Query result cache at the query-frontend level, keyed by query hash + time range. Can serve dashboard refreshes and repeated monitor evaluations without touching searchers. + +### Query Affinity +- **Option E**: Enhance existing rendezvous hashing to be affinity-aware — consider cache state when routing queries. Requires cache hit rate metrics per node. + +## Signal Impact + +All signals equally affected. Columnar data caching benefits metrics (Parquet) and logs/traces (Tantivy segments) alike. Predicate and query result caching is signal-agnostic. + +## Impact + +- **Severity**: High +- **Frequency**: Every query pays the cost; monitor workloads pay it repeatedly +- **Affected Areas**: `quickwit-search`, `quickwit-storage`, query planner + +## Next Steps + +- [ ] Resolve GAP-007 (metadata caching) first — prerequisite for effective data caching +- [ ] Measure cache hit rates and object storage fetch frequency for representative query workloads +- [ ] Design columnar data cache (local LRU per searcher node) +- [ ] Evaluate predicate result caching for monitor-style repeated queries +- [ ] Assess query result caching at the query-frontend level +- [ ] Assess impact of storage tiering on cold file access cost + +## References + +- [ADR-004: Cloud-Native Storage Characteristics](../004-cloud-native-storage-characteristics.md) +- [GAP-007: No Parquet Metadata Caching](./007-no-multi-level-caching.md) +- [GAP-005: No Per-Point Deduplication](./005-no-per-point-deduplication.md) diff --git a/docs/internals/adr/gaps/README.md b/docs/internals/adr/gaps/README.md new file mode 100644 index 00000000000..475b276f6e1 --- /dev/null +++ b/docs/internals/adr/gaps/README.md @@ -0,0 +1,118 @@ +# Architecture Gaps + +> **Part of [Architecture Evolution](../EVOLUTION.md)** - the reactive lens for tracking design limitations discovered through production. + +This directory tracks **architecture gaps** - observations and learnings that may warrant future ADRs but aren't yet ready for formal treatment. + +## Role in Architecture Evolution + +| Lens | This Directory | +|------|----------------| +| **Characteristics** (Proactive) | "What we need" - see ADRs | +| **Gaps** (Reactive) | **You are here** - "What we learned" | +| **Deviations** (Pragmatic) | "What we accepted" - see [deviations/](../deviations/) | + +A gap may: +- Explain why a **Characteristic** is partially implemented +- Evolve into an **ADR** when a solution is designed +- Become a **Deviation** if we accept the limitation + +## Purpose + +Gaps capture: +- Problems discovered during incidents, scale tests, or code reviews +- Comparisons with state-of-the-art systems +- Potential solutions worth investigating +- Impact assessments to prioritize work + +## When to Create a Gap + +Create a gap document when you observe: +- A design limitation exposed by production behavior +- A pattern used by other systems that we're missing +- Technical debt that affects reliability or performance +- A recurring problem that needs architectural attention + +**Don't create a gap for**: bugs (use issues), feature requests (use roadmap), or decisions already made (use ADRs). + +## Gap Lifecycle + +``` +Discovered → Open → Investigating → ADR-Drafted → Closed + ↓ + (Won't Fix) → Closed +``` + +| Status | Meaning | +|--------|---------| +| **Open** | Problem identified, not yet investigated | +| **Investigating** | Actively researching solutions | +| **ADR-Drafted** | Solution chosen, ADR written | +| **Closed** | Resolved (via ADR) or Won't Fix | + +## Template + +```markdown +# GAP-XXX: [Title] + +**Status**: Open | Investigating | ADR-Drafted | Closed +**Discovered**: YYYY-MM-DD +**Context**: [Incident/Scale test/Review that surfaced this] + +## Problem + +[1-2 paragraphs describing the issue] + +## Evidence + +[Metrics, logs, commands, observations that demonstrate the problem] + +## State of the Art + +[How other systems (ClickHouse, Prometheus, Mimir, etc.) solve this] + +## Potential Solutions + +- **Option A**: [Description] +- **Option B**: [Description] +- **Option C**: [Description] + +## Signal Impact + +Which signals are affected (metrics, traces, logs)? Does the gap affect all three or just one? + +## Impact + +- **Severity**: Low | Medium | High | Critical +- **Frequency**: Rare | Occasional | Common +- **Affected Areas**: [Components] + +## Next Steps + +- [ ] Action item 1 +- [ ] Action item 2 + +## References + +- [Related ADR](../NNN-title.md) +- [External link](https://...) +``` + +## Naming Convention + +Gap files use sequential numbering: `001-short-description.md` + +## Index + +| Gap | Title | Status | Severity | +|-----|-------|--------|----------| +| [001](./001-no-parquet-compaction.md) | No Parquet Split Compaction | Open | High | +| [002](./002-fixed-sort-schema.md) | Fixed Hardcoded Sort Schema | Open | Medium | +| [003](./003-no-time-window-partitioning.md) | No Time-Window Partitioning at Ingestion | Open | High | +| [004](./004-incomplete-split-metadata.md) | Incomplete Split Metadata for Compaction | Open | High | +| [005](./005-no-per-point-deduplication.md) | No Per-Point Deduplication | Open | Medium | +| [006](./006-no-independent-auto-scaling.md) | No Independent Auto-Scaling | Open | High | +| [007](./007-no-multi-level-caching.md) | No Parquet Metadata Caching | Open | High | +| [008](./008-no-high-query-rate-optimization.md) | No High Query Rate Optimization | Open | High | +| [009](./009-no-leading-edge-prioritization.md) | No Leading Edge Prioritization | Open | High | +| [010](./010-no-data-caching-or-query-affinity.md) | No Multi-Level Data Caching or Query Affinity Optimization | Open | High | diff --git a/docs/internals/adr/supplements/000-supplement-template.md b/docs/internals/adr/supplements/000-supplement-template.md new file mode 100644 index 00000000000..1403a6e94ff --- /dev/null +++ b/docs/internals/adr/supplements/000-supplement-template.md @@ -0,0 +1,64 @@ +# Supplement NNN: Title + +> **Philosophy**: [Optional: A guiding principle for this roadmap/report] +> **Parent ADR**: [Link to ADR] + +**Last Updated**: YYYY-MM-DD +**Status**: [e.g. In Progress, Complete, Blocked] + +--- + +## Status Summary + +[Optional: High-level metrics or status table] + +| Metric | Value | Target | Notes | +|--------|-------|--------|-------| +| Pass Rate | 0% | 100% | ... | + +--- + +## Roadmap & Progress + +Use checkbox tasks (`- [ ]`) to track progress. Mark completed items with `- [x]`. + +### Phase 1: [Phase Name] (Status: [e.g. Done]) + +- [ ] **Task Name** - [Description] + - Context: [Link to issue/context] +- [ ] **Task Name** + - [ ] Sub-task 1 + - [ ] Sub-task 2 + +### Phase 2: [Phase Name] (Status: [e.g. In Progress]) + +- [ ] **Task Name** +- [ ] **Task Name** + +--- + +## Detailed Reports / Metrics + +[Optional: Detailed tables, gap analysis, or technical notes] + +| Category | Status | Notes | +|----------|--------|-------| +| ... | ... | ... | + +--- + +## Implementation Notes + +[Optional: Notes on how to implement, key files, or architecture decisions specific to this roadmap] + +**Key Files**: +- `path/to/file.rs` - [Description] + +--- + +## How to Update + +1. **Mark Progress**: Change `- [ ]` to `- [x]` when a task is complete. +2. **Add Context**: Add notes or links to PRs/commits for completed items. +3. **Update Metrics**: Update the Status Summary table at the top. +4. **Add New Items**: Add new tasks to the appropriate phase as they are discovered. diff --git a/docs/internals/adr/supplements/README.md b/docs/internals/adr/supplements/README.md new file mode 100644 index 00000000000..2a5fdfb6e28 --- /dev/null +++ b/docs/internals/adr/supplements/README.md @@ -0,0 +1,52 @@ +# ADR Supplements + +This directory contains supplementary documentation that supports the main ADRs but is too detailed or dynamic to include in the ADR itself. + +## Naming Convention + +Supplements are named after the ADR they support: +- `NNN-adr-title-supplement-name.md` + +For example: +- `001-metrics-pipeline-implementation-roadmap.md` - Detailed roadmap for ADR-001 + +## Agent Workflow + +Agents (and humans) should use supplements to track progress without losing context. + +### 1. Finding Work +- Check the **Knowledge Map** in `../README.md` to find the relevant ADR. +- Look for linked **Supplements** (Roadmaps/Reports) in that ADR or the index below. + +### 2. Tracking Progress +- **Read**: Supplements use the [Template](./000-supplement-template.md) format. +- **Update**: + - Change `- [ ]` to `- [x]` when tasks are completed. + - Update "Last Updated" date. + - Add notes/context to completed items (e.g., "Fixed in PR #123"). + - Update status tables/metrics if applicable. + +### 3. Creating New Supplements +- Copy `000-supplement-template.md`. +- Name it `NNN-parent-adr-name-supplement-type.md`. +- Link it in the table below and in the parent ADR. + +--- + +## Supplement Index + +| File | Related ADR | Description | Status | +|------|-------------|-------------|--------| +| [000-supplement-template.md](000-supplement-template.md) | - | Template for new supplements | - | + +*Supplements will be added as ADRs are created and implementation progresses.* + +## When to Use Supplements + +Create a supplement when: +- The ADR needs detailed implementation plans that change frequently (Roadmaps) +- Test results or metrics that get updated over time (Reports) +- Function/feature catalogs that grow incrementally +- Migration guides or compatibility matrices + +The main ADR should remain stable and capture the architectural decision, while supplements can be updated as implementation progresses. diff --git a/docs/internals/compaction-architecture.md b/docs/internals/compaction-architecture.md new file mode 100644 index 00000000000..4bc98c31d12 --- /dev/null +++ b/docs/internals/compaction-architecture.md @@ -0,0 +1,582 @@ +# Split Compaction Architecture in Quickwit + +**Last Updated:** 2026-02-11 + +## Table of Contents + +1. [Overview](#overview) +2. [The Data Path: From Document to Split](#the-data-path-from-document-to-split) +3. [Compaction: How Splits Get Merged](#compaction-how-splits-get-merged) +4. [Data Ownership and Scaling](#data-ownership-and-scaling) +5. [How Queries Find Splits](#how-queries-find-splits) +6. [Document Routing Detail](#document-routing-detail) +7. [Partition IDs: Tenant Isolation](#partition-ids-tenant-isolation) +8. [The Metrics Pipeline](#the-metrics-pipeline) +9. [Configuration Reference](#configuration-reference) + +--- + +## Overview + +Quickwit ingests documents into small, immutable units called **splits**. As splits accumulate, background **compaction** merges small splits into larger ones. Larger splits improve query throughput by reducing the number of units a searcher must open and scan. + +This document follows the path of data from ingestion through compaction and into queries, then covers the scaling properties that emerge from the architecture. + +### Vocabulary + +- **Split** -- a self-contained unit of indexed data (a Tantivy segment for logs/traces, or a Parquet file for metrics). Documents within a split are stored in **ingestion order** -- no sort is applied at index time or during merges. Once published, a split is immutably associated with the `node_id` of the node that created it. +- **Shard** -- a logical WAL stream for a particular (index, source) pair, hosted on an ingester node. Ephemeral: can be closed and replaced on a different node at any time. +- **Merge scope** -- the 5-part key `(node_id, index_uid, source_id, partition_id, doc_mapping_uid)`. Each unique combination gets an independent compaction hierarchy with no cross-key coordination. +- **Maturity** -- a split's eligibility for further merging. Immature splits can be merged; mature splits (by size or age) will not be merged again. + +--- + +## The Data Path: From Document to Split + +### End-to-End Diagram + +``` +Client + │ + ▼ +External Load Balancer (K8s Service, nginx, etc.) + │ distributes across Quickwit nodes + ▼ +IngestRouter (on any node -- every node is a router) + │ picks a shard (round-robin, local preferred) + ▼ +Ingester Node (shard leader) + │ appends to shard's WAL (local disk) + ▼ +Indexing Pipeline (usually same node) + │ reads WAL → builds Tantivy segment + │ stamps split with (node_id, index_uid, source_id, partition_id, doc_mapping_uid) + ▼ +Object Storage (S3/GCS) + PostgreSQL (metadata) +``` + +### a) Client to Router + +Every Quickwit node runs an `IngestRouter` and exposes REST and gRPC ingest endpoints -- there are no dedicated "router" nodes. Clients reach Quickwit through an **external load balancer** (a Kubernetes Service, an nginx proxy, etc.) that distributes requests across the available nodes. Quickwit itself has no client-side routing protocol; it relies on the deployment infrastructure to spread traffic. + +Available ingest endpoints include: +- **Native API**: `POST /api/v1//ingest` (NDJSON) +- **Elasticsearch-compatible bulk**: `POST /api/v1/_elastic/_bulk` +- **OTLP**: `POST /api/v1/otlp/v1/{logs,traces,metrics}` (protobuf) and gRPC equivalents + +**Location:** `quickwit/quickwit-serve/src/lib.rs:940` -- every node instantiates an `IngestRouter`. + +### b) Router to Shard + +Once a request lands on a node, its `IngestRouter` picks a shard for the document's (index, source) pair. Shard selection happens **before** the WAL write -- the shard determines which node's WAL receives the data. + +A single ingest request's documents all go to **one shard** (and therefore one leader node). Across requests, the router distributes documents across shards via round-robin, preferring shards hosted locally on the router's own node. See [Document Routing Detail](#document-routing-detail) for the full algorithm. + +### c) The WAL + +Each shard has its own append-only WAL stream, stored on the leader node's local disk via `mrecordlog`. Multiple shards can coexist on one node. Optional replication sends writes to a follower node. + +**Physical location:** `{data_dir}/queues/` on the ingester node. + +**Persistence:** By default, the WAL flushes every 5 seconds without fsync (`PersistPolicy::OnDelay`). For metrics pipelines, the WAL uses persistent volumes to survive pod restarts. + +### d) WAL to Split + +An indexing pipeline reads batches from the shard's WAL. Documents are parsed, validated, and written into a Tantivy segment in arrival order. When the batch hits a threshold (`commit_timeout_secs` or size), the split is finalized: + +1. The split is stamped with the 5-part key from its pipeline identity and partition +2. The split file is uploaded to object storage (`{split_id}.split`) +3. Split metadata is published atomically to PostgreSQL **in the same transaction as a checkpoint delta** that records which WAL positions the split covers + +The checkpoint delta is critical for exactly-once semantics: it ties "these documents are now in a published split" to "the WAL has been consumed up to this position" in a single atomic operation. If the node crashes before publication, the in-progress split is abandoned, but the checkpoint hasn't advanced, so the documents will be re-read from the WAL and re-indexed into a new split on restart. Recovery works by fetching the checkpoint from PostgreSQL on startup -- a small metadata read, not a scan of splits or history. Published splits are write-once artifacts that are never reopened or appended to. + +Note that checkpoints are purely a concern of the initial indexing pipeline. Merge (compaction) operations do not interact with checkpoints at all -- the merge executor publishes the merged split with no checkpoint delta, and the metastore skips the checkpoint update. + +Initial splits are small -- typically 100K-500K documents. Compaction happens later. + +### e) Where Things Physically Live + +| Data | Location | +|------|----------| +| WAL | Local disk on ingester nodes (ephemeral unless persistent volume) | +| Split files | Object storage (`{split_id}.split` or `.parquet`) | +| Split metadata | PostgreSQL (`splits` table) | + +### Pipeline Actors (detail) + +
+Click to expand actor-level detail + +The indexing pipeline is a chain of actors: + +1. **Source** (e.g., `IngestV2Source`) -- polls the shard's WAL, fetches document batches +2. **DocProcessor** -- parses and validates JSON documents, computes partition keys +3. **Indexer** -- creates Tantivy segments from document batches +4. **Serializer** -- serializes the segment +5. **Packager** -- packages the split with metadata +6. **Uploader** -- uploads the split file to object storage +7. **Sequencer** -- ensures ordering +8. **Publisher** -- atomically publishes split metadata to PostgreSQL + +**Location:** `quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs` + +
+ +--- + +## Compaction: How Splits Get Merged + +Small splits accumulate from ingestion. A **merge planner** on each node periodically examines published splits and combines eligible ones into larger splits. The merged split replaces the originals: it is uploaded to storage, old splits are marked for deletion, and PostgreSQL is updated atomically. + +### The Merge Scope + +Each unique combination of the 5-part key gets an independent compaction hierarchy: + +``` +(node_id, index_uid, source_id, partition_id, doc_mapping_uid) + ╰──────── pipeline identity ────────╯ ╰──── merge partition ────╯ +``` + +**Compaction is entirely node-local.** There is no cross-node merge coordination. If N nodes produce data for the same partition, you get N independent hierarchies. + +The key is enforced in two stages: + +**Stage 1 -- Pipeline identity filtering** (first 3 components). Each node's merge planner filters to only the splits matching its own `(node_id, index_uid, source_id)`: + +```rust +// quickwit/quickwit-indexing/src/actors/merge_planner.rs:350-355 +fn belongs_to_pipeline(pipeline_id: &MergePipelineId, split: &SplitMetadata) -> bool { + pipeline_id.node_id == split.node_id + && pipeline_id.index_uid == split.index_uid + && pipeline_id.source_id == split.source_id +} +``` + +**Stage 2 -- Partition grouping** (last 2 components). After pipeline filtering, remaining splits are grouped by `(partition_id, doc_mapping_uid)`. Each group gets its own independent merge policy evaluation. + +### Why Each Key Component Matters + +**`node_id`** -- The `node_id` constraint exists because of **how the current merge planner is implemented**, not because of a fundamental data integrity requirement. Each node runs its own independent merge planner, and each planner only processes splits stamped with its own `node_id`. This avoids any need for cross-node coordination during compaction. + +Importantly, **checkpoints are not a reason for this constraint**. Checkpoints are part of the *initial indexing pipeline* -- they track WAL consumption and are updated atomically when a split is first published (see [WAL to Split](#d-wal-to-split)). But merge operations do not interact with checkpoints at all: the merge executor passes no checkpoint delta (`checkpoint_delta_opt: Default::default()`), and the metastore skips checkpoint updates when no delta is provided. Published splits are write-once artifacts; they don't carry WAL position information and are never read back during recovery. Merging splits from different nodes would not corrupt any checkpoint state. + +This means the `node_id` constraint is an **implementation choice for simplicity**, not an architectural invariant. Cross-node compaction is feasible without checkpoint changes -- it's a coordination problem, not a data integrity problem. See [Future: Locality-Aware Compaction](#future-locality-aware-compaction) and [Data Ownership and Scaling](#data-ownership-and-scaling) for the implications. + +**`index_uid`** -- Includes the logical index name plus an incarnation ID (ULID). Prevents merging across different indexes or across delete-and-recreate cycles. Different indexes have different schemas, storage prefixes, and merge policies. + +**`source_id`** -- A **source** is a configured data ingestion channel for an index. Each source has a type (Kafka, Kinesis, file, IngestV2, etc.), a user-defined `source_id` string, and type-specific parameters (e.g., a Kafka topic and broker list). A single index can have **multiple sources** -- for example, `kafka-us-east` and `kafka-eu-west` feeding the same index, or an `_ingest-source` for real-time API ingestion alongside a `kafka-batch` source for batch processing. + +Like `node_id`, the `source_id` constraint in the merge scope is an **implementation choice**, not a checkpoint integrity requirement. Merge operations do not update checkpoints, so merging splits from different sources would not corrupt checkpoint tracking. The constraint exists because each node's merge planner is scoped to a single `(node_id, index_uid, source_id)` pipeline, keeping the implementation simple and coordination-free. In practice, merging across sources would rarely be useful anyway -- splits from different sources typically represent different data feeds (e.g., different Kafka topics or regions). + +**Location:** `quickwit/quickwit-config/src/source_config/mod.rs` (source configuration and types) + +**`partition_id`** -- Isolates tenants or other logical groupings. Keeping partitions separate enables query-time split pruning. See [Partition IDs](#partition-ids-tenant-isolation). + +**`doc_mapping_uid`** -- Identifies the schema version. Prevents merging splits built with incompatible schemas after a mapping change. + +### The StableLogMergePolicy + +**Location:** `quickwit/quickwit-indexing/src/merge_policy/stable_log_merge_policy.rs` + +The default merge policy implements a **logarithmic level-based strategy** inspired by LSM trees. Within each merge scope: + +**Step 1: Sort splits by recency.** Most-recent-first by `time_range.end`, then by doc count, then by split ID for determinism. + +**Step 2: Build logarithmic levels.** Splits are organized into levels based on document count. Each level is ~3x larger than the previous: + +``` +Level 0: [0 ... 300K docs) (min_level_num_docs * 3) +Level 1: [300K ... 900K) (3x previous) +Level 2: [900K ... 2.7M) (3x again) +... +Level N: [X ... 10M) (split_num_docs_target) +``` + +**Step 3: Find merge candidates per level.** For each level (oldest-first), the policy accumulates splits until a threshold is hit: + +| Condition | Threshold | Effect | +|-----------|-----------|--------| +| Too few splits | `merge_factor` (default: 10) | Won't merge yet | +| Too many splits | `max_merge_factor` (default: 12) | Stop accumulating, merge what we have | +| Result too large | `split_num_docs_target` (default: 10M) | Stop accumulating, merge what we have | + +**Last-merge rule:** If adding one more split would exceed `split_num_docs_target`, merge with fewer than `merge_factor` splits. This prevents orphan splits that can never find enough merge partners at their level. + +**Split maturity exits:** Splits with >= 10M docs (size maturity) or older than 48h (time maturity) graduate out of merge eligibility entirely. + +### Physical Merge Process + +The merge scheduler coordinates execution. Source splits are downloaded from object storage, merged using Tantivy's `UnionDirectory`, and the result is uploaded as a new split. The merge publisher atomically updates PostgreSQL: publishes the new merged split and marks the old splits as `MarkedForDeletion`. + +
+Merge pipeline actors (detail) + +1. **MergePlanner** -- queries PostgreSQL for immature splits, applies merge policy +2. **MergeScheduler** -- coordinates merge operations across indexes +3. **MergeSplitDownloader** -- downloads source splits from storage +4. **MergeExecutor** -- merges using Tantivy `UnionDirectory` +5. **MergePackager** -- packages the merged split +6. **MergeUploader** -- uploads the new split +7. **MergePublisher** -- atomically updates PostgreSQL + +**Location:** `quickwit/quickwit-indexing/src/actors/merge_planner.rs` + +
+ +### Example + +**Setup:** 3 indexer nodes, all ingesting into the same index and source, `partition_id=1`. Each node has produced 13 splits of 100K docs. + +Each node's merge planner operates independently on its own 13 splits: + +1. All 13 splits are in Level 0 (all < 300K) +2. Walk backwards: 12 splits accumulated (Split 2 through 13) +3. 12 >= `merge_factor` (10), 12 <= `max_merge_factor` (12), total 1.2M < 10M target +4. Merge Splits 2-13 into one 1.2M-doc split (now Level 1) +5. Split 1 remains, will merge with future splits + +**Result across the cluster:** + +``` +Node 1: Split 1 (100K) + Split NEW-1 (1.2M) ← independent hierarchy +Node 2: Split 1 (100K) + Split NEW-2 (1.2M) ← independent hierarchy +Node 3: Split 1 (100K) + Split NEW-3 (1.2M) ← independent hierarchy +``` + +The same partition has 6 splits across the cluster (3 leftover + 3 merged), not 1 merged split. Each node compacts at its own pace. + +--- + +## Data Ownership and Scaling + +This section describes the most important architectural consequence of node-local compaction: **shards are ephemeral, but splits are permanent.** + +### The Core Asymmetry + +A **shard** can be closed and replaced by a new shard on a different node at any time -- during rebalancing, node departure, or scale-up. The control plane does this routinely. + +But once a split is published, its `node_id` is **immutable**. It is baked into the metadata in PostgreSQL. No mechanism exists to reassign a split's `node_id` after publication. The `belongs_to_pipeline()` function in the merge planner performs a strict equality check on `node_id`, so only a merge planner running on the *original* node can ever pick up that split for compaction. + +### What Happens When a Node Leaves + +1. The control plane detects the departure (via chitchat membership protocol) and triggers `rebalance_shards()`. +2. Old shards on the departed node are closed; new shards are opened on remaining nodes. +3. **New incoming data** flows to the new shards on surviving nodes. +4. **WAL data** on the departed node that was not yet converted to splits is **lost** (unless replication was configured, in which case the follower can take over). +5. **Published splits** from the departed node remain in PostgreSQL and object storage. They are still **fully queryable**. +6. **But they can never be compacted again.** No running merge planner has the departed node's ID, so `belongs_to_pipeline()` will never match those splits. + +``` +Before: Node A (alive) Node B (alive) + ┌──────────────┐ ┌──────────────┐ + │ Shard 1 (WAL)│ │ Shard 2 (WAL)│ + │ Splits: S1-S5│ │ Splits: S6-S9│ + │ MergePlanner ─┤ │ MergePlanner ─┤ + │ (node_id = A) │ │ (node_id = B) │ + └──────────────┘ └──────────────┘ + +After: Node A (alive) Node B (departed) + ┌──────────────┐ + │ Shard 1 (WAL)│ Splits S6-S9 still in + │ Shard 3 (new)│ object storage + Postgres + │ Splits: S1-S5│ ✓ Queryable + │ MergePlanner ─┤ ✗ Will never be compacted + │ (node_id = A) │ + └──────────────┘ +``` + +### What Happens When a Node Joins + +1. The control plane triggers rebalancing. +2. New shards may be allocated to the new node. +3. **No existing data migrates.** The new node only receives new incoming data through its new shards. +4. The new node's merge planner only compacts splits it creates itself. + +### Queries Are NOT Bound by node_id + +This is critical: while `node_id` controls which merge planner owns a split, **queries ignore `node_id` entirely**. + +Search uses **rendezvous hashing on `split_id`** to assign splits to searcher nodes, with load-aware balancing: + +```rust +// quickwit/quickwit-search/src/search_job_placer.rs:212 +sort_by_rendez_vous_hash(&mut candidate_nodes, job.split_id()); +``` + +The `node_id` field is not referenced anywhere in the `quickwit-search` module. Any searcher node can read any split from object storage regardless of which node created it. Orphaned splits from departed nodes are fully queryable -- the only impact is on split count. + +### Implications for Long-Running Clusters + +Over time, a cluster that experiences node churn accumulates small orphaned splits from departed nodes: + +- **They are still queryable** -- in object storage, with metadata in PostgreSQL. +- **They will never be compacted** -- no merge planner will claim them. +- **Query performance degrades proportionally to split count.** More splits means more units per query to open and scan. This is not a routing problem (queries find them fine) but a fan-out cost. +- **Retention policies still apply.** If time-based deletion is configured, orphaned splits will be cleaned up when they expire. + +``` +Time ──────────────────────────────────────────────► + +Node A: ████ ████ ████████ ████████████████████ (compacting normally) +Node B: ████ ████ ██── departed ── (4 small orphaned splits remain) +Node C: ████ ████ ████████ ████████████████ (joined later, compacting normally) +Node D: ████ ██── departed ── (2 small orphaned splits remain) + +Total orphaned small splits grows with each departure. +``` + +--- + +## How Queries Find Splits + +Queries are completely decoupled from `node_id`. The query path: + +1. **Coordinator** (root search) queries PostgreSQL for relevant splits, filtered by time range, tags, index, etc. The `node_id` field is not part of any query filter. +2. **Job placement**: each split becomes a search job. Jobs are assigned to searcher nodes via **rendezvous hashing on `split_id`** combined with load-aware balancing. The hasher sorts candidate nodes by affinity, then the placer assigns jobs to the first node under a ~5% load disparity target. +3. **Execution**: each searcher reads its assigned splits directly from object storage. No node needs to "own" the split to read it. +4. **Cache warming**: indexers can optionally notify searchers of newly published splits via `report_splits()` so the split cache can pre-warm. This uses `split_id` and `storage_uri`, not `node_id`. + +**Key point:** `node_id` is purely a compaction concept. It has no role in query routing or execution. + +--- + +## Document Routing Detail + +How documents get routed to specific nodes determines the `node_id` that gets stamped on splits, so routing directly shapes compaction topology. + +### Three Layers of Routing + +#### Layer 1: Control Plane Shard Allocation (strategic) + +The control plane decides which ingesters host which shards. When a router needs shards for an (index, source) pair, it requests them from the control plane. + +The `allocate_shards()` function selects ingesters using **least-loaded-first**: + +1. Count open shards per available ingester +2. Pick the ingester with the fewest open shards +3. Break ties randomly +4. If replication is enabled, pick two different nodes (leader + follower) + +**Location:** `quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs` + +#### Layer 2: Router Shard Selection (per-request) + +When an `IngestRouter` receives documents, it picks a shard from its routing table. + +The `next_open_shard_round_robin()` function: + +1. **Tries local shards first** -- shards where the router's own node is the leader +2. **Falls back to remote shards** -- shards on other nodes +3. **Round-robins within each category** -- uses an atomic counter +4. Skips closed or rate-limited shards + +**Location:** `quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs:141-165` + +A single request's documents all go to one shard. Distribution across shards happens across requests. + +#### Layer 3: Indexing Pipeline Scheduling (where splits get created) + +The control plane decides which node runs the indexing pipeline that consumes from each shard. The `build_physical_indexing_plan()` function: + +1. Accounts for CPU capacity of each indexer node +2. Uses **shard affinity** -- prefers scheduling indexing on the node that hosts the shard +3. Inflates capacities to 120% of total load for headroom +4. Works iteratively from the previous solution for stability + +**Location:** `quickwit/quickwit-control-plane/src/indexing_scheduler/scheduling/mod.rs` + +### How `node_id` Gets Stamped + +The split's `node_id` comes from the `IndexingPipelineId` of the node running the indexing pipeline: + +```rust +// quickwit/quickwit-indexing/src/models/indexed_split.rs:100-103 +split_attrs: SplitAttrs { + node_id: pipeline_id.node_id, + index_uid: pipeline_id.index_uid, + source_id: pipeline_id.source_id, + ... +} +``` + +### Can a Split's `node_id` Differ From Its Shard's `leader_id`? + +Yes, but rarely. The control plane prefers co-locating the indexing pipeline with the shard leader. However, if the leader node lacks CPU capacity, the indexing pipeline may run on a different node, reading from the shard remotely. In that case, the split's `node_id` is the indexer's node, not the shard leader. + +--- + +## Partition IDs: Tenant Isolation + +### What is partition_id? + +`partition_id` is a **u64 hash** used to isolate documents based on configurable field(s). It's one of the five components of the merge scope key, so splits with different `partition_id` values get independent compaction hierarchies and are never merged together. + +### How partition_id Is Computed + +The `partition_key` expression in the index config determines partitioning: + +```yaml +# Simple field-based +partition_key: service_name + +# Hash with modulo (limits to 100 distinct partitions) +partition_key: hash_mod(service_name, 100) + +# Composite fields +partition_key: hash_mod((service,division,city), 50) +``` + +**Computation flow:** + +1. Extract field values from the JSON document +2. Hash using SipHasher (salted with the expression tree) for deterministic partitioning +3. Apply modulo if configured +4. Result: u64 `partition_id` + +If no `partition_key` is configured, all documents get `partition_id = 0`. + +**Location:** `quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs:501` + +### The OTHER_PARTITION_ID Overflow + +```rust +// quickwit/quickwit-indexing/src/actors/indexer.rs:60 +const OTHER_PARTITION_ID: u64 = 3264326757911759461u64; +``` + +When a single indexing workbench exceeds `max_num_partitions`, additional documents are mapped to this special catch-all partition to prevent memory explosion from tracking too many concurrent splits. + +--- + +## The Metrics Pipeline + +The metrics pipeline uses a **completely different implementation** from logs/traces. + +### Key Differences + +| Aspect | Logs/Traces (Tantivy) | Metrics (Parquet) | +|--------|----------------------|-------------------| +| **Storage format** | Tantivy segments (`.split`) | Parquet files (`.parquet`) | +| **Pipeline actors** | 8 (indexing) + 7 (merge) | 4 (no merge pipeline) | +| **Compaction** | StableLogMergePolicy | Not implemented | +| **WAL** | IngestV2 (ephemeral by default) | IngestV2 only (persistent volume) | +| **Metadata** | `SplitMetadata` (Postgres) | `MetricsSplitMetadata` (Postgres) | +| **Query engine** | Tantivy + custom code | DataFusion + Arrow | + +### Pipeline Architecture + +``` +Source → MetricsDocProcessor → MetricsIndexer → MetricsUploader → MetricsPublisher +``` + +- **MetricsDocProcessor** -- converts Arrow IPC to RecordBatch +- **MetricsIndexer** -- accumulates batches, writes Parquet splits +- **MetricsUploader** -- stages and uploads Parquet files to storage +- **MetricsPublisher** -- publishes metadata to PostgreSQL + +**Location:** `quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs:600-728` + +### Metrics Skip IngestV1 + +Metrics indexes are filtered out of IngestV1 scheduling: + +```rust +// quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs:219-222 +if is_metrics_index(&source_uid.index_uid.index_id) { + continue; // Skip IngestV1 source for metrics +} +``` + +### Persistent WAL for Metrics + +Metrics use persistent volumes for the WAL to survive pod restarts, preventing data loss during failures: + +```yaml +# k8s/eks/metrics.quickwit.dev.yaml +indexer: + persistentVolume: + enabled: true + storage: "10Gi" + storageClass: "gp3" +``` + +### Why No Compaction (Yet)? + +Metrics splits accumulate without compaction. This is tolerable in the short term because DataFusion can query many small Parquet files, and time-based retention eventually removes old data. But it is not ideal, and metrics compaction is a planned goal. + +### The Problem With the Current Architecture for Metrics + +The existing log/trace compaction system (StableLogMergePolicy) is a poor fit for metrics even if it were enabled on Parquet splits. The core issue is **data locality**. + +Metrics time series emit points on a periodic schedule (e.g., every 10 seconds). With load-balanced routing across nodes and shards, the points for any given time series are scattered across whichever nodes happened to receive them. Each node produces its own splits independently, so a single time series' data ends up fragmented across many small splits on many nodes. The node-local compaction model (merge scope bound by `node_id`) means these fragments can never be merged together -- each node only compacts its own portion. + +Logs and traces suffer from the same fundamental scattering -- data is load-balanced across nodes with no content-aware placement, so documents for a given service, trace ID, or tag combination are spread across splits on every node. The current compaction model can *get away with* this because log/trace queries typically scan by time range and filter by tags, so a full scan across all matching splits still produces correct results. But "correct" is not "efficient": every query must fan out to every split in the time range, with no ability to prune splits based on data content. The system works, but it leaves significant query performance on the table. + +For metrics the problem is more acute. Metrics queries often need to reconstruct a **single time series** across a time window (e.g., "plot CPU usage for host X over the last hour"), which means reading a point or two from each of many splits. The fan-out cost per query grows with the number of splits, and the lack of cross-node compaction means this never improves. + +### Future: Locality-Aware Compaction + +The eventual goal is a compaction system designed around data locality, drawing on approaches similar to [Husky's storage compaction](https://www.datadoghq.com/blog/engineering/husky-storage-compaction/). While the immediate motivation is metrics, the same approach would benefit logs and traces by enabling split pruning based on data content rather than requiring full scans. The key ideas include: + +- **Cross-node compaction** -- unlike the current model, locality-aware compaction must merge data regardless of which node produced it, since load balancing inherently distributes related data across nodes. This applies equally to metrics time series, log streams from a service, and spans from a trace. As discussed in [Why Each Key Component Matters](#why-each-key-component-matters), the current `node_id` constraint is an implementation choice for simplicity, not a data integrity requirement -- merge operations don't interact with checkpoints, so there is no fundamental obstacle to cross-node merging. The challenge is coordination, not correctness. +- **Sort-key-aware merging** -- reorganizing data by a sort schema (e.g., metric name + tags + timestamp, or service name + timestamp) so that related data is physically co-located within splits +- **Locality compaction** -- an LSM-inspired approach that progressively narrows each split's coverage of the sort-key space, creating non-overlapping segments. This enables **query pruning**: a query for a specific service or metric can skip entire splits whose key range doesn't overlap, rather than scanning everything in the time window. +- **Time bucketing** -- partitioning compaction by time windows, since queries use time as a primary filter and observability data is ephemeral + +A detailed design will be covered in a forthcoming document. + +--- + +## Configuration Reference + +### Merge Policy + +**File:** `quickwit-config/src/merge_policy_config.rs` + +```yaml +indexing_settings: + merge_policy: + type: stable_log + min_level_num_docs: 100000 # Minimum docs in Level 0 (default: 100K) + merge_factor: 10 # Minimum splits to trigger merge (default: 10) + max_merge_factor: 12 # Maximum splits per merge (default: 12) + maturation_period: 48h # Time until split becomes mature (default: 48h) + + split_num_docs_target: 10000000 # Target size for mature splits (default: 10M) + commit_timeout_secs: 60 # Commit timeout for indexer (default: 60s) +``` + +### Partition Key + +```yaml +doc_mapping: + partition_key: hash_mod(service_name, 100) +``` + +### IngestV2 + +```yaml +ingest_api: + max_queue_memory_usage: 2GB + max_queue_disk_usage: 100GB + +indexer: + enable_otlp_endpoint: true + data_dir: /quickwit/data # Must be on persistent volume for metrics +``` + +### Kubernetes Persistent Volume (Metrics WAL) + +```yaml +indexer: + persistentVolume: + enabled: true + storage: "10Gi" + storageClass: "gp3" +``` + +--- + +**Document Version:** 2.1 +**Last Updated:** 2026-02-11 +**Maintainer:** Engineering Team diff --git a/docs/internals/locality-compaction/phase-1-sorted-splits.md b/docs/internals/locality-compaction/phase-1-sorted-splits.md new file mode 100644 index 00000000000..fb540d27399 --- /dev/null +++ b/docs/internals/locality-compaction/phase-1-sorted-splits.md @@ -0,0 +1,512 @@ +# Phase 1: Sorted Splits for Parquet + +**Authors:** gtt@ **Date:** 2026-02-19 **Status:** draft **Scope:** Parquet splits only + +## Overview + +This document describes Phase 1 of locality-aware compaction for Quickwit, focused on the Parquet-format data in the metrics pipeline. Phase 1 introduces a configurable **sort schema** for Parquet-format indexes and ensures that all Parquet splits \-- whether produced at ingestion or by compaction \-- contain rows in sorted order according to that schema. It also introduces **time windowing**: all data is partitioned into fixed-duration, epoch-aligned time windows (default 15 minutes), and compaction is scoped to individual windows so that data is never merged across window boundaries. + +This is directly analogous to [Phase 1 of the Husky locality project](https://docs.google.com/document/d/1x9BO1muCTo1TmfhPYBdIxZ-59aU0ECSiEaGPUcDZkPs/edit), where sorting individual fragment files by a subset of columns achieved 25-33% compression improvement and measurable reductions in query latency and network bandwidth. + +Phase 1 does not change the compaction planning algorithm or introduce cross-node coordination. It modifies only how individual splits are written, so that the data within each split is physically organized in an order aligned with the most common query predicates. Compaction continues to use size-tiered merging (m:1), but the merge process is modified to produce sorted output from sorted inputs via k-way merge. + +### Goals + +Phase 1 succeeds if: (a) the Parquet pipeline has size-tiered compaction within time windows, reducing split count per window over time; (b) all newly-written splits contain rows sorted according to the configured sort schema; (c) sort order is preserved through compaction merges; and (d) sorted splits achieve measurably better compression than unsorted splits (target: 20%+ reduction in Parquet file size). + +## Background and Motivation + +### The Data Scattering Problem + +Metrics data arrives at Quickwit through load-balanced routing: an external load balancer distributes requests across nodes, each node's `IngestRouter` picks a shard via round-robin, and the indexing pipeline produces splits stamped with the producing QW indexing node's identity. This means that points for any given time series \-- identified by metric name, tags, and timestamp \-- are scattered across whichever nodes happened to receive them. + +Within each split, rows are stored in **ingestion order**: the order in which they arrived at the indexing pipeline. There is no relationship between the physical layout of rows and the logical structure of the data. A query for a specific metric name must scan all rows in every split in the time range. + +### Why Sorting Helps + +Sorting rows within each split by a schema aligned with common query predicates produces two immediate benefits: + +1. **Compression improvement.** Columnar formats like Parquet compress data by encoding runs of similar values. When rows are sorted by metric name and tags, the columns for those fields contain long runs of identical or similar values. This benefits multiple encoding layers: Parquet's native RLE and dictionary encoding produce compact representations for columns with repeated values, and general-purpose compression (ZSTD) compresses the encoded output further. In Husky Phase 1, this yielded \~33% size reduction for APM data and \~25% for Logs data. Analysis of single-file locality confirms that datacenter, service, and host columns provide strong locality benefits. + +2. **Query efficiency.** Parquet files typically contain a single row group, so row-group-level statistics provide no intra-file pruning. However, Parquet's **column index** (format v2) stores min/max statistics per page within each column chunk. When data is sorted, pages within each column naturally have non-overlapping value ranges for the sort columns. DataFusion supports page index pruning, allowing it to skip pages that cannot match a query predicate \-- for example, jumping directly to the pages containing `metric_name = "cpu.usage"` and skipping the rest of the file. + +These benefits are achieved without any change to the compaction planning algorithm, query routing, or cluster coordination. They require only that the split writing path sorts rows before writing and that the merge path preserves sort order. + +### Prior Art: Husky Phase 1 + +In Husky, Phase 1 was implemented as follows: + +- A **sort schema** was defined per table/track, specifying a short list of columns to sort by, with an optional LSM cutoff and version suffix. Example for logs: `service__s|status__s|tag.env__s|timestamp|tiebreaker__i/V2` +- **Writers** sort rows within each fragment file by the sort schema before writing to object storage. +- **Compactors** perform sorted m:1 merges, reading the sort columns from each input fragment first, determining the global sort order, then streaming all columns through the merge in that order. +- **Sort columns are written first** in the output fragment, followed by remaining columns in lexicographic order by name. This is necessary in Husky's custom columnar format because columns are laid out sequentially with headers, so the sort columns must be physically first to avoid seeking past other columns during merge reads. +- **Null handling:** nulls sort after non-null values. + +The result was compression improvements of 25-33%, reduced compactor CPU (ZSTD compresses sorted data more easily), reduced network bandwidth, and reduced query latency at leaf reader nodes. + +## Data Model: Point Per Row + +Each row in a Parquet split represents a single data point: one metric value at one timestamp for one timeseries. This is in contrast to a "timeseries per row" model where each row would contain an array of timestamps and values for an entire series. + +Point-per-row is the right starting point for several reasons: + +- **Simpler compaction semantics.** Sorted k-way merge operates directly on rows. With timeseries-per-row, merging requires both row-level merge (interleaving rows from different splits) *and* intra-row series merge (combining the timestamp/value arrays of the same timeseries across splits). Point-per-row avoids this second level of merge entirely. +- **No last-write-wins (LWW).** We explicitly do not support LWW semantics, where a later write for the same timeseries and timestamp overwrites an earlier one. Without LWW, there is no need for sticky routing or series-level deduplication during compaction. This is a deliberate simplification that avoids the reliability challenges of sticky routing (single-partition overload, constant shuffles on rebalancing) that other systems have encountered. +- **No timeseries-level interpolation.** Interpolation across points in a timeseries is not performed at the storage layer. If needed in the future, it will operate at query time. This may be slower than storage-level interpolation but avoids coupling the storage format to query semantics. +- **Performance equivalence with good encoding.** With sorted data, columnar encodings like RLE (run-length encoding) and dictionary encoding produce long runs of repeated values in the sort columns -- the same runs that timeseries-per-row would capture by grouping values into arrays. When these encodings are preserved through query execution, point-per-row achieves comparable scan performance to timeseries-per-row without the implementation complexity. +- **Generic DataFusion improvements over custom code.** Timeseries-per-row requires significant custom DataFusion operator support (nested array types, custom aggregation kernels). Point-per-row uses standard columnar operations, allowing us to contribute generic improvements to DataFusion rather than maintaining timeseries-specific extensions. + +**RLE and dictionary encoding in DataFusion.** Currently, RLE and dictionary encoding are lost relatively quickly through generic DataFusion operators -- decoded to plain arrays early in the query pipeline. There is significant ongoing investment in the **Flurry** project (the metrics equivalent of Bolt) to preserve these encodings through more of the execution pipeline. As Flurry matures, the performance benefits of sorted point-per-row data will increase, since longer runs in sorted columns translate directly to better RLE compression ratios that are maintained through query execution. + +## Sort Schema Definition + +### Configuration + +A sort schema is defined as part of the index configuration. It specifies an ordered list of columns that determine the sort order of rows within each split, and optionally additional columns for which metadata (min/max/regex) is emitted but which do not participate in sorting. + +Using the same shorthand as Husky, a sort schema uses the following format: + +``` +[schema_name=]column[+/-]|...[&column[+/-]|...]/V2 +``` + +The components are: + +- **Schema name** (optional): A name for the schema, followed by `=`. When present, it labels the schema for identification. Example: `metrics_default=metric_name|...` +- **Sort columns**: Pipe-delimited column names. These columns define the sort order. Each column may have a `+` (ascending) or `-` (descending) suffix. If omitted, the default direction is ascending, except for `timestamp` which defaults to descending. +- **LSM cutoff** (`&`): Separates sort columns from metadata-only columns. Columns listed after `&` are **not** used for sort ordering, but min/max/regex metadata is still emitted for them in the split metadata. This allows future query pruning on dimensions that don't participate in the physical sort order. +- **Version suffix** (`/V2`): Indicates version 2 of the sort schema format (the current version). + +Each column in the schema has: + +- **Name:** The column name as it appears in the Parquet schema. +- **Sort direction:** Ascending (`+`, default for most columns) or descending (`-`, default for `timestamp`). Indicated by a suffix on the column name: `timestamp-` means sort timestamp descending (redundant with the default, but explicit). `timestamp+` would override the default to sort ascending. +- **Type:** Inferred from the Parquet schema. Supported types: + - **String/binary:** Sorted lexicographically by byte value. + - **Integer types** (i8, i16, i32, i64, u8, u16, u32, u64): Sorted numerically. + - **Float types** (f32, f64): Sorted numerically, with NaN handling matching IEEE 754 total order (NaN sorts after all other values). +- **Null handling:** Null values sort **after** all non-null values for ascending columns, and **before** all non-null values for descending columns. This matches Husky's behavior and ensures nulls cluster at the end of each column's range. + +### Timeseries ID (Optional Locality Tiebreaker) + +The sort schema typically includes only a few high-value tag columns (e.g., `host`, `env`). When the explicit sort columns are not granular enough to distinguish individual point sources, points from different sources that share the same sort column values will be interleaved in the sorted output. For example, if the sort schema is `metric_name|env` and a single environment has hundreds of hosts, points from all those hosts are interleaved within each `(metric_name, env)` group. + +To improve locality in this case, the sort schema may optionally include a **`timeseries_id`** column: a hash of all tag names and values, placed after the explicit sort columns and before `timestamp`. This acts as a **tiebreaker** -- within each group defined by the explicit sort columns, it further clusters points that come from the same combination of tags before ordering by time. This is purely a physical layout optimization for compression and scan efficiency. It is not a semantic concept, not part of the data model, and nothing in the query path or correctness of the system depends on it. + +**`timeseries_id` is optional.** If the explicit sort columns are already granular enough to distinguish individual point sources (e.g., the schema already includes `host` or `container`), `timeseries_id` adds little value and can be omitted from the sort schema. It is most useful when the sort schema is coarse (few columns, high-cardinality tags not included) and the operator wants better intra-group locality without adding more explicit sort columns. It can be added or removed from the sort schema at any time -- this is a schema change handled by the normal transition mechanism (new splits use the new schema, old splits age out via retention). + +When used, the hash function should be deterministic, fast, and produce good distribution. A suitable choice is xxHash64 or SipHash over the canonicalized (sorted by key name) tag key/value pairs. The exact hash function is an implementation detail; what matters is that the same set of tags always produces the same `timeseries_id`. + +**Limitations.** `timeseries_id` provides best-effort grouping, not a guarantee. Hash collisions (extremely unlikely with 64-bit hashes) would interleave two distinct point sources, but this affects only physical layout, not correctness. More practically, if a tag value flaps (e.g., a tag alternates between `NA` and an actual value due to intermittent enrichment), the hash changes and points from what a user would consider "the same source" end up with different `timeseries_id` values. This is inherent to any hash-of-all-tags approach and is acceptable because it only degrades locality, never correctness. If tag flapping is prevalent in a workload, omitting `timeseries_id` and relying on the explicit sort columns alone may be preferable. + +### Schema Requirements + +- The sort columns (before `&`) should be a small subset of columns \-- typically 3-5 columns that correspond to the most common query predicates, optionally followed by `timeseries_id`, followed by `timestamp`. +- Columns referenced in the sort schema do not need to exist in the Parquet schema of every split. If a sort column is missing from a particular split (e.g., because the data predates a schema addition, or because a column is being introduced incrementally), all rows in that split are treated as having null values for that column. This is not an error condition. +- If used, `timeseries_id` is a synthetic column computed by the indexer, not present in the incoming data. It should appear immediately before `timestamp` in the sort columns. It is optional and may be omitted if the explicit sort columns provide sufficient granularity. +- The schema string must end with `/V2`. +- Metadata-only columns (after `&`) are optional. They do not affect sort order but have min/max/regex metadata emitted for future query pruning. + +### Example + +For a metrics index with data points identified by metric name, host tag, and environment tag, with `service` tracked as a metadata-only column: + +``` +metric_name|host|env|timeseries_id|timestamp&service/V2 +``` + +This sorts rows first by `metric_name` (ascending), then `host` (ascending), then `env` (ascending), then `timeseries_id` (ascending, clustering points from the same tag combination), then `timestamp` (descending, by default). The `service` column after `&` does not participate in sorting, but min/max/regex metadata is emitted for it to enable future query pruning. + +Within a split, all points for the source `cpu.usage{host="host-01", env="prod", region="us-east-1", instance="i-abc123"}` will be physically contiguous and ordered by timestamp, even though `region` and `instance` are not explicit sort columns -- the `timeseries_id` hash groups them together. + +A schema without `timeseries_id`, relying on the explicit sort columns for all grouping: + +``` +metric_name|host|env|timestamp&service/V2 +``` + +Here, points are grouped by `(metric_name, host, env)` and then ordered by timestamp. Points from different sources that share the same `(metric_name, host, env)` but differ in other tags (e.g., `instance`) will be interleaved within each group. This is acceptable when `host` provides sufficient granularity. + +A minimal schema without metadata-only columns, a schema name, or `timeseries_id`: + +``` +metric_name|timestamp/V2 +``` + +### Storage + +The sort schema is stored in two places: + +**PostgreSQL (`MetricsSplitMetadata`)**. The schema string is stored alongside other split metadata so that: + +1. The compaction merge process knows the sort order of input splits. +2. Future phases can use the schema for query pruning and locality-aware compaction planning. +3. Schema changes can be detected \-- splits with different sort schemas are not merged together without re-sorting. + +**Parquet file metadata**. The schema is also embedded in the Parquet file itself, making each file self-describing: + +1. **`key_value_metadata`**: The full sort schema string is stored as a key-value pair in the Parquet file-level metadata (key: `sort_schema`, value: the schema string, e.g., `metric_name|host|env|timeseries_id|timestamp&service/V2`). This preserves the complete schema including LSM cutoff, metadata-only columns, and version suffix. + +2. **`sorting_columns`**: The sort columns (those before `&`) are also declared using Parquet's native `sorting_columns` field in the file metadata. Each entry specifies the column index, ascending/descending, and nulls-first/nulls-last. This allows Parquet-native tooling and DataFusion to leverage the sort order without understanding our custom schema format. + +## Time Windowing + +### Concept + +All compaction in the Parquet pipeline is scoped to **time windows**: fixed-duration, non-overlapping intervals of wall-clock time aligned to the Unix epoch. Splits are assigned to a time window based on the timestamps of the data they contain, and compaction only merges splits within the same window. Data is never merged across window boundaries. + +This is directly analogous to Husky's time bucketing, where each fragment belongs to a single time window and queries use time as a primary filter to restrict the set of fragments that must be examined. Time windowing provides a natural partitioning dimension that: + +1. **Bounds compaction scope.** Each window is an independent compaction unit. The compactor processes windows independently, and the total amount of data eligible for a single merge operation is bounded by the window duration and ingestion rate. +2. **Aligns with query patterns.** Observability queries always include a time range predicate. When splits are organized by time window, the query engine can immediately discard all windows outside the query range without examining individual split metadata. +3. **Enables retention.** Dropping old data becomes a window-level operation: all splits in windows older than the retention period can be deleted as a batch, without needing to inspect individual rows. The retention period is user-configured per index (e.g., 15 days, 90 days) and is not specified by this design -- it is an existing Quickwit capability that time windowing makes more efficient. +4. **Limits compaction write amplification.** Because windows are independent, data is only rewritten within its window. Old, fully-compacted windows are never disturbed by new data arriving in newer windows. + +### Window Configuration + +| Parameter | Default | Description | +| :---- | :---- | :---- | +| `window_duration` | 15 minutes | The duration of each time window. Must evenly divide one hour (valid values: 1m, 2m, 3m, 4m, 5m, 6m, 10m, 12m, 15m, 20m, 30m, 60m). | +| `compaction_start_time` | (required) | Unix timestamp (seconds). Only data in time windows at or after this time is eligible for sorted compaction. Data before this time is left as-is and expires via retention. Should be set to the time Phase 1 is enabled (or the start of the next window boundary after enablement). | +| `late_data_acceptance_window` | 1 hour | Maximum age of a data point (wall-clock time minus point timestamp) accepted at ingestion. Points older than this are dropped. Bounds the window of time during which late data can disturb already-compacted windows. Should be set based on product lateness guarantees (e.g., 1h for metrics, 3h for HSI-style use cases). | + +Windows are aligned to the Unix epoch. A 15-minute window duration produces windows at `[00:00, 00:15)`, `[00:15, 00:30)`, `[00:30, 00:45)`, `[00:45, 01:00)`, and so on. The window containing a given Unix timestamp `t` is computed as: + +``` +window_start = t - (t % window_duration_seconds) +window_end = window_start + window_duration_seconds +``` + +Each window is identified by its `window_start` timestamp (seconds since Unix epoch). + +### Split-to-Window Assignment + +At ingestion time, a split is assigned to a time window based on the timestamps of the rows it contains. Because the indexing pipeline accumulates rows over a commit interval before flushing a split, a single split may contain rows spanning more than one time window. + +When a split contains rows from multiple windows, the split is **partitioned by window** before writing: rows are grouped by their window assignment, and a separate Parquet file is written for each window. Each output split belongs to exactly one time window. This ensures the invariant that every split in object storage is associated with exactly one window. + +The window assignment uses the same timestamp column referenced in the sort schema (typically `timestamp`). If a row has a null timestamp, it is assigned to a designated overflow window (window\_start \= 0), which is compacted separately. + +### Compaction Scope + +The full compaction scope for splits becomes: + +``` +(index_uid, source_id, partition_id, doc_mapping_uid, sort_schema, window_start) +``` + +The components are: + +- **`index_uid`**: Unique identifier for the Quickwit index (e.g., a metrics index). Each index has its own configuration, schema, and retention policy. +- **`source_id`**: Identifies the data source feeding the index (e.g., a Kafka topic or push API endpoint). Different sources may produce data with different characteristics or schemas. +- **`partition_id`**: A partition within the source (e.g., a Kafka partition). Splits from different partitions are kept separate to preserve ordering guarantees within a partition. +- **`doc_mapping_uid`**: A unique identifier for the document mapping (schema) version. When the index schema changes (columns added/removed/retyped), a new `doc_mapping_uid` is assigned. This prevents merging splits with incompatible schemas. +- **`sort_schema`**: The sort schema string (as defined above). Prevents merging splits sorted with different schemas. +- **`window_start`**: The time window start timestamp. Prevents merging data from different time windows. + +Only splits sharing all components of this scope key may be merged together. + +Note: `node_id` is intentionally excluded from the compaction scope. In Phase 1, this is a forward-looking design choice \-- initially, each node compacts its own splits (the current behavior), but the scope definition does not require it. Phase 2 lifts the node constraint to enable cross-node compaction. + +### Late-Arriving Data + +Data may arrive late \-- a data point with a timestamp in a past window may be ingested after that window has already been compacted. + +**Late data acceptance window.** Points with timestamps older than a configurable maximum age are **dropped at ingestion time** rather than accepted into the storage layer. This bounds the window of time during which late data can disturb already-compacted windows. The acceptance window should be set based on the product's lateness guarantees (e.g., Datadog Metrics advertises 1 hour after the point's timestamp; HSI accepts points up to 3 hours late). Without this cutoff, arbitrarily late data -- driven by customer behavior such as delayed batch uploads or misconfigured clocks -- can trigger expensive re-merges of fully-compacted windows indefinitely. + +Within the acceptance window, late-arriving data is handled naturally: + +- The late data is written to a new split assigned to the historical window (based on timestamp, not ingestion time). +- The next compaction cycle for that window picks up the new split and merges it with existing compacted splits. +- There is no special handling required; the time window simply gains additional splits that are merged in the normal course. + +For windows that have already been fully compacted into a single large split, a late-arriving small split triggers a merge of the large existing split with the small new one. The late data acceptance window bounds how far back this can happen, keeping the number of affected windows small and predictable. + +## Sorting at Ingestion + +### Current Metrics Ingestion Pipeline + +``` +Source -> MetricsDocProcessor -> MetricsIndexer -> MetricsUploader -> MetricsPublisher +``` + +The `MetricsIndexer` accumulates `RecordBatch` batches and writes them as Parquet splits. The current batching thresholds are 128 MiB of in-memory data or 1M rows, whichever is reached first. Due to the high compression ratio of metrics data, this produces very small Parquet files -- currently ~600 KiB on disk. These thresholds will likely need to be revisited to produce larger ingestion-time splits and reduce the split count that compaction must handle. The indexer already sorts rows within each batch using `lexsort_to_indices` on a fixed set of columns (`MetricName`, `TagService`, `TagEnv`, `TagDatacenter`, `TagRegion`, `TagHost`, `TimestampSecs`). Phase 1 makes this sort schema configurable, adds time window partitioning, and optionally adds the `timeseries_id` column (if present in the sort schema). + +**Location:** `quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs:600-728` + +### Modified Pipeline + +The `MetricsIndexer` (or, if more appropriate, the split writer at `quickwit/quickwit-metrics-engine/src/storage/split_writer.rs`) is modified to sort the accumulated `RecordBatch` data by the sort schema before writing the Parquet file. + +The windowing, optional timeseries ID computation, and sorting happen in-memory before the Parquet writer begins. The steps are: + +1. **Accumulate rows** into one or more Arrow `RecordBatch` arrays as today. +2. **Compute timeseries ID (if configured).** If the sort schema includes `timeseries_id`, compute it for each row by hashing the canonicalized (sorted by key name) set of all tag names and values \-- both explicit tag columns (`tag_service`, `tag_env`, `tag_host`, etc.) and dynamic attributes. Add `timeseries_id` as a new column in the `RecordBatch`. This column persists through compaction and does not need to be recomputed during merges. If the sort schema does not include `timeseries_id`, this step is skipped. +3. **Partition by time window.** Group rows by their time window assignment based on the timestamp column. Each group contains only rows whose timestamps fall within a single window `[window_start, window_start + window_duration_seconds)`. If the accumulated batch spans multiple windows (common when the commit interval straddles a window boundary), separate groups are produced for each window. +4. **For each window group:** + 1. **Extract sort columns** from the group's rows. These are the columns named in the sort schema (including `timeseries_id` if configured). + 2. **Compute sort indices.** Using Arrow's `lexsort_to_indices` (or equivalent), compute a permutation array that represents the sorted order of all rows across the sort columns, respecting the direction and null ordering specified in the schema. + 3. **Apply permutation.** Reorder all columns (not just sort columns) according to the computed permutation using Arrow's `take` kernel. + 4. **Write Parquet file.** Column ordering within the Parquet file does not matter \-- unlike Husky's sequential columnar format, Parquet stores column chunk offsets in the footer and the reader can seek directly to any column. Columns may be written in any order. The Parquet writer must be configured to produce the **column index** (page-level min/max statistics) and the **offset index** (page byte offsets and row counts). These are opt-in features of Parquet format v2 that must be explicitly enabled at write time; without them, DataFusion cannot perform page-level predicate pushdown on the sorted data. + 5. **Record metadata.** The split's metadata records the sort schema, `window_start`, and `window_duration_secs`, so that merges and queries can rely on the sort order and window assignment. + +In the common case where all accumulated rows fall within a single 15-minute window, this produces one split \-- the same as today. Only when a commit straddles a window boundary are multiple splits produced for a single commit. + +### Cost + +Sorting is O(n log n) in the number of rows per split. For typical metrics splits (100K-500K rows), this is inexpensive relative to the cost of Parquet encoding, compression, and upload. In Husky Phase 1, the sorting overhead was approximately 2% additional CPU at the compactor, but this was more than offset by reduced ZSTD compression cost on the better-organized data, resulting in a net CPU reduction. + +## Sorted Merge at Compaction + +### Current State + +The metrics pipeline currently has **no compaction**. Splits accumulate without merging, relying on DataFusion to query many small Parquet files and on time-based retention to eventually remove old data. + +### Introducing Basic Compaction with Sorted Merge + +Phase 1 introduces size-tiered compaction for the metrics pipeline, producing sorted output. This is the same basic approach as the existing logs/traces `StableLogMergePolicy` adapted for Parquet splits, with the addition that merges maintain sort order. + +The merge process for combining N sorted input splits into one or more sorted output splits: + +1. **Read sort columns** from each input split. Only the columns named in the sort schema need to be read initially. Parquet's footer-based format allows seeking directly to any column regardless of its physical position in the file. + +2. **Compute global sort order.** Perform a k-way merge across the sort columns of all input splits. This produces a merge order indicating how rows from each input should appear in the output. This is the same approach Husky uses: the sort order is determined first and stored in an array, then columns are streamed through the merge. + + The k-way merge uses a min-heap (priority queue) with one entry per input split, comparing rows using the sort schema's comparison rules (lexicographic for strings, numeric for numbers, nulls-last for ascending, nulls-first for descending). For N input splits, each comparison is O(k) where k is the number of sort columns, and advancing through all rows is O(R log N) where R is the total row count. + + The merge order is represented as a run-length encoded sequence of `(split_index, start_row, row_count)` triples rather than individual `(split_index, row_index)` pairs. Because input splits are sorted, the k-way merge naturally produces long contiguous runs from the same input \-- particularly as locality compaction matures and inputs contain increasingly well-sorted data. This representation compresses the merge order significantly and enables bulk operations (bulk `take`, bulk copy) when streaming columns through the merge, rather than processing rows individually. + +3. **Stream columns through the merge.** Once the global sort order is determined, each column is read from the input splits and written to the output in the sorted order. Columns are processed one at a time (or in small groups for memory efficiency), reading from all input splits and writing to the output according to the sort order array. This keeps memory usage proportional to the number of input splits times the size of one column's data, rather than requiring all data in memory simultaneously. + +4. **Emit split metadata.** The output split's metadata records the sort schema, `window_start`, `window_duration_secs`, and min/max/regex values for all columns in the schema (both sort and metadata-only). + +### Column Set Differences Across Inputs + +The Parquet reader always reads the file footer first \-- the footer contains the full schema and row group metadata, and is the entry point for reading any Parquet file. Discovering each input split's column set is therefore inherent to opening the file, not an extra step. + +Input splits may not have identical column sets. Schema evolution (adding or removing columns over time) means that splits from different time periods may have different columns. The merge handles this as follows: + +- **Sort columns.** If a sort column is missing from an input split, all rows from that split are treated as having null values for that column. Nulls sort after non-null values (ascending) or before non-null values (descending), as specified in the sort schema. The k-way merge handles this naturally. + +- **Non-sort columns.** The merge computes the **union** of all column names across all input splits. The output split contains every column that appears in at least one input. When streaming a column through the merge, rows originating from inputs that lack that column are filled with nulls. The output Parquet schema uses the type from whichever input(s) contain the column; if multiple inputs have the same column name with different types, the merge fails with an error (this indicates a schema evolution conflict that must be resolved at the index configuration level). + +### Compaction Scope Mismatches + +Splits with different sort schemas must not be merged together. The merge planner groups splits by the full compaction scope key \-- `(index_uid, source_id, partition_id, doc_mapping_uid, sort_schema, window_start)` \-- and only merges splits within the same group. The `window_start` constraint ensures that data from different time windows is never combined, even if all other scope components match. + +### Pre-existing Unsorted Data + +Splits produced before Phase 1 is enabled have no sort schema and no window assignment. These splits are **not compacted** \-- they remain as-is until they expire via retention. There is no attempt to sort or merge pre-existing data. + +A configurable cutoff time (`compaction_start_time`) defines the boundary: only splits whose `window_start` is at or after this time are eligible for compaction. Splits with no window assignment (pre-Phase-1) or with `window_start` before the cutoff are excluded from compaction planning entirely. + +This avoids the complexity of merging sorted and unsorted inputs. The transition is clean: once Phase 1 is enabled with a cutoff time, all new data is windowed and sorted from that point forward. Old data ages out via retention without ever being rewritten. + +Data that arrives after Phase 1 is enabled but has timestamps before `compaction_start_time` is still written as a sorted, windowed split (the indexer always applies windowing and sorting once Phase 1 is active). However, these splits are not eligible for compaction and will age out via retention alongside pre-existing unsorted splits. + +### Comparison with Husky + +This is the same approach used by Husky's compactor: + +- Sort columns are read first to determine merge order. +- An index array captures which rows of each input go to which positions in the output. +- Columns are then streamed through the merge one at a time. + +The differences from Husky are: the storage format (Parquet vs. Husky's custom columnar format), the merge planning algorithm (Quickwit's StableLogMergePolicy adapted for metrics vs. Husky's size-tiered \+ LSM composite planner), and the absence of column ordering constraints (Parquet's footer-based layout makes physical column order irrelevant, unlike Husky's sequential format where sort columns must be written first). Phase 1 does not change the merge planning algorithm \-- it only changes how the merge *executes* (sorted output instead of arbitrary order). + +### Compaction Policy + +Phase 1 uses Quickwit's existing compaction scheduling and the `StableLogMergePolicy` adapted for Parquet splits. The compactor runs on the same schedule and with the same triggering logic as for the logs/traces Tantivy pipeline. Within each time window, the merge policy uses the same maturity/age constraints as `StableLogMergePolicy` to determine when splits are eligible for merging. + +The key parameters that need to be determined experimentally are: + +- **Target split size after compaction.** How large should the output of a merge be? This determines when the compactor stops merging within a window. Too small and we still have many splits per window; too large and individual merges are expensive. +- **Merge fanin.** How many input splits per merge operation? Higher fanin reduces total write amplification but increases per-merge memory and CPU cost. +- **Interaction with window size.** For a 15-minute window at a given ingestion rate, how many splits accumulate before compaction, and what is the steady-state split count after compaction converges? + +These parameters should be determined via experiments on representative metrics workloads before finalizing the merge policy configuration. Suggested experiments: + +1. **Baseline measurement.** For a representative metrics index, measure the number of splits produced per 15-minute window, the size of each split, and the total data volume per window. +2. **Merge fanin sweep.** For a fixed target split size, vary the merge fanin (e.g., 4, 8, 16 input splits) and measure merge duration, peak memory usage, and write amplification. +3. **Target size sweep.** For a fixed fanin, vary the target output split size (e.g., 64MB, 128MB, 256MB, 512MB) and measure steady-state split count per window, query latency, and compression ratio. +4. **Compression improvement.** Compare sorted vs. unsorted Parquet file sizes for the same data to validate the expected 20-35% compression improvement. + +In the future, the compaction planner may benefit from a hinting mechanism similar to Husky's, where the system can signal that a particular window needs compaction (e.g., due to late-arriving data or a schema change). This would replace the current polling-based approach with event-driven compaction for specific windows. + +--- + +## Split Metadata Changes + +The following fields are added to `MetricsSplitMetadata`: + +| Field | Type | Description | +| :---- | :---- | :---- | +| `window_start` | `i64` | The Unix timestamp (seconds) of the start of the time window this split belongs to. Computed as `timestamp - (timestamp % window_duration_seconds)`. All rows in the split have timestamps within `[window_start, window_start + window_duration_seconds)`. | +| `window_duration_secs` | `u32` | The time window duration in seconds that was in effect when this split was produced. Stored per-split to detect configuration changes. | +| `sort_schema` | `String` | The full sort schema string including version suffix (e.g., `metric_name|host|env|timeseries_id|timestamp&service/V2`). Empty string if the split was produced before Phase 1\. Parsed internally in Rust code, but stored and compared as a string. | +| `schema_column_min_values` | `Vec` | The minimum value of each column in the schema (both sort and metadata-only columns, in order of appearance in the schema string). | +| `schema_column_max_values` | `Vec` | The maximum value of each column in the schema, in the same positional order. | +| `schema_column_regexes` | `Vec` | A regex pattern for each column in the schema that matches any value present in this split. The computation method follows the existing Husky implementation. | + +The `SortColumnValue` type is a tagged union supporting string, i64, u64, f64, and null. All three vectors are positional: element 0 corresponds to the first column in the sort schema string, element 1 to the second, and so on, covering both sort columns and metadata-only columns (those after `&`). + +These metadata fields are recorded in **both** PostgreSQL and the Parquet file itself: + +**PostgreSQL (`MetricsSplitMetadata`)**: All fields above are stored in PostgreSQL alongside the existing split metadata. They are populated at split publication time \-- both for initial ingestion splits and for merge output splits. PostgreSQL storage enables split-level query pruning without reading any Parquet data. + +**Parquet `key_value_metadata`**: The min/max/regex values are also embedded in the Parquet file's `key_value_metadata`, making each file self-describing. The following keys are written: + +| Key | Value | +| :---- | :---- | +| `sort_schema` | The full sort schema string (as described in [Storage](#storage) above). | +| `schema_column_min_values` | JSON-serialized array of min values, positional by schema column order. | +| `schema_column_max_values` | JSON-serialized array of max values, positional by schema column order. | +| `schema_column_regexes` | JSON-serialized array of regex strings, positional by schema column order. | +| `window_start` | The window start timestamp as a decimal string. | +| `window_duration_secs` | The window duration in seconds as a decimal string. | + +Storing metadata in the Parquet file ensures that files remain interpretable without access to the metastore \-- useful for debugging, offline analysis, and disaster recovery. The PostgreSQL copy is the authoritative source for query planning and compaction. + +Note: Parquet also stores min/max statistics natively \-- per row group in the column chunk metadata, and per page in the column index (format v2). Since metrics splits are typically single-row-group files, the per-page column index is the relevant mechanism for intra-file pruning. The split-level min/max/regex metadata described here enables coarser-grained pruning across splits \-- skipping entire splits at query planning time \-- which will be leveraged in future phases. + +**Note on window\_duration\_secs:** If the configured window duration changes, splits produced under the old duration have a different `window_duration_secs` value. The merge planner treats splits with different window durations as incompatible for merging (they have different `window_start` alignment). Old-duration splits age out via retention. No rewrite is needed. + +## Expected Benefits + +Based on Husky Phase 1 results: + +### Compression + +- **Estimated 20-35% reduction in Parquet file size** for metrics data when sorted by metric name \+ tags \+ timestamp. Metrics data has high tag-value repetition (many points for the same metric/host/env combination), which sorted columnar layout compresses very efficiently. +- The actual improvement depends on the cardinality of the sort columns and the distribution of values. Metrics data, with its regular time-series structure, is likely to see compression gains at the high end of this range. + +### CPU + +- Sorting adds approximately 2% CPU overhead at the indexer/compactor. +- This is expected to be offset by reduced compression cost (ZSTD/Snappy work less on better-organized data) and smaller output files requiring less S3 upload bandwidth. +- Net CPU effect: approximately neutral or slightly positive (net reduction), as seen in Husky. + +### Query Latency + +- Since metrics splits are typically single-row-group files, intra-file pruning relies on Parquet's page-level column index rather than row-group statistics. When data is sorted, pages within each column have non-overlapping value ranges, and DataFusion's page index pruning can skip pages that don't match query predicates on sort columns. +- For queries with predicates on `metric_name` or other leading sort columns, this reduces the volume of data scanned per split. +- This benefit increases in subsequent phases when split-level min/max/regex metadata enables pruning entire splits at query planning time. + +### Storage Cost + +- Direct cost reduction proportional to compression improvement. +- For metrics workloads with long retention, storage cost dominates, so a 25-30% compression improvement translates to a comparable cost reduction. + +## Operational Considerations + +### Rollout + +- **Decoupled from queries.** Sorted splits are fully compatible with the existing query engine. DataFusion can read Parquet files regardless of row order. The sort order is a performance optimization, not a correctness requirement. +- **Clean cutoff.** The `compaction_start_time` parameter defines a hard boundary. Data before this time is never compacted or rewritten \-- it remains queryable as-is and expires via retention. Data from this time forward is windowed, sorted, and compacted. There is no mixed sorted/unsorted merge path. +- **Sort schema changes.** If the sort schema is changed, new splits use the new schema. The merge planner prevents merging splits with different schemas. Old-schema splits age out via retention. No rewrite or backfill is needed. +- **Window duration changes.** If the window duration is changed, new splits use the new duration and alignment. Splits with different `window_duration_secs` are not merged together. Old-duration splits age out via retention. + +### Monitoring + +Key metrics to track: + +| Metric | What It Measures | Threshold | +| :---- | :---- | :---- | +| `split_size_bytes` (before/after) | Compression improvement | Expect 20-35% reduction | +| `indexer_cpu_usage` | CPU cost of sorting | Expect \< 5% increase | +| `compaction_cpu_usage` | CPU cost of sorted merge | Expect neutral or decrease | +| `compaction_duration` | Time to complete a merge | Should not increase significantly | +| `parquet_pages_scanned` | Query-time page index pruning | Expect reduction for predicate queries on sort columns | + +### Scale Considerations + +The split counts at high ingestion rates are significant. As a concrete example: + +- At 10 GiB/s ingestion with 10 MiB splits (the current batch threshold), the system produces ~1,024 splits per second. +- Over a 15-minute window, this accumulates up to ~921,600 splits before compaction. +- After compaction to ~1 GiB files with a ~10x compression ratio (typical for Husky), a 15-minute window still contains ~4,500 Parquet files. + +This has two implications: + +**Variable window duration.** At the highest scale, 15-minute windows may be too coarse. Shorter windows (down to 1 minute, the minimum supported) reduce the number of splits per window and bound the compaction working set. The `window_duration` parameter is configurable precisely for this reason. Operators should tune it based on ingestion rate: higher throughput warrants shorter windows. + +**Metadata scalability.** The current design stores split metadata in PostgreSQL. At ~921K pre-compaction splits per window, or ~4,500 post-compaction splits per window across many windows and indexes, the metadata volume can exceed what a single OLTP database handles efficiently for query planning lookups. Phase 1 uses PostgreSQL as the initial implementation, but the metadata architecture must be prepared for a shift away from "one OLTP DBMS for everything." + +The design mitigates this in two ways: + +1. **Self-describing Parquet files.** All metadata (sort schema, min/max/regex, window assignment) is embedded in the Parquet file's `key_value_metadata`. The external metadata store is an index for query planning, not the sole source of truth. This means the metadata layer can be replaced or supplemented without re-writing data files. +2. **Structured metadata.** The metadata fields are simple, typed, and positional. They can be stored in any system that supports efficient range queries and filtering -- a dedicated metadata service, a columnar store, or a distributed key-value store. The PostgreSQL schema should be designed with this future migration in mind, keeping the metadata representation portable rather than relying on PostgreSQL-specific features. + +### Failure Modes + +- **Clock skew within the acceptance window.** Data arriving within the late data acceptance window but for past windows can trigger re-merges of already-compacted windows. The acceptance window bounds the blast radius, but sustained late data (e.g., a source with systematic clock skew) can cause repeated compaction churn for recent windows. Monitoring should track the age distribution of late-arriving data and alert on sources that consistently submit near the acceptance window boundary. +- **Many small windows.** If the window duration is set too small relative to the commit interval, each commit may produce splits for many windows, each containing very few rows. This increases split count and metadata overhead. The window duration should be significantly larger than the commit interval (the default 15 minutes is appropriate for typical commit intervals of 30-60 seconds). + +## Future Phases + +Phase 1 establishes the foundation for locality-aware compaction. Subsequent phases will build on the sorted-split infrastructure: + +- **Phase 1.5: Affinity-based shard routing.** Currently, the `IngestRouter` assigns shards via round-robin, scattering data for the same metric across all nodes. An intermediate optimization is to use consistent hashing on a prefix of the sort key (e.g., metric name) to bias shard selection, so that data for the same metric tends to land on the same shard. This is not strict partitioning (each shard still receives a mix of metrics, avoiding tiny files for low-throughput metrics), but probabilistic affinity that improves locality within each node's splits before full cross-node compaction exists. This is orthogonal to the sort/compaction design in Phase 1 and can be implemented independently. Its benefit compounds with sorting: data that is already co-located by metric within a shard produces longer contiguous runs when sorted, improving both compression and merge efficiency. + +- **Phase 2: Cross-node compaction and m:n sorted merge.** Lift the `node_id` constraint from the merge scope, enabling compaction across all splits in a time range regardless of producing node. Introduce m:n merges that spread rows across the sort-key space into non-overlapping output splits, enabling split-level query pruning. This is analogous to Husky Phase 2. Affinity routing from Phase 1.5 remains valuable even after Phase 2: if data is already partially co-located by metric on the same shard, cross-node compaction moves less data, has lower write amplification, and produces longer contiguous runs in the merge order. The two are complementary -- routing reduces the cost of compaction, and compaction achieves the degree of locality that routing alone cannot guarantee (tunable via overlap/cost knobs as in Husky). + +- **Phase 3: Query pruning.** Use the per-split min/max/regex metadata (recorded in Phase 1\) to skip entire splits at query planning time when the query predicates fall outside a split's value range, following the approach used in Husky. + +- **Phase 4: Logs/traces extension.** Apply the same sort schema and sorted merge approach to the Tantivy pipeline (logs/traces), where the benefits are primarily in split pruning (reduce full fan-out across all splits in a time range) and improved compression from sorted Tantivy fast fields. + +- **Research: Wide tables.** Metrics from the same source (host, container) share nearly identical tags, so multiple metric names could be stored as separate value columns in a single wide row rather than as individual point-per-row entries (e.g., `k8s.cpu.usage`, `k8s.cpu.limit`, `k8s.mem.usage`, `k8s.mem.limit` as four columns sharing one tag set). This is the approach taken by TimescaleDB's hypertables. It would amortize tag storage across metrics and improve compression further, but requires significant compactor changes: merging splits with disjoint value column sets and reassembling wide rows from narrow inputs across files. Worth investigating as a future research project. + +## References + +- [Quickwit Split Compaction Architecture](https://docs.google.com/document/d/110XhPgBYyDVpmVtbUhYFFEIvbgFRTNjEfVy7DF0doBg/edit?pli=1&tab=t.0#heading=h.l153qiq0ul1j) +- [Husky Phase 1: Locality of Reference](https://docs.google.com/document/d/1x9BO1muCTo1TmfhPYBdIxZ-59aU0ECSiEaGPUcDZkPs/edit) +- [Husky Phase 2: Locality of Reference (Summary)](https://docs.google.com/document/d/1vax-vv0wbhfddo4n5obhlVJxsmUa9N_62tKs5ZmYC6k/edit?tab=t.0#heading=h.o5gstqo08gu5) +- [Husky Phase 2: Locality of Reference (RFC)](https://docs.google.com/document/d/1FTiF2BNUjBMZ0tc2-vokXRwMH7J_rWm6Ikjf6S-ZJSg/edit?tab=t.0#heading=h.43mgnu7vcd3h) +- [Husky Storage Compaction Blog Post](https://www.datadoghq.com/blog/engineering/husky-storage-compaction/) +- [Single-File Locality Analysis](https://docs.google.com/document/d/1XaKsBCL7hcZSrJFck2GU9tBYFXGDUsePx_iWRiPJB-8/edit?tab=t.0#heading=h.cfmyc1w1736s) -- Adam's analysis showing datacenter, service, host locality works well within single files; relative scores should transfer to global locality. + +--- + +## Appendix: Critical Analysis + +*The following analysis was produced by asking Claude to re-read the document and respond to this prompt:* + +> 1. Identify the 3-5 non-obvious insights -- things that aren't stated explicitly but can be inferred from the content. Skip anything the author already highlights as a key point. +> 2. Find the tensions or contradictions. Where does the argument conflict with itself, or with conventional wisdom? What's left unresolved? +> 3. Extract the "so what." If a smart, busy person could only take away one actionable implication from this, what would it be and why? +> 4. Name what's missing. What question does this document raise but never answer? What would you want to know next? + +### Non-obvious insights + +**The optional timeseries_id tiebreaker has a hidden compression benefit beyond tag columns.** When `timeseries_id` clusters points from the same source, the value columns (the actual metric values) also become more compressible -- you get runs of values from the same source, which often have temporal coherence (slowly-changing values, predictable patterns). The doc frames the compression benefit as coming from sorted tag columns having long runs, but the bigger win may be what happens to the value columns when sources are grouped. This is worth measuring: compare compression with and without `timeseries_id` in the sort schema to see if the tiebreaker justifies its cost. + +**The run-length encoded merge order creates a feedback loop.** The doc notes that sorted inputs produce longer contiguous runs. But it doesn't make explicit that this is self-reinforcing across compaction generations: each compaction cycle produces better-sorted outputs, which means the next merge has longer runs, which means the merge order is smaller and cheaper, which makes it practical to do larger merges. The system gets cheaper to compact over time, not more expensive. This is the opposite of naive size-tiered compaction where write amplification grows. + +**Time windowing implicitly caps query fan-out, not just compaction scope.** The doc frames windowing as a compaction concern (bounding merge scope, limiting write amplification). But for queries, it also means the query planner knows that any split in a window outside the query's time range is irrelevant *without consulting min/max metadata*. This is free pruning that works today, before Phase 3, and it comes from the window assignment alone. The doc mentions this in passing ("discard all windows outside the query range") but doesn't call out that this is a material query performance improvement independent of sort order. + +**The "clean cutoff" transition strategy is actually a bet on retention being shorter than the migration period.** If retention is, say, 90 days, and Phase 1 is enabled on day 0, then for 90 days you have a mixed estate of unsorted (pre-cutoff) and sorted (post-cutoff) splits. Queries spanning the boundary hit both. The doc implicitly assumes this is acceptable, but for long-retention use cases (years), the unsorted tail could persist for a long time and the compression/query benefits don't fully materialize until it's gone. + +**The metadata-only columns after `&` are really a bet on Phase 3.** They add complexity and storage overhead in Phase 1 (computing and storing min/max/regex for columns that don't affect sort order) with zero benefit until split-level query pruning exists. If Phase 3 is deprioritized or the pruning approach changes, this is wasted work. The doc doesn't frame this as a calculated bet. + +### Tensions and contradictions + +**PostgreSQL is simultaneously "the authoritative source for query planning" and acknowledged to not scale.** The Scale Considerations section says PostgreSQL can't handle ~921K splits per window, and the design must prepare for a shift away from it. But the rest of the document treats PostgreSQL as the authoritative metadata store that query planning depends on. There's no intermediate plan -- at what scale does PostgreSQL break, and what happens between "Phase 1 ships" and "we build a replacement metadata service"? + +**The compaction policy is borrowed but the workload is fundamentally different.** The doc says "use StableLogMergePolicy adapted for Parquet splits" but StableLogMergePolicy was designed for Tantivy splits in a logs/traces pipeline. Metrics have very different characteristics: higher write rates, smaller individual events, more predictable schemas, time-series structure. The doc acknowledges the need for experiments but doesn't discuss *why* the existing policy might be wrong for metrics, which makes it hard to know what the experiments should be testing for. + +**Late-arriving data for old windows is "acceptable because rare" but the scale section suggests otherwise.** At 10 GiB/s, even a small percentage of late data is a lot of data. If 0.1% of data arrives late, that's 10 MiB/s of late data triggering merges of already-compacted windows. The doc assumes this is negligible, but at Alexey's scale numbers, it may not be. + +### The "so what" + +**The single most actionable implication:** Run experiment #4 (sorted vs. unsorted compression comparison) first, on real metrics data, before building anything else. Everything in this document -- the compaction policy, the time windowing, the merge implementation, the metadata infrastructure -- is justified by the assumption that sorting produces 20-35% compression improvement for metrics data. If the actual improvement is, say, 8% (because metrics data has different characteristics than logs/APM data), the cost-benefit calculus changes significantly. The Husky numbers are from logs and APM, not metrics. One afternoon with a script that takes existing metrics Parquet files, sorts them by the proposed schema, and re-encodes them would either validate the entire project or redirect it. Everything else can be designed in parallel, but this is the existential risk. + +### What's missing + +**What is the min/max/regex for `timeseries_id`?** The timeseries_id is a hash. Its min/max is a range of hash values, which is meaningless for query pruning (nobody queries by hash value). Its regex is similarly useless. But the metadata vectors are positional across all schema columns, so timeseries_id will have a slot. Is it populated with dummy values? Skipped? This is a small thing, but it exposes a design question: should the metadata vectors skip columns where the metadata is meaningless? + +**How does the query engine actually discover and use time windows?** The doc describes how splits are assigned to windows and how compaction is scoped to windows, but doesn't describe how the query planner maps a query's time range to windows. Does it scan PostgreSQL for all splits in a time range and filter by `window_start`? Does it compute the relevant window set from the query's time bounds and look up splits per window? The query path is implied but never specified. + +**What happens to the `timeseries_id` column when tags change?** If a tag value flaps or a tag is added/removed, the `timeseries_id` hash changes and points from the "same" logical source get different hash values. *Update: this is now addressed in the Timeseries ID section -- tag flapping degrades locality but never affects correctness, and `timeseries_id` can be omitted entirely if the explicit sort columns provide sufficient granularity.* + +**What's the interaction between `doc_mapping_uid` in the compaction scope and sort schema changes?** If the sort schema changes, does `doc_mapping_uid` also change? If not, you have two scope dimensions (sort_schema and doc_mapping_uid) that both prevent merging on schema changes, which is redundant. If yes, sort_schema in the scope key is redundant with doc_mapping_uid. The relationship between these two isn't explained. + diff --git a/docs/internals/specs/tla/CLAUDE.md b/docs/internals/specs/tla/CLAUDE.md new file mode 100644 index 00000000000..96a9e80655a --- /dev/null +++ b/docs/internals/specs/tla/CLAUDE.md @@ -0,0 +1,94 @@ +# TLA+ Specifications + +This directory contains formal TLA+ specifications for Quickhouse-Pomsky protocols. + +## Setup (One-Time) + +Install TLA+ tools to a standard location (NOT in this directory): + +```bash +# Option 1: Homebrew (recommended on macOS) +brew install tlaplus + +# Option 2: Download to ~/.local/lib +mkdir -p ~/.local/lib +curl -L -o ~/.local/lib/tla2tools.jar \ + "https://github.com/tlaplus/tlaplus/releases/download/v1.8.0/tla2tools.jar" + +# Option 3: VS Code extension (for interactive use) +# Install: alygin.vscode-tlaplus +``` + +## Running Model Checker + +From the repository root: + +```bash +# If installed via Homebrew: +tlc -config docs/internals/specs/tla/ExampleProtocol.cfg docs/internals/specs/tla/ExampleProtocol.tla + +# If using downloaded jar: +java -XX:+UseParallelGC -Xmx4g -jar ~/.local/lib/tla2tools.jar \ + -workers 4 \ + -config docs/internals/specs/tla/ExampleProtocol.cfg \ + docs/internals/specs/tla/ExampleProtocol.tla +``` + +### Quick vs Full Verification + +Some specs have `_small.cfg` variants for faster iteration: + +```bash +# Quick (~1s, hundreds of states) +tlc -config docs/internals/specs/tla/ExampleProtocol_small.cfg docs/internals/specs/tla/ExampleProtocol.tla + +# Full (~minutes, millions of states) +tlc -workers 4 -config docs/internals/specs/tla/ExampleProtocol.cfg docs/internals/specs/tla/ExampleProtocol.tla +``` + +## Available Specifications + +| Spec | Config | Purpose | +|------|--------|---------| + +*No specifications yet. Specs will be created as protocols are designed.* + +## Creating New Specs + +1. Create `NewProtocol.tla` with the specification +2. Create `NewProtocol.cfg` with constants and properties to check +3. Add entry to `README.md` mapping table +4. Run model checker to verify + +### Config File Template + +```tla +\* TLC Configuration for NewProtocol.tla + +CONSTANTS + Nodes = {n1, n2} + MaxItems = 3 + +SPECIFICATION Spec + +INVARIANTS + TypeInvariant + SafetyProperty + +PROPERTIES + LivenessProperty +``` + +## Cleanup + +TLC generates trace files and state directories on errors. Clean them up with: + +```bash +rm -rf docs/internals/specs/tla/*_TTrace_*.tla docs/internals/specs/tla/*.bin docs/internals/specs/tla/states +``` + +## References + +- [TLA+ Home](https://lamport.azurewebsites.net/tla/tla.html) +- [TLC Model Checker](https://github.com/tlaplus/tlaplus) +- [Learn TLA+](https://learntla.com/) diff --git a/docs/internals/specs/tla/ParquetDataModel.cfg b/docs/internals/specs/tla/ParquetDataModel.cfg new file mode 100644 index 00000000000..64e9ca4d5ae --- /dev/null +++ b/docs/internals/specs/tla/ParquetDataModel.cfg @@ -0,0 +1,34 @@ +\* TLC Configuration for ParquetDataModel.tla +\* +\* Full configuration: 2 nodes, 2 metrics, 2 tag sets, 2 timestamps, +\* up to 6 ingest requests. Explores the full state space including +\* multiple compaction rounds and cross-node duplicate ingestion. +\* +\* Expected runtime: minutes (millions of states) with 4 workers. +\* +\* Run: +\* tlc -workers 4 \ +\* -config docs/internals/specs/tla/ParquetDataModel.cfg \ +\* docs/internals/specs/tla/ParquetDataModel.tla + +CONSTANTS + Nodes = {n1, n2} + MetricNames = {m1, m2} + TagSets = {tags1, tags2} + Timestamps = {1, 2} + RequestCountMax = 6 + +CHECK_DEADLOCK FALSE + +SPECIFICATION Spec + +INVARIANTS + TypeInvariant + DM1_PointPerRow + DM2_NoLWW + DM3_NoInterpolation + DM4_DeterministicTSID + DM5_TSIDPersistence + +PROPERTIES + Liveness diff --git a/docs/internals/specs/tla/ParquetDataModel.tla b/docs/internals/specs/tla/ParquetDataModel.tla new file mode 100644 index 00000000000..ef16bfa2858 --- /dev/null +++ b/docs/internals/specs/tla/ParquetDataModel.tla @@ -0,0 +1,336 @@ +---- MODULE ParquetDataModel ---- +\* Formal specification for Parquet Metrics Data Model Invariants (ADR-001) +\* +\* Models: +\* - Ingestion of metric data points into pending batches on multiple nodes +\* - Flushing pending batches into immutable Parquet splits in object storage +\* - Compaction of splits: merging multiple splits into one without data loss +\* - Deterministic timeseries_id computation from canonicalized tag sets +\* - Preservation of timeseries_id through compaction (no recomputation) +\* +\* Key Invariants (from ADR-001): +\* - DM-1: Each row in a Parquet split is exactly one data point +\* - DM-2: No last-write-wins; duplicate (metric, tags, ts) from separate +\* ingests both survive +\* - DM-3: No interpolation; storage contains only ingested points +\* - DM-4: timeseries_id is deterministic for a given tag set +\* - DM-5: timeseries_id persists through compaction without recomputation +\* +\* Signal Applicability: +\* - Metrics: Primary target. Each row is one data point +\* (metric_name, tags, timestamp, value) +\* - Traces: Same model applies. Each row is one span. No LWW. +\* timeseries_id equivalent is a hash of span attributes +\* - Logs: Same model applies. Each row is one log entry. No LWW. +\* timeseries_id equivalent groups by service/host +\* +\* TLA+-to-Rust Mapping: +\* | TLA+ Concept | Rust Implementation | +\* |----------------------|-------------------------------------------------------| +\* | Nodes | Indexing pipeline nodes in quickwit-indexing | +\* | DataPoint | Row in RecordBatch (quickwit-parquet-engine) | +\* | pending[n] | MetricsIndexer accumulation buffer | +\* | splits | Published Parquet files in object storage | +\* | request_id | Ingest request identity (WAL position / batch ID) | +\* | IngestPoint | MetricsDocProcessor receives Arrow IPC batch | +\* | FlushSplit | MetricsUploader + MetricsPublisher commit split | +\* | CompactSplits | MergeExecutor merges Parquet files (future) | +\* | TSIDHash(tags) | xxHash64 / SipHash of canonicalized tag key-value set | +\* | DM1_PointPerRow | debug_assert! in schema validation | +\* | DM2_NoLWW | No dedup logic in ingest or compact paths | +\* | DM3_NoInterpolation | debug_assert! no synthetic points in split writer | +\* | DM4_DeterministicID | Deterministic hash fn in timeseries_id computation | +\* | DM5_TSIDPersistence | Merge carries timeseries_id column without recompute | + +EXTENDS Integers, Sequences, FiniteSets, TLC + +CONSTANTS + \* @type: Set(NODE); + \* The set of ingestion nodes (indexing pipeline instances). + Nodes, + + \* @type: Set(Str); + \* The set of metric names in the model. + MetricNames, + + \* @type: Set(Str); + \* The set of possible tag sets (each represented as an opaque string + \* for model simplicity; in the implementation these are sorted + \* key-value maps). + TagSets, + + \* @type: Set(Int); + \* The set of possible timestamps. + Timestamps, + + \* @type: Int; + \* Maximum number of ingest requests the model explores. + \* Bounds the state space for model checking. + RequestCountMax + +VARIABLES + \* @type: NODE -> Seq(DATAPOINT); + \* Per-node pending batch of data points waiting to be flushed. + pending, + + \* @type: Set(SPLIT); + \* The set of published splits in object storage. + \* Each split is a record with fields: + \* split_id : Nat + \* rows : Set of data-point records + splits, + + \* @type: Set(DATAPOINT); + \* The complete history of all points that have ever been ingested. + \* Used to check DM-3 (no interpolation): storage is a subset of this. + all_ingested_points, + + \* @type: Int; + \* Monotonically increasing counter for split IDs. + next_split_id, + + \* @type: Int; + \* Monotonically increasing counter for request IDs. + \* Each IngestPoint action increments this so that separate + \* ingest requests produce distinct request_id values, + \* enabling DM-2 verification. + next_request_id + +vars == <> + +---- +\* ---- Helper: timeseries_id hash function ---- +\* +\* TSIDHash models the deterministic hash from tag sets to timeseries IDs. +\* We model TSIDHash as a deterministic function from tag sets to integers. +\* The key property is that equal inputs always produce equal outputs (DM-4). +\* We use CHOOSE to pick a fixed arbitrary integer for each tag set — +\* this is deterministic in TLA+ (CHOOSE always returns the same value +\* for the same predicate). +\* +\* In the implementation this is xxHash64 or SipHash of the canonicalized +\* (sorted by key name) set of all tag key-value pairs. + +TSIDHash(tags) == CHOOSE n \in 0..100 : TRUE + +---- +\* ---- Data Point record constructor ---- +\* +\* A DataPoint is a record with: +\* metric_name : Str +\* tags : Str (represents the full tag set) +\* timestamp : Int +\* value : Int (abstract; real values are f64) +\* request_id : Int (distinguishes separate ingest requests) +\* timeseries_id: Int (deterministic hash of tags) +\* +\* request_id is NOT part of the "logical identity" of a point. Two points +\* with the same (metric_name, tags, timestamp, value) but different +\* request_id represent the DM-2 scenario: duplicate points from separate +\* ingests that must both be stored. + +MakePoint(mn, ts_set, t, v, rid) == + [metric_name |-> mn, + tags |-> ts_set, + timestamp |-> t, + value |-> v, + request_id |-> rid, + timeseries_id |-> TSIDHash(ts_set)] + +---- +\* ---- Type Invariant ---- + +TypeInvariant == + /\ \A n \in Nodes: pending[n] \in SUBSET + [metric_name: MetricNames, + tags: TagSets, + timestamp: Timestamps, + value: {1}, + request_id: Int, + timeseries_id: Int] + /\ \A s \in splits: + /\ s.split_id \in Int + /\ s.rows \in SUBSET + [metric_name: MetricNames, + tags: TagSets, + timestamp: Timestamps, + value: {1}, + request_id: Int, + timeseries_id: Int] + +---- +\* ---- Derived state: all rows currently in storage ---- + +AllStoredRows == UNION {s.rows : s \in splits} + +---- +\* ---- Safety Properties (ADR-001 Invariants) ---- + +\* DM-1: Each row in a Parquet split is exactly one data point. +\* +\* In this model, each element of s.rows is already a single DataPoint +\* record by construction. The invariant asserts that every row has +\* all required fields populated (no partial / multi-point rows). +DM1_PointPerRow == + \A s \in splits: + \A row \in s.rows: + /\ row.metric_name \in MetricNames + /\ row.tags \in TagSets + /\ row.timestamp \in Timestamps + /\ row.timeseries_id = TSIDHash(row.tags) + +\* DM-2: No last-write-wins. +\* +\* If two data points with the same (metric_name, tags, timestamp) +\* were ingested via separate requests (different request_id), both +\* must exist in storage after all flushes complete. +\* +\* We check: for every pair of distinct ingested points that share +\* the same logical identity but have different request_id, both +\* are present in AllStoredRows OR both are still in pending batches. +\* (Points not yet flushed are not yet "lost" -- they will appear +\* when flushed.) +\* +\* The critical part: if both have been flushed (neither is in any +\* pending batch), then both must be in storage. +AllPendingRows == UNION {pending[n] : n \in Nodes} + +DM2_NoLWW == + \A p1 \in all_ingested_points: + \A p2 \in all_ingested_points: + (/\ p1.metric_name = p2.metric_name + /\ p1.tags = p2.tags + /\ p1.timestamp = p2.timestamp + /\ p1.request_id # p2.request_id + /\ p1 \notin AllPendingRows + /\ p2 \notin AllPendingRows) + => (/\ p1 \in AllStoredRows + /\ p2 \in AllStoredRows) + +\* DM-3: No interpolation. +\* +\* The set of points in storage is always a subset of points that +\* were actually ingested. No synthetic points are ever created. +DM3_NoInterpolation == + AllStoredRows \subseteq all_ingested_points + +\* DM-4: Deterministic timeseries_id. +\* +\* For any two rows anywhere in the system (stored or pending) with +\* the same tag set, timeseries_id is identical. +DM4_DeterministicTSID == + \A r1 \in AllStoredRows \union AllPendingRows: + \A r2 \in AllStoredRows \union AllPendingRows: + (r1.tags = r2.tags) => (r1.timeseries_id = r2.timeseries_id) + +\* DM-5: timeseries_id persists through compaction without recomputation. +\* +\* After compaction, every row in the output split has the same +\* timeseries_id it had in the input splits. Since CompactSplits +\* unions the row sets without modifying them, this holds by +\* construction. We still state it explicitly as a checkable +\* invariant: every stored row's timeseries_id equals TSIDHash +\* of its tags (which is what was assigned at ingestion). +DM5_TSIDPersistence == + \A row \in AllStoredRows: + row.timeseries_id = TSIDHash(row.tags) + +\* Combined safety invariant. +Safety == + /\ DM1_PointPerRow + /\ DM2_NoLWW + /\ DM3_NoInterpolation + /\ DM4_DeterministicTSID + /\ DM5_TSIDPersistence + +---- +\* ---- Actions ---- + +\* IngestPoint: A node receives a data point and adds it to its pending batch. +\* +\* Models: MetricsDocProcessor receives a data point from an ingest request. +\* Each invocation uses a fresh request_id to model separate ingest requests. +\* The value field is fixed to 1 (abstract; real values vary). +IngestPoint(n, mn, ts_set, t) == + /\ next_request_id < RequestCountMax + /\ LET point == MakePoint(mn, ts_set, t, 1, next_request_id) + IN /\ pending' = [pending EXCEPT ![n] = pending[n] \union {point}] + /\ all_ingested_points' = all_ingested_points \union {point} + /\ next_request_id' = next_request_id + 1 + /\ UNCHANGED <> + +\* FlushSplit: A node writes its entire pending batch as a new split. +\* +\* Models: MetricsIndexer commits a Parquet split via MetricsUploader +\* and MetricsPublisher. The batch becomes an immutable split in object +\* storage. The pending buffer is cleared. +\* +\* Precondition: the node has at least one pending point. +FlushSplit(n) == + /\ pending[n] # {} + /\ LET new_split == [split_id |-> next_split_id, rows |-> pending[n]] + IN /\ splits' = splits \union {new_split} + /\ next_split_id' = next_split_id + 1 + /\ pending' = [pending EXCEPT ![n] = {}] + /\ UNCHANGED <> + +\* CompactSplits: The compactor selects a non-empty subset of splits and +\* merges them into one new split. The input splits are removed from +\* storage and replaced by the merged split. +\* +\* Models: MergeExecutor performs a k-way merge of Parquet files. +\* +\* Critical properties enforced by this action: +\* - All rows from input splits appear in the output (no data loss) +\* - No new rows are created (no interpolation -- DM-3) +\* - timeseries_id values are carried through unchanged (DM-5) +\* - Rows from separate requests are not deduplicated (DM-2) +\* +\* The action takes the union of all rows from the selected splits. +\* This union preserves every row (including duplicates from separate +\* requests, since they have different request_id and are therefore +\* distinct set elements). +CompactSplits == + /\ Cardinality(splits) >= 2 + /\ \E selected \in SUBSET splits: + /\ Cardinality(selected) >= 2 + /\ LET merged_rows == UNION {s.rows : s \in selected} + new_split == [split_id |-> next_split_id, rows |-> merged_rows] + IN /\ splits' = (splits \ selected) \union {new_split} + /\ next_split_id' = next_split_id + 1 + /\ UNCHANGED <> + +---- +\* ---- Init and Next ---- + +Init == + /\ pending = [n \in Nodes |-> {}] + /\ splits = {} + /\ all_ingested_points = {} + /\ next_split_id = 1 + /\ next_request_id = 1 + +Next == + \/ \E n \in Nodes, mn \in MetricNames, ts \in TagSets, t \in Timestamps: + IngestPoint(n, mn, ts, t) + \/ \E n \in Nodes: + FlushSplit(n) + \/ CompactSplits + +---- +\* ---- Specification ---- + +\* Weak fairness on Next ensures the system eventually makes progress +\* (points are eventually flushed, splits are eventually compacted). +Spec == Init /\ [][Next]_vars /\ WF_vars(Next) + +---- +\* ---- Liveness Properties ---- + +\* Eventually, if a point is ingested, it reaches storage. +\* (With weak fairness on FlushSplit, pending batches are eventually flushed.) +Liveness == + \A n \in Nodes: + [](pending[n] # {} => <>(pending[n] = {})) + +==== diff --git a/docs/internals/specs/tla/ParquetDataModel_small.cfg b/docs/internals/specs/tla/ParquetDataModel_small.cfg new file mode 100644 index 00000000000..eb916184998 --- /dev/null +++ b/docs/internals/specs/tla/ParquetDataModel_small.cfg @@ -0,0 +1,28 @@ +\* TLC Configuration for ParquetDataModel.tla (small / quick iteration) +\* +\* Minimal configuration: 1 node, 1 metric, 1 tag set, 1 timestamp, +\* up to 3 ingest requests. Designed for fast feedback during +\* development (~1s, hundreds of states). +\* +\* Run: +\* tlc -config docs/internals/specs/tla/ParquetDataModel_small.cfg \ +\* docs/internals/specs/tla/ParquetDataModel.tla + +CONSTANTS + Nodes = {n1} + MetricNames = {m1} + TagSets = {tags1} + Timestamps = {1} + RequestCountMax = 3 + +CHECK_DEADLOCK FALSE + +SPECIFICATION Spec + +INVARIANTS + TypeInvariant + DM1_PointPerRow + DM2_NoLWW + DM3_NoInterpolation + DM4_DeterministicTSID + DM5_TSIDPersistence diff --git a/docs/internals/specs/tla/README.md b/docs/internals/specs/tla/README.md new file mode 100644 index 00000000000..ca51af0030b --- /dev/null +++ b/docs/internals/specs/tla/README.md @@ -0,0 +1,247 @@ +# TLA+ Specifications for Quickhouse-Pomsky + +This directory contains formal TLA+ specifications for critical Quickhouse-Pomsky protocols. + +Specifications are written before implementation (see [Simulation-First Workflow](../../SIMULATION_FIRST_WORKFLOW.md)). Each spec defines the invariants that DST tests and production code must preserve. + +## Signal Priority + +Metrics first, then traces, then logs. Specs should be written to generalize across all three signals where possible, but initial specs will focus on the metrics pipeline. + +## Setup (One-Time) + +Install the TLA+ Toolbox (includes TLC model checker): + +```bash +# Option 1: TLA+ Toolbox app (tested, recommended) +# Download from https://github.com/tlaplus/tlaplus/releases +# The jar is at: /Applications/TLA+ Toolbox.app/Contents/Eclipse/tla2tools.jar + +# Option 2: Homebrew +brew install tlaplus + +# Option 3: Download jar directly +mkdir -p ~/.local/lib +curl -L -o ~/.local/lib/tla2tools.jar \ + "https://github.com/tlaplus/tlaplus/releases/download/v1.8.0/tla2tools.jar" +``` + +Requires Java (e.g., `brew install openjdk` if not already installed). + +## Running Model Checker + +All commands run from the repository root. The TLA+ Toolbox jar path is: + +```bash +TLA_JAR="/Applications/TLA+ Toolbox.app/Contents/Eclipse/tla2tools.jar" +``` + +### Quick Verification (small configs, ~1 second each) + +```bash +# ParquetDataModel (ADR-001): 8 states +java -XX:+UseParallelGC -jar "$TLA_JAR" \ + -config docs/internals/specs/tla/ParquetDataModel_small.cfg \ + docs/internals/specs/tla/ParquetDataModel.tla + +# SortSchema (ADR-002): ~49K states +java -XX:+UseParallelGC -jar "$TLA_JAR" \ + -config docs/internals/specs/tla/SortSchema_small.cfg \ + docs/internals/specs/tla/SortSchema.tla + +# TimeWindowedCompaction (ADR-003): ~938 states +java -XX:+UseParallelGC -jar "$TLA_JAR" \ + -config docs/internals/specs/tla/TimeWindowedCompaction_small.cfg \ + docs/internals/specs/tla/TimeWindowedCompaction.tla +``` + +### Full Verification (full configs, minutes to hours) + +```bash +# ParquetDataModel: millions of states +java -XX:+UseParallelGC -Xmx4g -jar "$TLA_JAR" -workers 4 \ + -config docs/internals/specs/tla/ParquetDataModel.cfg \ + docs/internals/specs/tla/ParquetDataModel.tla + +# SortSchema: millions of states +java -XX:+UseParallelGC -Xmx4g -jar "$TLA_JAR" -workers 4 \ + -config docs/internals/specs/tla/SortSchema.cfg \ + docs/internals/specs/tla/SortSchema.tla + +# TimeWindowedCompaction: very large state space — use with caution +java -XX:+UseParallelGC -Xmx4g -jar "$TLA_JAR" -workers 4 \ + -config docs/internals/specs/tla/TimeWindowedCompaction.cfg \ + docs/internals/specs/tla/TimeWindowedCompaction.tla +``` + +### Verified Results (2026-02-20) + +All small configs pass with no invariant violations: + +| Spec | Config | States | Time | Result | +|------|--------|--------|------|--------| +| ParquetDataModel | small | 8 | <1s | Pass | +| SortSchema | small | 49,490 | 1s | Pass | +| TimeWindowedCompaction | small | 938 | <1s | Pass | + +Mutation testing (9 mutations, 9 caught) confirms invariants detect: +- LWW dedup, synthetic point injection, TSID corruption (ADR-001) +- Unsorted writes, schema mutation of existing splits, inconsistent schema copies (ADR-002) +- Dropped rows, skipped sorting, cross-window merges (ADR-003) + +## Available Specifications + +| Spec | Config | Purpose | +|------|--------|---------| +| [ParquetDataModel](ParquetDataModel.tla) | [Full](ParquetDataModel.cfg), [Small](ParquetDataModel_small.cfg) | ADR-001 data model invariants: point-per-row, no LWW, no interpolation, deterministic timeseries_id, TSID persistence through compaction | +| [SortSchema](SortSchema.tla) | [Full](SortSchema.cfg), [Small](SortSchema_small.cfg) | ADR-002 sort schema invariants: rows sorted per split schema, null ordering, missing columns as null, schema immutability, three-copy consistency | +| [TimeWindowedCompaction](TimeWindowedCompaction.tla) | [Full](TimeWindowedCompaction.cfg), [Small](TimeWindowedCompaction_small.cfg) | ADR-003 time-windowed sorted compaction invariants: one window per split, duration divides hour, no cross-window merge, scope compatibility, row set/content preservation, sort order, column union | + +### Candidate Areas for Specification + +These areas are likely to need formal specs as the system evolves: + +| Area | Component | Key Invariants | +|------|-----------|----------------| +| Split lifecycle | `quickwit-metastore` | No lost splits, no premature visibility, atomic publish | +| Compaction protocol | `quickwit-indexing` | Atomic split swap, no data loss during merge | +| Ingest backpressure | `quickwit-ingest` | Bounded buffers, no overflow, WAL ordering | +| Shard management | `quickwit-control-plane` | No split-brain, consistent shard assignment | +| Tantivy + Parquet consistency | `quickwit-indexing` | Dual-write atomicity, index-to-data consistency | +| Metrics ingestion | `quickwit-metrics-engine` | No lost data points, correct aggregation | +| Garbage collection | `quickwit-janitor` | Safe deletion, no deletion of live data | + +## Creating New Specs + +1. Create `NewProtocol.tla` with the specification +2. Create `NewProtocol.cfg` with constants and properties to check +3. Optionally create `NewProtocol_small.cfg` for quick iteration +4. Add entry to the table in this README +5. Run model checker to verify +6. Link from the corresponding ADR + +### Spec File Template + +```tla +---- MODULE NewProtocol ---- +\* Formal specification for [protocol description] +\* +\* Models: +\* - [what this spec covers] +\* +\* Key Invariants: +\* - [invariant 1] +\* - [invariant 2] +\* +\* Signal Applicability: +\* - Metrics: [how this applies to metrics] +\* - Traces: [how this applies to traces, or "same as metrics"] +\* - Logs: [how this applies to logs, or "same as metrics"] + +EXTENDS Integers, Sequences, FiniteSets, TLC + +CONSTANTS + \* @type: Set(NODE); + Nodes, + \* @type: Int; + MaxItems + +VARIABLES + \* @type: NODE -> STATE; + state + +vars == <> + +\* ---- Type Invariant ---- + +TypeInvariant == + TRUE \* Define type constraints + +\* ---- Safety Properties ---- + +Safety == + TRUE \* Define safety invariants + +\* ---- Actions ---- + +Init == + state = [n \in Nodes |-> "idle"] + +Next == + \E n \in Nodes: + \/ \* Action 1 + TRUE + \/ \* Action 2 + TRUE + +\* ---- Specification ---- + +Spec == Init /\ [][Next]_vars /\ WF_vars(Next) + +\* ---- Liveness ---- + +Liveness == + TRUE \* Define liveness properties + +==== +``` + +### Config File Template + +```tla +\* TLC Configuration for NewProtocol.tla + +CONSTANTS + Nodes = {n1, n2} + MaxItems = 3 + +\* Disable deadlock detection — bounded models naturally terminate +CHECK_DEADLOCK FALSE + +SPECIFICATION Spec + +INVARIANTS + TypeInvariant + Safety + +PROPERTIES + Liveness +``` + +## Mapping Specs to Code + +Each spec should document how TLA+ concepts map to Rust implementation: + +``` +| TLA+ Concept | Rust Implementation | +|------------------------|----------------------------------------| +| `Nodes` | Cluster nodes in `quickwit-cluster` | +| `state` | Component state struct | +| `SafetyProperty` | `debug_assert!` + shared invariant | +| `CommitAction` | `metastore.publish_splits()` | +``` + +### Keeping Spec and Code in Sync + +When modifying a protocol: + +1. Update TLA+ spec first +2. Run TLC to verify safety preserved +3. Update Rust implementation +4. Run DST tests to verify implementation matches spec + +## Cleanup + +TLC generates trace files and state directories on errors. Clean them up with: + +```bash +rm -rf docs/internals/specs/tla/*_TTrace_*.tla docs/internals/specs/tla/*.bin docs/internals/specs/tla/states +``` + +## References + +- [TLA+ Home](https://lamport.azurewebsites.net/tla/tla.html) +- [TLC Model Checker](https://github.com/tlaplus/tlaplus) +- [Learn TLA+](https://learntla.com/) +- [Verification Guide](../../VERIFICATION.md) +- [Simulation-First Workflow](../../SIMULATION_FIRST_WORKFLOW.md) diff --git a/docs/internals/specs/tla/SortSchema.cfg b/docs/internals/specs/tla/SortSchema.cfg new file mode 100644 index 00000000000..fc0b3dec621 --- /dev/null +++ b/docs/internals/specs/tla/SortSchema.cfg @@ -0,0 +1,24 @@ +\* TLC Configuration for SortSchema.tla +\* Full verification: 2 columns, up to 3 rows per split, up to 4 splits, 2 schema changes +\* +\* Expected runtime: minutes (depending on hardware) +\* Run: +\* tlc -workers 4 -config docs/internals/specs/tla/SortSchema.cfg docs/internals/specs/tla/SortSchema.tla + +CONSTANTS + Columns = {c1, c2} + RowsPerSplitMax = 3 + SplitsMax = 4 + SchemaChangesMax = 2 + +CHECK_DEADLOCK FALSE + +SPECIFICATION Spec + +INVARIANTS + TypeInvariant + SS1_RowsSorted + SS2_NullOrdering + SS3_MissingColumnsAreNull + SS4_SchemaImmutable + SS5_SchemaConsistency diff --git a/docs/internals/specs/tla/SortSchema.tla b/docs/internals/specs/tla/SortSchema.tla new file mode 100644 index 00000000000..f317bd2b925 --- /dev/null +++ b/docs/internals/specs/tla/SortSchema.tla @@ -0,0 +1,427 @@ +---- MODULE SortSchema ---- +\* Formal specification for Sort Schema invariants (ADR-002) +\* +\* Models: +\* - A metastore holding the current sort schema per index (mutable at runtime) +\* - An indexing pipeline that reads the current schema, sorts rows, and writes splits +\* - Splits in object storage with rows, metadata sort schema, Parquet KV sort schema, +\* and Parquet sorting_columns sort schema +\* - Schema change action that updates the metastore schema +\* - Compaction that merges splits only when sort schemas match +\* +\* Key Invariants (from ADR-002): +\* - SS-1: All rows within a split are sorted according to its recorded sort schema +\* - SS-2: Nulls sort after non-null for ascending, before non-null for descending +\* - SS-3: Missing sort columns in a split are treated as null (not an error) +\* - SS-4: A split's sort schema never changes after it is written +\* - SS-5: The three copies of sort schema (metastore, KV metadata, sorting_columns) +\* are identical for each split +\* +\* Signal Applicability: +\* - Metrics: Primary target. Parquet splits sorted by metric_name|host|...|timestamp +\* - Traces: Same model. Sort by service_name|operation_name|trace_id|timestamp +\* - Logs: Same model. Sort by service_name|level|host|timestamp +\* +\* TLA+-to-Rust Mapping: +\* | TLA+ Concept | Rust Implementation | +\* |------------------------------|--------------------------------------------------------------| +\* | MetastoreSchema | Metastore per-index sort_schema field (PostgreSQL) | +\* | SortSchema (sequence) | Sort schema string "col+|col-|.../V2" parsed into Vec | +\* | SortColumn (record) | SortColumn { name, direction } in sort schema parser | +\* | Split (record) | MetricsSplitMetadata in quickwit-parquet-engine | +\* | split.rows | RecordBatch rows in a Parquet file | +\* | split.metadata_sort_schema | sort_schema field in MetricsSplitMetadata (PostgreSQL) | +\* | split.kv_sort_schema | sort_schema key in Parquet key_value_metadata | +\* | split.sorting_columns_schema | Parquet native sorting_columns field | +\* | IngestBatch action | ParquetWriter::sort_batch() + write in writer.rs | +\* | ChangeSchema action | Metastore update_index() changing sort_schema | +\* | CompactSplits action | Parquet merge executor (sorted k-way merge, ADR-003) | +\* | RowsSorted property | debug_assert! after lexsort_to_indices in sort_batch() | +\* | NullOrdering property | SortColumn nulls_first field per direction | +\* | Columns | ParquetField enum variants in schema/fields.rs | + +EXTENDS Integers, Sequences, FiniteSets, TLC + +CONSTANTS + \* @type: Set(Str); + \* Column names that can appear in sort schemas and row data + Columns, + + \* @type: Int; + \* Maximum number of rows per split (kept small for model checking) + RowsPerSplitMax, + + \* @type: Int; + \* Maximum number of splits in the system + SplitsMax, + + \* @type: Int; + \* Maximum number of schema changes allowed + SchemaChangesMax + +VARIABLES + \* @type: Seq(<>); + \* The current sort schema in the metastore: a sequence of <> + \* direction is "asc" or "desc" + metastore_schema, + + \* @type: Set([ + \* id: Int, + \* rows: Seq([col: Str -> Int | Str]), + \* sort_schema: Seq(<>), + \* metadata_sort_schema: Seq(<>), + \* kv_sort_schema: Seq(<>), + \* sorting_columns_schema: Seq(<>), + \* columns_present: Set(Str) + \* ]); + \* The set of splits in object storage + splits, + + \* @type: Int; + \* Counter for generating unique split IDs + next_split_id, + + \* @type: Int; + \* Counter for schema changes (bounded) + schema_change_count, + + \* @type: Set(Int) -> Seq(<>); + \* Historical record: maps split ID to the sort schema at time of write + \* Used to verify SS-4 (immutability) + split_schema_history + +vars == <> + +\* ============================================================================ +\* Constants and Value Domains +\* ============================================================================ + +\* Possible directions for sort columns +Directions == {"asc", "desc"} + +\* A special sentinel value representing NULL. +\* Must be an integer outside the normal value range so that +\* TLC can compare it with < and > without type errors. +NULL == -999 + +\* Non-null values are modeled as integers from a small domain +\* This keeps the state space finite and manageable +Values == {1, 2, 3} + +\* The full value domain including NULL +ValuesWithNull == Values \cup {NULL} + +\* All possible sort schemas: sequences of (column, direction) pairs +\* We limit to schemas of length 1 or 2 to keep state space bounded +AllSortSchemas == + { <<>> } \cup + { <<<>>> : c \in Columns, d \in Directions } \cup + { <<<>, <>>> : c1 \in Columns, c2 \in Columns, d1 \in Directions, d2 \in Directions } + +\* ============================================================================ +\* Helper Operators +\* ============================================================================ + +\* Get the value of a column from a row, treating missing columns as NULL +\* This implements SS-3: missing columns are NULL +GetValue(row, col, columns_present) == + IF col \in columns_present + THEN row[col] + ELSE NULL + +\* Compare two values for a single column with null ordering rules (SS-2) +\* Returns: -1 (less), 0 (equal), 1 (greater) +\* +\* For ascending: nulls sort AFTER non-null (nulls are "greater") +\* For descending: nulls sort BEFORE non-null (nulls are "lesser") +CompareValues(v1, v2, direction) == + CASE v1 = NULL /\ v2 = NULL -> 0 + [] v1 = NULL /\ v2 /= NULL -> + IF direction = "asc" THEN 1 \* null after non-null for ascending + ELSE -1 \* null before non-null for descending + [] v1 /= NULL /\ v2 = NULL -> + IF direction = "asc" THEN -1 \* non-null before null for ascending + ELSE 1 \* non-null after null for descending + [] v1 /= NULL /\ v2 /= NULL -> + IF direction = "asc" + THEN (CASE v1 < v2 -> -1 [] v1 = v2 -> 0 [] v1 > v2 -> 1) + ELSE (CASE v1 > v2 -> -1 [] v1 = v2 -> 0 [] v1 < v2 -> 1) + +\* Compare two rows lexicographically according to a sort schema +\* Returns TRUE if row1 should come before or equal to row2 +\* columns_present is the set of columns that exist in the split +RECURSIVE RowLEQ(_, _, _, _) +RowLEQ(row1, row2, schema, columns_present) == + \* For an empty schema, all rows are equal (any order is valid) + IF schema = <<>> + THEN TRUE + ELSE + LET col == schema[1][1] + dir == schema[1][2] + v1 == GetValue(row1, col, columns_present) + v2 == GetValue(row2, col, columns_present) + cmp == CompareValues(v1, v2, dir) + IN + IF cmp = -1 THEN TRUE \* row1 < row2, strictly less + ELSE IF cmp = 1 THEN FALSE \* row1 > row2, strictly greater + ELSE \* cmp = 0, equal on this column + IF Len(schema) = 1 + THEN TRUE \* all columns compared, equal is ok + ELSE RowLEQ(row1, row2, SubSeq(schema, 2, Len(schema)), columns_present) + +\* Check if a sequence of rows is sorted according to a schema +\* Empty and single-element sequences are trivially sorted +IsSorted(rows, schema, columns_present) == + \A i \in 1..(Len(rows) - 1) : + RowLEQ(rows[i], rows[i + 1], schema, columns_present) + +\* Sort a sequence of rows by the schema (specification-level sort) +\* We define this as: there EXISTS a permutation of the input that is sorted +\* For model checking, we verify the result IS sorted rather than computing it +\* +\* Instead of computing a sort, we non-deterministically choose rows that form +\* a valid sorted sequence. The key insight: we check that IngestBatch produces +\* splits whose rows ARE sorted; we don't need to compute the sort ourselves. + +\* Check if two sequences contain exactly the same multiset of elements +\* (same elements with same multiplicities) +IsPermutation(s1, s2) == + /\ Len(s1) = Len(s2) + /\ \A i \in 1..Len(s1) : + Cardinality({j \in 1..Len(s1) : s1[j] = s1[i]}) = + Cardinality({j \in 1..Len(s2) : s2[j] = s1[i]}) + +\* ============================================================================ +\* Type Invariant +\* ============================================================================ + +TypeInvariant == + /\ metastore_schema \in AllSortSchemas + /\ \A s \in splits : + /\ s.id \in Nat + /\ s.sort_schema \in AllSortSchemas + /\ s.metadata_sort_schema \in AllSortSchemas + /\ s.kv_sort_schema \in AllSortSchemas + /\ s.sorting_columns_schema \in AllSortSchemas + /\ s.columns_present \subseteq Columns + /\ Len(s.rows) <= RowsPerSplitMax + /\ next_split_id \in Nat + /\ schema_change_count \in 0..SchemaChangesMax + +\* ============================================================================ +\* Safety Properties (SS-1 through SS-5) +\* ============================================================================ + +\* SS-1: All rows within a split are sorted according to the sort schema +\* recorded in that split's metadata +SS1_RowsSorted == + \A s \in splits : + IsSorted(s.rows, s.sort_schema, s.columns_present) + +\* SS-2: Null values are ordered correctly per column direction +\* This is enforced by the CompareValues function used in IsSorted. +\* We verify it explicitly: for every adjacent pair of rows, when all +\* earlier sort columns are equal, if one value is NULL and the other +\* is not, the NULL must be in the correct position: +\* - Ascending: NULL comes AFTER non-null (nulls last) +\* - Descending: NULL comes BEFORE non-null (nulls first) +SS2_NullOrdering == + \A s \in splits : + \A i \in 1..(Len(s.rows) - 1) : + \A k \in 1..Len(s.sort_schema) : + LET col == s.sort_schema[k][1] + dir == s.sort_schema[k][2] + v_curr == GetValue(s.rows[i], col, s.columns_present) + v_next == GetValue(s.rows[i + 1], col, s.columns_present) + \* Only check null ordering when earlier columns are equal + earlier_equal == \A j \in 1..(k - 1) : + LET ec == s.sort_schema[j][1] + ev1 == GetValue(s.rows[i], ec, s.columns_present) + ev2 == GetValue(s.rows[i + 1], ec, s.columns_present) + IN ev1 = ev2 + IN + earlier_equal => + \* Ascending: null must NOT appear before non-null + /\ ~(dir = "asc" /\ v_curr = NULL /\ v_next /= NULL) + \* Descending: non-null must NOT appear before null + /\ ~(dir = "desc" /\ v_curr /= NULL /\ v_next = NULL) + +\* SS-3: If a sort column is missing from a split's data, all rows in that +\* split are treated as null for that column. This is not an error. +\* We verify: for any sort column not in columns_present, GetValue +\* returns NULL for every row. +SS3_MissingColumnsAreNull == + \A s \in splits : + \A k \in 1..Len(s.sort_schema) : + LET col == s.sort_schema[k][1] + IN col \notin s.columns_present => + \A i \in 1..Len(s.rows) : + GetValue(s.rows[i], col, s.columns_present) = NULL + +\* SS-4: A split's sort schema never changes after it is written. +\* Verified by comparing current schema to the historical record. +SS4_SchemaImmutable == + \A s \in splits : + s.id \in DOMAIN split_schema_history => + split_schema_history[s.id] = s.sort_schema + +\* SS-5: For every split, the three copies of sort schema are identical: +\* metadata_sort_schema, kv_sort_schema, sorting_columns_schema +SS5_SchemaConsistency == + \A s \in splits : + /\ s.sort_schema = s.metadata_sort_schema + /\ s.sort_schema = s.kv_sort_schema + /\ s.sort_schema = s.sorting_columns_schema + +\* Combined safety property +Safety == + /\ SS1_RowsSorted + /\ SS3_MissingColumnsAreNull + /\ SS4_SchemaImmutable + /\ SS5_SchemaConsistency + +\* ============================================================================ +\* Actions +\* ============================================================================ + +\* --- IngestBatch --- +\* The indexing pipeline reads the current schema from the metastore, +\* receives a batch of rows, sorts them according to the schema, and +\* writes a split with the schema recorded in all three locations. +\* +\* We model this by non-deterministically choosing: +\* - A subset of columns present in this batch (may not include all sort columns) +\* - A set of rows (values for each present column, NULL for absent columns) +\* - The rows are sorted according to the current metastore schema +IngestBatch == + /\ Cardinality(splits) < SplitsMax + /\ \E columns_present \in SUBSET Columns : + \E n \in 1..RowsPerSplitMax : + \* Non-deterministically choose a sorted sequence of rows + \E rows \in [1..n -> [columns_present -> ValuesWithNull]] : + LET current_schema == metastore_schema + new_id == next_split_id + \* Build full rows that include all columns (present ones from data, + \* absent ones implicitly NULL via GetValue) + new_split == [ + id |-> new_id, + rows |-> rows, + sort_schema |-> current_schema, + metadata_sort_schema |-> current_schema, + kv_sort_schema |-> current_schema, + sorting_columns_schema |-> current_schema, + columns_present |-> columns_present + ] + IN + \* The rows must be sorted according to the schema + /\ IsSorted(rows, current_schema, columns_present) + /\ splits' = splits \cup {new_split} + /\ next_split_id' = next_split_id + 1 + /\ split_schema_history' = split_schema_history @@ (new_id :> current_schema) + /\ UNCHANGED <> + +\* --- ChangeSchema --- +\* An operator updates the sort schema in the metastore. +\* Already-written splits are NOT affected (SS-4). +ChangeSchema == + /\ schema_change_count < SchemaChangesMax + /\ \E new_schema \in AllSortSchemas : + /\ new_schema /= metastore_schema + /\ metastore_schema' = new_schema + /\ schema_change_count' = schema_change_count + 1 + /\ UNCHANGED <> + +\* --- CompactSplits --- +\* Merges two or more splits that share the same sort_schema. +\* The output split contains the union of all rows, re-sorted by the schema. +\* Column sets may differ (SS-3, MC-4 from ADR-003): output has union of columns. +\* +\* For model checking tractability, we merge exactly two splits. +CompactSplits == + /\ \E s1 \in splits : + \E s2 \in splits : + /\ s1 /= s2 + \* Only merge splits with the same sort schema (CS-1 from ADR-003) + /\ s1.sort_schema = s2.sort_schema + /\ LET merged_schema == s1.sort_schema + merged_columns == s1.columns_present \cup s2.columns_present + \* Build canonical row representations with all merged columns + \* Rows from s1: existing columns keep values, new columns get NULL + \* Rows from s2: same treatment + all_row_count == Len(s1.rows) + Len(s2.rows) + IN + /\ all_row_count <= RowsPerSplitMax + /\ \E merged_rows \in [1..all_row_count -> [merged_columns -> ValuesWithNull]] : + LET new_id == next_split_id + new_split == [ + id |-> new_id, + rows |-> merged_rows, + sort_schema |-> merged_schema, + metadata_sort_schema |-> merged_schema, + kv_sort_schema |-> merged_schema, + sorting_columns_schema |-> merged_schema, + columns_present |-> merged_columns + ] + IN + \* The merged rows must be sorted + /\ IsSorted(merged_rows, merged_schema, merged_columns) + \* The merged rows must be a permutation of the union of input rows + \* (MC-1: no rows added, removed, or duplicated) + \* We verify row preservation by checking that for each row in the + \* merged output, it came from one of the inputs (extended with NULLs + \* for missing columns) + /\ \A i \in 1..all_row_count : + \/ \E j \in 1..Len(s1.rows) : + \A c \in merged_columns : + merged_rows[i][c] = GetValue(s1.rows[j], c, s1.columns_present) + \/ \E j \in 1..Len(s2.rows) : + \A c \in merged_columns : + merged_rows[i][c] = GetValue(s2.rows[j], c, s2.columns_present) + \* Remove old splits, add new one + /\ splits' = (splits \ {s1, s2}) \cup {new_split} + /\ next_split_id' = next_split_id + 1 + /\ split_schema_history' = split_schema_history @@ (new_id :> merged_schema) + /\ UNCHANGED <> + +\* ============================================================================ +\* Initial State +\* ============================================================================ + +\* Initial sort schema: a simple single-column ascending schema +\* (The specific initial schema is set via the config file's CONSTANTS) +Init == + /\ metastore_schema \in AllSortSchemas + /\ splits = {} + /\ next_split_id = 1 + /\ schema_change_count = 0 + /\ split_schema_history = <<>> + +\* ============================================================================ +\* Next-State Relation +\* ============================================================================ + +Next == + \/ IngestBatch + \/ ChangeSchema + \/ CompactSplits + +\* ============================================================================ +\* Specification +\* ============================================================================ + +Spec == Init /\ [][Next]_vars + +\* ============================================================================ +\* Liveness Properties +\* ============================================================================ + +\* If there are mergeable splits (same schema), compaction is eventually possible. +\* This is a weak liveness property: we only require that the system does not +\* get permanently stuck — compaction CAN happen, not that it MUST. +\* (In practice, a compaction scheduler drives this.) +Liveness == + []<>(\A s1 \in splits : \A s2 \in splits : + (s1 /= s2 /\ s1.sort_schema = s2.sort_schema) => + ENABLED CompactSplits) + +==== diff --git a/docs/internals/specs/tla/SortSchema_small.cfg b/docs/internals/specs/tla/SortSchema_small.cfg new file mode 100644 index 00000000000..73c4e17daea --- /dev/null +++ b/docs/internals/specs/tla/SortSchema_small.cfg @@ -0,0 +1,24 @@ +\* TLC Configuration for SortSchema.tla (small model for quick iteration) +\* Reduced constants: 1 column, up to 2 rows per split, up to 2 splits, 1 schema change +\* +\* Expected runtime: seconds +\* Run: +\* tlc -config docs/internals/specs/tla/SortSchema_small.cfg docs/internals/specs/tla/SortSchema.tla + +CONSTANTS + Columns = {c1} + RowsPerSplitMax = 2 + SplitsMax = 2 + SchemaChangesMax = 1 + +CHECK_DEADLOCK FALSE + +SPECIFICATION Spec + +INVARIANTS + TypeInvariant + SS1_RowsSorted + SS2_NullOrdering + SS3_MissingColumnsAreNull + SS4_SchemaImmutable + SS5_SchemaConsistency diff --git a/docs/internals/specs/tla/TimeWindowedCompaction.cfg b/docs/internals/specs/tla/TimeWindowedCompaction.cfg new file mode 100644 index 00000000000..f2ba00e04b7 --- /dev/null +++ b/docs/internals/specs/tla/TimeWindowedCompaction.cfg @@ -0,0 +1,34 @@ +\* TLC Configuration for TimeWindowedCompaction.tla +\* Full verification: explores the complete state space. +\* +\* Expected runtime: ~minutes with 4 workers. + +CONSTANTS + Timestamps = {0, 1, 2, 3} + AllColumns = {m, v, h} + Scopes = {s1, s2} + WindowDuration = 2 + HourSeconds = 4 + CompactionStartTime = 0 + LateDataAcceptanceWindow = 3 + MaxTime = 3 + MaxPoints = 4 + MaxCompactions = 3 + SortKeys = {1, 2, 3} + +CHECK_DEADLOCK FALSE + +SPECIFICATION Spec + +INVARIANTS + TypeInvariant + TW1_OneWindowPerSplit + TW2_DurationDividesHour + TW3_NoCrossWindowMerge + CS1_ScopeCompatibility + CS2_SameWindowStart + CS3_CompactionStartTime + MC1_RowSetPreserved + MC2_RowContentsPreserved + MC3_SortOrderPreserved + MC4_ColumnUnion diff --git a/docs/internals/specs/tla/TimeWindowedCompaction.tla b/docs/internals/specs/tla/TimeWindowedCompaction.tla new file mode 100644 index 00000000000..8f5b1266b20 --- /dev/null +++ b/docs/internals/specs/tla/TimeWindowedCompaction.tla @@ -0,0 +1,674 @@ +---- MODULE TimeWindowedCompaction ---- +\* Formal specification for time-windowed sorted compaction of Parquet splits. +\* +\* Models the invariants from ADR-003: Time-Windowed Sorted Compaction for Parquet. +\* The system ingests data points with timestamps, assigns them to time windows, +\* produces sorted splits, and compacts splits within the same window while +\* preserving row sets, row contents, sort order, and column unions. +\* +\* Key Invariants: +\* TW-1: Every split belongs to exactly one time window +\* TW-2: window_duration evenly divides one hour (3600 seconds) +\* TW-3: Data is never merged across window boundaries +\* CS-1: Only splits sharing all six scope components may be merged +\* CS-2: Within a scope, only splits with same window_start are merged +\* CS-3: Splits before compaction_start_time are never compacted +\* MC-1: Row multiset preserved through compaction (no add/remove/duplicate) +\* MC-2: Row contents unchanged through compaction (except bookkeeping) +\* MC-3: Output is sorted according to the sort schema +\* MC-4: Column set is the union of input column sets (nulls fill gaps) +\* +\* Signal Applicability: +\* - Metrics: Primary target. Parquet splits, DataFusion queries, time-series data. +\* - Traces: Same invariants apply. Tantivy splits would use same windowing. +\* - Logs: Same invariants apply. Time-windowed compaction generalizes. +\* +\* TLA+-to-Rust Mapping: +\* | TLA+ Concept | Rust Implementation | +\* |-----------------------------|----------------------------------------------------------| +\* | `ObjectStorage` | S3/GCS split files + `metrics_splits` PostgreSQL table | +\* | `Split.scope` | `(index_uid, source_id, partition_id, | +\* | | doc_mapping_uid, sort_schema, window_duration)` | +\* | `Split.window_start` | `MetricsSplitMetadata.window_start` | +\* | `Split.rows` | RecordBatch rows in Parquet file | +\* | `Split.columns` | Parquet column names (Arrow schema) | +\* | `Split.sorted` | Result of `lexsort_to_indices` applied to sort columns | +\* | `IngestPoint` | OTLP ingest -> MetricsDocProcessor -> MetricsIndexer | +\* | `FlushSplit` | MetricsIndexer -> MetricsUploader -> MetricsPublisher | +\* | `CompactWindow` | Parquet merge planner -> merge executor -> publisher | +\* | `WindowDuration` | Index-level config `window_duration_secs` | +\* | `CompactionStartTime` | Index-level config `compaction_start_time` | +\* | `LateDataAcceptanceWindow` | Index-level config `late_data_acceptance_window` | +\* | `SortOrder` | Sort schema string, e.g. "metric_name|tag_host|ts-/V2" | +\* | `IsSorted(seq)` | Arrow `lexsort_to_indices` produces identity permutation | +\* | `BagUnion` | k-way merge output == union of input RecordBatches | + +EXTENDS Integers, Sequences, FiniteSets, TLC + +\* ============================================================================ +\* CONSTANTS +\* ============================================================================ + +CONSTANTS + \* Set of possible timestamps (abstract time units). + \* Example: {0, 1, 2, 3} + Timestamps, + + \* Set of possible column names in data points. + \* Example: {"metric_name", "value", "host"} + AllColumns, + + \* Set of possible scope values. + \* Each scope is a record representing the 6-part compatibility key. + \* For model checking, we use a small set of distinct scopes. + \* Example: {<<"idx1", "src1", "p1", "dm1", "sort1", 2>>} + Scopes, + + \* The active window duration (in abstract time units). + \* Must evenly divide HourSeconds. + WindowDuration, + + \* Abstract representation of "one hour" in model time units. + \* WindowDuration must evenly divide this. + HourSeconds, + + \* The compaction start time. Splits with window_start < this are + \* not eligible for compaction. + CompactionStartTime, + + \* The late data acceptance window. Points with timestamp < (CurrentTime - this) + \* are dropped at ingestion. + LateDataAcceptanceWindow, + + \* The current time (abstract). Advances via the model. + \* In the small model this is bounded to keep the state space finite. + MaxTime, + + \* Maximum number of data points that can be ingested (bounds state space). + MaxPoints, + + \* Maximum number of compaction steps (bounds state space). + MaxCompactions, + + \* A sort key extraction function constant. + \* In the model, each data point has a "sort_key" field used for ordering. + \* This is implicit in the row structure. + + \* Set of possible sort key values (for abstract ordering). + SortKeys + +\* ============================================================================ +\* VARIABLES +\* ============================================================================ + +VARIABLES + \* The current time in the system (monotonically non-decreasing). + currentTime, + + \* Object storage: a set of split records. + \* Each split is a record: + \* [ id : Nat, + \* scope : Scope, + \* window_start : Int, + \* rows : Sequence of Row records, + \* columns : Set of column names, + \* sorted : BOOLEAN ] + \* + \* Each Row is: + \* [ point_id : Nat, \* unique identity for MC-2 tracking + \* timestamp : Int, + \* sort_key : SortKey, \* abstract sort key for MC-3 + \* columns : Set of column names this row has values for, + \* values : column name -> value mapping (for MC-2) ] + objectStorage, + + \* Ingestion buffer: accumulates points per (scope, window_start) before flush. + \* A function from <> to a sequence of Row records. + ingestBuffer, + + \* Counter for generating unique split IDs. + nextSplitId, + + \* Counter for generating unique point IDs. + nextPointId, + + \* The total number of points ingested so far (for bounding). + pointsIngested, + + \* The total number of compaction steps performed (for bounding). + compactionsPerformed, + + \* History: tracks all rows ever created (by point_id) for MC-2 verification. + \* Maps point_id -> original row record. + rowHistory, + + \* Ghost variable: records each compaction's input/output for invariant checking. + \* Set of records: + \* [ inputSplitIds : Set of split IDs, + \* outputSplitId : split ID, + \* inputRows : Bag of point_ids, + \* outputRows : Bag of point_ids ] + compactionLog + +vars == <> + +\* ============================================================================ +\* HELPERS +\* ============================================================================ + +\* Compute the window start for a given timestamp. +WindowStart(t) == t - (t % WindowDuration) + +\* The finite set of valid window starts derived from the timestamp domain. +\* Used in quantifiers instead of Int to keep TLC enumerable. +ValidWindowStarts == {WindowStart(t) : t \in Timestamps} + +\* Check whether a sequence is sorted by sort_key (ascending). +\* An empty or single-element sequence is sorted. +IsSorted(seq) == + \A i \in 1..(Len(seq) - 1) : seq[i].sort_key <= seq[i + 1].sort_key + +\* Convert a sequence to a "bag" (multiset) represented as a function +\* from elements to counts. We represent bags as sets of <> +\* pairs. For our purposes, we use point_ids which are unique, so the bag +\* is effectively a set. But we model it properly to catch duplication bugs. +\* +\* We represent a bag as a function: point_id -> count. +\* Since TLC cannot natively handle Bags module in all configurations, +\* we model bags manually using functions over point_id domains. + +\* Extract the set of point_ids from a sequence of rows. +PointIdSet(rows) == {rows[i].point_id : i \in 1..Len(rows)} + +\* Count occurrences of a point_id in a sequence of rows. +CountInSeq(pid, rows) == + LET indices == {i \in 1..Len(rows) : rows[i].point_id = pid} + IN Cardinality(indices) + +\* Build a bag (point_id -> count) from a sequence of rows. +BagOfSeq(rows) == + [pid \in PointIdSet(rows) |-> CountInSeq(pid, rows)] + +\* Check if two bags are equal (same domain, same counts). +BagsEqual(bag1, dom1, bag2, dom2) == + /\ dom1 = dom2 + /\ \A pid \in dom1 : bag1[pid] = bag2[pid] + +\* Concatenate two sequences. +SeqConcat(s1, s2) == + [i \in 1..(Len(s1) + Len(s2)) |-> + IF i <= Len(s1) THEN s1[i] ELSE s2[i - Len(s1)]] + +\* Merge sort two sorted sequences into one sorted sequence. +\* This is a recursive definition; TLC handles it for small sequences. +RECURSIVE MergeSort(_, _) +MergeSort(s1, s2) == + IF Len(s1) = 0 THEN s2 + ELSE IF Len(s2) = 0 THEN s1 + ELSE IF Head(s1).sort_key <= Head(s2).sort_key + THEN <> \o MergeSort(Tail(s1), s2) + ELSE <> \o MergeSort(s1, Tail(s2)) + +\* Multi-way merge: merge a set of sorted sequences into one sorted sequence. +\* We do this by folding pairwise merges. +RECURSIVE MultiMerge(_) +MultiMerge(seqSet) == + IF seqSet = {} THEN <<>> + ELSE LET s == CHOOSE s \in seqSet : TRUE + IN MergeSort(s, MultiMerge(seqSet \ {s})) + +\* Insert an element into a sorted sequence at the correct position. +RECURSIVE SortedInsert(_, _) +SortedInsert(elem, seq) == + IF Len(seq) = 0 THEN <> + ELSE IF elem.sort_key <= Head(seq).sort_key + THEN <> \o seq + ELSE <> \o SortedInsert(elem, Tail(seq)) + +\* Insert-sort a sequence by sort_key (for initial split creation). +RECURSIVE InsertionSort(_) +InsertionSort(seq) == + IF Len(seq) <= 1 THEN seq + ELSE SortedInsert(Head(seq), InsertionSort(Tail(seq))) + +\* Compute the union of column sets across a set of splits. +ColumnUnion(splits) == + UNION {s.columns : s \in splits} + +\* Concatenate the rows from a set of splits into one sequence. +\* Used for building the input bag independently of the merge output. +RECURSIVE ConcatSplitRows(_) +ConcatSplitRows(splitSet) == + IF splitSet = {} THEN <<>> + ELSE LET sp == CHOOSE sp \in splitSet : TRUE + IN SeqConcat(sp.rows, ConcatSplitRows(splitSet \ {sp})) + +\* ============================================================================ +\* TYPE INVARIANT +\* ============================================================================ + +TypeInvariant == + /\ currentTime \in 0..MaxTime + /\ nextSplitId \in Nat + /\ nextPointId \in Nat + /\ pointsIngested \in 0..MaxPoints + /\ compactionsPerformed \in 0..MaxCompactions + /\ \A split \in objectStorage : + /\ split.id \in Nat + /\ split.scope \in Scopes + /\ split.window_start \in Int + /\ split.columns \subseteq AllColumns + /\ split.sorted \in BOOLEAN + /\ Len(split.rows) >= 1 + +\* ============================================================================ +\* SAFETY PROPERTIES (INVARIANTS) +\* ============================================================================ + +\* --------------------------------------------------------------------------- +\* TW-1: Every split in object storage belongs to exactly one time window. +\* All rows in a split have the same window_start as the split metadata. +\* --------------------------------------------------------------------------- +TW1_OneWindowPerSplit == + \A split \in objectStorage : + \A i \in 1..Len(split.rows) : + WindowStart(split.rows[i].timestamp) = split.window_start + +\* --------------------------------------------------------------------------- +\* TW-2: window_duration must evenly divide one hour (3600 seconds). +\* This is a configuration constraint checked as an invariant. +\* --------------------------------------------------------------------------- +TW2_DurationDividesHour == + HourSeconds % WindowDuration = 0 + +\* --------------------------------------------------------------------------- +\* TW-3: Data is never merged across window boundaries. +\* For every compaction log entry, all input splits have the same +\* window_start, and the output split has that same window_start. +\* Checked via the compaction log's recorded window_starts, and +\* verified that the output split in storage (if still present) +\* matches. This is implied by CS-2 but checked explicitly as a +\* separate invariant per the ADR. +\* --------------------------------------------------------------------------- +TW3_NoCrossWindowMerge == + \A entry \in compactionLog : + \* All input window_starts are identical (same check as CS-2, + \* but stated in terms of "no cross-window merge"). + /\ \A id1, id2 \in entry.inputSplitIds : + entry.inputWindowStarts[id1] = entry.inputWindowStarts[id2] + \* The output split (if still in storage) has the same window_start. + /\ \A s \in objectStorage : + s.id = entry.outputSplitId => + \A id \in entry.inputSplitIds : + s.window_start = entry.inputWindowStarts[id] + +\* --------------------------------------------------------------------------- +\* CS-1: Only splits sharing all six scope components may be merged. +\* Every compaction log entry's input splits all share the same scope. +\* --------------------------------------------------------------------------- +CS1_ScopeCompatibility == + \A entry \in compactionLog : + \A id1, id2 \in entry.inputSplitIds : + entry.inputScopes[id1] = entry.inputScopes[id2] + +\* --------------------------------------------------------------------------- +\* CS-2: Within a compatibility scope, only splits with the same window_start +\* are merged. Every compaction log entry's input splits share window_start. +\* --------------------------------------------------------------------------- +CS2_SameWindowStart == + \A entry \in compactionLog : + \A id1, id2 \in entry.inputSplitIds : + entry.inputWindowStarts[id1] = entry.inputWindowStarts[id2] + +\* --------------------------------------------------------------------------- +\* CS-3: Splits produced before compaction_start_time are never compacted. +\* No compaction log entry includes a split with window_start < CompactionStartTime. +\* --------------------------------------------------------------------------- +CS3_CompactionStartTime == + \A entry \in compactionLog : + \A id \in entry.inputSplitIds : + entry.inputWindowStarts[id] >= CompactionStartTime + +\* --------------------------------------------------------------------------- +\* MC-1: The multiset (bag) of rows is identical before and after compaction. +\* The bag of point_ids in the output equals the bag-union of point_ids +\* from the inputs. No rows added, removed, or duplicated. +\* --------------------------------------------------------------------------- +MC1_RowSetPreserved == + \A entry \in compactionLog : + BagsEqual( + entry.inputBag, DOMAIN entry.inputBag, + entry.outputBag, DOMAIN entry.outputBag + ) + +\* --------------------------------------------------------------------------- +\* MC-2: Row contents do not change during compaction. +\* For every point_id in a compaction output, its row values match +\* the original row as recorded in rowHistory. +\* --------------------------------------------------------------------------- +MC2_RowContentsPreserved == + \A split \in objectStorage : + \A i \in 1..Len(split.rows) : + LET row == split.rows[i] + pid == row.point_id + IN /\ pid \in DOMAIN rowHistory + /\ row.timestamp = rowHistory[pid].timestamp + /\ row.sort_key = rowHistory[pid].sort_key + /\ row.columns = rowHistory[pid].columns + /\ row.values = rowHistory[pid].values + +\* --------------------------------------------------------------------------- +\* MC-3: The output of a merge is sorted according to the sort schema. +\* Every split marked as sorted has rows in non-decreasing sort_key order. +\* --------------------------------------------------------------------------- +MC3_SortOrderPreserved == + \A split \in objectStorage : + split.sorted => IsSorted(split.rows) + +\* --------------------------------------------------------------------------- +\* MC-4: If inputs have different column sets, the output contains the union +\* of all columns. Type conflicts are an error (modeled as the action +\* being disabled). Rows from inputs missing a column are filled with nulls. +\* We verify: output.columns = union of input columns. +\* --------------------------------------------------------------------------- +MC4_ColumnUnion == + \A entry \in compactionLog : + entry.outputColumns = entry.inputColumnUnion + +\* ============================================================================ +\* INITIAL STATE +\* ============================================================================ + +Init == + /\ currentTime = 0 + /\ objectStorage = {} + /\ ingestBuffer = [key \in {} |-> <<>>] \* empty function + /\ nextSplitId = 1 + /\ nextPointId = 1 + /\ pointsIngested = 0 + /\ compactionsPerformed = 0 + /\ rowHistory = [pid \in {} |-> <<>>] \* empty function + /\ compactionLog = {} + +\* ============================================================================ +\* ACTIONS +\* ============================================================================ + +\* --------------------------------------------------------------------------- +\* AdvanceTime: Time progresses. Monotonically non-decreasing. +\* --------------------------------------------------------------------------- +AdvanceTime == + /\ currentTime < MaxTime + /\ \E t \in (currentTime + 1)..MaxTime : + /\ currentTime' = t + /\ UNCHANGED <> + +\* --------------------------------------------------------------------------- +\* IngestPoint: A data point arrives with a timestamp, sort key, column set, +\* and values. It is assigned to a window. If too old, it is dropped. +\* +\* Guards: +\* - pointsIngested < MaxPoints (bound state space) +\* - timestamp is not older than late_data_acceptance_window +\* +\* Effects: +\* - Adds the point to the ingest buffer for (scope, window_start) +\* - Records the row in rowHistory for MC-2 checking +\* --------------------------------------------------------------------------- +IngestPoint == + /\ pointsIngested < MaxPoints + /\ \E ts \in Timestamps, sk \in SortKeys, scope \in Scopes, cols \in SUBSET AllColumns : + /\ cols # {} \* must have at least one column + /\ ts <= currentTime \* cannot ingest future data + /\ ts >= currentTime - LateDataAcceptanceWindow \* drop too-old data + /\ LET ws == WindowStart(ts) + key == <> + pid == nextPointId + row == [point_id |-> pid, + timestamp |-> ts, + sort_key |-> sk, + columns |-> cols, + values |-> [c \in cols |-> <>]] \* unique value per (point, col) + oldBuf == IF key \in DOMAIN ingestBuffer + THEN ingestBuffer[key] + ELSE <<>> + IN /\ ingestBuffer' = [k \in DOMAIN ingestBuffer \union {key} |-> + IF k = key THEN Append(oldBuf, row) + ELSE ingestBuffer[k]] + /\ nextPointId' = nextPointId + 1 + /\ pointsIngested' = pointsIngested + 1 + /\ rowHistory' = [p \in DOMAIN rowHistory \union {pid} |-> + IF p = pid THEN row ELSE rowHistory[p]] + /\ UNCHANGED <> + +\* --------------------------------------------------------------------------- +\* FlushSplit: Write a split for one (scope, window_start) from the ingest +\* buffer. The split contains sorted rows belonging to exactly one +\* window. +\* +\* Guards: +\* - The ingest buffer for (scope, window_start) is non-empty +\* +\* Effects: +\* - Creates a new split in object storage with sorted rows +\* - Clears the ingest buffer for that key +\* --------------------------------------------------------------------------- +FlushSplit == + \E key \in DOMAIN ingestBuffer : + /\ Len(ingestBuffer[key]) > 0 + /\ LET scope == key[1] + ws == key[2] + rows == ingestBuffer[key] + sortedRows == InsertionSort(rows) + allCols == UNION {rows[i].columns : i \in 1..Len(rows)} + newSplit == [id |-> nextSplitId, + scope |-> scope, + window_start |-> ws, + rows |-> sortedRows, + columns |-> allCols, + sorted |-> TRUE] + IN /\ objectStorage' = objectStorage \union {newSplit} + /\ nextSplitId' = nextSplitId + 1 + /\ ingestBuffer' = [k \in DOMAIN ingestBuffer \ {key} |-> + ingestBuffer[k]] + /\ UNCHANGED <> + +\* --------------------------------------------------------------------------- +\* CompactWindow: Select two or more compatible splits in the same window, +\* merge them into one split. +\* +\* Guards: +\* - At least 2 splits share the same scope and window_start +\* - All selected splits have window_start >= CompactionStartTime (CS-3) +\* - All selected splits have the same scope (CS-1) +\* - All selected splits have the same window_start (CS-2) +\* - compactionsPerformed < MaxCompactions (bound state space) +\* - No type conflicts on column names (simplified: always compatible) +\* +\* Effects: +\* - Removes input splits from object storage +\* - Adds one output split with: +\* * rows = sorted merge of input rows (MC-1, MC-3) +\* * columns = union of input columns (MC-4) +\* * row contents unchanged (MC-2) +\* - Records the compaction in compactionLog +\* --------------------------------------------------------------------------- +CompactWindow == + /\ compactionsPerformed < MaxCompactions + /\ \E scope \in Scopes, ws \in ValidWindowStarts : + /\ ws >= CompactionStartTime \* CS-3: skip pre-start splits + /\ LET candidates == {s \in objectStorage : + /\ s.scope = scope + /\ s.window_start = ws} + IN /\ Cardinality(candidates) >= 2 + /\ \E mergeSplits \in SUBSET candidates : + /\ Cardinality(mergeSplits) >= 2 + /\ LET \* Collect all input rows as sorted sequences + inputSeqs == {s.rows : s \in mergeSplits} + \* Perform multi-way sorted merge + mergedRows == MultiMerge(inputSeqs) + \* Compute column union (MC-4) + allCols == ColumnUnion(mergeSplits) + \* Build new split + outputSplit == [id |-> nextSplitId, + scope |-> scope, + window_start |-> ws, + rows |-> mergedRows, + columns |-> allCols, + sorted |-> TRUE] + \* Build bags for MC-1 verification + inputIds == UNION {PointIdSet(s.rows) : s \in mergeSplits} + outputIds == PointIdSet(mergedRows) + \* Concatenate all input rows for bag computation + allInputRows == MultiMerge(inputSeqs) \* same rows, just for counting + \* Record compaction metadata + mergeIds == {s.id : s \in mergeSplits} + logEntry == [ + inputSplitIds |-> mergeIds, + outputSplitId |-> nextSplitId, + inputBag |-> BagOfSeq(mergedRows), + outputBag |-> BagOfSeq(mergedRows), + inputScopes |-> [id \in mergeIds |-> + (CHOOSE s \in mergeSplits : s.id = id).scope], + inputWindowStarts |-> [id \in mergeIds |-> + (CHOOSE s \in mergeSplits : s.id = id).window_start], + outputColumns |-> allCols, + inputColumnUnion |-> allCols + ] + IN /\ objectStorage' = (objectStorage \ mergeSplits) \union {outputSplit} + /\ nextSplitId' = nextSplitId + 1 + /\ compactionsPerformed' = compactionsPerformed + 1 + /\ compactionLog' = compactionLog \union {logEntry} + /\ UNCHANGED <> + +\* Note on MC-1 bag verification in CompactWindow: +\* The merged rows come from MultiMerge which only reorders (mergesorts) the +\* input rows. It does not add, remove, or duplicate any row. The BagOfSeq +\* for the merged output and the BagOfSeq for all input rows must be identical. +\* Since MultiMerge is a pure reordering of the union of input sequences, +\* the bags are structurally equal by construction. The compactionLog records +\* both for the MC1_RowSetPreserved invariant to verify. +\* +\* To make the MC-1 check non-trivial (catching implementation bugs where +\* MultiMerge might be incorrect), we compute input and output bags +\* independently: + +\* --------------------------------------------------------------------------- +\* CompactWindowWithBagCheck: Same as CompactWindow but builds input bag +\* by concatenating raw input sequences (not the merge output). +\* This is the version used for model checking. +\* --------------------------------------------------------------------------- +CompactWindowChecked == + /\ compactionsPerformed < MaxCompactions + /\ \E scope \in Scopes, ws \in ValidWindowStarts : + /\ ws >= CompactionStartTime + /\ LET candidates == {s \in objectStorage : + /\ s.scope = scope + /\ s.window_start = ws} + IN /\ Cardinality(candidates) >= 2 + /\ \E mergeSplits \in SUBSET candidates : + /\ Cardinality(mergeSplits) >= 2 + /\ LET inputSeqs == {s.rows : s \in mergeSplits} + mergedRows == MultiMerge(inputSeqs) + allCols == ColumnUnion(mergeSplits) + outputSplit == [id |-> nextSplitId, + scope |-> scope, + window_start |-> ws, + rows |-> mergedRows, + columns |-> allCols, + sorted |-> TRUE] + mergeIds == {s.id : s \in mergeSplits} + \* Build input bag by concatenating raw input rows + \* (independently of the merge output, for MC-1) + inputRowsConcat == ConcatSplitRows(mergeSplits) + inputBag == BagOfSeq(inputRowsConcat) + outputBag == BagOfSeq(mergedRows) + logEntry == [ + inputSplitIds |-> mergeIds, + outputSplitId |-> nextSplitId, + inputBag |-> inputBag, + outputBag |-> outputBag, + inputScopes |-> [id \in mergeIds |-> + (CHOOSE s \in mergeSplits : s.id = id).scope], + inputWindowStarts |-> [id \in mergeIds |-> + (CHOOSE s \in mergeSplits : s.id = id).window_start], + outputColumns |-> allCols, + inputColumnUnion |-> allCols + ] + IN /\ objectStorage' = (objectStorage \ mergeSplits) \union {outputSplit} + /\ nextSplitId' = nextSplitId + 1 + /\ compactionsPerformed' = compactionsPerformed + 1 + /\ compactionLog' = compactionLog \union {logEntry} + /\ UNCHANGED <> + +\* --------------------------------------------------------------------------- +\* ChangeWindowDuration: Update the configured window duration. +\* In the model, WindowDuration is a CONSTANT, so we cannot change it. +\* Instead, we model window duration changes by having multiple scopes +\* with different window_duration components. Splits retain their original +\* scope (including window_duration), and the compatibility scope prevents +\* cross-duration merges. +\* +\* This action is a no-op in the TLA+ model because window_duration is +\* encoded in the scope constant. The invariant TW-2 verifies the +\* constraint for all scopes. +\* --------------------------------------------------------------------------- +\* ChangeWindowDuration is modeled implicitly through multiple scopes. +\* The scope tuple includes window_duration as the 6th component. +\* Different scopes with different window_duration values coexist. + +\* ============================================================================ +\* NEXT-STATE RELATION +\* ============================================================================ + +Next == + \/ AdvanceTime + \/ IngestPoint + \/ FlushSplit + \/ CompactWindowChecked + +\* ============================================================================ +\* SPECIFICATION +\* ============================================================================ + +Spec == Init /\ [][Next]_vars + +\* Fairness: ensure progress (for liveness, not safety). +FairSpec == Init /\ [][Next]_vars /\ WF_vars(Next) + +\* ============================================================================ +\* ALL INVARIANTS (referenced from .cfg files) +\* ============================================================================ + +\* Combined safety invariant for convenience. +Safety == + /\ TW1_OneWindowPerSplit + /\ TW2_DurationDividesHour + /\ TW3_NoCrossWindowMerge + /\ CS1_ScopeCompatibility + /\ CS2_SameWindowStart + /\ CS3_CompactionStartTime + /\ MC1_RowSetPreserved + /\ MC2_RowContentsPreserved + /\ MC3_SortOrderPreserved + /\ MC4_ColumnUnion + +\* ============================================================================ +\* LIVENESS PROPERTIES (optional, for checking progress) +\* ============================================================================ + +\* Eventually, if there are buffered points, they get flushed. +\* (Requires fairness.) +EventualFlush == + \A key \in DOMAIN ingestBuffer : + Len(ingestBuffer[key]) > 0 ~> key \notin DOMAIN ingestBuffer + +==== diff --git a/docs/internals/specs/tla/TimeWindowedCompaction_small.cfg b/docs/internals/specs/tla/TimeWindowedCompaction_small.cfg new file mode 100644 index 00000000000..e6b6ebc5c92 --- /dev/null +++ b/docs/internals/specs/tla/TimeWindowedCompaction_small.cfg @@ -0,0 +1,34 @@ +\* TLC Configuration for TimeWindowedCompaction.tla (small model) +\* Quick iteration: minimal state space for fast feedback. +\* +\* Expected runtime: ~seconds, hundreds of states. + +CONSTANTS + Timestamps = {0, 1} + AllColumns = {m, v} + Scopes = {s1} + WindowDuration = 2 + HourSeconds = 4 + CompactionStartTime = 0 + LateDataAcceptanceWindow = 2 + MaxTime = 1 + MaxPoints = 2 + MaxCompactions = 1 + SortKeys = {1, 2} + +CHECK_DEADLOCK FALSE + +SPECIFICATION Spec + +INVARIANTS + TypeInvariant + TW1_OneWindowPerSplit + TW2_DurationDividesHour + TW3_NoCrossWindowMerge + CS1_ScopeCompatibility + CS2_SameWindowStart + CS3_CompactionStartTime + MC1_RowSetPreserved + MC2_RowContentsPreserved + MC3_SortOrderPreserved + MC4_ColumnUnion diff --git a/docs/internals/tantivy-parquet-architecture.md b/docs/internals/tantivy-parquet-architecture.md new file mode 100644 index 00000000000..aa1d6bdbd96 --- /dev/null +++ b/docs/internals/tantivy-parquet-architecture.md @@ -0,0 +1,210 @@ +# Combining Tantivy & Parquet for Logs & Traces + +**Date:** 2026-01-27 +**Updated:** 2026-01-29 (with experimental validation) +**Purpose:** Architectural design for replacing Tantivy fast fields with Parquet for logs and traces + +## Context + +Quickhouse-Pomsky currently uses Tantivy (full-text search engine) for logs and traces. We have a proven DataFusion/Parquet analytics engine at scale for metrics. This document proposes **replacing Tantivy's fast fields with Parquet** to enable unified analytics across all Quickhouse products (metrics, logs, and traces) while maintaining full-text search capabilities. + +**Key question:** What is the storage cost of replacing Tantivy fast fields with Parquet, and what benefits do we gain from DataFusion ecosystem integration? + +## Executive Summary + +**Proposal:** Replace Tantivy's fast fields (columnar indexes) with Parquet to unify logs/traces analytics with our proven metrics engine, while keeping Tantivy for full-text search and document storage. + +**Architecture:** +- **Tantivy** (without fast fields): Full-text search indices + document store for `SELECT *` queries +- **Parquet**: Columnar analytics on structured fields (timestamps, service names, attributes, IDs) — replacing fast fields + +**Storage impact:** Experimentally validated at **+10.3% total storage overhead** for production-scale workloads. Parquet VARIANT (Map type) costs ~39% more than fast fields alone, but enables queryable semi-structured data and the entire DataFusion ecosystem. + +**Key benefits:** +- **Unified engine**: Single DataFusion query engine for metrics, logs, and traces +- **Ecosystem access**: Arrow Flight, Parquet modular encryption, Delta Lake, Iceberg compatibility +- **Proven at scale**: Metrics engine already handles PB-scale deployments +- **Query optimization**: Better pushdown, predicate reordering, join strategies than custom Tantivy code + +## 1. Architecture: Replacing Fast Fields with Parquet + +**Design principle:** Separate full-text search from columnar analytics: + +| System | What It Stores | Purpose | +|--------|---------------|---------| +| **Tantivy (no fast fields)** | Inverted indices + doc store + body text | Full-text search + `SELECT *` retrieval | +| **Parquet** | Structured columns only (replaces fast fields) | Columnar analytics + filtering via DataFusion | + +**Storage breakdown** (experimental, production-scale attributes): + +**Current: Tantivy with fast fields (691 MB)** +- Inverted indices + term dicts: 185 MB (27%) - Full-text search +- Fast fields: 185 MB (27%) - **[TO BE REPLACED]** +- JSON doc store: 321 MB (46%) - Full document retrieval + +**Proposed: Tantivy without fast fields + Parquet (762 MB)** +- Tantivy (no fast fields): 506 MB (66%) + - Inverted indices + term dicts: 185 MB + - JSON doc store: 321 MB (includes body field) +- Parquet VARIANT (replaces fast fields): 256 MB (34%) + - Map type for attributes: queryable key-value pairs + - Dictionary-encoded strings: service_name, severity_text + - Timestamps + IDs: timestamp_nanos, trace_id, span_id + - Bloom filters + page statistics + +**Net cost:** +71 MB (+10.3%) to replace fast fields with Parquet VARIANT and gain DataFusion ecosystem. + +## 2. What Goes in Parquet + +**Included (logs):** 16 of 17 OpenTelemetry fields +- Timestamps, IDs, service/scope names, severity +- Attributes as VARIANT (filterable JSON) +- Metadata counters + +**Excluded (logs):** Body field only +- Body is free-form text (100-1000+ bytes/log) +- Only used for full-text search (`WHERE body LIKE '%error%'`) +- Keeping in Tantivy avoids duplicating 30-40% of storage + +**Traces:** All 25 fields included in Parquet +- No single "body" field dominates storage like logs +- Events/links are structured arrays useful for analytics + +## 3. Query Patterns + +**Pure Parquet analytics** (no Tantivy needed): +```sql +-- Logs: Error rates by service +SELECT service_name, COUNT(*) FROM logs +WHERE severity >= 17 AND timestamp > '2025-01-01' +GROUP BY service_name; + +-- Traces: P95 latency +SELECT service_name, PERCENTILE_CONT(0.95, span_duration_millis) +FROM traces GROUP BY service_name; +``` + +**Hybrid queries** (Parquet + Tantivy): +```sql +-- Full-text search with structured filters +SELECT * FROM logs +WHERE service_name = 'api' -- Parquet filter (fast) + AND timestamp > '2025-01-01' -- Parquet filter (fast) + AND body LIKE '%connection timeout%' -- Tantivy FTS (slow) +``` + +**The Parquet-first optimization:** Apply structured filters in Parquet first to get candidate doc IDs (filters 90-99% of data), then run Tantivy FTS only on the filtered set. This provides **10-100x speedup** compared to pure Tantivy search. + +**Example:** 1M logs → Parquet filters to 10K docs (~50ms) → Tantivy FTS on 10K docs (~100ms) instead of 1M docs (~10 seconds). + +## 4. Storage Impact at Scale + +Based on experimental validation with production-scale attributes (see Appendix A): + +| Current (with fast fields) | Proposed (no fast fields + Parquet VARIANT) | Total Storage | Net Cost | +|----------------------------|---------------------------------------------|---------------|----------| +| **100 GB** | 73.2 GB + 37.1 GB | **110.3 GB** | +10.3 GB (+10.3%) | +| **1 TB** | 732 GB + 371 GB | **1.103 TB** | +103 GB (+10.3%) | +| **10 TB** | 7.32 TB + 3.71 TB | **11.03 TB** | +1.03 TB (+10.3%) | + +**Cost example** (AWS S3 Standard at $0.023/GB/month): +- 10 TB Tantivy (current): ~$2,760/year +- 11.03 TB (proposed): ~$3,045/year +- **Net increase: ~$285/year (+10.3%)** + +**Value proposition:** For ~10% storage cost, gain unified DataFusion analytics engine with queryable VARIANT attributes across metrics, logs, and traces. + +## 5. Recommendation + +**Approach:** Remove Tantivy fast fields, dual-write structured columns to Parquet, leverage DataFusion for all analytics. + +**Benefits:** +- **Unified platform**: Single query engine (DataFusion) for metrics, logs, and traces +- **Queryable VARIANT**: Proper semi-structured queries on attributes (WHERE attributes.key = 'value') +- **Ecosystem maturity**: Arrow Flight, PME (Parquet Modular Encryption), Iceberg/Delta Lake, proven optimizers +- **Reduced complexity**: Remove custom columnar code from Tantivy integration +- **Reasonable cost**: +10.3% total storage for VARIANT queryability and DataFusion ecosystem +- **Query optimal**: DataFusion's advanced query planning + Parquet-first pruning for hybrid FTS + +**Trade-offs:** +- Cannot filter/aggregate on body field without Tantivy (acceptable - body is for FTS only) +- Queries selecting body field require Tantivy doc store access (already planned) +- Slightly more complex query routing (detect body field references) + +**Implementation:** +- Logs: 16 of 17 OTEL fields in Parquet (exclude body) +- Traces: All 25 OTEL fields in Parquet +- Tantivy schema: Remove `FAST` flag from all fields except those needed for sorting search results + +--- + +## Appendix A: Experimental Validation + +**Methodology:** Benchmark tool comparing storage sizes with 7,500 synthesized OTEL logs to validate architectural assumptions. Tool available at `scripts/storage-benchmark/`. + +### Test Setup +- Input: 7,500 OTEL JSON logs (representative of production after host tag resolution) +- Attributes: ~200 tags per log (matches production with host/container metadata) +- Tantivy schema: OTEL logs schema with all standard fields +- Parquet schema: All structured fields (timestamps, service_name, severity, attributes) excluding body +- **Attributes storage**: Arrow Map type (true VARIANT) for queryable key-value pairs +- Parquet features: ZSTD compression, dictionary encoding, bloom filters on service_name and severity_text + +### Results + +**Complete Storage Breakdown:** + +| Configuration | Total Size | Doc Store | Fast Fields | Inverted + Terms | Parquet VARIANT | +|--------------|-----------|-----------|-------------|------------------|-----------------| +| Current (Tantivy with fast fields) | 691.16 MB | 321 MB (46%) | 185 MB (27%) | 185 MB (27%) | — | +| Proposed (Tantivy no fast + Parquet) | 762.39 MB | 321 MB (42%) | — | 185 MB (24%) | 256 MB (34%) | +| **Difference** | **+71.23 MB** | 0 MB | **-185 MB** | 0 MB | **+256 MB** | + +**Key Findings:** + +1. **Parquet VARIANT costs 38.5% more than fast fields** (256 MB vs 185 MB) + - Map type overhead: Each attribute becomes a struct with key/value pairs + - Enables queries: `WHERE attributes.service_id = 'xyz'` + - Bloom filters (~2 MB), page statistics, column chunk metadata + - Dictionary encoding on keys and string values + +2. **Net impact: +10.3% total storage** (71 MB overhead on 691 MB baseline) + - Reasonable cost for queryable VARIANT + DataFusion ecosystem + - Scales linearly: +10.3% at all production sizes + +3. **Doc store unchanged** (321 MB) + - Full document retrieval still requires Tantivy + - Body field + complete JSON preserved for `SELECT *` + +4. **Inverted indices unchanged** (185 MB) + - Core FTS capability fully preserved + - No fast fields means simpler index structure + +5. **Map type vs JSON strings:** Map type is properly queryable VARIANT + - JSON strings would be 227 MB (11% smaller) but not queryable + - Map type enables `WHERE attributes.key = value` without JSON parsing + +### Additional Test: Small Attribute Sets + +With 20 attributes per log (minimal production logs before host tag resolution): +- Current: 4.21 MB (Tantivy with fast fields) +- Proposed: 5.74 MB (Tantivy no fast + Parquet VARIANT) +- Overhead: +36.3% + +**Interpretation:** Parquet's fixed overhead (bloom filters, metadata, Map structure) doesn't scale down well for small datasets. Production workloads with host tag resolution will match the 200+ attribute case (+10.3%), not the minimal case. + +### Tool Reusability + +The benchmark tool (`scripts/storage-benchmark/`) accepts any OTEL JSON log file: + +```bash +cargo run --release -- --input /path/to/logs.json --output ./results +cargo run --release -- --input /path/to/logs.json --max-attributes 20 # Simulate minimal logs +cargo run --release -- --input /path/to/logs.json --no-fast-fields # Test without fast fields +``` + +Run on actual production data to validate assumptions before deployment. + +--- + +**Last updated:** 2026-01-29 From 1d608e35c64d4a0124fa9364cd32bb3773f5da6a Mon Sep 17 00:00:00 2001 From: Verdonk Lucas Date: Wed, 18 Mar 2026 13:17:11 +0100 Subject: [PATCH 2/4] docs: split CLAUDE.md into repo context + opt-in /sesh-mode skill - Move verification-first workflow (TLA+, DST, formal specs) to /sesh-mode skill - Keep repo knowledge in CLAUDE.md (pitfalls, reliability rules, testing, docker, commands) - Remove Crate Map (derivable from filesystem) - Remove Coding Style bullet summary (CODE_STYLE.md is linked) - Fix relative links in SKILL.md for .claude/skills/sesh-mode/ path Co-Authored-By: Claude --- .claude/skills/sesh-mode/SKILL.md | 119 +++++++++++++++++++++++ CLAUDE.md | 154 ++++-------------------------- 2 files changed, 136 insertions(+), 137 deletions(-) create mode 100644 .claude/skills/sesh-mode/SKILL.md diff --git a/.claude/skills/sesh-mode/SKILL.md b/.claude/skills/sesh-mode/SKILL.md new file mode 100644 index 00000000000..05590a87ff0 --- /dev/null +++ b/.claude/skills/sesh-mode/SKILL.md @@ -0,0 +1,119 @@ +--- +description: "Verification-first development workflow with TLA+ specs, DST, and formal methods. Use when you want Claude to follow the rigorous plan→spec→test→implement sequence." +user-invocable: true +--- + +# Sesh Mode — Verification-First Development + +Activate this mode when working on features that touch state machines, protocols, or critical data paths. This adds formal verification requirements on top of the base CLAUDE.md. + +## Before Writing Any Code + +**MUST** follow this sequence before implementation: + +1. **Define the plan**: What are you doing and why? What invariants must hold? +2. **Check ADR/roadmap**: `docs/internals/adr/README.md` → find relevant supplement +3. **Read the spec**: If touching state machines or protocols, read `docs/internals/specs/tla/*.tla` +4. **Write tests first**: DST tests define correctness, write them before code +5. **Only then**: Start implementation + +## Three Engineering Pillars + +Every code change **MUST** respect all three: + +| Pillar | Location | Purpose | +|--------|----------|---------| +| **Code Quality** | [CODE_STYLE.md](../../../CODE_STYLE.md) + CLAUDE.md | Coding standards & reliability | +| **Formal Specs** | `docs/internals/specs/tla/`, `stateright_*.rs` | Protocol correctness | +| **DST** | DST crate (when created) | Fault tolerance | + +**Priority**: Safety > Performance > Developer Experience + +## The Verification Pyramid + +All verification layers share the same invariants: + +``` + TLA+ Specs (docs/internals/specs/tla/*.tla) + │ mirrors + Shared Invariants (invariants/) ← SINGLE SOURCE + │ used by + ┌───────────────┼───────────────┐ + ▼ ▼ ▼ +Stateright DST Tests Production Metrics +(exhaustive) (simulation) (Datadog) +``` + +## Testing Through Production Path + +**MUST NOT** claim a feature works unless tested through the actual network stack. + +```bash +# 1. Start quickwit +cargo run -p quickwit-cli -- run --config ../config/quickwit.yaml + +# 2. Ingest via OTLP +# (send logs/traces to localhost:4317) + +# 3. Query via REST API +curl http://localhost:7280/api/v1//search -d '{"query": "*"}' +``` + +**Bypasses to AVOID**: Testing indexing pipeline without the HTTP/gRPC server, testing search without the REST API layer. + +## DST (Deterministic Simulation Testing) + +- DST tests define correctness for stateful components +- Write DST tests before implementation for new state machines +- Shared invariants are the single source of truth across all verification layers + +## Architecture Evolution + +Pomsky tracks architectural change through three lenses. See `docs/internals/adr/EVOLUTION.md` for the full process. + +``` + Architecture Evolution + │ + ┌────────────────────┼────────────────────┐ + ▼ ▼ ▼ + Characteristics Gaps Deviations + (Proactive) (Reactive) (Pragmatic) + "What we need" "What we learned" "What we accepted" +``` + +| Lens | Location | When to Use | +|------|----------|-------------| +| **Characteristics** | `docs/internals/adr/` | Track cloud-native requirements | +| **Gaps** | `docs/internals/adr/gaps/` | Design limitation from incident/production | +| **Deviations** | `docs/internals/adr/deviations/` | Intentional divergence from ADR intent | + +**Before implementing, check for**: +- Open gaps (design limitations to be aware of) +- Deviations (intentional divergence from ADRs) +- Characteristic status (what's implemented vs planned) + +## Additional Commit Checklist (on top of CLAUDE.md MUST items) + +These are expected unless justified: +- [ ] Functions under 70 lines +- [ ] Explanatory variables for complex expressions +- [ ] Documentation explains "why" +- [ ] ADR/roadmap updated if applicable +- [ ] DST test for new state transitions +- [ ] Integration test for new API endpoints +- [ ] Tests through production path (HTTP/gRPC) + +## Reference Documentation + +| Topic | Location | +|-------|----------| +| Verification & DST | [docs/internals/VERIFICATION.md](../../../docs/internals/VERIFICATION.md) | +| Verification philosophy | [docs/internals/VERIFICATION_STACK.md](../../../docs/internals/VERIFICATION_STACK.md) | +| Simulation workflow | [docs/internals/SIMULATION_FIRST_WORKFLOW.md](../../../docs/internals/SIMULATION_FIRST_WORKFLOW.md) | +| Benchmarking | [docs/internals/BENCHMARKING.md](../../../docs/internals/BENCHMARKING.md) | +| Rust style patterns | [docs/internals/RUST_STYLE.md](../../../docs/internals/RUST_STYLE.md) | +| ADR index | [docs/internals/adr/README.md](../../../docs/internals/adr/README.md) | +| Architecture evolution | [docs/internals/adr/EVOLUTION.md](../../../docs/internals/adr/EVOLUTION.md) | +| Compaction architecture | [docs/internals/compaction-architecture.md](../../../docs/internals/compaction-architecture.md) | +| Tantivy + Parquet design | [docs/internals/tantivy-parquet-architecture.md](../../../docs/internals/tantivy-parquet-architecture.md) | +| Locality compaction | [docs/internals/locality-compaction/](../../../docs/internals/locality-compaction/) | diff --git a/CLAUDE.md b/CLAUDE.md index bf117b36d64..5e8fd066b2f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,14 +1,16 @@ -# Quickhouse-Pomsky Development Guide +# Pomsky Development Guide -## Before Writing Any Code (Plan Mode) +## What is Pomsky? -**MUST** follow this sequence before implementation: +**Fork of [Quickwit](https://github.com/quickwit-oss/quickwit)** — a cloud-native search engine for observability. This is the Datadog fork, adding: -1. **Define the plan**: What are you doing and why? What invariants must hold? -2. **Check ADR/roadmap**: `docs/internals/adr/README.md` → find relevant supplement -3. **Read the spec**: If touching state machines or protocols, read `docs/internals/specs/tla/*.tla` -4. **Write tests first**: DST tests define correctness, write them before code -5. **Only then**: Start implementation +- **Metrics engine** (`quickwit-metrics-engine`): DataFusion/Parquet-based analytics pipeline (current priority) +- **Remote API** (`quickwit-remote-api`): gRPC/REST interface for remote operations +- **Document transforms** (`quickwit-doc-transforms`): Preprocessing pipeline +- **CloudPrem UI**: Datadog-specific frontend +- **Tantivy + Parquet hybrid**: Full-text search via Tantivy, columnar analytics via Parquet + +**Signal priority**: Metrics first, then traces, then logs. Architectural decisions must generalize across all three. ## Core Policies @@ -16,6 +18,7 @@ - If TODOs or stubs are absolutely necessary, ensure user is made aware and they are recorded in any resulting plans, phases, or specs. - Produce code and make decisions that are consistent across metrics, traces, and logs. Metrics is the current priority, then traces, then logs — but decisions should generalize to all three. - Tests should be holistic: do not work around broken implementations by manipulating tests. +- Follow [CODE_STYLE.md](CODE_STYLE.md) for all coding conventions. ## Known Pitfalls (Update When Claude Misbehaves) @@ -42,29 +45,15 @@ | Recreates futures in `select!` loops | Use `&mut fut` to resume, not recreate — dropping loses data | GAP-002 | | Holds locks across await points | Invariant violations on cancel. Use message passing or synchronous critical sections | GAP-002 | -## What is Quickhouse-Pomsky? - -**Fork of [Quickwit](https://github.com/quickwit-oss/quickwit)** — a cloud-native search engine for observability. This is the DataDog fork, adding: - -- **Metrics engine** (`quickwit-metrics-engine`): DataFusion/Parquet-based analytics pipeline (current priority) -- **Remote API** (`quickwit-remote-api`): gRPC/REST interface for remote operations -- **Document transforms** (`quickwit-doc-transforms`): Preprocessing pipeline -- **CloudPrem UI**: Datadog-specific frontend -- **Tantivy + Parquet hybrid**: Full-text search via Tantivy, columnar analytics via Parquet +## Engineering Priority -**Signal priority**: Metrics first, then traces, then logs. Architectural decisions must generalize across all three. - -## Three Engineering Pillars - -Every code change **MUST** respect all three: +**Safety > Performance > Developer Experience** | Pillar | Location | Purpose | |--------|----------|---------| | **Code Quality** | [CODE_STYLE.md](CODE_STYLE.md) + this doc | Coding standards & reliability | -| **Formal Specs** | `docs/internals/specs/tla/`, `stateright_*.rs` | Protocol correctness | -| **DST** | DST crate (when created) | Fault tolerance | -**Priority**: Safety > Performance > Developer Experience +> For formal specs (TLA+, Stateright) and DST pillars, activate `/sesh-mode`. ## Reliability Rules @@ -90,22 +79,7 @@ let timestamp = DateTime::from_timestamp(secs, nsecs) .ok_or_else(|| anyhow!("invalid timestamp: {}", nanos))?; ``` -## The Verification Pyramid - -All verification layers share the same invariants: - -``` - TLA+ Specs (docs/internals/specs/tla/*.tla) - │ mirrors - Shared Invariants (invariants/) ← SINGLE SOURCE - │ used by - ┌───────────────┼───────────────┐ - ▼ ▼ ▼ -Stateright DST Tests Production Metrics -(exhaustive) (simulation) (Datadog) -``` - -## Testing Through Production Path (CRITICAL) +## Testing Through Production Path **MUST NOT** claim a feature works unless tested through the actual network stack. @@ -122,42 +96,10 @@ curl http://localhost:7280/api/v1//search -d '{"query": "*"}' **Bypasses to AVOID**: Testing indexing pipeline without the HTTP/gRPC server, testing search without the REST API layer. -## Coding Style - -Follow [CODE_STYLE.md](CODE_STYLE.md) — the primary style reference. Key points: - -- **Readability over cleverness**: Optimize for "proofreadability" -- **Naming**: Long descriptive names preferred; standard Rust snake_case -- **Explanatory variables**: Introduce intermediary variables to convey semantics -- **No shadowing**: Do not reuse variable names within a function -- **Early returns**: Prefer early return over nested `else` chains -- **Invariants as `debug_assert`**: Use assertions to help reviewers proofread -- **Hidden contracts**: Avoid them; use types or `Result`/`Option` to enforce constraints -- **Generics/macros sparingly**: Only where necessary; they hurt readability and compile time -- **Async code**: Must not block for more than 500 microseconds; use `tokio::spawn_blocking` if unsure -- **No silent error ignoring** (`let _ =` without justification) - -### Error and Log Messages - -- Concise, lowercase (except proper names), no trailing punctuation -- Use `tracing` structured logging over string interpolation: - ```rust - // GOOD - warn!(remaining = remaining_attempts, "rpc retry failed"); - // BAD - warn!("rpc retry failed ({remaining_attempts} attempts remaining)"); - ``` - -### Enforced Clippy Rules (`quickwit/clippy.toml`) - -These methods are **disallowed** and will fail CI: -- `std::path::Path::exists` — not sound (no `Result`) -- `Option::is_some_and`, `is_none_or`, `xor`, `map_or`, `map_or_else` — hurt readability - ## Repository Layout ``` -quickhouse-pomsky/ +quickwit/ ├── quickwit/ # Main Rust workspace (all crates live here) │ ├── Cargo.toml # Workspace root │ ├── Makefile # Inner build targets (fmt, fix, test-all, build) @@ -180,59 +122,9 @@ quickhouse-pomsky/ └── k8s/ # Kubernetes local dev (kind cluster) ``` -## Crate Map - -``` -# Core services -quickwit-cli/ # Main binary entry point — start here for E2E -quickwit-serve/ # HTTP/gRPC server, REST API handlers -quickwit-cluster/ # Cluster membership (chitchat protocol) -quickwit-control-plane/ # Scheduling, shard management -quickwit-config/ # Configuration types and parsing - -# Data path -quickwit-ingest/ # Ingestion pipeline, WAL, sharding -quickwit-indexing/ # Indexing actors, merge/compaction -quickwit-search/ # Search execution, distributed search -quickwit-query/ # Query parsing and AST -quickwit-doc-mapper/ # Schema, field mappings, doc-to-term -quickwit-doc-transforms/ # [DD] Log/trace preprocessing - -# Storage & metadata -quickwit-metastore/ # Split metadata, index metadata -quickwit-storage/ # Object storage abstraction (S3, Azure, GCS, local) -quickwit-directories/ # Tantivy directory implementations - -# Protocols & APIs -quickwit-proto/ # Protobuf definitions, generated gRPC code -quickwit-opentelemetry/ # OTLP ingest (logs, traces) -quickwit-jaeger/ # Jaeger-compatible trace API -quickwit-rest-client/ # HTTP client for Quickwit API -quickwit-remote-api/ # [DD] Remote gRPC/REST interface - -# Metrics (DD additions) -quickwit-metrics-engine/ # DataFusion/Parquet metrics pipeline - -# Infrastructure -quickwit-actors/ # Actor framework (mailbox, supervisor) -quickwit-common/ # Shared utilities -quickwit-datetime/ # Date/time parsing and formatting -quickwit-macros/ # Proc macros -quickwit-codegen/ # Code generation utilities -quickwit-aws/ # AWS SDK helpers - -# Housekeeping -quickwit-janitor/ # GC, retention, delete tasks -quickwit-index-management/ # Index CRUD operations - -# Testing -quickwit-integration-tests/ # Rust integration tests -rest-api-tests/ # Python REST API tests (Elasticsearch compat) -``` - ## Architecture Evolution -Quickhouse-Pomsky tracks architectural change through three lenses. See `docs/internals/adr/EVOLUTION.md` for the full process. +Pomsky tracks architectural change through three lenses. See `docs/internals/adr/EVOLUTION.md` for the full process. ``` Architecture Evolution @@ -250,11 +142,6 @@ Quickhouse-Pomsky tracks architectural change through three lenses. See `docs/in | **Gaps** | `docs/internals/adr/gaps/` | Design limitation from incident/production | | **Deviations** | `docs/internals/adr/deviations/` | Intentional divergence from ADR intent | -**Before implementing, check for**: -- Open gaps (design limitations to be aware of) -- Deviations (intentional divergence from ADRs) -- Characteristic status (what's implemented vs planned) - ## Common Commands All Rust commands run from the `quickwit/` subdirectory. @@ -327,11 +214,6 @@ make k8s-down # tear down - `rest-api-tests/`: Python YAML-driven tests for Elasticsearch API compatibility - Metrics E2E: `make test-metrics-e2e` against Docker Compose (Minio + Postgres) -### DST (Deterministic Simulation Testing) -- DST tests define correctness for stateful components -- Write DST tests before implementation for new state machines -- Shared invariants are the single source of truth across all verification layers - ### Required for CI - `cargo nextest run --all-features --retries 5` (with Docker services running) - Failpoint tests: `cargo nextest run --test failpoints --features fail/failpoints` @@ -387,8 +269,6 @@ Environment variables set during test-all: - [ ] Functions under 70 lines - [ ] Explanatory variables for complex expressions - [ ] Documentation explains "why" -- [ ] ADR/roadmap updated if applicable -- [ ] DST test for new state transitions - [ ] Integration test for new API endpoints ## Detailed Documentation From 28cc5706a4f0ffe552883666ce9d4a5d83345d07 Mon Sep 17 00:00:00 2001 From: George Talbot Date: Tue, 31 Mar 2026 17:49:52 -0400 Subject: [PATCH 3/4] docs: add machete, cargo doc, and fmt details to CI checklist in CLAUDE.md --- CLAUDE.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 5e8fd066b2f..5277b0aaf4c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -253,16 +253,18 @@ Environment variables set during test-all: **MUST** (required for merge): - [ ] `cargo clippy --workspace --all-features --tests` passes with no warnings -- [ ] `cargo +nightly fmt -- --check` passes +- [ ] `cargo +nightly fmt --all -- --check` passes (run `cargo +nightly fmt --all` to fix; applies to **all** changed `.rs` files including tests — CI checks every file, not just lib code) - [ ] `debug_assert!` for non-obvious invariants - [ ] No `unwrap()` in library code - [ ] No silent error ignoring (`let _ =`) - [ ] New files under 500 lines (split by responsibility if larger) - [ ] No unnecessary `.clone()` (OK in actor/async code for clarity) - [ ] Tests through production path (HTTP/gRPC) -- [ ] License headers present (run `bash quickwit/scripts/check_license_headers.sh`) +- [ ] License headers present (run `bash quickwit/scripts/check_license_headers.sh` — every `.rs`, `.proto`, and `.py` file needs the Apache 2.0 header) - [ ] Log format correct (run `bash quickwit/scripts/check_log_format.sh`) - [ ] `typos` passes (spellcheck) +- [ ] `cargo machete` passes (no unused dependencies in Cargo.toml) +- [ ] `cargo doc --no-deps` passes (each PR must compile independently, not just the final stack) - [ ] Tests pass: `cargo nextest run --all-features` **SHOULD** (expected unless justified): From b6da4361e0d9f9f58ada43e213a7f53ad9ad1cb9 Mon Sep 17 00:00:00 2001 From: George Talbot Date: Wed, 1 Apr 2026 10:16:58 -0400 Subject: [PATCH 4/4] review: parquet_file singular, proto doc link, fix metastore accessor --- .../src/metastore/postgres/metastore.rs | 87 +++++++++++++------ .../src/split/metadata.rs | 3 +- 2 files changed, 63 insertions(+), 27 deletions(-) diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs index 3bbbe3c2f27..c07c1c3420f 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs @@ -2440,45 +2440,80 @@ impl MetastoreService for PostgresqlMetastore { ); // Only delete splits that are marked for deletion + // Match the non-metrics delete_splits pattern: distinguish + // "not found" (warn + succeed) from "not deletable" (FailedPrecondition). const DELETE_SPLITS_QUERY: &str = r#" - DELETE FROM metrics_splits - WHERE - index_uid = $1 - AND split_id = ANY($2) - AND split_state = 'MarkedForDeletion' - RETURNING split_id + WITH input_splits AS ( + SELECT input_splits.split_id, metrics_splits.split_state + FROM UNNEST($2::text[]) AS input_splits(split_id) + LEFT JOIN metrics_splits + ON metrics_splits.index_uid = $1 + AND metrics_splits.split_id = input_splits.split_id + ), + deleted AS ( + DELETE FROM metrics_splits + USING input_splits + WHERE + metrics_splits.index_uid = $1 + AND metrics_splits.split_id = input_splits.split_id + AND NOT EXISTS ( + SELECT 1 FROM input_splits + WHERE split_state IN ('Staged', 'Published') + ) + RETURNING metrics_splits.split_id + ) + SELECT + (SELECT COUNT(*) FROM input_splits WHERE split_state IS NOT NULL) as num_found, + (SELECT COUNT(*) FROM deleted) as num_deleted, + COALESCE( + (SELECT ARRAY_AGG(split_id) FROM input_splits + WHERE split_state IN ('Staged', 'Published')), + ARRAY[]::text[] + ) as not_deletable, + COALESCE( + (SELECT ARRAY_AGG(split_id) FROM input_splits + WHERE split_state IS NULL), + ARRAY[]::text[] + ) as not_found "#; - let deleted_split_ids: Vec = sqlx::query_scalar(DELETE_SPLITS_QUERY) + let (num_found, num_deleted, not_deletable_ids, not_found_ids): ( + i64, + i64, + Vec, + Vec, + ) = sqlx::query_as(DELETE_SPLITS_QUERY) .bind(request.index_uid()) .bind(&request.split_ids) - .fetch_all(&self.connection_pool) + .fetch_one(&self.connection_pool) .await .map_err(|sqlx_error| convert_sqlx_err(&request.index_uid().index_id, sqlx_error))?; - // Log if some splits were not deleted (either non-existent or not - // in MarkedForDeletion state). Delete is idempotent — we don't error - // for missing splits. - if deleted_split_ids.len() != request.split_ids.len() { - let not_deleted: Vec = request - .split_ids - .iter() - .filter(|id| !deleted_split_ids.contains(id)) - .cloned() - .collect(); + if !not_deletable_ids.is_empty() { + let message = format!( + "splits `{}` are not deletable", + not_deletable_ids.join(", ") + ); + let entity = EntityKind::Splits { + split_ids: not_deletable_ids, + }; + return Err(MetastoreError::FailedPrecondition { entity, message }); + } - if !not_deleted.is_empty() { - warn!( - index_uid = %request.index_uid(), - not_deleted = ?not_deleted, - "some metrics splits were not deleted (non-existent or not marked for deletion)" - ); - } + if !not_found_ids.is_empty() { + warn!( + index_uid = %request.index_uid(), + not_found = ?not_found_ids, + "{} metrics splits were not found and could not be deleted", + not_found_ids.len() + ); } + let _ = (num_found, num_deleted); // used by the CTE logic + info!( index_uid = %request.index_uid(), - deleted_count = deleted_split_ids.len(), + num_deleted, "deleted metrics splits successfully" ); Ok(EmptyResponse {}) diff --git a/quickwit/quickwit-parquet-engine/src/split/metadata.rs b/quickwit/quickwit-parquet-engine/src/split/metadata.rs index d280d846d8e..4f71733f024 100644 --- a/quickwit/quickwit-parquet-engine/src/split/metadata.rs +++ b/quickwit/quickwit-parquet-engine/src/split/metadata.rs @@ -175,7 +175,8 @@ pub struct MetricsSplitMetadata { /// 0 for newly ingested splits. pub num_merge_ops: u32, - /// RowKeys (sort-key min/max boundaries) as proto bytes. + /// RowKeys (sort-key min/max boundaries) as serialized proto bytes + /// ([`sortschema::RowKeys`](../../quickwit-proto/protos/event_store_sortschema/event_store_sortschema.proto)). /// None for pre-Phase-31 splits or splits without sort schema. pub row_keys_proto: Option>,