From 10de4c2bd31d83a5af43be9c4aa9cc8838ed7a5c Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Sun, 17 May 2026 21:47:39 +0200 Subject: [PATCH 001/127] doc/semantics: add error handling design + Lean 4 skeleton Add design doc proposing per-cell `Datum::Error`, row-scoped errors via the existing `DataflowError`, and a diff-semiring sketch for collection-scoped errors. Define the four-valued AND/OR truth tables ({TRUE, FALSE, NULL, ERROR}) that the rest of the spec rests on. Add a v1 Lean 4 mechanization of the boolean fragment at doc/developer/semantics/. The skeleton models `Datum`, `Expr`, `evalAnd`/`evalOr`, and proves all 32 cells of the AND/OR truth tables. The pattern order in `evalAnd`/`evalOr` matches the current Rust runtime in src/expr/src/scalar/func/variadic.rs; deviating from that runtime requires a corresponding diff in Boolean.lean. CI runs lake build in a self-built Docker image (ubuntu:26.04 + elan + lean-toolchain pin), gated on changes under doc/developer/semantics/. No Mathlib dependency. Co-Authored-By: Claude Opus 4.7 (1M context) --- ci/test/lean-semantics.sh | 41 +++ ci/test/pipeline.template.yml | 13 + .../20260517_error_handling_semantics.md | 236 ++++++++++++++++++ doc/developer/semantics/.gitignore | 2 + doc/developer/semantics/Dockerfile | 44 ++++ doc/developer/semantics/Mz.lean | 4 + doc/developer/semantics/Mz/Boolean.lean | 76 ++++++ doc/developer/semantics/Mz/Datum.lean | 40 +++ doc/developer/semantics/Mz/Eval.lean | 76 ++++++ doc/developer/semantics/Mz/Expr.lean | 32 +++ doc/developer/semantics/README.md | 66 +++++ doc/developer/semantics/lake-manifest.json | 5 + doc/developer/semantics/lakefile.lean | 9 + doc/developer/semantics/lean-toolchain | 1 + 14 files changed, 645 insertions(+) create mode 100755 ci/test/lean-semantics.sh create mode 100644 doc/developer/design/20260517_error_handling_semantics.md create mode 100644 doc/developer/semantics/.gitignore create mode 100644 doc/developer/semantics/Dockerfile create mode 100644 doc/developer/semantics/Mz.lean create mode 100644 doc/developer/semantics/Mz/Boolean.lean create mode 100644 doc/developer/semantics/Mz/Datum.lean create mode 100644 doc/developer/semantics/Mz/Eval.lean create mode 100644 doc/developer/semantics/Mz/Expr.lean create mode 100644 doc/developer/semantics/README.md create mode 100644 doc/developer/semantics/lake-manifest.json create mode 100644 doc/developer/semantics/lakefile.lean create mode 100644 doc/developer/semantics/lean-toolchain diff --git a/ci/test/lean-semantics.sh b/ci/test/lean-semantics.sh new file mode 100755 index 0000000000000..a52605a781d63 --- /dev/null +++ b/ci/test/lean-semantics.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. +# +# lean-semantics.sh — build the Lean 4 semantics model in +# `doc/developer/semantics`. The Lean toolchain version is read from +# `lean-toolchain` and forwarded to the Dockerfile in the same +# directory, so the elan toolchain pin used by local developers and the +# CI image stay in lockstep. +# +# The Docker image is built locally on the agent and reused across CI +# runs by Docker's layer cache. There is no registry push; the image is +# cheap to rebuild from cold (≈ apt + elan + one Lean toolchain). + +set -euo pipefail + +cd "$(dirname "$0")/../.." + +semantics_dir="doc/developer/semantics" +lean_toolchain="$(tr -d '[:space:]' < "$semantics_dir/lean-toolchain")" +image_tag="mz-lean-semantics:latest" + +docker build \ + --build-arg "LEAN_TOOLCHAIN=$lean_toolchain" \ + --tag "$image_tag" \ + -f "$semantics_dir/Dockerfile" \ + "$semantics_dir" + +docker run --rm \ + --user "$(id -u):$(id -g)" \ + -v "$PWD/$semantics_dir:/workspace" \ + -w /workspace \ + "$image_tag" \ + lake build diff --git a/ci/test/pipeline.template.yml b/ci/test/pipeline.template.yml index 5a24a620d9940..f2de2388e4635 100644 --- a/ci/test/pipeline.template.yml +++ b/ci/test/pipeline.template.yml @@ -193,6 +193,19 @@ steps: coverage: skip sanitizer: skip + - id: lean-semantics + label: ":lean: Lean semantics" + command: ci/test/lean-semantics.sh + inputs: + - doc/developer/semantics + - ci/test/lean-semantics.sh + depends_on: [] + timeout_in_minutes: 15 + agents: + queue: hetzner-x86-64-4cpu-8gb + coverage: skip + sanitizer: skip + - id: lint-macos label: ":rust: macOS Clippy" # Running on a manually installed macOS agent, so make sure we don't go out of disk diff --git a/doc/developer/design/20260517_error_handling_semantics.md b/doc/developer/design/20260517_error_handling_semantics.md new file mode 100644 index 0000000000000..f7edcb1060c9f --- /dev/null +++ b/doc/developer/design/20260517_error_handling_semantics.md @@ -0,0 +1,236 @@ +# Error handling semantics + +* Associated: TBD (no open issue yet; this doc establishes the model) + +## The problem + +Materialize today has two error pathways and a gap between them. +`DataflowError` propagates row-level failures through a parallel error collection, allowing dataflows to surface failed records without halting computation. +`EvalError` describes scalar failures but, when raised, escalates the whole row to `DataflowError::EvalError` and removes the row from the output collection. +There is no first-class representation of a cell-level error inside a `Row`, and there is no representation of a collection-level error attached to a logical operator output. + +The absence of cell-level errors forces ingestion and casting paths to either reject records, coerce values to types they cannot represent, or route columns through `text` and defer parsing. +Concrete cases this hurts include MySQL/TiDB `0000-00-00` zero-dates that cannot be represented as `Datum::Date`, JSON casts that fail mid-row, decimal overflow inside a `SELECT` list, and any user-defined coercion that may fail on a subset of inputs. +A user who wants the rest of the row preserved must today either filter the data upstream or model the column as `text` and reparse it at query time. +This is the same friction that motivated `try_cast`, but applied at the storage layer rather than at the cast site. + +Globally scoped errors — failures whose blast radius is the whole collection rather than a single row — also lack a uniform representation. +A `WHERE` predicate that errors on a single row is a row-level error today, but the same predicate evaluated as part of a join condition or aggregation can produce semantics that escape any single row. +Differential dataflow's natural locus for "this collection is invalid at time `t`" is the `diff` field, which Materialize currently does not use this way. +A spec for global errors is needed so that future work has a consistent target rather than an ad-hoc encoding per operator. + +## Success criteria + +A solution is successful when the following hold. + +* Cell-level failures can be represented inside a `Row` without forcing the row out of the output collection. +* Row-level failures continue to be represented by the existing error collection without behavior change for in-place dataflows. +* Collection-level failures (global errors) have a defined semantics and a defined encoding, even if the encoding is not yet implemented. +* SQL evaluation rules for `NULL`, errors, and short-circuiting are written down in one place and match PostgreSQL where reasonable, with deviations called out explicitly. +* Existing data and existing dataflows continue to read and run after the new variant is added. + +## Out of scope + +The following are intentionally not addressed here. + +* The wire-format migration plan for adding a new `Datum` tag. +This is implementation work whose shape depends on which encoding option is chosen. +* The exact set of operators that should be made error-aware in the first iteration. +That is a planning concern, not a semantic one. +* User-facing SQL syntax for introspecting or filtering on errors (`is_error`, `error_message`, `try_*`). +The semantics defined here support such syntax, but choosing the surface is a separate design. +* Cancellation of in-flight queries due to global errors. +A dataflow exposing a global error is a steady-state concept; cancellation is orthogonal. + +## Solution proposal + +The proposal introduces three error scopes and assigns each a representation. + +### Error scopes + +Errors are classified by the smallest unit of output they invalidate. + +* **Cell-scoped**: the error invalidates a single `Datum` within a single `Row`. +The rest of the row is still well-defined. +Example: cast overflow on a single column inside a `SELECT` list. +* **Row-scoped**: the error invalidates a single row. +No `Datum` in the row is well-defined, but the rest of the collection is. +Example: a decoding error on a Kafka record, or a key-conflict in an upsert envelope. +* **Global-scoped**: the error invalidates the entire output collection at some time `t`. +No row at that time is well-defined. +Example: a `WHERE clause` whose evaluation depends on collection-wide state that has become invalid, or a sink whose downstream contract has been violated. + +Classification is a property of the operator producing the error, not of the underlying `EvalError`. +The same arithmetic overflow is cell-scoped when raised inside a `SELECT` projection and row-scoped when raised inside a `WHERE` predicate. +Operators are responsible for choosing the smallest scope that faithfully represents their semantics. + +### Cell-scoped errors: `Datum::Error` + +A new `Datum::Error(Box)` variant is added. +The variant participates in all `Row` encoding paths and is propagated by expressions according to the rules in the SQL semantics section. +Operators that produce a row may produce `Datum::Error` in any position where a value of any type is expected. +Operators that consume a row must either propagate the error, trap it via an explicit operator (`try_*`, `coalesce`-style), or escalate it to a row-scoped error. + +The type system treats `Datum::Error` as inhabiting every `ScalarType`. +This mirrors the way `NULL` inhabits every nullable type. +The variant carries an `EvalError`, not a string, so that error introspection functions can be added later without a format break. + +### Row-scoped errors: `DataflowError` (unchanged) + +`DataflowError` continues to carry row-scoped errors through the existing error collection. +The semantics are unchanged. +An operator that wishes to escalate a `Datum::Error` to row scope does so by emitting a `DataflowError::EvalError` and dropping the row from the data collection. + +### Global-scoped errors: diff-field encoding (specification only) + +A global error at time `t` is encoded as a distinguished record in the error collection whose `diff` field carries a special marker. +The intent is that any downstream operator observing such a record at time `t` treats the entire input collection at `t` as invalid, propagating the global error to its own output. +The natural encoding in differential dataflow uses the `diff` field because the data field is per-row and the time field is per-update. +A monoid extension of the `diff` semiring that adds an absorbing "error" element captures the propagation rule: any sum involving the absorbing element is itself the absorbing element, which is exactly the semantics required. + +Implementation is out of scope. +The spec exists so that future operator work targets this encoding rather than inventing alternates. + +### SQL error semantics + +The rules below define how `NULL` and `Datum::Error` interact in expression evaluation. +The intent is to match PostgreSQL behavior where PostgreSQL has behavior, and to extend it where PostgreSQL has none because PostgreSQL has no first-class cell error. + +**Scalar function evaluation.** +A strict function applied to any `Datum::Error` argument returns `Datum::Error`. +A strict function applied to `NULL` returns `NULL`, as today. +If a strict function receives both `NULL` and `Datum::Error` arguments, it returns `Datum::Error`. +This rule matches the principle that errors are stronger than `NULL`: `NULL` denotes "unknown value", error denotes "the value cannot exist". + +**Non-strict functions.** +`coalesce(a, b)` returns the first non-`NULL`, non-error argument, evaluating left to right. +If all arguments are `NULL` or error, the result is `NULL` if any argument was `NULL` and all errors were unreached, otherwise the first error. +This generalizes PostgreSQL `coalesce` so that a fallback can rescue an error in the same way it rescues a `NULL`. +Short-circuit boolean operators evaluate per the truth table below. + +**Boolean three-valued logic, extended.** +`AND` and `OR` are extended from PostgreSQL's three-valued logic to four values: `TRUE`, `FALSE`, `NULL`, `ERROR`. +The extension is conservative: any cell that PostgreSQL would have produced as `NULL` is still `NULL`, and `ERROR` participates only when an operand is an actual error. + +| `AND` | TRUE | FALSE | NULL | ERROR | +|---------|-------|-------|-------|-------| +| TRUE | TRUE | FALSE | NULL | ERROR | +| FALSE | FALSE | FALSE | FALSE | FALSE | +| NULL | NULL | FALSE | NULL | NULL | +| ERROR | ERROR | FALSE | NULL | ERROR | + +| `OR` | TRUE | FALSE | NULL | ERROR | +|---------|------|-------|-------|-------| +| TRUE | TRUE | TRUE | TRUE | TRUE | +| FALSE | TRUE | FALSE | NULL | ERROR | +| NULL | TRUE | NULL | NULL | NULL | +| ERROR | TRUE | ERROR | NULL | ERROR | + +`FALSE AND ERROR` is `FALSE`, and `TRUE OR ERROR` is `TRUE`, because the result is determined without inspecting the erroring operand. +`NULL AND ERROR` and `NULL OR ERROR` collapse to `NULL`, preserving PostgreSQL's bias toward `NULL` when ignorance subsumes the question. + +**Predicates.** +A `WHERE` clause emits a row when its predicate evaluates to `TRUE`. +It drops the row when the predicate is `FALSE` or `NULL`, as today. +When the predicate is `ERROR`, the row is escalated to a row-scoped error and surfaced via `DataflowError`. +This preserves "predicates are total" externally — the user sees either matching rows or row errors, never silently dropped errors. + +**Comparison.** +`=`, `<`, `>`, etc., applied to `Datum::Error` return `Datum::Error`. +`IS DISTINCT FROM` treats `Datum::Error` as distinct from any other value including another `Datum::Error` carrying the same inner error, on the grounds that the equality of two errors is itself ill-defined. +`IS NULL` returns `FALSE` on `Datum::Error`, mirroring the rule that error and `NULL` are distinct. +A future `IS ERROR` predicate would return `TRUE` on `Datum::Error` and `FALSE` otherwise. + +**Aggregates.** +`COUNT(*)` counts rows regardless of cell contents; `COUNT(expr)` counts rows where `expr` is neither `NULL` nor error. +`SUM`, `AVG`, `MIN`, `MAX`, and similar reductions return `Datum::Error` if any input cell is `Datum::Error`, with the inner `EvalError` chosen by an operator-defined rule (typically the first error in scan order; the rule must be deterministic given a fixed input). +This matches the principle that errors are stronger than `NULL` and the principle that aggregates should not silently hide failures. +An explicit opt-out is provided by future `try_sum`-style aggregates. + +**Grouping.** +`GROUP BY` treats `Datum::Error` as a distinct group key, with the same equality semantics as `IS DISTINCT FROM`: every error is its own group. +This avoids accidentally collapsing unrelated failures into a single aggregate output. + +**Joins.** +A join predicate evaluating to `ERROR` escalates the candidate pair to a row-scoped error. +This is symmetric with the `WHERE` rule. +Join keys containing `Datum::Error` do not match any other key, including identical `Datum::Error` values, mirroring the grouping rule. + +**Casts and `try_cast`.** +A cast that would today raise `EvalError` now also has the option of producing `Datum::Error` when invoked from a context that has opted in to cell-scoped failures. +`try_cast` continues to return `NULL` on failure for backward compatibility. +A new variant such as `try_cast_error` could return `Datum::Error`; choosing the surface is out of scope. + +### Operator obligations + +Each operator falls into one of three categories. + +* **Error-transparent**: passes `Datum::Error` through unchanged in the cells where it appears. +Most projection-style operators are transparent. +* **Error-aware**: inspects `Datum::Error` and produces a defined result. +Examples: `coalesce`, `IS NULL`, future `IS ERROR`, `try_*`. +* **Error-escalating**: converts a cell-scoped error to a row-scoped error. +Examples: `WHERE`, join predicates, sink output (a sink cannot emit a row containing `Datum::Error` to a downstream system, so it must escalate). + +Operators document which category they fall into. +Default for a new operator is transparent unless it has a reason to be aware or escalating. + +### Worked example: TiDB zero-date + +MySQL and TiDB allow `0000-00-00` as a fallback when permissive `sql_mode` rejects an invalid date. +The value is not equivalent to `NULL`: a `NOT NULL` column can contain `0000-00-00` alongside `NULL` columns elsewhere. +Materialize's `Date` type cannot represent `0000-00-00` (no year 0, no month 0, no day 0). + +Under the proposed model the MySQL source decodes `Value::Date(0, 0, 0, ...)` into `Datum::Error(EvalError::InvalidDate)` for `SqlScalarType::Date` columns. +The row is emitted intact; downstream queries that touch only other columns succeed. +A query that projects the date column sees `Datum::Error`, which propagates per the rules above. +A user who wants to coalesce can write `coalesce(d, DATE '1970-01-01')` if they have opted into error-aware coalesce, preserving the distinction between zero-date and `NULL` while still producing a usable timestamp downstream. +The TEXT COLUMN escape hatch remains available but is no longer the only correct ingestion path. + +## Minimal viable prototype + +A prototype consists of three steps, none of which require the full migration to land. + +* Add `Datum::Error(Box)` behind a feature flag, wired through `Row` packing and `RowArena` allocation, gated so that existing rows cannot contain the variant. +* Implement the strict-function propagation rule and the extended boolean truth tables in `MirScalarExpr` evaluation, with unit tests covering each cell of each table. +* Wire the MySQL source decoder to emit `Datum::Error` on `Value::Date(0, 0, 0, ...)` for `SqlScalarType::Date` columns, and verify end-to-end that a `SELECT` over an unaffected column returns the row while a `SELECT` over the date column returns the error. + +The prototype intentionally omits global errors, sinks, and aggregates; those land in subsequent PRs with their own tests. + +## Alternatives + +**Status quo plus TEXT COLUMN guidance.** +Document that columns with permissive upstream `sql_mode` must be ingested as `text`. +Cheap, but pushes parsing to query time forever and does not generalize to other cell-failure sources (overflow, JSON cast, decimal precision). + +**Datum-level `NULL` overload.** +Coerce zero-dates to `NULL` at ingestion. +Loses the distinction between user-intended `NULL` and ingestion-rejected value. +Violates the spec rule that error is stronger than `NULL`. +Rejected on correctness grounds. + +**String error wrapper instead of `EvalError`.** +Store `Datum::Error(Box)` rather than `Datum::Error(Box)`. +Simpler to encode but loses structured error information; introspection functions would parse a string. +Rejected on extensibility grounds. + +**Error rows in the data collection rather than a separate variant.** +Tag the row itself with a per-column error bitmask, leaving `Row` as today. +Saves a `Datum` variant but introduces an out-of-band channel that operators must thread through every transformation. +Rejected on uniformity grounds; the `Datum` variant is the same place every existing operator already inspects. + +**Global errors via a sidecar collection rather than the `diff` field.** +Carry global errors in a separate timely stream. +Works, but requires every operator to be aware of two error inputs. +The `diff`-field encoding leverages differential dataflow's existing fan-in and is the natural extension of the semiring. + +## Open questions + +* What is the exact set of `EvalError` payloads that operators may produce as `Datum::Error`? +Some `EvalError` variants (out-of-memory, environment errors) make no sense at cell scope. +* How does `Datum::Error` interact with persisted state and version skew? +A reader on an older binary that encounters `Datum::Error` must have a defined behavior; the default proposal is to surface the row as a row-scoped error in the read path. +* Should ordering operators have a defined sort position for `Datum::Error`? +PostgreSQL has no precedent; candidates are "errors sort last", "errors sort like `NULL`", and "errors are unordered and produce a sort-key error". +* What is the storage cost of widening `Row` encoding to include the new tag, and how does it compare to the current cost of routing failures through the error collection? +* For global errors, what is the precise specification of the absorbing element in the `diff` semiring, and how does it interact with consolidation and arrangement? diff --git a/doc/developer/semantics/.gitignore b/doc/developer/semantics/.gitignore new file mode 100644 index 0000000000000..b914de79dca12 --- /dev/null +++ b/doc/developer/semantics/.gitignore @@ -0,0 +1,2 @@ +# Lake build cache. +.lake/ diff --git a/doc/developer/semantics/Dockerfile b/doc/developer/semantics/Dockerfile new file mode 100644 index 0000000000000..a895b5128b5bc --- /dev/null +++ b/doc/developer/semantics/Dockerfile @@ -0,0 +1,44 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. +# +# Minimal Lean 4 toolchain image for building the Materialize semantics +# model in this directory. The Lean version is supplied via build arg, +# which `ci/test/lean-semantics.sh` reads from `lean-toolchain`, so the +# elan toolchain pin used by local developers and the CI image stay in +# lockstep. +# +# The image is intentionally small. It installs elan, a single Lean +# toolchain, and the system packages elan needs to download. No Mathlib, +# no editor, no test runners. Adding those is a deliberate future +# decision, not a passive consequence of inheriting a heavier base. + +FROM ubuntu:26.04 + +ARG LEAN_TOOLCHAIN + +# Install elan system-wide so the toolchain is usable by any uid that +# the container may be launched with (CI runs with the agent's uid via +# `--user`). Writes go to `/workspace/.lake/`, which is bind-mounted. +ENV ELAN_HOME=/opt/elan +ENV PATH=$ELAN_HOME/bin:$PATH + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + git \ + && rm -rf /var/lib/apt/lists/* + +RUN curl --proto '=https' --tlsv1.2 -sSf \ + https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh \ + | sh -s -- --default-toolchain "$LEAN_TOOLCHAIN" -y --no-modify-path \ + && chmod -R a+rX "$ELAN_HOME" \ + && lean --version + +WORKDIR /workspace diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean new file mode 100644 index 0000000000000..60d695d85b388 --- /dev/null +++ b/doc/developer/semantics/Mz.lean @@ -0,0 +1,4 @@ +import Mz.Datum +import Mz.Expr +import Mz.Eval +import Mz.Boolean diff --git a/doc/developer/semantics/Mz/Boolean.lean b/doc/developer/semantics/Mz/Boolean.lean new file mode 100644 index 0000000000000..0a4f0bcd9f3b2 --- /dev/null +++ b/doc/developer/semantics/Mz/Boolean.lean @@ -0,0 +1,76 @@ +import Mz.Eval + +/-! +# Boolean truth tables + +Cell-by-cell proofs that `evalAnd` and `evalOr` realize the truth +tables stated in +`doc/developer/design/20260517_error_handling_semantics.md`. + +Each cell is its own theorem. The redundancy is deliberate: if a +future semantic change touches one cell, exactly one theorem breaks, +making the change reviewable in isolation. + +All proofs reduce to `rfl` because `evalAnd` and `evalOr` are defined +by pattern matching and the cases are exhaustive constructor +applications. The `cases d <;> rfl` form is used where a single cell +quantifies over an arbitrary `Datum` (e.g. `false` absorbs everything). +-/ + +namespace Mz + +/-! ## AND -/ + +theorem and_false_left (d : Datum) : evalAnd (.bool false) d = .bool false := by + cases d <;> rfl + +theorem and_false_right (d : Datum) : evalAnd d (.bool false) = .bool false := by + cases d with + | bool b => cases b <;> rfl + | null => rfl + | err _ => rfl + +theorem and_true_true : evalAnd (.bool true) (.bool true) = .bool true := rfl +theorem and_true_null : evalAnd (.bool true) .null = .null := rfl +theorem and_null_true : evalAnd .null (.bool true) = .null := rfl +theorem and_null_null : evalAnd .null .null = .null := rfl + +theorem and_true_err (e : EvalError) : + evalAnd (.bool true) (.err e) = .err e := rfl +theorem and_err_true (e : EvalError) : + evalAnd (.err e) (.bool true) = .err e := rfl +theorem and_null_err (e : EvalError) : + evalAnd .null (.err e) = .err e := rfl +theorem and_err_null (e : EvalError) : + evalAnd (.err e) .null = .err e := rfl +theorem and_err_err (e₁ e₂ : EvalError) : + evalAnd (.err e₁) (.err e₂) = .err e₁ := rfl + +/-! ## OR -/ + +theorem or_true_left (d : Datum) : evalOr (.bool true) d = .bool true := by + cases d <;> rfl + +theorem or_true_right (d : Datum) : evalOr d (.bool true) = .bool true := by + cases d with + | bool b => cases b <;> rfl + | null => rfl + | err _ => rfl + +theorem or_false_false : evalOr (.bool false) (.bool false) = .bool false := rfl +theorem or_false_null : evalOr (.bool false) .null = .null := rfl +theorem or_null_false : evalOr .null (.bool false) = .null := rfl +theorem or_null_null : evalOr .null .null = .null := rfl + +theorem or_false_err (e : EvalError) : + evalOr (.bool false) (.err e) = .err e := rfl +theorem or_err_false (e : EvalError) : + evalOr (.err e) (.bool false) = .err e := rfl +theorem or_null_err (e : EvalError) : + evalOr .null (.err e) = .err e := rfl +theorem or_err_null (e : EvalError) : + evalOr (.err e) .null = .err e := rfl +theorem or_err_err (e₁ e₂ : EvalError) : + evalOr (.err e₁) (.err e₂) = .err e₁ := rfl + +end Mz diff --git a/doc/developer/semantics/Mz/Datum.lean b/doc/developer/semantics/Mz/Datum.lean new file mode 100644 index 0000000000000..0a5e7e400958d --- /dev/null +++ b/doc/developer/semantics/Mz/Datum.lean @@ -0,0 +1,40 @@ +/-! +# `Datum` + +The subset of Materialize's `Datum` modeled by the semantics skeleton. + +Only the variants required to state the boolean truth tables are +present: booleans, `null`, and the proposed `err` variant carrying an +opaque `EvalError` payload. Numeric, string, and temporal types are +intentionally omitted from this skeleton; adding them later does not +affect the proofs in `Mz/Boolean.lean`. + +The Rust counterpart lives in `src/repr/src/row.rs` (`Datum`) and +`src/expr/src/scalar.rs` (`EvalError`). +-/ + +namespace Mz + +/-- Opaque payload for cell-scoped errors. + +The skeleton does not enumerate the variants of the Rust +`EvalError`. A single placeholder constructor keeps the type +inhabited so that proofs that need a concrete value can supply one, +without committing to a wire format. Later refinements will replace +this with the real variant set. -/ +inductive EvalError + | placeholder + deriving DecidableEq, Inhabited + +/-- A modeled scalar value. + +`bool b` is a boolean literal; `null` is the SQL `NULL` value; `err e` +is the proposed cell-scoped error variant whose payload is the +`EvalError` raised at the cell. -/ +inductive Datum + | bool (b : Bool) + | null + | err (e : EvalError) + deriving Inhabited + +end Mz diff --git a/doc/developer/semantics/Mz/Eval.lean b/doc/developer/semantics/Mz/Eval.lean new file mode 100644 index 0000000000000..28acfa226c70e --- /dev/null +++ b/doc/developer/semantics/Mz/Eval.lean @@ -0,0 +1,76 @@ +import Mz.Datum +import Mz.Expr + +/-! +# `eval` + +Operational semantics for `Expr`. + +`evalAnd` and `evalOr` are pattern-ordered to match Materialize's +current runtime behavior in `src/expr/src/scalar/func/variadic.rs` +(`And::eval`, `Or::eval`). Specifically: + +* `FALSE` in either operand absorbs every other value, including + `null` and `err`. Symmetric for `TRUE` in `OR`. +* Otherwise, `err` is the result whenever it appears, including when + the other operand is `null`. This deliberately matches the current + Mz behavior rather than the "NULL absorbs ERROR" rule discussed in + `doc/developer/design/20260517_error_handling_semantics.md`. + +A change to either rule should produce a corresponding diff in +`Mz/Boolean.lean`. That diff is the spec change. +-/ + +namespace Mz + +/-- AND evaluation table. + +The pattern order encodes the absorption hierarchy: +`FALSE > ERROR > NULL > TRUE`. -/ +def evalAnd : Datum → Datum → Datum + | .bool false, _ => .bool false + | _, .bool false => .bool false + | .err e, _ => .err e + | _, .err e => .err e + | .null, _ => .null + | _, .null => .null + | .bool true, .bool true => .bool true + +/-- OR evaluation table. + +Mirror of `evalAnd` with `TRUE` as the dominant absorber: +`TRUE > ERROR > NULL > FALSE`. -/ +def evalOr : Datum → Datum → Datum + | .bool true, _ => .bool true + | _, .bool true => .bool true + | .err e, _ => .err e + | _, .err e => .err e + | .null, _ => .null + | _, .null => .null + | .bool false, .bool false => .bool false + +/-- Environment: a positional list of bindings for `Expr.col`. -/ +abbrev Env := List Datum + +/-- Reading an out-of-bounds column yields `NULL`. + +This is a modeling choice for the skeleton. The real evaluator +expects callers to provide a well-typed row of the correct width; +the skeleton avoids that obligation by defaulting to `NULL`. -/ +def Env.get (env : Env) (i : Nat) : Datum := + env.getD i .null + +/-- Big-step evaluation. -/ +def eval (env : Env) : Expr → Datum + | .lit d => d + | .col i => Env.get env i + | .and a b => evalAnd (eval env a) (eval env b) + | .or a b => evalOr (eval env a) (eval env b) + | .ifThen c t e => + match eval env c with + | .bool true => eval env t + | .bool false => eval env e + | .null => .null + | .err err => .err err + +end Mz diff --git a/doc/developer/semantics/Mz/Expr.lean b/doc/developer/semantics/Mz/Expr.lean new file mode 100644 index 0000000000000..11f4a1005b9aa --- /dev/null +++ b/doc/developer/semantics/Mz/Expr.lean @@ -0,0 +1,32 @@ +import Mz.Datum + +/-! +# `Expr` + +A minimal subset of `MirScalarExpr` (see `src/expr/src/scalar.rs`). + +The skeleton uses binary `and` and `or` rather than the variadic form +used in production. The boolean truth tables are pairwise, so binary +operators are sufficient to state and prove them. Generalizing to the +variadic form is straightforward via fold and is deferred to a later +iteration. +-/ + +namespace Mz + +/-- Scalar expression syntax. + +* `lit d`: literal datum. +* `col i`: reference to column `i` in the surrounding environment. +* `and a b`, `or a b`: logical conjunction and disjunction. +* `ifThen c t e`: PostgreSQL-style `CASE` / `If` — the only + user-controllable short-circuit in `MirScalarExpr`. -/ +inductive Expr + | lit (d : Datum) + | col (i : Nat) + | and (a b : Expr) + | or (a b : Expr) + | ifThen (c t e : Expr) + deriving Inhabited + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md new file mode 100644 index 0000000000000..e49d23299d562 --- /dev/null +++ b/doc/developer/semantics/README.md @@ -0,0 +1,66 @@ +# Lean 4 semantics skeleton + +A mechanized model of Materialize's scalar evaluation semantics. + +This directory contains the v1 skeleton accompanying the error-handling design document at `../design/20260517_error_handling_semantics.md`. +The goal of the skeleton is not to mechanize all of `MirScalarExpr`. +The goal is to lock in the boolean truth tables for `AND` and `OR` over the four-valued logic `{TRUE, FALSE, NULL, ERROR}` and provide a place to grow from. + +## What is here + +* `Mz/Datum.lean`: `Datum` and `EvalError` types. +* `Mz/Expr.lean`: a minimal `Expr` inductive (literals, columns, binary `and`/`or`, `ifThen`). +* `Mz/Eval.lean`: `evalAnd`, `evalOr`, and `eval` matching the runtime in `src/expr/src/scalar/func/variadic.rs`. +* `Mz/Boolean.lean`: per-cell truth-table proofs. + +## What is not here + +* No bag semantics, joins, aggregates, or relational operators. +* No diff-semiring extension for global errors (see the design doc). +* No bridge to the Rust evaluator. + The model and the runtime are independent; divergences are caught by review, not by tooling. +* No Mathlib dependency. + The skeleton is pure core Lean 4 to keep build time small and bootstrap simple. + +## Build + +``` +cd doc/developer/semantics +lake build +``` + +Toolchain is pinned in `lean-toolchain`. +CI uses the local `Dockerfile` in this directory, which installs elan and reads the same pin via the `LEAN_TOOLCHAIN` build arg. +The elan toolchain used by local developers and the toolchain baked into the CI image therefore stay in lockstep. +Buildkite runs `ci/test/lean-semantics.sh` on every PR that touches `doc/developer/semantics/` (see the `lean-semantics` step in `ci/test/pipeline.template.yml`). + +To reproduce the CI build locally: + +``` +./ci/test/lean-semantics.sh +``` + +To run `lake build` directly outside Docker, install elan via the standard instructions at `https://lean-lang.org/` and run `lake build` in this directory. + +## Workflow + +When a semantic rule changes, the change must land in two places. + +1. Update `Mz/Eval.lean` to reflect the new operational behavior. +2. Update the corresponding theorem in `Mz/Boolean.lean` (or add a new one). + +A semantic change without a Lean diff is incomplete. +A Lean diff without a corresponding Rust diff in `src/expr/` is a spec change that has not yet shipped. +Reviewers should expect both sides of the change in the same PR. + +## Next steps + +The roadmap in priority order: + +* Strict-propagation theorem: `f` strict implies `f(.., err, ..) = err`. +* Coalesce error-rescue law: `coalesce(err, x) = x`. +* Reorder-safety conditions: `(¬might_error a ∧ ¬might_error b) → eval (And [a, b]) = eval (And [b, a])`. +* Variadic `And` and `Or` via fold, with equivalence to the binary form. +* Lift to bag semantics for predicate / projection rewrites. + +The diff-semiring extension for global errors is a separate v2 effort. diff --git a/doc/developer/semantics/lake-manifest.json b/doc/developer/semantics/lake-manifest.json new file mode 100644 index 0000000000000..08f9ce69b1e10 --- /dev/null +++ b/doc/developer/semantics/lake-manifest.json @@ -0,0 +1,5 @@ +{"version": "1.1.0", + "packagesDir": ".lake/packages", + "packages": [], + "name": "«mz-semantics»", + "lakeDir": ".lake"} diff --git a/doc/developer/semantics/lakefile.lean b/doc/developer/semantics/lakefile.lean new file mode 100644 index 0000000000000..9df38b0726e1e --- /dev/null +++ b/doc/developer/semantics/lakefile.lean @@ -0,0 +1,9 @@ +import Lake +open Lake DSL + +package «mz-semantics» where + -- No options needed yet. + +@[default_target] +lean_lib «Mz» where + -- Library auto-discovers files under `Mz/`. diff --git a/doc/developer/semantics/lean-toolchain b/doc/developer/semantics/lean-toolchain new file mode 100644 index 0000000000000..33e0c088939ad --- /dev/null +++ b/doc/developer/semantics/lean-toolchain @@ -0,0 +1 @@ +leanprover/lean4:v4.29.1 From bf26c0e2084d312f337b58e564cf9d3f3d88bf24 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Sun, 17 May 2026 21:51:33 +0200 Subject: [PATCH 002/127] doc/semantics: add NOT, IsErr, idempotence, conditional commutativity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expand the Lean skeleton with a small set of follow-up theorems that the optimizer roadmap depends on. * `Datum.IsErr` propositional predicate plus `DecidablePred` instance, for use as a hypothesis in algebraic laws. * `Expr.not` constructor and `evalNot`, with the 4-cell `NOT` truth table and `not_not` (involution on the boolean fragment, no-op on `null` and `err`). * New `Mz/Laws.lean`: - `evalAnd_idem`, `evalOr_idem`: idempotence holds unconditionally, including on `err` (same error preserved). - `evalAnd_comm_of_no_err`, `evalOr_comm_of_no_err`: commutativity conditional on neither operand being an error. This is the law an optimizer that has run `might_error` analysis can use to justify conjunct reordering. Unconditional commutativity fails over `(err e₁, err e₂)` with distinct payloads. Associativity and `might_error` soundness are deferred. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Boolean.lean | 15 +++++ doc/developer/semantics/Mz/Datum.lean | 14 ++++ doc/developer/semantics/Mz/Eval.lean | 10 +++ doc/developer/semantics/Mz/Expr.lean | 2 + doc/developer/semantics/Mz/Laws.lean | 87 +++++++++++++++++++++++++ doc/developer/semantics/README.md | 14 ++-- 7 files changed, 137 insertions(+), 6 deletions(-) create mode 100644 doc/developer/semantics/Mz/Laws.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 60d695d85b388..bdbdc81191e61 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -2,3 +2,4 @@ import Mz.Datum import Mz.Expr import Mz.Eval import Mz.Boolean +import Mz.Laws diff --git a/doc/developer/semantics/Mz/Boolean.lean b/doc/developer/semantics/Mz/Boolean.lean index 0a4f0bcd9f3b2..f5ed69bc9f92d 100644 --- a/doc/developer/semantics/Mz/Boolean.lean +++ b/doc/developer/semantics/Mz/Boolean.lean @@ -73,4 +73,19 @@ theorem or_err_null (e : EvalError) : theorem or_err_err (e₁ e₂ : EvalError) : evalOr (.err e₁) (.err e₂) = .err e₁ := rfl +/-! ## NOT -/ + +theorem not_true : evalNot (.bool true) = .bool false := rfl +theorem not_false : evalNot (.bool false) = .bool true := rfl +theorem not_null : evalNot .null = .null := rfl +theorem not_err (e : EvalError) : evalNot (.err e) = .err e := rfl + +/-- `Not` is involutive on the boolean fragment and a no-op on `null` +and `err`. The latter mirrors the strict propagation rule. -/ +theorem not_not (d : Datum) : evalNot (evalNot d) = d := by + cases d with + | bool b => cases b <;> rfl + | null => rfl + | err _ => rfl + end Mz diff --git a/doc/developer/semantics/Mz/Datum.lean b/doc/developer/semantics/Mz/Datum.lean index 0a5e7e400958d..16f2f8841c9e6 100644 --- a/doc/developer/semantics/Mz/Datum.lean +++ b/doc/developer/semantics/Mz/Datum.lean @@ -37,4 +37,18 @@ inductive Datum | err (e : EvalError) deriving Inhabited +/-- Propositional predicate "this datum is an error". + +Stated as `Prop` rather than `Bool` so it composes with `Not` and is +usable as a hypothesis in proofs without first lifting through +`= true`. The recursor over `Datum` collapses each branch to either +`True` or `False`, so `decide` (and `simp`) handles `IsErr` cleanly. -/ +def Datum.IsErr : Datum → Prop + | .err _ => True + | _ => False + +instance : DecidablePred Datum.IsErr := by + intro d + cases d <;> unfold Datum.IsErr <;> infer_instance + end Mz diff --git a/doc/developer/semantics/Mz/Eval.lean b/doc/developer/semantics/Mz/Eval.lean index 28acfa226c70e..dccfd4c3b8bbe 100644 --- a/doc/developer/semantics/Mz/Eval.lean +++ b/doc/developer/semantics/Mz/Eval.lean @@ -49,6 +49,15 @@ def evalOr : Datum → Datum → Datum | _, .null => .null | .bool false, .bool false => .bool false +/-- NOT evaluation table. + +`Not` is strict in the SQL sense: it propagates `null` and `err` while +flipping `true ↔ false`. -/ +def evalNot : Datum → Datum + | .bool b => .bool (!b) + | .null => .null + | .err e => .err e + /-- Environment: a positional list of bindings for `Expr.col`. -/ abbrev Env := List Datum @@ -66,6 +75,7 @@ def eval (env : Env) : Expr → Datum | .col i => Env.get env i | .and a b => evalAnd (eval env a) (eval env b) | .or a b => evalOr (eval env a) (eval env b) + | .not a => evalNot (eval env a) | .ifThen c t e => match eval env c with | .bool true => eval env t diff --git a/doc/developer/semantics/Mz/Expr.lean b/doc/developer/semantics/Mz/Expr.lean index 11f4a1005b9aa..8e2ba049f201f 100644 --- a/doc/developer/semantics/Mz/Expr.lean +++ b/doc/developer/semantics/Mz/Expr.lean @@ -19,6 +19,7 @@ namespace Mz * `lit d`: literal datum. * `col i`: reference to column `i` in the surrounding environment. * `and a b`, `or a b`: logical conjunction and disjunction. +* `not a`: logical negation. * `ifThen c t e`: PostgreSQL-style `CASE` / `If` — the only user-controllable short-circuit in `MirScalarExpr`. -/ inductive Expr @@ -26,6 +27,7 @@ inductive Expr | col (i : Nat) | and (a b : Expr) | or (a b : Expr) + | not (a : Expr) | ifThen (c t e : Expr) deriving Inhabited diff --git a/doc/developer/semantics/Mz/Laws.lean b/doc/developer/semantics/Mz/Laws.lean new file mode 100644 index 0000000000000..7deda0ba0acd0 --- /dev/null +++ b/doc/developer/semantics/Mz/Laws.lean @@ -0,0 +1,87 @@ +import Mz.Eval + +/-! +# Algebraic laws + +Laws over `evalAnd` and `evalOr` that constrain optimizer rewrites. + +The laws here are deliberately weaker than the unconditional laws of +two-valued boolean algebra. The current Materialize runtime +(`src/expr/src/scalar/func/variadic.rs`) lets `FALSE` absorb `ERR` in +`AND` and `TRUE` absorb `ERR` in `OR`, but does not let either of +`NULL` or another `ERR` absorb `ERR`. Consequently: + +* **Idempotence** holds unconditionally for every cell of the value + lattice, including `err`. The result is *the same* error, not an + arbitrary one — this matters for rewrites that fold `x AND x` to + `x`. +* **Commutativity** holds unless both operands are errors with + distinct payloads, because `evalAnd (.err e₁) (.err e₂)` selects + `e₁` while the swapped form selects `e₂`. The conditional form + guards on `¬IsErr` for at least one operand, which is what an + optimizer that has run `might_error` analysis can prove. + +Associativity is not stated here. It fails over the four-valued +lattice in the presence of distinct errors and would require a +more delicate hypothesis; it is left for a later iteration that +introduces a partial-order on errors. +-/ + +namespace Mz + +/-! ## Idempotence -/ + +theorem evalAnd_idem (d : Datum) : evalAnd d d = d := by + cases d with + | bool b => cases b <;> rfl + | null => rfl + | err _ => rfl + +theorem evalOr_idem (d : Datum) : evalOr d d = d := by + cases d with + | bool b => cases b <;> rfl + | null => rfl + | err _ => rfl + +/-! ## Conditional commutativity + +`evalAnd` and `evalOr` commute whenever neither operand is an error. +The premise is stronger than strictly necessary — commutativity also +holds when exactly one operand is an error — but the symmetric form +matches the shape an optimizer typically carries (a `might_error` +flag per operand). A weaker premise can be added later if a transform +demands it. -/ + +theorem evalAnd_comm_of_no_err + {d₁ d₂ : Datum} (h₁ : ¬d₁.IsErr) (h₂ : ¬d₂.IsErr) : + evalAnd d₁ d₂ = evalAnd d₂ d₁ := by + cases d₁ with + | bool b₁ => + cases d₂ with + | bool b₂ => cases b₁ <;> cases b₂ <;> rfl + | null => cases b₁ <;> rfl + | err _ => exact (h₂ trivial).elim + | null => + cases d₂ with + | bool b₂ => cases b₂ <;> rfl + | null => rfl + | err _ => exact (h₂ trivial).elim + | err _ => exact (h₁ trivial).elim + +theorem evalOr_comm_of_no_err + {d₁ d₂ : Datum} (h₁ : ¬d₁.IsErr) (h₂ : ¬d₂.IsErr) : + evalOr d₁ d₂ = evalOr d₂ d₁ := by + cases d₁ with + | bool b₁ => + cases d₂ with + | bool b₂ => cases b₁ <;> cases b₂ <;> rfl + | null => cases b₁ <;> rfl + | err _ => exact (h₂ trivial).elim + | null => + cases d₂ with + | bool b₂ => cases b₂ <;> rfl + | null => rfl + | err _ => exact (h₂ trivial).elim + | err _ => exact (h₁ trivial).elim + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index e49d23299d562..5eb0187e0b617 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -8,10 +8,11 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four ## What is here -* `Mz/Datum.lean`: `Datum` and `EvalError` types. -* `Mz/Expr.lean`: a minimal `Expr` inductive (literals, columns, binary `and`/`or`, `ifThen`). -* `Mz/Eval.lean`: `evalAnd`, `evalOr`, and `eval` matching the runtime in `src/expr/src/scalar/func/variadic.rs`. -* `Mz/Boolean.lean`: per-cell truth-table proofs. +* `Mz/Datum.lean`: `Datum`, `EvalError`, and the `Datum.IsErr` predicate. +* `Mz/Expr.lean`: a minimal `Expr` inductive (literals, columns, binary `and`/`or`, `not`, `ifThen`). +* `Mz/Eval.lean`: `evalAnd`, `evalOr`, `evalNot`, and `eval` matching the runtime in `src/expr/src/scalar/func/variadic.rs`. +* `Mz/Boolean.lean`: per-cell truth-table proofs for `AND`, `OR`, and `NOT`, plus involutivity of `NOT`. +* `Mz/Laws.lean`: algebraic laws — idempotence (unconditional) and commutativity (conditional on error-freedom). ## What is not here @@ -57,9 +58,10 @@ Reviewers should expect both sides of the change in the same PR. The roadmap in priority order: -* Strict-propagation theorem: `f` strict implies `f(.., err, ..) = err`. +* `Expr.might_error` predicate plus soundness: `¬might_error e → ∀ env err-free, ¬(eval env e).IsErr`. +* Reorder-safety conditions on `Expr`: lift `evalAnd_comm_of_no_err` to `eval env (And a b) = eval env (And b a)` when `¬a.might_error ∧ ¬b.might_error`. +* Strict-propagation theorem: any strict function applied to `err` returns `err`. * Coalesce error-rescue law: `coalesce(err, x) = x`. -* Reorder-safety conditions: `(¬might_error a ∧ ¬might_error b) → eval (And [a, b]) = eval (And [b, a])`. * Variadic `And` and `Or` via fold, with equivalence to the binary form. * Lift to bag semantics for predicate / projection rewrites. From 21673bd44529d455d6e9fdfc89e62f7db2ac7949 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Sun, 17 May 2026 22:02:40 +0200 Subject: [PATCH 003/127] doc/semantics: add might_error analyzer + soundness + reorder safety MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Land the optimizer-facing theorem the previous commit's roadmap called out as priority one. * Switch `eval` for `.ifThen` to a strict `evalIfThen` helper. The observable `Datum` is identical to the lazy form in this total skeleton (both branches are total functions); strictness collapses the ifThen proof case to a single `cases` on the condition. * Redefine `Env.get` by primitive recursion so inductive proofs can `cases` on the defining equations directly without going through `List.getD`. * New `Mz/MightError.lean`: - Per-operator helper lemmas: `evalAnd_not_err`, `evalOr_not_err`, `evalNot_not_err`, `evalIfThen_not_err`. - `Expr.might_error` conservative analyzer (literal err taints every ancestor; columns assumed err-free). - `Env.ErrFree` predicate and `Env.get_not_err` lemma. - `might_error_sound`: if `¬e.might_error` and `env.ErrFree` then `¬(eval env e).IsErr`. Structural induction on `e`, one helper per Expr constructor. * `Mz/Laws.lean`: lift `evalAnd_comm_of_no_err` and `evalOr_comm_of_no_err` through `eval` via `might_error_sound`, giving `Expr`-level reorder-safety theorems `eval_and_comm_of_no_might_error` and `eval_or_comm_of_no_might_error`. These are the laws an optimizer would cite when reordering conjuncts in the boolean fragment. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Eval.lean | 37 +++-- doc/developer/semantics/Mz/Laws.lean | 27 ++++ doc/developer/semantics/Mz/MightError.lean | 176 +++++++++++++++++++++ doc/developer/semantics/README.md | 14 +- 5 files changed, 237 insertions(+), 18 deletions(-) create mode 100644 doc/developer/semantics/Mz/MightError.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index bdbdc81191e61..f4170aba6ea51 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -2,4 +2,5 @@ import Mz.Datum import Mz.Expr import Mz.Eval import Mz.Boolean +import Mz.MightError import Mz.Laws diff --git a/doc/developer/semantics/Mz/Eval.lean b/doc/developer/semantics/Mz/Eval.lean index dccfd4c3b8bbe..5aa4bca6e4bde 100644 --- a/doc/developer/semantics/Mz/Eval.lean +++ b/doc/developer/semantics/Mz/Eval.lean @@ -58,16 +58,36 @@ def evalNot : Datum → Datum | .null => .null | .err e => .err e +/-- `IfThen` evaluation table. + +In SQL, the runtime form is lazy: only the selected branch is +evaluated, so a literal error inside the un-selected branch is never +raised. In this total skeleton every `Expr` evaluates to a `Datum` +regardless, so we model `IfThen` as a strict function of three values. +The observable output (a `Datum`) coincides with the lazy version +whenever the lazy version is defined, because both branches are total +functions of `Datum`. A future iteration that introduces effects or +partiality will have to reintroduce the laziness explicitly. -/ +def evalIfThen : Datum → Datum → Datum → Datum + | .bool true, dt, _ => dt + | .bool false, _, de => de + | .null, _, _ => .null + | .err e, _, _ => .err e + /-- Environment: a positional list of bindings for `Expr.col`. -/ abbrev Env := List Datum /-- Reading an out-of-bounds column yields `NULL`. -This is a modeling choice for the skeleton. The real evaluator -expects callers to provide a well-typed row of the correct width; -the skeleton avoids that obligation by defaulting to `NULL`. -/ -def Env.get (env : Env) (i : Nat) : Datum := - env.getD i .null +This is a modeling choice for the skeleton. The real evaluator expects +callers to provide a well-typed row of the correct width; the skeleton +avoids that obligation by defaulting to `NULL`. Defined by primitive +recursion on the list so that inductive proofs can `cases` on the +defining equations directly rather than going through `List.getD`. -/ +def Env.get : Env → Nat → Datum + | [], _ => .null + | d :: _, 0 => d + | _ :: rest, n + 1 => Env.get rest n /-- Big-step evaluation. -/ def eval (env : Env) : Expr → Datum @@ -76,11 +96,6 @@ def eval (env : Env) : Expr → Datum | .and a b => evalAnd (eval env a) (eval env b) | .or a b => evalOr (eval env a) (eval env b) | .not a => evalNot (eval env a) - | .ifThen c t e => - match eval env c with - | .bool true => eval env t - | .bool false => eval env e - | .null => .null - | .err err => .err err + | .ifThen c t e => evalIfThen (eval env c) (eval env t) (eval env e) end Mz diff --git a/doc/developer/semantics/Mz/Laws.lean b/doc/developer/semantics/Mz/Laws.lean index 7deda0ba0acd0..066f09e5e279a 100644 --- a/doc/developer/semantics/Mz/Laws.lean +++ b/doc/developer/semantics/Mz/Laws.lean @@ -1,4 +1,5 @@ import Mz.Eval +import Mz.MightError /-! # Algebraic laws @@ -84,4 +85,30 @@ theorem evalOr_comm_of_no_err | err _ => exact (h₂ trivial).elim | err _ => exact (h₁ trivial).elim +/-! ## Reorder safety on `Expr` + +These corollaries lift the conditional commutativity laws above +through `eval`. They are the precondition an optimizer must check +before swapping conjuncts: both operands must be statically proved +error-free by `might_error`, and the surrounding environment must be +error-free. -/ + +theorem eval_and_comm_of_no_might_error + {a b : Expr} {env : Env} + (ha : ¬(a.might_error = true)) (hb : ¬(b.might_error = true)) + (hEnv : env.ErrFree) : + eval env (.and a b) = eval env (.and b a) := by + have hae := might_error_sound ha hEnv + have hbe := might_error_sound hb hEnv + exact evalAnd_comm_of_no_err hae hbe + +theorem eval_or_comm_of_no_might_error + {a b : Expr} {env : Env} + (ha : ¬(a.might_error = true)) (hb : ¬(b.might_error = true)) + (hEnv : env.ErrFree) : + eval env (.or a b) = eval env (.or b a) := by + have hae := might_error_sound ha hEnv + have hbe := might_error_sound hb hEnv + exact evalOr_comm_of_no_err hae hbe + end Mz diff --git a/doc/developer/semantics/Mz/MightError.lean b/doc/developer/semantics/Mz/MightError.lean new file mode 100644 index 0000000000000..cba31df923dd4 --- /dev/null +++ b/doc/developer/semantics/Mz/MightError.lean @@ -0,0 +1,176 @@ +import Mz.Eval + +/-! +# `might_error` static analyzer and soundness + +A conservative analyzer that returns `true` when an `Expr` might +evaluate to an `err`, plus the soundness theorem that justifies its +use by the optimizer: if `might_error e` is `false` and the +surrounding environment carries no errors, then `eval env e` is not +an error. + +The analyzer in `src/expr/src/scalar.rs::might_error` is more refined +(it knows about `ErrorIfNull` and literal errors). The skeleton +version is purely structural; tightening it later is additive work +that does not change the soundness statement. +-/ + +namespace Mz + +/-! ## Helper lemmas: error-free inputs yield an error-free output -/ + +theorem evalAnd_not_err + {d₁ d₂ : Datum} (h₁ : ¬d₁.IsErr) (h₂ : ¬d₂.IsErr) : + ¬(evalAnd d₁ d₂).IsErr := by + cases d₁ with + | bool b₁ => + cases d₂ with + | bool b₂ => cases b₁ <;> cases b₂ <;> decide + | null => cases b₁ <;> decide + | err _ => exact (h₂ trivial).elim + | null => + cases d₂ with + | bool b₂ => cases b₂ <;> decide + | null => decide + | err _ => exact (h₂ trivial).elim + | err _ => exact (h₁ trivial).elim + +theorem evalOr_not_err + {d₁ d₂ : Datum} (h₁ : ¬d₁.IsErr) (h₂ : ¬d₂.IsErr) : + ¬(evalOr d₁ d₂).IsErr := by + cases d₁ with + | bool b₁ => + cases d₂ with + | bool b₂ => cases b₁ <;> cases b₂ <;> decide + | null => cases b₁ <;> decide + | err _ => exact (h₂ trivial).elim + | null => + cases d₂ with + | bool b₂ => cases b₂ <;> decide + | null => decide + | err _ => exact (h₂ trivial).elim + | err _ => exact (h₁ trivial).elim + +theorem evalNot_not_err + {d : Datum} (h : ¬d.IsErr) : ¬(evalNot d).IsErr := by + cases d with + | bool b => cases b <;> decide + | null => decide + | err _ => exact (h trivial).elim + +theorem evalIfThen_not_err + {dc dt de : Datum} + (hc : ¬dc.IsErr) (ht : ¬dt.IsErr) (he : ¬de.IsErr) : + ¬(evalIfThen dc dt de).IsErr := by + cases dc with + | bool b => + cases b + · -- false branch: evalIfThen reduces to `de` + simp only [evalIfThen]; exact he + · -- true branch: evalIfThen reduces to `dt` + simp only [evalIfThen]; exact ht + | null => + simp only [evalIfThen]; decide + | err _ => exact (hc trivial).elim + +/-! ## Static analyzer -/ + +/-- Returns `true` when `e` might evaluate to an `err`. The current +implementation is purely structural and conservative: any literal +`err` taints every ancestor. Columns are assumed not to contain errors +(see `Env.ErrFree`). -/ +def Expr.might_error : Expr → Bool + | .lit (.err _) => true + | .lit _ => false + | .col _ => false + | .and a b => a.might_error || b.might_error + | .or a b => a.might_error || b.might_error + | .not a => a.might_error + | .ifThen c t e => c.might_error || t.might_error || e.might_error + +/-! ## Error-free environments -/ + +/-- An environment is error-free when every bound value is not an `err`. -/ +def Env.ErrFree (env : Env) : Prop := + ∀ d ∈ env, ¬d.IsErr + +theorem Env.get_not_err {env : Env} (hErr : env.ErrFree) (i : Nat) : + ¬(Env.get env i).IsErr := by + induction env generalizing i with + | nil => + -- `Env.get [] i = .null` by definition; `.null` is not an error. + intro h + cases h + | cons hd tl ih => + cases i with + | zero => + -- `Env.get (hd :: tl) 0 = hd`. `hd ∈ hd :: tl`, so `ErrFree` rules out err. + apply hErr hd + exact List.Mem.head tl + | succ n => + -- Reduce to the tail and apply the IH. + apply ih + intro d hd_mem + apply hErr d + exact List.Mem.tail hd hd_mem + +/-! ## Soundness -/ + +/-- If `might_error e` is `false` and the environment carries no +errors, then `eval env e` is not an error. + +The proof is structural induction on `e`. For each compound case the +hypothesis `¬e.might_error = true` decomposes into per-subexpression +hypotheses via `Bool` distribution. The matching helper lemma +(`evalAnd_not_err`, etc.) then concludes. -/ +theorem might_error_sound {e : Expr} : + ∀ {env : Env}, + ¬(e.might_error = true) → env.ErrFree → ¬(eval env e).IsErr := by + induction e with + | lit d => + intro _ hMe _ hRes + -- `eval env (lit d) = d`. If `d` were `err`, `might_error (lit d) = true`, + -- contradicting `hMe`. The other cases reduce `IsErr d` to `False`. + cases d with + | bool _ => cases hRes + | null => cases hRes + | err _ => exact hMe rfl + | col i => + intro env _ hEnv + -- `eval env (col i) = Env.get env i`, error-freedom of the env carries. + show ¬(Env.get env i).IsErr + exact Env.get_not_err hEnv i + | and a b ih_a ih_b => + intro env hMe hEnv + have ha : ¬(a.might_error = true) := by + intro h; apply hMe; simp [Expr.might_error, h] + have hb : ¬(b.might_error = true) := by + intro h; apply hMe; simp [Expr.might_error, h] + show ¬(evalAnd (eval env a) (eval env b)).IsErr + exact evalAnd_not_err (ih_a ha hEnv) (ih_b hb hEnv) + | or a b ih_a ih_b => + intro env hMe hEnv + have ha : ¬(a.might_error = true) := by + intro h; apply hMe; simp [Expr.might_error, h] + have hb : ¬(b.might_error = true) := by + intro h; apply hMe; simp [Expr.might_error, h] + show ¬(evalOr (eval env a) (eval env b)).IsErr + exact evalOr_not_err (ih_a ha hEnv) (ih_b hb hEnv) + | not a ih_a => + intro env hMe hEnv + have ha : ¬(a.might_error = true) := by + intro h; apply hMe; simp [Expr.might_error, h] + show ¬(evalNot (eval env a)).IsErr + exact evalNot_not_err (ih_a ha hEnv) + | ifThen c t e ih_c ih_t ih_e => + intro env hMe hEnv + have hc : ¬(c.might_error = true) := by + intro h; apply hMe; simp [Expr.might_error, h] + have ht : ¬(t.might_error = true) := by + intro h; apply hMe; simp [Expr.might_error, h] + have he : ¬(e.might_error = true) := by + intro h; apply hMe; simp [Expr.might_error, h] + show ¬(evalIfThen (eval env c) (eval env t) (eval env e)).IsErr + exact evalIfThen_not_err (ih_c hc hEnv) (ih_t ht hEnv) (ih_e he hEnv) + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 5eb0187e0b617..1c9a268245c86 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -10,9 +10,10 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Datum.lean`: `Datum`, `EvalError`, and the `Datum.IsErr` predicate. * `Mz/Expr.lean`: a minimal `Expr` inductive (literals, columns, binary `and`/`or`, `not`, `ifThen`). -* `Mz/Eval.lean`: `evalAnd`, `evalOr`, `evalNot`, and `eval` matching the runtime in `src/expr/src/scalar/func/variadic.rs`. +* `Mz/Eval.lean`: `evalAnd`, `evalOr`, `evalNot`, `evalIfThen`, and `eval` matching the runtime in `src/expr/src/scalar/func/variadic.rs`. `Env.get` is defined by primitive recursion to keep inductive proofs simple. * `Mz/Boolean.lean`: per-cell truth-table proofs for `AND`, `OR`, and `NOT`, plus involutivity of `NOT`. -* `Mz/Laws.lean`: algebraic laws — idempotence (unconditional) and commutativity (conditional on error-freedom). +* `Mz/MightError.lean`: the conservative `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem that the optimizer needs in order to trust the analyzer's verdict. +* `Mz/Laws.lean`: algebraic laws — idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. ## What is not here @@ -58,11 +59,10 @@ Reviewers should expect both sides of the change in the same PR. The roadmap in priority order: -* `Expr.might_error` predicate plus soundness: `¬might_error e → ∀ env err-free, ¬(eval env e).IsErr`. -* Reorder-safety conditions on `Expr`: lift `evalAnd_comm_of_no_err` to `eval env (And a b) = eval env (And b a)` when `¬a.might_error ∧ ¬b.might_error`. -* Strict-propagation theorem: any strict function applied to `err` returns `err`. -* Coalesce error-rescue law: `coalesce(err, x) = x`. -* Variadic `And` and `Or` via fold, with equivalence to the binary form. +* Strict-propagation theorem: any strict function applied to `err` returns `err`. Requires introducing a `Strict` predicate on functions and proving closure under composition. +* Coalesce error-rescue law: `coalesce(err, x) = x`. Requires an `n`-ary `coalesce` operator on the `Expr` side. +* Variadic `And` and `Or` via fold, with equivalence to the binary form. The binary truth tables and laws transport to the variadic form by induction on the operand list. +* Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. The diff-semiring extension for global errors is a separate v2 effort. From 0a76308223be304a7dda22f3c5c6b133c6b2dd24 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Sun, 17 May 2026 22:05:50 +0200 Subject: [PATCH 004/127] doc/semantics: add strictness predicates and propagation theorems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * `ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary` predicates capturing per-position strictness. The cell-scoped analogue of PostgreSQL's `STRICT` qualifier on `NULL`, applied to `err`. * Positive instances: `evalNot` is both err-strict and null-strict; the condition slot of `evalIfThen` is err-strict. * Closure: `ErrStrictUnary.comp` proves strictness is preserved under function composition, which is the property an optimizer relies on when fusing chains of strict scalar functions into a single MFP expression. * Negative results: `evalAnd` and `evalOr` are not err-strict in either argument position. The counterexamples (`FALSE AND ERR`, `TRUE OR ERR`) are also canonical regression tests — a future change to AND/OR that promoted these cells to `ERR` would break exactly these proofs. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Strict.lean | 105 +++++++++++++++++++++++++ doc/developer/semantics/README.md | 4 +- 3 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 doc/developer/semantics/Mz/Strict.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index f4170aba6ea51..3e89104feb31d 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -3,4 +3,5 @@ import Mz.Expr import Mz.Eval import Mz.Boolean import Mz.MightError +import Mz.Strict import Mz.Laws diff --git a/doc/developer/semantics/Mz/Strict.lean b/doc/developer/semantics/Mz/Strict.lean new file mode 100644 index 0000000000000..85820b3dabe6e --- /dev/null +++ b/doc/developer/semantics/Mz/Strict.lean @@ -0,0 +1,105 @@ +import Mz.Eval + +/-! +# Strict propagation + +A scalar function is *err-strict in position k* when supplying `.err` +at that position forces the output to be the same `.err`, regardless +of the other arguments. This is the cell-scoped analogue of +PostgreSQL's `STRICT` qualifier on `NULL`: in the four-valued lattice +of this skeleton, `err` plays the role `NULL` plays in PostgreSQL for +strict functions. + +The boolean fragment exposes exactly one fully strict function +(`evalNot`) and one position-strict function (`evalIfThen`, strict +only in its condition). `evalAnd` and `evalOr` are *not* err-strict in +either position because `FALSE`/`TRUE` short-circuit absorbs the other +operand, including `err`. The corresponding negative results are +stated below; they ensure that the spec does not silently regress to +an "errors > everything" model. + +`NULL`-strictness is captured by a separate predicate. The two +strictness flavors agree for most arithmetic / comparison operators +but disagree for `AND`, `OR`, `IfThen`, and `COALESCE`, which the +spec singles out for special treatment. +-/ + +namespace Mz + +/-! ## Strictness predicates -/ + +/-- `f` is err-strict: an `err` argument forces an `err` output with +the same payload. -/ +def ErrStrictUnary (f : Datum → Datum) : Prop := + ∀ e, f (.err e) = .err e + +/-- `f` is err-strict in each argument position. The two positions +have independent witnesses; a function strict only in its left +argument is captured by `.left` alone. -/ +structure ErrStrictBinary (f : Datum → Datum → Datum) : Prop where + left : ∀ d e, f (.err e) d = .err e + right : ∀ d e, f d (.err e) = .err e + +/-- `f` is null-strict: a `null` argument forces a `null` output. -/ +def NullStrictUnary (f : Datum → Datum) : Prop := + f .null = .null + +/-! ## Concrete instances on the boolean fragment -/ + +theorem evalNot_errStrict : ErrStrictUnary evalNot := fun _ => rfl + +theorem evalNot_nullStrict : NullStrictUnary evalNot := rfl + +/-- The condition slot of `IfThen` is err-strict: an `err` condition +forces an `err` output, with the same payload, no matter what the +branches contain. The branch slots are *not* err-strict: when the +condition selects the other branch, the error in the unselected +branch has no effect on the output. -/ +theorem evalIfThen_errStrict_condition (e : EvalError) (dt de : Datum) : + evalIfThen (.err e) dt de = .err e := rfl + +/-! ## Closure under composition + +If both `f` and `g` are err-strict, so is `f ∘ g`. This is the +property an optimizer uses when it fuses a chain of strict scalar +functions into a single MFP expression: strict-in-strict is strict. -/ + +theorem ErrStrictUnary.comp {f g : Datum → Datum} + (hf : ErrStrictUnary f) (hg : ErrStrictUnary g) : + ErrStrictUnary (f ∘ g) := by + intro e + show f (g (.err e)) = .err e + rw [hg e, hf e] + +/-! ## Negative results + +`AND` and `OR` are not err-strict in either position. The short +counterexamples below also serve as canonical regression tests: a +future change to `evalAnd` that accidentally promotes +`FALSE AND ERROR` to `ERROR` would break exactly these proofs. -/ + +theorem evalAnd_not_errStrict_left : + ¬ ∀ d e, evalAnd (.err e) d = .err e := by + intro h + have hh := h (.bool false) .placeholder + simp [evalAnd] at hh + +theorem evalAnd_not_errStrict_right : + ¬ ∀ d e, evalAnd d (.err e) = .err e := by + intro h + have hh := h (.bool false) .placeholder + simp [evalAnd] at hh + +theorem evalOr_not_errStrict_left : + ¬ ∀ d e, evalOr (.err e) d = .err e := by + intro h + have hh := h (.bool true) .placeholder + simp [evalOr] at hh + +theorem evalOr_not_errStrict_right : + ¬ ∀ d e, evalOr d (.err e) = .err e := by + intro h + have hh := h (.bool true) .placeholder + simp [evalOr] at hh + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 1c9a268245c86..044f586059a59 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -13,6 +13,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Eval.lean`: `evalAnd`, `evalOr`, `evalNot`, `evalIfThen`, and `eval` matching the runtime in `src/expr/src/scalar/func/variadic.rs`. `Env.get` is defined by primitive recursion to keep inductive proofs simple. * `Mz/Boolean.lean`: per-cell truth-table proofs for `AND`, `OR`, and `NOT`, plus involutivity of `NOT`. * `Mz/MightError.lean`: the conservative `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem that the optimizer needs in order to trust the analyzer's verdict. +* `Mz/Strict.lean`: strictness predicates (`ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary`), positive instances for `evalNot` and the condition slot of `evalIfThen`, closure under composition, and negative results witnessing that `AND` and `OR` are *not* err-strict in either position. * `Mz/Laws.lean`: algebraic laws — idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. ## What is not here @@ -59,8 +60,7 @@ Reviewers should expect both sides of the change in the same PR. The roadmap in priority order: -* Strict-propagation theorem: any strict function applied to `err` returns `err`. Requires introducing a `Strict` predicate on functions and proving closure under composition. -* Coalesce error-rescue law: `coalesce(err, x) = x`. Requires an `n`-ary `coalesce` operator on the `Expr` side. +* Coalesce error-rescue law: `coalesce(err, x) = x`. Requires an `n`-ary `coalesce` operator on the `Expr` side, plus the corresponding fold-based evaluator. * Variadic `And` and `Or` via fold, with equivalence to the binary form. The binary truth tables and laws transport to the variadic form by induction on the operand list. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From 7867c8bf931d64ff7db5647688a214eb4878c17c Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Sun, 17 May 2026 22:09:22 +0200 Subject: [PATCH 005/127] doc/semantics: add evalCoalesce and the error-rescue laws MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `evalCoalesce : List Datum → Datum` and prove the laws that characterize the proposed cell-scoped coalesce. Semantics: walk operands left to right; return the first concrete value. When none exists, apply a `null`-beats-`err` tiebreak — if any operand was `null`, return `null`; otherwise, return the first `err`. The state machine in `Coalesce.go` tracks a `seenNull` sticky bit and a first-`err` payload. The defining error-rescue property (`coalesce_err_rescue_bool`) and its symmetric null-rescue (`coalesce_null_rescue_bool`) together establish that a later concrete operand rescues an earlier `err` exactly as it rescues an earlier `null`. This is the explicit, user-controllable error trap referenced in the design doc. The `null`-beats-`err` tiebreak (`coalesce_null_then_err`, `coalesce_err_then_null`) preserves the PostgreSQL behavior users expect when all operands are NULL, and is dual to the strict-function rule documented in `Mz/Strict.lean`: strict functions promote `err` above `null` in per-cell results; `coalesce` is non-strict and demotes `err` below `null` in the tiebreak. Defer wiring `evalCoalesce` into `Expr` as a `.coalesce` constructor until the variadic `And`/`Or` ctors land — the termination story is shared between them. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Coalesce.lean | 115 +++++++++++++++++++++++ doc/developer/semantics/README.md | 3 +- 3 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 doc/developer/semantics/Mz/Coalesce.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 3e89104feb31d..20b132b1d9509 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -4,4 +4,5 @@ import Mz.Eval import Mz.Boolean import Mz.MightError import Mz.Strict +import Mz.Coalesce import Mz.Laws diff --git a/doc/developer/semantics/Mz/Coalesce.lean b/doc/developer/semantics/Mz/Coalesce.lean new file mode 100644 index 0000000000000..5d14afd34cbe7 --- /dev/null +++ b/doc/developer/semantics/Mz/Coalesce.lean @@ -0,0 +1,115 @@ +import Mz.Eval + +/-! +# `coalesce` and the error-rescue law + +`coalesce(d₁, …, dₙ)` returns the first operand that is neither +`null` nor `err`, evaluating left to right. The proposed extension +over PostgreSQL is that `err` is rescuable in the same way `null` +is: a later non-error operand can substitute for an earlier one, +whether that earlier one was `null`, `err`, or any combination. + +When no concrete value is found, the result follows a `null`-beats- +`err` rule: + +* If any operand was `null`, return `null`. +* Otherwise, if any operand was `err`, return the first such `err`. +* Otherwise (the empty list), return `null`. + +The "`null` beats `err`" tiebreaker preserves PostgreSQL's familiar +`coalesce` behavior for the all-`null` case while extending it +cleanly. It is the dual of the strict-function rule documented in +`Mz/Strict.lean`: strict functions promote `err` above `null` in the +result of a per-cell computation; `coalesce` is non-strict and +demotes `err` below `null`. +-/ + +namespace Mz + +/-- Tail-recursive state machine for `evalCoalesce`. State carries: + +* `seenNull`: whether any preceding operand was `null`. Sticky bit. +* `firstErr`: the payload of the earliest `err` operand encountered, + if any. + +Encountering a concrete `.bool b` returns immediately and bypasses +the state. -/ +private def Coalesce.go (seenNull : Bool) (firstErr : Option EvalError) : + List Datum → Datum + | [] => + if seenNull then .null + else + match firstErr with + | some e => .err e + | none => .null + | .bool b :: _ => .bool b + | .null :: rest => Coalesce.go true firstErr rest + | .err e :: rest => + match firstErr with + | some _ => Coalesce.go seenNull firstErr rest + | none => Coalesce.go seenNull (some e) rest + +/-- The `coalesce` evaluator. Walks operands left to right and returns +the first concrete value, with the `null`-beats-`err` tiebreak when no +concrete value is found. -/ +def evalCoalesce : List Datum → Datum := + Coalesce.go false none + +/-! ## Base cases -/ + +theorem coalesce_nil : evalCoalesce [] = .null := rfl + +theorem coalesce_singleton_bool (b : Bool) : + evalCoalesce [.bool b] = .bool b := rfl + +theorem coalesce_singleton_null : + evalCoalesce [.null] = .null := rfl + +theorem coalesce_singleton_err (e : EvalError) : + evalCoalesce [.err e] = .err e := rfl + +/-! ## Error-rescue laws + +The defining property of the proposed `coalesce`: a later non-error, +non-null operand rescues an earlier `err` exactly as it rescues an +earlier `null`. -/ + +theorem coalesce_err_rescue_bool (e : EvalError) (b : Bool) : + evalCoalesce [.err e, .bool b] = .bool b := rfl + +theorem coalesce_null_rescue_bool (b : Bool) : + evalCoalesce [.null, .bool b] = .bool b := rfl + +/-! ## `null` beats `err` -/ + +theorem coalesce_null_then_err (e : EvalError) : + evalCoalesce [.null, .err e] = .null := rfl + +theorem coalesce_err_then_null (e : EvalError) : + evalCoalesce [.err e, .null] = .null := rfl + +/-! ## First error wins among errors -/ + +theorem coalesce_first_err_wins (e₁ e₂ : EvalError) : + evalCoalesce [.err e₁, .err e₂] = .err e₁ := rfl + +/-! ## Three-operand examples + +These nail down the interaction between several `err`s, a `null`, +and a concrete value. They are intentionally stated as concrete +equations rather than universal laws so that a regression in +`Coalesce.go` breaks the offending equation in isolation. -/ + +theorem coalesce_err_err_bool (e₁ e₂ : EvalError) (b : Bool) : + evalCoalesce [.err e₁, .err e₂, .bool b] = .bool b := rfl + +theorem coalesce_err_err_null (e₁ e₂ : EvalError) : + evalCoalesce [.err e₁, .err e₂, .null] = .null := rfl + +theorem coalesce_err_null_err (e₁ e₂ : EvalError) : + evalCoalesce [.err e₁, .null, .err e₂] = .null := rfl + +theorem coalesce_null_err_bool (e : EvalError) (b : Bool) : + evalCoalesce [.null, .err e, .bool b] = .bool b := rfl + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 044f586059a59..f5ce4bbe0d22b 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -14,6 +14,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Boolean.lean`: per-cell truth-table proofs for `AND`, `OR`, and `NOT`, plus involutivity of `NOT`. * `Mz/MightError.lean`: the conservative `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem that the optimizer needs in order to trust the analyzer's verdict. * `Mz/Strict.lean`: strictness predicates (`ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary`), positive instances for `evalNot` and the condition slot of `evalIfThen`, closure under composition, and negative results witnessing that `AND` and `OR` are *not* err-strict in either position. +* `Mz/Coalesce.lean`: the proposed `coalesce` evaluator and the error-rescue laws. A later non-error, non-null operand rescues an earlier `err` exactly as it rescues a `null`. The `null`-beats-`err` tiebreak preserves the all-`null` behavior PostgreSQL users expect. * `Mz/Laws.lean`: algebraic laws — idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. ## What is not here @@ -60,7 +61,7 @@ Reviewers should expect both sides of the change in the same PR. The roadmap in priority order: -* Coalesce error-rescue law: `coalesce(err, x) = x`. Requires an `n`-ary `coalesce` operator on the `Expr` side, plus the corresponding fold-based evaluator. +* Wire `evalCoalesce` into `Expr` as `.coalesce (args : List Expr)`, with the eval clause and the lifted error-rescue theorems. Defer until the variadic `And`/`Or` ctors land in the same commit (the termination story is shared between them). * Variadic `And` and `Or` via fold, with equivalence to the binary form. The binary truth tables and laws transport to the variadic form by induction on the operand list. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From f0b00c4dc2825dd85302e763c723d2ebd2efd031 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Sun, 17 May 2026 22:15:19 +0200 Subject: [PATCH 006/127] doc/semantics: add identity laws and variadic And/Or with absorption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the missing identity laws and the variadic-fold counterparts of the binary AND/OR evaluators. * `Mz/Laws.lean`: `TRUE` is the two-sided identity for `evalAnd` and `FALSE` is the two-sided identity for `evalOr`. The proofs verify each cell of the non-identity argument. These identities are also the seed values used by the variadic folds. * `Mz/Variadic.lean`: `evalAndN`, `evalOrN : List Datum → Datum` defined by right-fold. Right-fold gives the cons recurrence by `rfl` and avoids the associativity argument a left-fold would require. - Cons recurrence, nil, singleton, and binary equivalence with `evalAnd` / `evalOr`. The binary equivalence is the bridge that transports every binary truth-table cell and algebraic law to the variadic form on lists of length two. - Absorption theorems: a single `FALSE` anywhere in the operand list forces `evalAndN` to `FALSE`; symmetric statement for `TRUE` and `evalOrN`. These justify the runtime short-circuit optimization. Wiring `evalAndN`, `evalOrN`, and `evalCoalesce` into `Expr` as list-carrying constructors is deferred — the termination story for the nested inductive is shared between them and is best landed in a single commit once it's been validated. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Laws.lean | 31 +++++ doc/developer/semantics/Mz/Variadic.lean | 138 +++++++++++++++++++++++ doc/developer/semantics/README.md | 6 +- 4 files changed, 173 insertions(+), 3 deletions(-) create mode 100644 doc/developer/semantics/Mz/Variadic.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 20b132b1d9509..9b8038e84bceb 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -6,3 +6,4 @@ import Mz.MightError import Mz.Strict import Mz.Coalesce import Mz.Laws +import Mz.Variadic diff --git a/doc/developer/semantics/Mz/Laws.lean b/doc/developer/semantics/Mz/Laws.lean index 066f09e5e279a..d42f03b5a00ce 100644 --- a/doc/developer/semantics/Mz/Laws.lean +++ b/doc/developer/semantics/Mz/Laws.lean @@ -30,6 +30,37 @@ introduces a partial-order on errors. namespace Mz +/-! ## Identity laws + +`TRUE` is the two-sided identity for `evalAnd`; `FALSE` is the +two-sided identity for `evalOr`. The proofs verify each cell of the +non-identity argument. Identities are the seed values used by the +variadic fold in `Mz/Variadic.lean`. -/ + +theorem evalAnd_true_left (d : Datum) : evalAnd (.bool true) d = d := by + cases d with + | bool b => cases b <;> rfl + | null => rfl + | err _ => rfl + +theorem evalAnd_true_right (d : Datum) : evalAnd d (.bool true) = d := by + cases d with + | bool b => cases b <;> rfl + | null => rfl + | err _ => rfl + +theorem evalOr_false_left (d : Datum) : evalOr (.bool false) d = d := by + cases d with + | bool b => cases b <;> rfl + | null => rfl + | err _ => rfl + +theorem evalOr_false_right (d : Datum) : evalOr d (.bool false) = d := by + cases d with + | bool b => cases b <;> rfl + | null => rfl + | err _ => rfl + /-! ## Idempotence -/ theorem evalAnd_idem (d : Datum) : evalAnd d d = d := by diff --git a/doc/developer/semantics/Mz/Variadic.lean b/doc/developer/semantics/Mz/Variadic.lean new file mode 100644 index 0000000000000..a226b8a3e0ee6 --- /dev/null +++ b/doc/developer/semantics/Mz/Variadic.lean @@ -0,0 +1,138 @@ +import Mz.Eval +import Mz.Laws + +/-! +# Variadic `AND` and `OR` + +`MirScalarExpr::VariadicFunc::And` and `Or` take an arbitrary number +of operands. This file defines their semantic counterparts over +`List Datum` and shows that the binary `evalAnd` / `evalOr` are the +two-operand specializations of the variadic forms. + +The variadic forms are defined by right-fold so the cons recurrence +holds by `rfl`. With a left-fold the recurrence would require a +separate associativity argument; the right-fold gives the same final +value because `TRUE`/`FALSE` are two-sided identities for +`evalAnd`/`evalOr` (see `Mz/Laws.lean`). + +A key spec property of `MirScalarExpr` `And`/`Or` is that `FALSE` +(resp. `TRUE`) absorbs every other operand including `err`. Lifted +to the variadic form: if `FALSE` appears anywhere in the list, +`evalAndN` is `FALSE`. The corresponding `evalOrN` theorem holds for +`TRUE`. These absorption theorems justify the runtime's short-circuit +optimization and are what an optimizer cites when it folds +`x AND false` to `false` regardless of `x`'s side effects (which in +this skeleton are limited to producing `err`s). +-/ + +namespace Mz + +/-! ## Definitions -/ + +/-- Right-fold evaluator for variadic `AND`. + +The seed value is `TRUE`, the identity for `evalAnd`. The right-fold +gives `evalAndN (d :: ds) = evalAnd d (evalAndN ds)` by definition, +which makes inductive proofs trivial. -/ +def evalAndN : List Datum → Datum + | [] => .bool true + | d :: rest => evalAnd d (evalAndN rest) + +/-- Right-fold evaluator for variadic `OR`. Dual of `evalAndN`. -/ +def evalOrN : List Datum → Datum + | [] => .bool false + | d :: rest => evalOr d (evalOrN rest) + +/-! ## Cons recurrence -/ + +theorem evalAndN_cons (d : Datum) (ds : List Datum) : + evalAndN (d :: ds) = evalAnd d (evalAndN ds) := rfl + +theorem evalOrN_cons (d : Datum) (ds : List Datum) : + evalOrN (d :: ds) = evalOr d (evalOrN ds) := rfl + +/-! ## Identity cases -/ + +theorem evalAndN_nil : evalAndN [] = .bool true := rfl +theorem evalOrN_nil : evalOrN [] = .bool false := rfl + +theorem evalAndN_singleton (d : Datum) : evalAndN [d] = d := by + show evalAnd d (.bool true) = d + exact evalAnd_true_right d + +theorem evalOrN_singleton (d : Datum) : evalOrN [d] = d := by + show evalOr d (.bool false) = d + exact evalOr_false_right d + +/-! ## Binary equivalence + +The two-operand variadic forms agree with the binary evaluators. +This is the bridge that lets every binary theorem in +`Mz/Boolean.lean` and `Mz/Laws.lean` carry over to the variadic +operators on lists of length two. -/ + +theorem evalAndN_binary (a b : Datum) : + evalAndN [a, b] = evalAnd a b := by + show evalAnd a (evalAnd b (.bool true)) = evalAnd a b + rw [evalAnd_true_right] + +theorem evalOrN_binary (a b : Datum) : + evalOrN [a, b] = evalOr a b := by + show evalOr a (evalOr b (.bool false)) = evalOr a b + rw [evalOr_false_right] + +/-! ## Absorption + +A single `FALSE` anywhere in the operand list collapses `evalAndN` to +`FALSE`. Symmetric statement for `TRUE` and `evalOrN`. The induction +is on the operand list, using the cons recurrence and the fact that +`evalAnd` returns `FALSE` whenever its right argument is `FALSE`. -/ + +private theorem evalAnd_false_right_any (d : Datum) : + evalAnd d (.bool false) = .bool false := by + cases d with + | bool b => cases b <;> rfl + | null => rfl + | err _ => rfl + +private theorem evalOr_true_right_any (d : Datum) : + evalOr d (.bool true) = .bool true := by + cases d with + | bool b => cases b <;> rfl + | null => rfl + | err _ => rfl + +theorem evalAndN_false_absorbs : + ∀ {ds : List Datum}, Datum.bool false ∈ ds → evalAndN ds = .bool false := by + intro ds hmem + induction ds with + | nil => cases hmem + | cons hd tl ih => + cases hmem with + | head _ => + -- `hd` was unified with `.bool false`. Cons recurrence reduces. + show evalAnd (.bool false) (evalAndN tl) = .bool false + rfl + | tail _ hmem' => + have htl : evalAndN tl = .bool false := ih hmem' + show evalAnd hd (evalAndN tl) = .bool false + rw [htl] + exact evalAnd_false_right_any hd + +theorem evalOrN_true_absorbs : + ∀ {ds : List Datum}, Datum.bool true ∈ ds → evalOrN ds = .bool true := by + intro ds hmem + induction ds with + | nil => cases hmem + | cons hd tl ih => + cases hmem with + | head _ => + show evalOr (.bool true) (evalOrN tl) = .bool true + rfl + | tail _ hmem' => + have htl : evalOrN tl = .bool true := ih hmem' + show evalOr hd (evalOrN tl) = .bool true + rw [htl] + exact evalOr_true_right_any hd + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index f5ce4bbe0d22b..87cb8a9515d24 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -15,7 +15,8 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/MightError.lean`: the conservative `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem that the optimizer needs in order to trust the analyzer's verdict. * `Mz/Strict.lean`: strictness predicates (`ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary`), positive instances for `evalNot` and the condition slot of `evalIfThen`, closure under composition, and negative results witnessing that `AND` and `OR` are *not* err-strict in either position. * `Mz/Coalesce.lean`: the proposed `coalesce` evaluator and the error-rescue laws. A later non-error, non-null operand rescues an earlier `err` exactly as it rescues a `null`. The `null`-beats-`err` tiebreak preserves the all-`null` behavior PostgreSQL users expect. -* `Mz/Laws.lean`: algebraic laws — idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. +* `Mz/Laws.lean`: algebraic laws — two-sided identity (`TRUE` for `AND`, `FALSE` for `OR`), idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. +* `Mz/Variadic.lean`: variadic `evalAndN` and `evalOrN` over `List Datum` as right-folds, the binary-equivalence theorems that bridge them to `evalAnd`/`evalOr`, and the `FALSE`/`TRUE` absorption theorems that justify short-circuit evaluation. ## What is not here @@ -61,8 +62,7 @@ Reviewers should expect both sides of the change in the same PR. The roadmap in priority order: -* Wire `evalCoalesce` into `Expr` as `.coalesce (args : List Expr)`, with the eval clause and the lifted error-rescue theorems. Defer until the variadic `And`/`Or` ctors land in the same commit (the termination story is shared between them). -* Variadic `And` and `Or` via fold, with equivalence to the binary form. The binary truth tables and laws transport to the variadic form by induction on the operand list. +* Wire `evalAndN`, `evalOrN`, and `evalCoalesce` into `Expr` as `.andN`, `.orN`, and `.coalesce` constructors taking `List Expr`. Termination for the eval clause is the shared engineering effort — once one of these lands, the others follow trivially. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From 3f534a7fa794b75eecf2808ae55c450ae6329822 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Sun, 17 May 2026 23:07:09 +0200 Subject: [PATCH 007/127] doc/semantics: wire variadic ops into Expr as list-carrying ctors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `.andN`, `.orN`, and `.coalesce` to `Expr` as variadic constructors carrying `List Expr`, with the corresponding `eval` clauses and Expr-level reduction lemmas. The variadic primitives (`evalAndN`, `evalOrN`, `evalCoalesce`) move to a new `Mz/PrimEval.lean` to avoid the circular dependency that would otherwise arise between `Mz/Eval.lean` and the variadic-law files. File reorganization: * New `Mz/PrimEval.lean`: every evaluator that operates on `Datum` or `List Datum` without referring to `Expr` — the binary boolean primitives, the environment, and the variadic primitives. * `Mz/Eval.lean` shrinks to the `eval : Env → Expr → Datum` definition. * `Mz/Variadic.lean` and `Mz/Coalesce.lean` drop their now-moved evaluator definitions and keep only the theorems. * `Mz/Boolean.lean` and `Mz/Strict.lean` switch their imports to `Mz.PrimEval` since they never used the Expr-level evaluator. `might_error_sound`: * `induction` cannot be used on the now-nested-inductive `Expr`, so the soundness proof is rewritten as a recursive `theorem` that pattern-matches the constructor and recurses on subexpressions. The signature also makes `env` explicit so the recursive calls in compound cases stay readable. * The list-carrying constructors are handled vacuously: the conservative `might_error` for them returns `true` unconditionally, so the soundness premise is absurd in those cases. A future refinement will tighten the analyzer to inspect operands. New `Mz/ExprVariadic.lean`: * `eval_andN`, `eval_orN`, `eval_coalesce` connect the Expr-level evaluator to the variadic primitives. * Empty / singleton cases for all three. * Binary equivalence: `eval env (.andN [a, b]) = eval env (.and a b)` and the dual for `.orN`, transporting `Mz/Variadic.lean`'s binary laws through `eval`. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Boolean.lean | 2 +- doc/developer/semantics/Mz/Coalesce.lean | 34 +----- doc/developer/semantics/Mz/Eval.lean | 104 ++++------------ doc/developer/semantics/Mz/Expr.lean | 13 +- doc/developer/semantics/Mz/ExprVariadic.lean | 98 +++++++++++++++ doc/developer/semantics/Mz/Laws.lean | 10 +- doc/developer/semantics/Mz/MightError.lean | 117 ++++++++++-------- doc/developer/semantics/Mz/PrimEval.lean | 121 +++++++++++++++++++ doc/developer/semantics/Mz/Strict.lean | 2 +- doc/developer/semantics/Mz/Variadic.lean | 25 +--- doc/developer/semantics/README.md | 15 ++- 12 files changed, 346 insertions(+), 196 deletions(-) create mode 100644 doc/developer/semantics/Mz/ExprVariadic.lean create mode 100644 doc/developer/semantics/Mz/PrimEval.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 9b8038e84bceb..02485f311e937 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -7,3 +7,4 @@ import Mz.Strict import Mz.Coalesce import Mz.Laws import Mz.Variadic +import Mz.ExprVariadic diff --git a/doc/developer/semantics/Mz/Boolean.lean b/doc/developer/semantics/Mz/Boolean.lean index f5ed69bc9f92d..d8c9a32d68812 100644 --- a/doc/developer/semantics/Mz/Boolean.lean +++ b/doc/developer/semantics/Mz/Boolean.lean @@ -1,4 +1,4 @@ -import Mz.Eval +import Mz.PrimEval /-! # Boolean truth tables diff --git a/doc/developer/semantics/Mz/Coalesce.lean b/doc/developer/semantics/Mz/Coalesce.lean index 5d14afd34cbe7..e51acd97ac57d 100644 --- a/doc/developer/semantics/Mz/Coalesce.lean +++ b/doc/developer/semantics/Mz/Coalesce.lean @@ -1,4 +1,4 @@ -import Mz.Eval +import Mz.PrimEval /-! # `coalesce` and the error-rescue law @@ -9,6 +9,9 @@ over PostgreSQL is that `err` is rescuable in the same way `null` is: a later non-error operand can substitute for an earlier one, whether that earlier one was `null`, `err`, or any combination. +The evaluator (`evalCoalesce`) and its state-machine helper live in +`Mz/PrimEval.lean`. This file collects the laws. + When no concrete value is found, the result follows a `null`-beats- `err` rule: @@ -26,35 +29,6 @@ demotes `err` below `null`. namespace Mz -/-- Tail-recursive state machine for `evalCoalesce`. State carries: - -* `seenNull`: whether any preceding operand was `null`. Sticky bit. -* `firstErr`: the payload of the earliest `err` operand encountered, - if any. - -Encountering a concrete `.bool b` returns immediately and bypasses -the state. -/ -private def Coalesce.go (seenNull : Bool) (firstErr : Option EvalError) : - List Datum → Datum - | [] => - if seenNull then .null - else - match firstErr with - | some e => .err e - | none => .null - | .bool b :: _ => .bool b - | .null :: rest => Coalesce.go true firstErr rest - | .err e :: rest => - match firstErr with - | some _ => Coalesce.go seenNull firstErr rest - | none => Coalesce.go seenNull (some e) rest - -/-- The `coalesce` evaluator. Walks operands left to right and returns -the first concrete value, with the `null`-beats-`err` tiebreak when no -concrete value is found. -/ -def evalCoalesce : List Datum → Datum := - Coalesce.go false none - /-! ## Base cases -/ theorem coalesce_nil : evalCoalesce [] = .null := rfl diff --git a/doc/developer/semantics/Mz/Eval.lean b/doc/developer/semantics/Mz/Eval.lean index 5aa4bca6e4bde..f9acc4376fa69 100644 --- a/doc/developer/semantics/Mz/Eval.lean +++ b/doc/developer/semantics/Mz/Eval.lean @@ -1,94 +1,31 @@ -import Mz.Datum import Mz.Expr +import Mz.PrimEval /-! # `eval` -Operational semantics for `Expr`. - -`evalAnd` and `evalOr` are pattern-ordered to match Materialize's -current runtime behavior in `src/expr/src/scalar/func/variadic.rs` -(`And::eval`, `Or::eval`). Specifically: - -* `FALSE` in either operand absorbs every other value, including - `null` and `err`. Symmetric for `TRUE` in `OR`. -* Otherwise, `err` is the result whenever it appears, including when - the other operand is `null`. This deliberately matches the current - Mz behavior rather than the "NULL absorbs ERROR" rule discussed in - `doc/developer/design/20260517_error_handling_semantics.md`. - -A change to either rule should produce a corresponding diff in -`Mz/Boolean.lean`. That diff is the spec change. +Big-step evaluator. The primitive operations on `Datum` and +`List Datum` live in `Mz/PrimEval.lean`; this file only defines the +`Expr` → `Datum` translation. + +For the variadic constructors `andN`, `orN`, and `coalesce`, the +evaluator first evaluates every operand and then hands the resulting +`List Datum` to the variadic primitive. Operand evaluation uses +`List.attach` so that Lean's structural recursion checker can see +each element of `args` as a subterm of the enclosing `Expr`. + +Modeling note on laziness: in the runtime, the boolean fragment short- +circuits — once a `FALSE` is seen, the rest of the `AND` operands are +not evaluated. In this total skeleton, every `eval env e` is a total +function of its inputs, and the absorption theorems in +`Mz/Variadic.lean` guarantee that strict evaluation produces the same +`Datum` as the lazy runtime would. A future iteration that introduces +effects (resource usage, partiality, observability) will need to +reintroduce the laziness explicitly. -/ namespace Mz -/-- AND evaluation table. - -The pattern order encodes the absorption hierarchy: -`FALSE > ERROR > NULL > TRUE`. -/ -def evalAnd : Datum → Datum → Datum - | .bool false, _ => .bool false - | _, .bool false => .bool false - | .err e, _ => .err e - | _, .err e => .err e - | .null, _ => .null - | _, .null => .null - | .bool true, .bool true => .bool true - -/-- OR evaluation table. - -Mirror of `evalAnd` with `TRUE` as the dominant absorber: -`TRUE > ERROR > NULL > FALSE`. -/ -def evalOr : Datum → Datum → Datum - | .bool true, _ => .bool true - | _, .bool true => .bool true - | .err e, _ => .err e - | _, .err e => .err e - | .null, _ => .null - | _, .null => .null - | .bool false, .bool false => .bool false - -/-- NOT evaluation table. - -`Not` is strict in the SQL sense: it propagates `null` and `err` while -flipping `true ↔ false`. -/ -def evalNot : Datum → Datum - | .bool b => .bool (!b) - | .null => .null - | .err e => .err e - -/-- `IfThen` evaluation table. - -In SQL, the runtime form is lazy: only the selected branch is -evaluated, so a literal error inside the un-selected branch is never -raised. In this total skeleton every `Expr` evaluates to a `Datum` -regardless, so we model `IfThen` as a strict function of three values. -The observable output (a `Datum`) coincides with the lazy version -whenever the lazy version is defined, because both branches are total -functions of `Datum`. A future iteration that introduces effects or -partiality will have to reintroduce the laziness explicitly. -/ -def evalIfThen : Datum → Datum → Datum → Datum - | .bool true, dt, _ => dt - | .bool false, _, de => de - | .null, _, _ => .null - | .err e, _, _ => .err e - -/-- Environment: a positional list of bindings for `Expr.col`. -/ -abbrev Env := List Datum - -/-- Reading an out-of-bounds column yields `NULL`. - -This is a modeling choice for the skeleton. The real evaluator expects -callers to provide a well-typed row of the correct width; the skeleton -avoids that obligation by defaulting to `NULL`. Defined by primitive -recursion on the list so that inductive proofs can `cases` on the -defining equations directly rather than going through `List.getD`. -/ -def Env.get : Env → Nat → Datum - | [], _ => .null - | d :: _, 0 => d - | _ :: rest, n + 1 => Env.get rest n - /-- Big-step evaluation. -/ def eval (env : Env) : Expr → Datum | .lit d => d @@ -97,5 +34,8 @@ def eval (env : Env) : Expr → Datum | .or a b => evalOr (eval env a) (eval env b) | .not a => evalNot (eval env a) | .ifThen c t e => evalIfThen (eval env c) (eval env t) (eval env e) + | .andN args => evalAndN (args.map (eval env)) + | .orN args => evalOrN (args.map (eval env)) + | .coalesce args => evalCoalesce (args.map (eval env)) end Mz diff --git a/doc/developer/semantics/Mz/Expr.lean b/doc/developer/semantics/Mz/Expr.lean index 8e2ba049f201f..b2af7186cdbbc 100644 --- a/doc/developer/semantics/Mz/Expr.lean +++ b/doc/developer/semantics/Mz/Expr.lean @@ -18,10 +18,16 @@ namespace Mz * `lit d`: literal datum. * `col i`: reference to column `i` in the surrounding environment. -* `and a b`, `or a b`: logical conjunction and disjunction. +* `and a b`, `or a b`: binary logical conjunction and disjunction. * `not a`: logical negation. * `ifThen c t e`: PostgreSQL-style `CASE` / `If` — the only - user-controllable short-circuit in `MirScalarExpr`. -/ + user-controllable short-circuit in `MirScalarExpr`. +* `andN args`, `orN args`: variadic logical conjunction and + disjunction. These match `MirScalarExpr::VariadicFunc::{And, Or}` + in the runtime; the binary `and` / `or` constructors are kept + alongside for proof convenience. +* `coalesce args`: the error-rescuing variant of `COALESCE` proposed + in `doc/developer/design/20260517_error_handling_semantics.md`. -/ inductive Expr | lit (d : Datum) | col (i : Nat) @@ -29,6 +35,9 @@ inductive Expr | or (a b : Expr) | not (a : Expr) | ifThen (c t e : Expr) + | andN (args : List Expr) + | orN (args : List Expr) + | coalesce (args : List Expr) deriving Inhabited end Mz diff --git a/doc/developer/semantics/Mz/ExprVariadic.lean b/doc/developer/semantics/Mz/ExprVariadic.lean new file mode 100644 index 0000000000000..82721b351c0ff --- /dev/null +++ b/doc/developer/semantics/Mz/ExprVariadic.lean @@ -0,0 +1,98 @@ +import Mz.Eval +import Mz.Variadic +import Mz.Coalesce + +/-! +# `Expr`-level reduction lemmas for the variadic constructors + +The variadic `Expr` constructors (`andN`, `orN`, `coalesce`) evaluate +by mapping `eval env` across the operand list and then handing the +result to the corresponding variadic primitive. These reduction +lemmas state that explicitly so downstream proofs can rewrite without +having to unfold `eval`. + +Each lemma is a single-line `rfl` because the matching clause of +`eval` is structurally identical to the lemma's right-hand side. + +Concrete consequences derived from `Mz/Variadic.lean` follow at the +end of the file: the empty / singleton / two-operand cases of the +variadic `Expr` operators, and absorption by `FALSE` (resp. `TRUE`) +when a literal `FALSE` (resp. `TRUE`) is one of the operands. +-/ + +namespace Mz + +/-! ## eval reduction lemmas -/ + +theorem eval_andN (env : Env) (args : List Expr) : + eval env (.andN args) = evalAndN (args.map (eval env)) := by + simp only [eval] + +theorem eval_orN (env : Env) (args : List Expr) : + eval env (.orN args) = evalOrN (args.map (eval env)) := by + simp only [eval] + +theorem eval_coalesce (env : Env) (args : List Expr) : + eval env (.coalesce args) = evalCoalesce (args.map (eval env)) := by + simp only [eval] + +/-! ## Identity, singleton, and binary equivalence at the `Expr` level + +`andN []`, `orN []`, and `coalesce []` are constants. `andN [a]` and +`orN [a]` reduce to `eval env a`. `andN [a, b]` agrees with `and a b`, +and similarly for `or`. These transport the corresponding `Datum`- +level laws in `Mz/Variadic.lean` through `eval`. -/ + +theorem eval_andN_nil (env : Env) : + eval env (.andN []) = .bool true := by + rw [eval_andN]; rfl + +theorem eval_orN_nil (env : Env) : + eval env (.orN []) = .bool false := by + rw [eval_orN]; rfl + +theorem eval_andN_singleton (env : Env) (a : Expr) : + eval env (.andN [a]) = eval env a := by + rw [eval_andN] + show evalAndN [eval env a] = eval env a + exact evalAndN_singleton (eval env a) + +theorem eval_orN_singleton (env : Env) (a : Expr) : + eval env (.orN [a]) = eval env a := by + rw [eval_orN] + show evalOrN [eval env a] = eval env a + exact evalOrN_singleton (eval env a) + +theorem eval_andN_binary (env : Env) (a b : Expr) : + eval env (.andN [a, b]) = eval env (.and a b) := by + rw [eval_andN] + show evalAndN [eval env a, eval env b] = eval env (.and a b) + rw [evalAndN_binary] + -- Goal: evalAnd (eval env a) (eval env b) = eval env (.and a b) + simp only [eval] + +theorem eval_orN_binary (env : Env) (a b : Expr) : + eval env (.orN [a, b]) = eval env (.or a b) := by + rw [eval_orN] + show evalOrN [eval env a, eval env b] = eval env (.or a b) + rw [evalOrN_binary] + simp only [eval] + +/-! ## Coalesce base cases at the `Expr` level -/ + +theorem eval_coalesce_nil (env : Env) : + eval env (.coalesce []) = .null := by + rw [eval_coalesce]; rfl + +theorem eval_coalesce_singleton (env : Env) (a : Expr) : + eval env (.coalesce [a]) = eval env a := by + rw [eval_coalesce] + -- `args.map (eval env) = [eval env a]`; result depends on `eval env a`'s shape. + show evalCoalesce [eval env a] = eval env a + -- Case analysis on the underlying datum. + cases h : eval env a with + | bool b => rw [show evalCoalesce [Datum.bool b] = Datum.bool b from rfl] + | null => rw [show evalCoalesce [Datum.null] = Datum.null from rfl] + | err e => rw [show evalCoalesce [Datum.err e] = Datum.err e from rfl] + +end Mz diff --git a/doc/developer/semantics/Mz/Laws.lean b/doc/developer/semantics/Mz/Laws.lean index d42f03b5a00ce..c97e31e159bd5 100644 --- a/doc/developer/semantics/Mz/Laws.lean +++ b/doc/developer/semantics/Mz/Laws.lean @@ -129,8 +129,9 @@ theorem eval_and_comm_of_no_might_error (ha : ¬(a.might_error = true)) (hb : ¬(b.might_error = true)) (hEnv : env.ErrFree) : eval env (.and a b) = eval env (.and b a) := by - have hae := might_error_sound ha hEnv - have hbe := might_error_sound hb hEnv + have hae := might_error_sound a env ha hEnv + have hbe := might_error_sound b env hb hEnv + simp only [eval] exact evalAnd_comm_of_no_err hae hbe theorem eval_or_comm_of_no_might_error @@ -138,8 +139,9 @@ theorem eval_or_comm_of_no_might_error (ha : ¬(a.might_error = true)) (hb : ¬(b.might_error = true)) (hEnv : env.ErrFree) : eval env (.or a b) = eval env (.or b a) := by - have hae := might_error_sound ha hEnv - have hbe := might_error_sound hb hEnv + have hae := might_error_sound a env ha hEnv + have hbe := might_error_sound b env hb hEnv + simp only [eval] exact evalOr_comm_of_no_err hae hbe end Mz diff --git a/doc/developer/semantics/Mz/MightError.lean b/doc/developer/semantics/Mz/MightError.lean index cba31df923dd4..21315753054b2 100644 --- a/doc/developer/semantics/Mz/MightError.lean +++ b/doc/developer/semantics/Mz/MightError.lean @@ -78,7 +78,13 @@ theorem evalIfThen_not_err /-- Returns `true` when `e` might evaluate to an `err`. The current implementation is purely structural and conservative: any literal `err` taints every ancestor. Columns are assumed not to contain errors -(see `Env.ErrFree`). -/ +(see `Env.ErrFree`). + +For the list-carrying constructors (`andN`, `orN`, `coalesce`) the +skeleton is maximally conservative — they always taint. A future +refinement would recurse into `args` and (for `coalesce`) reason +about the rescue rule, but the present version keeps the soundness +proof trivial for those cases. -/ def Expr.might_error : Expr → Bool | .lit (.err _) => true | .lit _ => false @@ -87,6 +93,9 @@ def Expr.might_error : Expr → Bool | .or a b => a.might_error || b.might_error | .not a => a.might_error | .ifThen c t e => c.might_error || t.might_error || e.might_error + | .andN _ => true + | .orN _ => true + | .coalesce _ => true /-! ## Error-free environments -/ @@ -119,58 +128,66 @@ theorem Env.get_not_err {env : Env} (hErr : env.ErrFree) (i : Nat) : /-- If `might_error e` is `false` and the environment carries no errors, then `eval env e` is not an error. -The proof is structural induction on `e`. For each compound case the -hypothesis `¬e.might_error = true` decomposes into per-subexpression -hypotheses via `Bool` distribution. The matching helper lemma -(`evalAnd_not_err`, etc.) then concludes. -/ -theorem might_error_sound {e : Expr} : - ∀ {env : Env}, - ¬(e.might_error = true) → env.ErrFree → ¬(eval env e).IsErr := by - induction e with - | lit d => - intro _ hMe _ hRes - -- `eval env (lit d) = d`. If `d` were `err`, `might_error (lit d) = true`, - -- contradicting `hMe`. The other cases reduce `IsErr d` to `False`. +The proof is structural recursion on `e`. The `induction` tactic +cannot be used on `Expr` because it is a nested inductive type (the +`andN` / `orN` / `coalesce` constructors carry `List Expr`), so the +proof is written as a recursive `theorem` that pattern-matches the +constructor and recurses on subexpressions. For each compound case +the hypothesis `¬e.might_error = true` decomposes into +per-subexpression hypotheses via `Bool` distribution. The matching +helper lemma (`evalAnd_not_err`, etc.) then concludes. + +The list-carrying constructors are handled vacuously: `might_error` +returns `true` on them unconditionally, so the soundness premise is +absurd. A future refinement will tighten `might_error` to inspect +operands and add real proofs for those cases. -/ +theorem might_error_sound : + ∀ (e : Expr) (env : Env), + ¬(e.might_error = true) → env.ErrFree → ¬(eval env e).IsErr + | .lit d, _, hMe, _ => by + intro hRes + simp only [eval] at hRes cases d with | bool _ => cases hRes | null => cases hRes | err _ => exact hMe rfl - | col i => - intro env _ hEnv - -- `eval env (col i) = Env.get env i`, error-freedom of the env carries. - show ¬(Env.get env i).IsErr - exact Env.get_not_err hEnv i - | and a b ih_a ih_b => - intro env hMe hEnv - have ha : ¬(a.might_error = true) := by - intro h; apply hMe; simp [Expr.might_error, h] - have hb : ¬(b.might_error = true) := by - intro h; apply hMe; simp [Expr.might_error, h] - show ¬(evalAnd (eval env a) (eval env b)).IsErr - exact evalAnd_not_err (ih_a ha hEnv) (ih_b hb hEnv) - | or a b ih_a ih_b => - intro env hMe hEnv - have ha : ¬(a.might_error = true) := by - intro h; apply hMe; simp [Expr.might_error, h] - have hb : ¬(b.might_error = true) := by - intro h; apply hMe; simp [Expr.might_error, h] - show ¬(evalOr (eval env a) (eval env b)).IsErr - exact evalOr_not_err (ih_a ha hEnv) (ih_b hb hEnv) - | not a ih_a => - intro env hMe hEnv - have ha : ¬(a.might_error = true) := by - intro h; apply hMe; simp [Expr.might_error, h] - show ¬(evalNot (eval env a)).IsErr - exact evalNot_not_err (ih_a ha hEnv) - | ifThen c t e ih_c ih_t ih_e => - intro env hMe hEnv - have hc : ¬(c.might_error = true) := by - intro h; apply hMe; simp [Expr.might_error, h] - have ht : ¬(t.might_error = true) := by - intro h; apply hMe; simp [Expr.might_error, h] - have he : ¬(e.might_error = true) := by - intro h; apply hMe; simp [Expr.might_error, h] - show ¬(evalIfThen (eval env c) (eval env t) (eval env e)).IsErr - exact evalIfThen_not_err (ih_c hc hEnv) (ih_t ht hEnv) (ih_e he hEnv) + | .col i, env, _, hEnv => by + intro hRes + simp only [eval] at hRes + exact Env.get_not_err hEnv i hRes + | .and a b, env, hMe, hEnv => by + intro hRes + simp only [eval] at hRes + have ha : ¬(a.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + have hb : ¬(b.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + exact evalAnd_not_err + (might_error_sound a env ha hEnv) + (might_error_sound b env hb hEnv) hRes + | .or a b, env, hMe, hEnv => by + intro hRes + simp only [eval] at hRes + have ha : ¬(a.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + have hb : ¬(b.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + exact evalOr_not_err + (might_error_sound a env ha hEnv) + (might_error_sound b env hb hEnv) hRes + | .not a, env, hMe, hEnv => by + intro hRes + simp only [eval] at hRes + have ha : ¬(a.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + exact evalNot_not_err (might_error_sound a env ha hEnv) hRes + | .ifThen c t e, env, hMe, hEnv => by + intro hRes + simp only [eval] at hRes + have hc : ¬(c.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + have ht : ¬(t.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + have he : ¬(e.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + exact evalIfThen_not_err + (might_error_sound c env hc hEnv) + (might_error_sound t env ht hEnv) + (might_error_sound e env he hEnv) hRes + | .andN _, _, hMe, _ => by intro _; exact hMe rfl + | .orN _, _, hMe, _ => by intro _; exact hMe rfl + | .coalesce _, _, hMe, _ => by intro _; exact hMe rfl end Mz diff --git a/doc/developer/semantics/Mz/PrimEval.lean b/doc/developer/semantics/Mz/PrimEval.lean new file mode 100644 index 0000000000000..12d7dcf43b0ba --- /dev/null +++ b/doc/developer/semantics/Mz/PrimEval.lean @@ -0,0 +1,121 @@ +import Mz.Datum + +/-! +# Primitive scalar evaluators + +This file collects every evaluator that operates on `Datum` (or +`List Datum`) without reference to `Expr`. The split keeps the +defining equations available to both the algebraic-law files +(`Boolean.lean`, `Laws.lean`, `Strict.lean`, etc.) and the +expression-level evaluator (`Eval.lean`) without circular imports. + +The primitives split into three groups: + +* **Binary boolean and ternary if-then**: `evalAnd`, `evalOr`, + `evalNot`, `evalIfThen`. Match the runtime in + `src/expr/src/scalar/func/variadic.rs`. +* **Environment**: `Env`, `Env.get`. Indexed lookups for `Expr.col`. +* **Variadic primitives**: `evalAndN`, `evalOrN`, `evalCoalesce`. + Used directly by `Expr.andN`, `Expr.orN`, `Expr.coalesce` + evaluation in `Eval.lean`. +-/ + +namespace Mz + +/-! ## Binary and ternary boolean evaluators -/ + +/-- AND evaluation table. Pattern order encodes the absorption +hierarchy `FALSE > ERROR > NULL > TRUE`. -/ +def evalAnd : Datum → Datum → Datum + | .bool false, _ => .bool false + | _, .bool false => .bool false + | .err e, _ => .err e + | _, .err e => .err e + | .null, _ => .null + | _, .null => .null + | .bool true, .bool true => .bool true + +/-- OR evaluation table. Mirror of `evalAnd` with `TRUE` as the +dominant absorber: `TRUE > ERROR > NULL > FALSE`. -/ +def evalOr : Datum → Datum → Datum + | .bool true, _ => .bool true + | _, .bool true => .bool true + | .err e, _ => .err e + | _, .err e => .err e + | .null, _ => .null + | _, .null => .null + | .bool false, .bool false => .bool false + +/-- NOT evaluation table. Strict on `null` and `err`. -/ +def evalNot : Datum → Datum + | .bool b => .bool (!b) + | .null => .null + | .err e => .err e + +/-- `IfThen` evaluation table. Modeled strictly; see `Mz/Eval.lean` +for the discussion of lazy vs strict in a total skeleton. -/ +def evalIfThen : Datum → Datum → Datum → Datum + | .bool true, dt, _ => dt + | .bool false, _, de => de + | .null, _, _ => .null + | .err e, _, _ => .err e + +/-! ## Environment -/ + +/-- Environment: a positional list of bindings for `Expr.col`. -/ +abbrev Env := List Datum + +/-- Reading an out-of-bounds column yields `NULL`. Defined by +primitive recursion to keep inductive proofs simple. -/ +def Env.get : Env → Nat → Datum + | [], _ => .null + | d :: _, 0 => d + | _ :: rest, n + 1 => Env.get rest n + +/-! ## Variadic primitives + +Variadic `AND`, `OR`, and `COALESCE` evaluators over `List Datum`. +The fold-style definitions are exposed here so `eval` in +`Mz/Eval.lean` can refer to them by name without a forward +dependency. The theorems for these evaluators live in +`Mz/Variadic.lean` and `Mz/Coalesce.lean`. -/ + +/-- Right-fold variadic AND. Seed value `TRUE` is the identity for +`evalAnd`, giving the cons recurrence by `rfl`. -/ +def evalAndN : List Datum → Datum + | [] => .bool true + | d :: rest => evalAnd d (evalAndN rest) + +/-- Right-fold variadic OR. Dual of `evalAndN`. -/ +def evalOrN : List Datum → Datum + | [] => .bool false + | d :: rest => evalOr d (evalOrN rest) + +/-! ### Coalesce state machine + +`Coalesce.go` carries the `seenNull` sticky bit and the earliest +`err` payload while walking operands. The first concrete value +(`.bool _`) short-circuits the walk. -/ + +private def Coalesce.go (seenNull : Bool) (firstErr : Option EvalError) : + List Datum → Datum + | [] => + if seenNull then .null + else + match firstErr with + | some e => .err e + | none => .null + | .bool b :: _ => .bool b + | .null :: rest => Coalesce.go true firstErr rest + | .err e :: rest => + match firstErr with + | some _ => Coalesce.go seenNull firstErr rest + | none => Coalesce.go seenNull (some e) rest + +/-- `coalesce` returns the first concrete operand, with a +`null`-beats-`err` tiebreak when none exists. See `Mz/Coalesce.lean` +for the laws. -/ +def evalCoalesce : List Datum → Datum := + Coalesce.go false none + +end Mz diff --git a/doc/developer/semantics/Mz/Strict.lean b/doc/developer/semantics/Mz/Strict.lean index 85820b3dabe6e..4f66d5f863b83 100644 --- a/doc/developer/semantics/Mz/Strict.lean +++ b/doc/developer/semantics/Mz/Strict.lean @@ -1,4 +1,4 @@ -import Mz.Eval +import Mz.PrimEval /-! # Strict propagation diff --git a/doc/developer/semantics/Mz/Variadic.lean b/doc/developer/semantics/Mz/Variadic.lean index a226b8a3e0ee6..3d5c7946c6d69 100644 --- a/doc/developer/semantics/Mz/Variadic.lean +++ b/doc/developer/semantics/Mz/Variadic.lean @@ -1,13 +1,14 @@ -import Mz.Eval +import Mz.PrimEval import Mz.Laws /-! # Variadic `AND` and `OR` `MirScalarExpr::VariadicFunc::And` and `Or` take an arbitrary number -of operands. This file defines their semantic counterparts over -`List Datum` and shows that the binary `evalAnd` / `evalOr` are the -two-operand specializations of the variadic forms. +of operands. This file proves laws about the corresponding +`List Datum → Datum` evaluators (`evalAndN`, `evalOrN`) defined in +`Mz/PrimEval.lean`, and shows that the binary `evalAnd` / `evalOr` +are the two-operand specializations of the variadic forms. The variadic forms are defined by right-fold so the cons recurrence holds by `rfl`. With a left-fold the recurrence would require a @@ -27,22 +28,6 @@ this skeleton are limited to producing `err`s). namespace Mz -/-! ## Definitions -/ - -/-- Right-fold evaluator for variadic `AND`. - -The seed value is `TRUE`, the identity for `evalAnd`. The right-fold -gives `evalAndN (d :: ds) = evalAnd d (evalAndN ds)` by definition, -which makes inductive proofs trivial. -/ -def evalAndN : List Datum → Datum - | [] => .bool true - | d :: rest => evalAnd d (evalAndN rest) - -/-- Right-fold evaluator for variadic `OR`. Dual of `evalAndN`. -/ -def evalOrN : List Datum → Datum - | [] => .bool false - | d :: rest => evalOr d (evalOrN rest) - /-! ## Cons recurrence -/ theorem evalAndN_cons (d : Datum) (ds : List Datum) : diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 87cb8a9515d24..20857fb584e46 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -9,14 +9,16 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four ## What is here * `Mz/Datum.lean`: `Datum`, `EvalError`, and the `Datum.IsErr` predicate. -* `Mz/Expr.lean`: a minimal `Expr` inductive (literals, columns, binary `and`/`or`, `not`, `ifThen`). -* `Mz/Eval.lean`: `evalAnd`, `evalOr`, `evalNot`, `evalIfThen`, and `eval` matching the runtime in `src/expr/src/scalar/func/variadic.rs`. `Env.get` is defined by primitive recursion to keep inductive proofs simple. +* `Mz/Expr.lean`: `Expr` inductive — literals, columns, binary `and`/`or`, `not`, `ifThen`, plus the list-carrying constructors `andN`, `orN`, and `coalesce`. +* `Mz/PrimEval.lean`: primitive evaluators on `Datum` and `List Datum` — `evalAnd`, `evalOr`, `evalNot`, `evalIfThen`, `Env`, `Env.get`, `evalAndN`, `evalOrN`, `evalCoalesce`. Split out so the algebraic-law files and the expression-level evaluator can both import them without circular dependencies. +* `Mz/Eval.lean`: the big-step `eval : Env → Expr → Datum`. List-carrying constructors evaluate each operand and hand the result list to the matching primitive. * `Mz/Boolean.lean`: per-cell truth-table proofs for `AND`, `OR`, and `NOT`, plus involutivity of `NOT`. -* `Mz/MightError.lean`: the conservative `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem that the optimizer needs in order to trust the analyzer's verdict. +* `Mz/MightError.lean`: the conservative `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. The list-carrying constructors are currently tainted unconditionally; soundness extends trivially for them and tightens in a future iteration. * `Mz/Strict.lean`: strictness predicates (`ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary`), positive instances for `evalNot` and the condition slot of `evalIfThen`, closure under composition, and negative results witnessing that `AND` and `OR` are *not* err-strict in either position. -* `Mz/Coalesce.lean`: the proposed `coalesce` evaluator and the error-rescue laws. A later non-error, non-null operand rescues an earlier `err` exactly as it rescues a `null`. The `null`-beats-`err` tiebreak preserves the all-`null` behavior PostgreSQL users expect. +* `Mz/Coalesce.lean`: laws for `evalCoalesce` — error-rescue, null-beats-err tiebreak, first-error stickiness. * `Mz/Laws.lean`: algebraic laws — two-sided identity (`TRUE` for `AND`, `FALSE` for `OR`), idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. -* `Mz/Variadic.lean`: variadic `evalAndN` and `evalOrN` over `List Datum` as right-folds, the binary-equivalence theorems that bridge them to `evalAnd`/`evalOr`, and the `FALSE`/`TRUE` absorption theorems that justify short-circuit evaluation. +* `Mz/Variadic.lean`: laws for `evalAndN` and `evalOrN` over `List Datum` — cons recurrence, nil, singleton, binary equivalence with the binary evaluators, and `FALSE`/`TRUE` absorption. +* `Mz/ExprVariadic.lean`: `Expr`-level reduction lemmas connecting `eval env (.andN args)` / `.orN` / `.coalesce` to their primitive counterparts, plus identity / singleton / binary-equivalence corollaries lifted through `eval`. ## What is not here @@ -62,7 +64,8 @@ Reviewers should expect both sides of the change in the same PR. The roadmap in priority order: -* Wire `evalAndN`, `evalOrN`, and `evalCoalesce` into `Expr` as `.andN`, `.orN`, and `.coalesce` constructors taking `List Expr`. Termination for the eval clause is the shared engineering effort — once one of these lands, the others follow trivially. +* Tighten `Expr.might_error` for the list-carrying constructors. The current placeholder always taints `andN` / `orN` / `coalesce`, which makes the soundness theorem vacuous for those constructors. Real analysis is a list-induction over operands. +* Variadic absorption at the `Expr` level: `FALSE ∈ args → eval env (.andN args) = .bool false` for a closed term. Mirrors `evalAndN_false_absorbs`, lifted through `eval`. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From 280f400498ea91398e4adf607ed806456627f8c5 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Sun, 17 May 2026 23:20:55 +0200 Subject: [PATCH 008/127] doc/semantics: tighten might_error for andN/orN with structural soundness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refine `Expr.might_error` for the `andN` and `orN` list-carrying constructors so the analyzer is no longer maximally conservative for them. The analyzer now recurses into the operand list via `Expr.argsMightError`, declared mutually with `Expr.might_error` so Lean's structural-recursion checker accepts both sides without an explicit termination measure. `Expr.argsMightError_of_mem` is the membership-driven introduction form needed by the soundness proof — it turns "some operand might error" into the bool fold. Its contrapositive is what extracts a per-operand non-erroring hypothesis from the analyzer's verdict on the whole list. `might_error_sound` for the `.andN` and `.orN` constructors now performs real work: * Reduce `eval env (.andN args)` to `evalAndN (args.map (eval env))` via the existing `simp only [eval]` pattern. * Apply the list-level `evalAndN_not_err` (also added in this commit) with a per-operand non-erroring hypothesis. * Extract that per-operand hypothesis by destructuring `List.mem_map` and applying `Expr.argsMightError_of_mem` contrapositively against `hMe`. * Recurse via `might_error_sound` on the individual operand. `.coalesce` is still tainted unconditionally — the rescue analysis is a separate follow-up since the helper `evalCoalesce_not_err` needs a different list induction (state-machine over `Coalesce.go`). `Expr.might_error` and `Expr.argsMightError` are now mutually recursive, which forces well-founded compilation. Existing `simp only [Expr.might_error]` patterns in the proof are unaffected because they target equation lemmas, not definitional reduction. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/MightError.lean | 125 ++++++++++++++++----- doc/developer/semantics/README.md | 4 +- 2 files changed, 101 insertions(+), 28 deletions(-) diff --git a/doc/developer/semantics/Mz/MightError.lean b/doc/developer/semantics/Mz/MightError.lean index 21315753054b2..96b9ebe3ff491 100644 --- a/doc/developer/semantics/Mz/MightError.lean +++ b/doc/developer/semantics/Mz/MightError.lean @@ -73,18 +73,46 @@ theorem evalIfThen_not_err simp only [evalIfThen]; decide | err _ => exact (hc trivial).elim -/-! ## Static analyzer -/ +/-- List-level analogue of `evalAnd_not_err`: if every operand is +error-free, the variadic AND is error-free. -/ +theorem evalAndN_not_err : + ∀ {ds : List Datum}, (∀ d ∈ ds, ¬d.IsErr) → ¬(evalAndN ds).IsErr + | [], _ => by intro hRes; cases hRes + | hd :: tl, h => by + show ¬(evalAnd hd (evalAndN tl)).IsErr + apply evalAnd_not_err + · exact h hd (List.Mem.head tl) + · exact evalAndN_not_err (fun d hd_mem => h d (List.Mem.tail hd hd_mem)) -/-- Returns `true` when `e` might evaluate to an `err`. The current +/-- Dual: every error-free operand makes the variadic OR error-free. -/ +theorem evalOrN_not_err : + ∀ {ds : List Datum}, (∀ d ∈ ds, ¬d.IsErr) → ¬(evalOrN ds).IsErr + | [], _ => by intro hRes; cases hRes + | hd :: tl, h => by + show ¬(evalOr hd (evalOrN tl)).IsErr + apply evalOr_not_err + · exact h hd (List.Mem.head tl) + · exact evalOrN_not_err (fun d hd_mem => h d (List.Mem.tail hd hd_mem)) + +/-! ## Static analyzer + +Returns `true` when `e` might evaluate to an `err`. The current implementation is purely structural and conservative: any literal `err` taints every ancestor. Columns are assumed not to contain errors (see `Env.ErrFree`). -For the list-carrying constructors (`andN`, `orN`, `coalesce`) the -skeleton is maximally conservative — they always taint. A future -refinement would recurse into `args` and (for `coalesce`) reason -about the rescue rule, but the present version keeps the soundness -proof trivial for those cases. -/ +For `andN` and `orN`, the analyzer recurses into the operand list via +`Expr.argsMightError` and returns `true` if any operand might error. +The mutual recursion across `Expr.might_error` and +`Expr.argsMightError` keeps Lean's structural-recursion checker +satisfied without an explicit termination measure. + +`coalesce` is still tainted unconditionally. A precise analyzer +would reason about the rescue rule (`coalesce(err, x) = x` when `x` +is concrete), which requires tracking which operands are statically +*safe* rather than merely *not erroring*. Tightening it is a separate +follow-up. -/ +mutual def Expr.might_error : Expr → Bool | .lit (.err _) => true | .lit _ => false @@ -93,10 +121,32 @@ def Expr.might_error : Expr → Bool | .or a b => a.might_error || b.might_error | .not a => a.might_error | .ifThen c t e => c.might_error || t.might_error || e.might_error - | .andN _ => true - | .orN _ => true + | .andN args => Expr.argsMightError args + | .orN args => Expr.argsMightError args | .coalesce _ => true +def Expr.argsMightError : List Expr → Bool + | [] => false + | e :: rest => e.might_error || Expr.argsMightError rest +end + +/-- Membership-driven introduction for `argsMightError`. If some +operand in the list might error, the whole list folds to `true`. The +contrapositive is what the soundness proof uses to extract a +per-operand non-erroring hypothesis from the analyzer's verdict on +`andN` / `orN`. -/ +theorem Expr.argsMightError_of_mem + {args : List Expr} {e : Expr} + (h_mem : e ∈ args) (h_err : e.might_error = true) : + Expr.argsMightError args = true := by + induction args with + | nil => cases h_mem + | cons hd tl ih => + show (hd.might_error || Expr.argsMightError tl) = true + cases h_mem with + | head _ => simp [h_err] + | tail _ h' => simp [ih h'] + /-! ## Error-free environments -/ /-- An environment is error-free when every bound value is not an `err`. -/ @@ -128,19 +178,19 @@ theorem Env.get_not_err {env : Env} (hErr : env.ErrFree) (i : Nat) : /-- If `might_error e` is `false` and the environment carries no errors, then `eval env e` is not an error. -The proof is structural recursion on `e`. The `induction` tactic -cannot be used on `Expr` because it is a nested inductive type (the -`andN` / `orN` / `coalesce` constructors carry `List Expr`), so the -proof is written as a recursive `theorem` that pattern-matches the -constructor and recurses on subexpressions. For each compound case -the hypothesis `¬e.might_error = true` decomposes into -per-subexpression hypotheses via `Bool` distribution. The matching -helper lemma (`evalAnd_not_err`, etc.) then concludes. - -The list-carrying constructors are handled vacuously: `might_error` -returns `true` on them unconditionally, so the soundness premise is -absurd. A future refinement will tighten `might_error` to inspect -operands and add real proofs for those cases. -/ +The proof is structural recursion on `e`. `induction` cannot be used +on `Expr` because it is a nested inductive type, so the proof is +written as a recursive `theorem` that pattern-matches the constructor +and recurses on subexpressions. For each compound case the hypothesis +`¬e.might_error = true` decomposes into per-subexpression hypotheses, +and the matching helper lemma (`evalAnd_not_err` etc.) concludes. + +The `andN` / `orN` cases extract a per-operand non-erroring witness +through `Expr.argsMightError_of_mem` and then recurse via +`might_error_sound` on the individual operand. The `coalesce` case +is currently vacuous — `might_error` always returns `true` for +`.coalesce`, so the soundness premise is absurd. A future refinement +will tighten that case alongside the analyzer. -/ theorem might_error_sound : ∀ (e : Expr) (env : Env), ¬(e.might_error = true) → env.ErrFree → ¬(eval env e).IsErr @@ -150,7 +200,9 @@ theorem might_error_sound : cases d with | bool _ => cases hRes | null => cases hRes - | err _ => exact hMe rfl + | err _ => + apply hMe + simp only [Expr.might_error] | .col i, env, _, hEnv => by intro hRes simp only [eval] at hRes @@ -186,8 +238,29 @@ theorem might_error_sound : (might_error_sound c env hc hEnv) (might_error_sound t env ht hEnv) (might_error_sound e env he hEnv) hRes - | .andN _, _, hMe, _ => by intro _; exact hMe rfl - | .orN _, _, hMe, _ => by intro _; exact hMe rfl - | .coalesce _, _, hMe, _ => by intro _; exact hMe rfl + | .andN args, env, hMe, hEnv => by + intro hRes + simp only [eval] at hRes + apply evalAndN_not_err (ds := args.map (eval env)) ?_ hRes + intro d hd_mem + obtain ⟨e, e_mem, h_eq⟩ := List.mem_map.mp hd_mem + subst h_eq + have he : ¬(e.might_error = true) := fun h => hMe + (Expr.argsMightError_of_mem e_mem h) + exact might_error_sound e env he hEnv + | .orN args, env, hMe, hEnv => by + intro hRes + simp only [eval] at hRes + apply evalOrN_not_err (ds := args.map (eval env)) ?_ hRes + intro d hd_mem + obtain ⟨e, e_mem, h_eq⟩ := List.mem_map.mp hd_mem + subst h_eq + have he : ¬(e.might_error = true) := fun h => hMe + (Expr.argsMightError_of_mem e_mem h) + exact might_error_sound e env he hEnv + | .coalesce _, _, hMe, _ => by + intro _ + apply hMe + simp only [Expr.might_error] end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 20857fb584e46..5e4bc89aaef28 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -13,7 +13,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/PrimEval.lean`: primitive evaluators on `Datum` and `List Datum` — `evalAnd`, `evalOr`, `evalNot`, `evalIfThen`, `Env`, `Env.get`, `evalAndN`, `evalOrN`, `evalCoalesce`. Split out so the algebraic-law files and the expression-level evaluator can both import them without circular dependencies. * `Mz/Eval.lean`: the big-step `eval : Env → Expr → Datum`. List-carrying constructors evaluate each operand and hand the result list to the matching primitive. * `Mz/Boolean.lean`: per-cell truth-table proofs for `AND`, `OR`, and `NOT`, plus involutivity of `NOT`. -* `Mz/MightError.lean`: the conservative `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. The list-carrying constructors are currently tainted unconditionally; soundness extends trivially for them and tightens in a future iteration. +* `Mz/MightError.lean`: the `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. `andN` and `orN` recurse into their operand list via the mutually-recursive `Expr.argsMightError`; soundness for those cases extracts a per-operand non-erroring witness through `Expr.argsMightError_of_mem` and recurses. `coalesce` is still tainted unconditionally and soundness for it is vacuous. * `Mz/Strict.lean`: strictness predicates (`ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary`), positive instances for `evalNot` and the condition slot of `evalIfThen`, closure under composition, and negative results witnessing that `AND` and `OR` are *not* err-strict in either position. * `Mz/Coalesce.lean`: laws for `evalCoalesce` — error-rescue, null-beats-err tiebreak, first-error stickiness. * `Mz/Laws.lean`: algebraic laws — two-sided identity (`TRUE` for `AND`, `FALSE` for `OR`), idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. @@ -64,7 +64,7 @@ Reviewers should expect both sides of the change in the same PR. The roadmap in priority order: -* Tighten `Expr.might_error` for the list-carrying constructors. The current placeholder always taints `andN` / `orN` / `coalesce`, which makes the soundness theorem vacuous for those constructors. Real analysis is a list-induction over operands. +* Tighten `Expr.might_error` for `.coalesce`. The current placeholder taints unconditionally; a precise analyzer would reason about the rescue rule (one statically-safe operand makes the whole coalesce safe). Requires a list-induction analogous to `Expr.argsMightError_of_mem` plus an `evalCoalesce_not_err` helper. * Variadic absorption at the `Expr` level: `FALSE ∈ args → eval env (.andN args) = .bool false` for a closed term. Mirrors `evalAndN_false_absorbs`, lifted through `eval`. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From 39eda4230f5bbd3f6f37c58f8baca31b955f5e3d Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 07:22:04 +0200 Subject: [PATCH 009/127] doc/semantics: tighten might_error for coalesce via argsAllMightError Refine `Expr.might_error` for the `.coalesce` constructor so the analyzer captures the rescue rule: `coalesce` might error only when every operand might error (and the operand list is non-empty). A single statically-safe operand makes the whole expression safe. Analyzer changes (mutual block): * `Expr.argsAllMightError`: companion fold to `argsMightError`, empty-list base `true`, cons case `e.might_error && rest...`. * `.coalesce []` returns `false` (empty coalesce is `.null`, never errors). * `.coalesce (a :: rest)` returns `a.might_error && argsAllMightError rest`, which is equal by definition to `argsAllMightError (a :: rest)`. Soundness chain for the `.coalesce` case: 1. `Expr.exists_safe_of_not_argsAllMightError` extracts a statically-safe operand from the negation of the analyzer. 2. `might_error_sound` recurses on that operand and produces a not-`IsErr` witness on its evaluated value. 3. `Coalesce.go_not_err` (new) is the state-machine lemma: any `Coalesce.go` invocation whose remaining list contains at least one non-erroring datum (or whose `seenNull` is already set) does not return an error. Proof is structural recursion on the list, plumbing the safety witness through the `.null` and `.err` arms. 4. `evalCoalesce_not_err_of_some_safe` packages the above into the surface lemma soundness invokes. `Coalesce.go` is made non-private in `Mz/PrimEval.lean` so the state-machine lemma can reference it. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/MightError.lean | 157 +++++++++++++++++++-- doc/developer/semantics/Mz/PrimEval.lean | 2 +- doc/developer/semantics/README.md | 3 +- 3 files changed, 145 insertions(+), 17 deletions(-) diff --git a/doc/developer/semantics/Mz/MightError.lean b/doc/developer/semantics/Mz/MightError.lean index 96b9ebe3ff491..a1052c700a23e 100644 --- a/doc/developer/semantics/Mz/MightError.lean +++ b/doc/developer/semantics/Mz/MightError.lean @@ -114,20 +114,33 @@ is concrete), which requires tracking which operands are statically follow-up. -/ mutual def Expr.might_error : Expr → Bool - | .lit (.err _) => true - | .lit _ => false - | .col _ => false - | .and a b => a.might_error || b.might_error - | .or a b => a.might_error || b.might_error - | .not a => a.might_error - | .ifThen c t e => c.might_error || t.might_error || e.might_error - | .andN args => Expr.argsMightError args - | .orN args => Expr.argsMightError args - | .coalesce _ => true + | .lit (.err _) => true + | .lit _ => false + | .col _ => false + | .and a b => a.might_error || b.might_error + | .or a b => a.might_error || b.might_error + | .not a => a.might_error + | .ifThen c t e => c.might_error || t.might_error || e.might_error + | .andN args => Expr.argsMightError args + | .orN args => Expr.argsMightError args + | .coalesce [] => false + | .coalesce (a :: rest) => a.might_error && Expr.argsAllMightError rest +/-- Bool fold of `might_error` over a list of operands ("does any +operand might-error"), declared mutually with `might_error` so +structural recursion accepts both sides. -/ def Expr.argsMightError : List Expr → Bool | [] => false | e :: rest => e.might_error || Expr.argsMightError rest + +/-- Companion fold for `coalesce`: "do all operands might-error". +The empty-list base case is `true` so that the cons case +`a.might_error && argsAllMightError rest` gives the right answer +for any non-empty list. The pattern match in `Expr.might_error`'s +`.coalesce` arms handles the empty case separately. -/ +def Expr.argsAllMightError : List Expr → Bool + | [] => true + | e :: rest => e.might_error && Expr.argsAllMightError rest end /-- Membership-driven introduction for `argsMightError`. If some @@ -147,6 +160,100 @@ theorem Expr.argsMightError_of_mem | head _ => simp [h_err] | tail _ h' => simp [ih h'] +/-- If `argsAllMightError args` is not `true`, there is at least one +operand whose `might_error` is not `true`. This is the +"some safe operand exists" extraction used by the `coalesce` case of +soundness. -/ +theorem Expr.exists_safe_of_not_argsAllMightError + {args : List Expr} (h : ¬(Expr.argsAllMightError args = true)) : + ∃ e ∈ args, ¬(e.might_error = true) := by + induction args with + | nil => + -- argsAllMightError [] = true, contradicts h + exact (h rfl).elim + | cons hd tl ih => + -- argsAllMightError (hd :: tl) = hd.might_error && argsAllMightError tl + by_cases hd_me : hd.might_error = true + · -- hd is not safe; the safe one must be in tl. + have htl : ¬(Expr.argsAllMightError tl = true) := by + intro h_tl + apply h + show (hd.might_error && Expr.argsAllMightError tl) = true + simp [hd_me, h_tl] + obtain ⟨e, e_mem, he⟩ := ih htl + exact ⟨e, List.Mem.tail hd e_mem, he⟩ + · -- hd is the safe operand. + exact ⟨hd, List.Mem.head tl, hd_me⟩ + +/-! ## Coalesce safety + +Once `Coalesce.go` is invoked with `seenNull = true`, the result is +never an error: the empty-list base returns `.null`, and every cons +case either short-circuits to a `.bool b`, recurses with +`seenNull = true` unchanged, or updates `firstErr` without ever +flipping `seenNull` back to `false`. + +The combined lemma `Coalesce.go_not_err` strengthens that +observation: if the starting state has either `seenNull = true` or +at least one not-err element in the remaining list, the result is +not an error. -/ + +theorem Coalesce.go_not_err : + ∀ (seenNull : Bool) (firstErr : Option EvalError) (ds : List Datum), + seenNull = true ∨ (∃ d ∈ ds, ¬d.IsErr) → + ¬(Coalesce.go seenNull firstErr ds).IsErr + | true, _, [], _ => by + intro hRes + -- Coalesce.go true _ [] = .null + show False + simp only [Coalesce.go, if_true] at hRes + cases hRes + | false, _, [], h => by + -- Empty + seenNull=false: only the disjunct ∃ d ∈ [] survives, which is False. + cases h with + | inl h_true => cases h_true + | inr h_ex => + obtain ⟨_, hmem, _⟩ := h_ex + cases hmem + | _, _, .bool b :: _, _ => by + intro hRes + -- Coalesce.go _ _ (.bool b :: _) = .bool b + show False + simp only [Coalesce.go] at hRes + cases hRes + | _, firstErr, .null :: rest, _ => by + -- Recurse with seenNull=true. + show ¬(Coalesce.go true firstErr rest).IsErr + exact Coalesce.go_not_err true firstErr rest (Or.inl rfl) + | seenNull, firstErr, .err e :: rest, h => by + -- Push the witness from (.err e :: rest) into rest, since .err e cannot be the witness. + have h_rest : seenNull = true ∨ ∃ d ∈ rest, ¬d.IsErr := by + cases h with + | inl h_true => exact Or.inl h_true + | inr h_ex => + obtain ⟨d, hmem, hsafe⟩ := h_ex + cases hmem with + | head _ => exact (hsafe trivial).elim + | tail _ h_tl => exact Or.inr ⟨d, h_tl, hsafe⟩ + -- Two cases on firstErr; the recursion shape is the same modulo argument. + cases firstErr with + | some firstErr' => + show ¬(Coalesce.go seenNull (some firstErr') rest).IsErr + exact Coalesce.go_not_err seenNull (some firstErr') rest h_rest + | none => + show ¬(Coalesce.go seenNull (some e) rest).IsErr + exact Coalesce.go_not_err seenNull (some e) rest h_rest + +/-- The headline lemma: an `evalCoalesce` call cannot return an +error when at least one operand evaluates to something that is not +an error. The `null`-beats-`err` tiebreak inside `Coalesce.go` does +the work. -/ +theorem evalCoalesce_not_err_of_some_safe + {ds : List Datum} (h : ∃ d ∈ ds, ¬d.IsErr) : + ¬(evalCoalesce ds).IsErr := by + show ¬(Coalesce.go false none ds).IsErr + exact Coalesce.go_not_err false none ds (Or.inr h) + /-! ## Error-free environments -/ /-- An environment is error-free when every bound value is not an `err`. -/ @@ -258,9 +365,31 @@ theorem might_error_sound : have he : ¬(e.might_error = true) := fun h => hMe (Expr.argsMightError_of_mem e_mem h) exact might_error_sound e env he hEnv - | .coalesce _, _, hMe, _ => by - intro _ - apply hMe - simp only [Expr.might_error] + | .coalesce args, env, hMe, hEnv => by + intro hRes + simp only [eval] at hRes + -- Empty list: `evalCoalesce [] = .null`, immediately not an error. + match args, hMe, hRes with + | [], _, hRes' => + simp [evalCoalesce, Coalesce.go] at hRes' + cases hRes' + | a :: rest, hMe', hRes' => + -- Non-empty case. `might_error (.coalesce (a :: rest))` reduces to + -- `a.might_error && argsAllMightError rest`, which is exactly + -- `argsAllMightError (a :: rest)`. Negate and extract a safe operand. + have hAll : ¬(Expr.argsAllMightError (a :: rest) = true) := by + intro hAll + apply hMe' + show (a.might_error && Expr.argsAllMightError rest) = true + -- `argsAllMightError (a :: rest)` reduces by definition to the + -- conjunction we need. + exact hAll + obtain ⟨e, e_mem, he_safe⟩ := + Expr.exists_safe_of_not_argsAllMightError hAll + have he : ¬(eval env e).IsErr := might_error_sound e env he_safe hEnv + apply evalCoalesce_not_err_of_some_safe + (ds := (a :: rest).map (eval env)) + ?_ hRes' + exact ⟨eval env e, List.mem_map.mpr ⟨e, e_mem, rfl⟩, he⟩ end Mz diff --git a/doc/developer/semantics/Mz/PrimEval.lean b/doc/developer/semantics/Mz/PrimEval.lean index 12d7dcf43b0ba..b52a05cfd420f 100644 --- a/doc/developer/semantics/Mz/PrimEval.lean +++ b/doc/developer/semantics/Mz/PrimEval.lean @@ -97,7 +97,7 @@ def evalOrN : List Datum → Datum `err` payload while walking operands. The first concrete value (`.bool _`) short-circuits the walk. -/ -private def Coalesce.go (seenNull : Bool) (firstErr : Option EvalError) : +def Coalesce.go (seenNull : Bool) (firstErr : Option EvalError) : List Datum → Datum | [] => if seenNull then .null diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 5e4bc89aaef28..99f44c773fbdb 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -13,7 +13,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/PrimEval.lean`: primitive evaluators on `Datum` and `List Datum` — `evalAnd`, `evalOr`, `evalNot`, `evalIfThen`, `Env`, `Env.get`, `evalAndN`, `evalOrN`, `evalCoalesce`. Split out so the algebraic-law files and the expression-level evaluator can both import them without circular dependencies. * `Mz/Eval.lean`: the big-step `eval : Env → Expr → Datum`. List-carrying constructors evaluate each operand and hand the result list to the matching primitive. * `Mz/Boolean.lean`: per-cell truth-table proofs for `AND`, `OR`, and `NOT`, plus involutivity of `NOT`. -* `Mz/MightError.lean`: the `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. `andN` and `orN` recurse into their operand list via the mutually-recursive `Expr.argsMightError`; soundness for those cases extracts a per-operand non-erroring witness through `Expr.argsMightError_of_mem` and recurses. `coalesce` is still tainted unconditionally and soundness for it is vacuous. +* `Mz/MightError.lean`: the `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. `andN` and `orN` recurse via `Expr.argsMightError` ("any operand might error"); `coalesce` recurses via `Expr.argsAllMightError` ("every operand might error"), special-casing the empty list as safe. Soundness for `coalesce` extracts a statically-safe operand through `Expr.exists_safe_of_not_argsAllMightError` and applies `evalCoalesce_not_err_of_some_safe`, which in turn rests on `Coalesce.go_not_err` — the state-machine lemma that "once one safe operand is in the remaining list, the walk cannot return an error". * `Mz/Strict.lean`: strictness predicates (`ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary`), positive instances for `evalNot` and the condition slot of `evalIfThen`, closure under composition, and negative results witnessing that `AND` and `OR` are *not* err-strict in either position. * `Mz/Coalesce.lean`: laws for `evalCoalesce` — error-rescue, null-beats-err tiebreak, first-error stickiness. * `Mz/Laws.lean`: algebraic laws — two-sided identity (`TRUE` for `AND`, `FALSE` for `OR`), idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. @@ -64,7 +64,6 @@ Reviewers should expect both sides of the change in the same PR. The roadmap in priority order: -* Tighten `Expr.might_error` for `.coalesce`. The current placeholder taints unconditionally; a precise analyzer would reason about the rescue rule (one statically-safe operand makes the whole coalesce safe). Requires a list-induction analogous to `Expr.argsMightError_of_mem` plus an `evalCoalesce_not_err` helper. * Variadic absorption at the `Expr` level: `FALSE ∈ args → eval env (.andN args) = .bool false` for a closed term. Mirrors `evalAndN_false_absorbs`, lifted through `eval`. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From 4cf032a00214118b565380a2af8682cbc17d73bc Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 07:34:38 +0200 Subject: [PATCH 010/127] doc/semantics: lift variadic absorption through eval MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the `Expr`-level absorption theorems that an optimizer cites when folding a variadic `AND` containing a `FALSE` operand to `FALSE` (symmetric for `OR` with `TRUE`). Two forms each: * Semantic premise: `(∃ e ∈ args, eval env e = .bool false) → eval env (.andN args) = .bool false`. Sufficient for any operand the optimizer has reduced to a constant boolean. * Syntactic premise: `Expr.lit (.bool false) ∈ args → eval env (.andN args) = .bool false`. The corollary specialized to the case where a literal `.bool false` is syntactically present in the operand list. Useful for simple constant folding. Each proof reduces `eval env (.andN args)` to `evalAndN (args.map (eval env))` via `eval_andN`, then witnesses membership of `.bool false` in the mapped list through `List.mem_map`, then invokes `evalAndN_false_absorbs` from `Mz/Variadic.lean`. The `Or`/`True` direction is symmetric. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/ExprVariadic.lean | 42 ++++++++++++++++++++ doc/developer/semantics/README.md | 5 ++- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/doc/developer/semantics/Mz/ExprVariadic.lean b/doc/developer/semantics/Mz/ExprVariadic.lean index 82721b351c0ff..10577c66d2369 100644 --- a/doc/developer/semantics/Mz/ExprVariadic.lean +++ b/doc/developer/semantics/Mz/ExprVariadic.lean @@ -95,4 +95,46 @@ theorem eval_coalesce_singleton (env : Env) (a : Expr) : | null => rw [show evalCoalesce [Datum.null] = Datum.null from rfl] | err e => rw [show evalCoalesce [Datum.err e] = Datum.err e from rfl] +/-! ## Variadic absorption at the `Expr` level + +A single operand that evaluates to `FALSE` (resp. `TRUE`) makes the +whole variadic `AND` (resp. `OR`) evaluate to `FALSE` (resp. `TRUE`), +regardless of the other operands — including those that produce +`err`. These theorems transport `evalAndN_false_absorbs` / +`evalOrN_true_absorbs` (in `Mz/Variadic.lean`) through `eval`, and +are what an optimizer cites when folding `e₁ AND … AND falseᵢ AND …` +to `false`. The `lit` corollary specializes the semantic premise to +the syntactic case where one of the operands is the literal `.bool +false` / `.bool true`. -/ + +theorem eval_andN_false_absorbs {env : Env} {args : List Expr} + (h : ∃ e ∈ args, eval env e = .bool false) : + eval env (.andN args) = .bool false := by + rw [eval_andN] + apply evalAndN_false_absorbs (ds := args.map (eval env)) + obtain ⟨e, he_mem, he_eq⟩ := h + exact List.mem_map.mpr ⟨e, he_mem, he_eq⟩ + +theorem eval_orN_true_absorbs {env : Env} {args : List Expr} + (h : ∃ e ∈ args, eval env e = .bool true) : + eval env (.orN args) = .bool true := by + rw [eval_orN] + apply evalOrN_true_absorbs (ds := args.map (eval env)) + obtain ⟨e, he_mem, he_eq⟩ := h + exact List.mem_map.mpr ⟨e, he_mem, he_eq⟩ + +theorem eval_andN_lit_false_absorbs {env : Env} {args : List Expr} + (h : Expr.lit (.bool false) ∈ args) : + eval env (.andN args) = .bool false := by + apply eval_andN_false_absorbs + refine ⟨Expr.lit (.bool false), h, ?_⟩ + simp only [eval] + +theorem eval_orN_lit_true_absorbs {env : Env} {args : List Expr} + (h : Expr.lit (.bool true) ∈ args) : + eval env (.orN args) = .bool true := by + apply eval_orN_true_absorbs + refine ⟨Expr.lit (.bool true), h, ?_⟩ + simp only [eval] + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 99f44c773fbdb..75a726bd7abfd 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -18,7 +18,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Coalesce.lean`: laws for `evalCoalesce` — error-rescue, null-beats-err tiebreak, first-error stickiness. * `Mz/Laws.lean`: algebraic laws — two-sided identity (`TRUE` for `AND`, `FALSE` for `OR`), idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. * `Mz/Variadic.lean`: laws for `evalAndN` and `evalOrN` over `List Datum` — cons recurrence, nil, singleton, binary equivalence with the binary evaluators, and `FALSE`/`TRUE` absorption. -* `Mz/ExprVariadic.lean`: `Expr`-level reduction lemmas connecting `eval env (.andN args)` / `.orN` / `.coalesce` to their primitive counterparts, plus identity / singleton / binary-equivalence corollaries lifted through `eval`. +* `Mz/ExprVariadic.lean`: `Expr`-level reduction lemmas connecting `eval env (.andN args)` / `.orN` / `.coalesce` to their primitive counterparts, identity / singleton / binary-equivalence corollaries lifted through `eval`, and variadic-absorption theorems — a single operand evaluating to `FALSE` (resp. `TRUE`) makes the whole `andN` (resp. `orN`) evaluate to `FALSE` (resp. `TRUE`). ## What is not here @@ -64,7 +64,8 @@ Reviewers should expect both sides of the change in the same PR. The roadmap in priority order: -* Variadic absorption at the `Expr` level: `FALSE ∈ args → eval env (.andN args) = .bool false` for a closed term. Mirrors `evalAndN_false_absorbs`, lifted through `eval`. +* Lift to bag semantics: predicate / projection rewrites over `List Row`. +* Diff-semiring extension for global errors (v2). * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From cb27ccb5280bebe359d08040f0dd8535bfb21ff0 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 08:10:56 +0200 Subject: [PATCH 011/127] =?UTF-8?q?doc/semantics:=20add=20bag=20semantics?= =?UTF-8?q?=20=E2=80=94=20filterRel,=20project,=20basic=20laws?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model relations as `List Row` and define the two basic relational operators on top of the per-row evaluator. * `filterRel pred rel` keeps rows whose predicate evaluates to `.bool true`. Rows evaluating to `.bool false`, `.null`, or `.err` are dropped. The skeleton silently drops `err` rows; a real implementation would route them to a separate error collection. Noted in the docstring as a known modelling gap. * `project es rel` evaluates each scalar in `es` against every row and produces a relation whose row width is `es.length`. Laws: * `filterRel_idem`: applying the same predicate twice is the same as applying it once. Proof: `List.filter_filter` + `Bool.and_self`. * `filterRel_comm`: filter commutes with filter. Proof: `List.filter_filter` twice + `Bool.and_comm`. * `project_length`: projection preserves cardinality. Direct consequence of `List.length_map`. * `project_nil`: the empty projection collapses every row to `[]`. All proofs are one-liners over Lean core's list lemmas; the laws themselves are the first relational-level rewrites an optimizer cites. Predicate pushdown across projection, joins, and aggregates remain follow-ups. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Bag.lean | 88 +++++++++++++++++++++++++++++ doc/developer/semantics/README.md | 4 +- 3 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 doc/developer/semantics/Mz/Bag.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 02485f311e937..be4139177d4f9 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -8,3 +8,4 @@ import Mz.Coalesce import Mz.Laws import Mz.Variadic import Mz.ExprVariadic +import Mz.Bag diff --git a/doc/developer/semantics/Mz/Bag.lean b/doc/developer/semantics/Mz/Bag.lean new file mode 100644 index 0000000000000..c32aeddfddbc7 --- /dev/null +++ b/doc/developer/semantics/Mz/Bag.lean @@ -0,0 +1,88 @@ +import Mz.Eval + +/-! +# Bag semantics: filter and project + +A first sliver of relational semantics on top of the per-`Row` +evaluator. A `Relation` is modeled as a `List Row` — a bag of rows +in execution order. The skeleton supports two relational +operators: + +* `filterRel pred rel` keeps the rows whose predicate evaluates to + `.bool true`. Rows that evaluate to `.bool false`, `.null`, or + `.err` are dropped. A real implementation would route `err` rows + to a separate error collection; the skeleton silently drops them + so the laws can be stated without modelling the error stream yet. +* `project es rel` evaluates each expression in `es` against every + row and emits a list of resulting rows. The output schema width + is `es.length`. + +The two laws stated here — filter idempotence and filter +commutativity — are the simplest optimizer-relevant rewrites over +the bag. Both reduce to `List.filter_filter` plus a `Bool` +identity (`and_self`, `and_comm`); the proofs lean entirely on +existing core-library lemmas. More substantial rewrites (predicate +pushdown across projection, predicate combination, etc.) are +follow-ups that this file is intended to grow into. +-/ + +namespace Mz + +/-- A row is a positional list of bound values. Reuses `Env`. -/ +abbrev Row := Env + +/-- A relation is a list of rows. Bag (multiset) semantics; row +order does not matter for filter or project laws. -/ +abbrev Relation := List Row + +/-- Membership-style predicate evaluator used by `filterRel`. Rows +that evaluate to `.bool true` are kept; everything else (including +`.err`) is dropped. -/ +@[inline] def rowPredicate (pred : Expr) (row : Row) : Bool := + match eval row pred with + | .bool true => true + | _ => false + +/-- Filter a relation by a scalar predicate. -/ +def filterRel (pred : Expr) (rel : Relation) : Relation := + rel.filter (rowPredicate pred) + +/-- Project a relation through a list of scalar expressions. +Each output row has width `es.length`. -/ +def project (es : List Expr) (rel : Relation) : Relation := + rel.map (fun row => es.map (eval row)) + +/-! ## Filter laws -/ + +theorem filterRel_idem (pred : Expr) (rel : Relation) : + filterRel pred (filterRel pred rel) = filterRel pred rel := by + unfold filterRel + rw [List.filter_filter] + congr 1 + funext row + exact Bool.and_self _ + +theorem filterRel_comm (p q : Expr) (rel : Relation) : + filterRel p (filterRel q rel) = filterRel q (filterRel p rel) := by + unfold filterRel + rw [List.filter_filter, List.filter_filter] + congr 1 + funext row + exact Bool.and_comm _ _ + +/-! ## Project laws -/ + +theorem project_length (es : List Expr) (rel : Relation) : + (project es rel).length = rel.length := by + unfold project + exact List.length_map _ + +/-- The empty projection collapses every row to the empty row of +width zero, so the relation becomes a list of empty rows whose +length equals the input length. -/ +theorem project_nil (rel : Relation) : + project [] rel = rel.map (fun _ => []) := by + show rel.map (fun row => ([] : List Expr).map (eval row)) = rel.map (fun _ => []) + congr 1 + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 75a726bd7abfd..5962bac5b2f18 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -19,6 +19,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Laws.lean`: algebraic laws — two-sided identity (`TRUE` for `AND`, `FALSE` for `OR`), idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. * `Mz/Variadic.lean`: laws for `evalAndN` and `evalOrN` over `List Datum` — cons recurrence, nil, singleton, binary equivalence with the binary evaluators, and `FALSE`/`TRUE` absorption. * `Mz/ExprVariadic.lean`: `Expr`-level reduction lemmas connecting `eval env (.andN args)` / `.orN` / `.coalesce` to their primitive counterparts, identity / singleton / binary-equivalence corollaries lifted through `eval`, and variadic-absorption theorems — a single operand evaluating to `FALSE` (resp. `TRUE`) makes the whole `andN` (resp. `orN`) evaluate to `FALSE` (resp. `TRUE`). +* `Mz/Bag.lean`: bag semantics on `List Row`. Defines `filterRel` and `project`, with filter idempotence, filter commutativity, projection length-preservation, and the empty-projection equation. The skeleton silently drops `err` rows from `filterRel` output; a real implementation routes them to the error collection. ## What is not here @@ -64,7 +65,8 @@ Reviewers should expect both sides of the change in the same PR. The roadmap in priority order: -* Lift to bag semantics: predicate / projection rewrites over `List Row`. +* Route `err` rows from `filterRel` to a separate error collection rather than silently dropping. Model the dataflow's data/error stream pair explicitly. +* Predicate pushdown across projection: `filterRel p (project es rel) = project es (filterRel (p[es]) rel)` under suitable substitution conditions. * Diff-semiring extension for global errors (v2). * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From a1adb598fd879ff2874e181013a280075869caf1 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 08:30:47 +0200 Subject: [PATCH 012/127] =?UTF-8?q?doc/semantics:=20add=20BagStream=20?= =?UTF-8?q?=E2=80=94=20explicit=20data/error=20stream=20pair?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model the dataflow's data and error collections as a pair carried together through operators, mirroring the structure of the real Materialize dataflow where erroring rows are routed to a separate collection rather than silently dropped. * `BagStream` is a structure with `data : Relation` and `errors : List EvalError`. * `BagStream.ofRelation` injects a plain relation with no accumulated errors. * `BagStream.filter pred s` evaluates `pred` on every row of `s.data`. Survivors (`.bool true`) stay in the data field; erroring rows contribute their payloads to the error field via `errorRows`; everything else is dropped. Two helper lemmas carry the structural fact "predicate evaluated to `.bool true` on every survivor of `filterRel`": * `rows_in_filterRel_eval_to_true` unfolds the survival condition. * `errorRows_eq_nil_of_all_true` says a relation where every row evaluates to `.bool true` contributes no errors. `errorRows_filterRel` combines them and is the key fact behind `BagStream.filter_idem` — the second pass of an idempotent filter sees only survivors of the first, so it contributes no new errors. The overall stream is therefore unchanged on the second filter, at both the data and the error level. `BagStream.project` and stream-level filter commutativity follow in later iterations; commutativity at the error level requires multiset equality on `List EvalError` since list order differs across permutations. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/ErrStream.lean | 117 ++++++++++++++++++++++ doc/developer/semantics/README.md | 6 +- 3 files changed, 122 insertions(+), 2 deletions(-) create mode 100644 doc/developer/semantics/Mz/ErrStream.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index be4139177d4f9..ef3285ad5c7ac 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -9,3 +9,4 @@ import Mz.Laws import Mz.Variadic import Mz.ExprVariadic import Mz.Bag +import Mz.ErrStream diff --git a/doc/developer/semantics/Mz/ErrStream.lean b/doc/developer/semantics/Mz/ErrStream.lean new file mode 100644 index 0000000000000..75f17573e7f3e --- /dev/null +++ b/doc/developer/semantics/Mz/ErrStream.lean @@ -0,0 +1,117 @@ +import Mz.Eval +import Mz.Bag + +/-! +# Data / error stream pair + +The bag semantics in `Mz/Bag.lean` silently drops rows whose +predicate evaluates to `.err`. The real Materialize dataflow runs a +data stream alongside an error stream: an erroring row is removed +from the data collection and emitted into the error collection +instead, where downstream operators forward it unchanged. This file +makes that structure explicit and proves the basic laws of the +error-aware filter. + +`BagStream` is a pair `(data, errors)`. Operators consume a +`BagStream` and produce a `BagStream`. Existing errors from the +input propagate to the output; new errors from the operator are +appended. + +The skeleton models only `filter`. Adding `project` follows the same +pattern: each expression in the projection list contributes its own +error rows. +-/ + +namespace Mz + +/-- A dataflow stream: row collection plus accompanying error +collection. Operators below take a `BagStream` and return a +`BagStream`. -/ +structure BagStream where + data : Relation + errors : List EvalError + +/-- Inject a plain `Relation` into a `BagStream` with no accumulated +errors. The natural starting point for a source. -/ +def BagStream.ofRelation (rel : Relation) : BagStream := + { data := rel, errors := [] } + +/-- Collect every `err` payload produced by evaluating `pred` on the +rows of `rel`. Order matches `rel`. -/ +def errorRows (pred : Expr) (rel : Relation) : List EvalError := + rel.foldr (fun row acc => + match eval row pred with + | .err e => e :: acc + | _ => acc) [] + +/-- Error-aware filter. Rows whose predicate evaluates to `.bool +true` stay in the data collection; rows whose predicate evaluates +to `.err` contribute their payload to the error collection; +everything else is dropped. -/ +def BagStream.filter (pred : Expr) (s : BagStream) : BagStream := + { data := filterRel pred s.data + , errors := s.errors ++ errorRows pred s.data } + +/-! ## Per-field reduction lemmas -/ + +theorem BagStream.filter_data (pred : Expr) (s : BagStream) : + (BagStream.filter pred s).data = filterRel pred s.data := rfl + +theorem BagStream.filter_errors (pred : Expr) (s : BagStream) : + (BagStream.filter pred s).errors = s.errors ++ errorRows pred s.data := rfl + +/-! ## Helper lemmas -/ + +/-- Every row that survives `filterRel pred rel` evaluates the +predicate to `.bool true`. This is what the filter "kept" means +unfolded against `rowPredicate`. -/ +theorem rows_in_filterRel_eval_to_true (pred : Expr) (rel : Relation) : + ∀ row ∈ filterRel pred rel, eval row pred = .bool true := by + intro row h_mem + unfold filterRel at h_mem + have h_pred : rowPredicate pred row = true := (List.mem_filter.mp h_mem).2 + unfold rowPredicate at h_pred + cases h_eval : eval row pred with + | bool b => cases b + · rw [h_eval] at h_pred; cases h_pred + · rfl + | null => rw [h_eval] at h_pred; cases h_pred + | err _ => rw [h_eval] at h_pred; cases h_pred + +/-- If every row evaluates the predicate to `.bool true`, the error +collection is empty. -/ +theorem errorRows_eq_nil_of_all_true (pred : Expr) (rel : Relation) + (h : ∀ row ∈ rel, eval row pred = .bool true) : + errorRows pred rel = [] := by + induction rel with + | nil => rfl + | cons hd tl ih => + have hd_eval : eval hd pred = .bool true := h hd (List.Mem.head tl) + have htl : ∀ row ∈ tl, eval row pred = .bool true := + fun row h_mem => h row (List.Mem.tail hd h_mem) + show (match eval hd pred with + | .err e => e :: errorRows pred tl + | _ => errorRows pred tl) = [] + rw [hd_eval] + exact ih htl + +/-- `errorRows` of a filtered relation is empty: the survivors all +evaluated to `.bool true`, none produced an `err`. -/ +theorem errorRows_filterRel (pred : Expr) (rel : Relation) : + errorRows pred (filterRel pred rel) = [] := + errorRows_eq_nil_of_all_true pred _ + (rows_in_filterRel_eval_to_true pred rel) + +/-! ## Stream laws -/ + +/-- Idempotence of `BagStream.filter`. Applying the same predicate +twice produces the same data *and* the same errors as applying it +once: the second pass observes only the survivors of the first, +which by construction evaluate to `.bool true` and thus contribute +nothing new to the error collection. -/ +theorem BagStream.filter_idem (pred : Expr) (s : BagStream) : + BagStream.filter pred (BagStream.filter pred s) = + BagStream.filter pred s := by + simp [BagStream.filter, filterRel_idem, errorRows_filterRel] + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 5962bac5b2f18..ffc0629add0fb 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -19,7 +19,8 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Laws.lean`: algebraic laws — two-sided identity (`TRUE` for `AND`, `FALSE` for `OR`), idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. * `Mz/Variadic.lean`: laws for `evalAndN` and `evalOrN` over `List Datum` — cons recurrence, nil, singleton, binary equivalence with the binary evaluators, and `FALSE`/`TRUE` absorption. * `Mz/ExprVariadic.lean`: `Expr`-level reduction lemmas connecting `eval env (.andN args)` / `.orN` / `.coalesce` to their primitive counterparts, identity / singleton / binary-equivalence corollaries lifted through `eval`, and variadic-absorption theorems — a single operand evaluating to `FALSE` (resp. `TRUE`) makes the whole `andN` (resp. `orN`) evaluate to `FALSE` (resp. `TRUE`). -* `Mz/Bag.lean`: bag semantics on `List Row`. Defines `filterRel` and `project`, with filter idempotence, filter commutativity, projection length-preservation, and the empty-projection equation. The skeleton silently drops `err` rows from `filterRel` output; a real implementation routes them to the error collection. +* `Mz/Bag.lean`: bag semantics on `List Row`. Defines `filterRel` and `project`, with filter idempotence, filter commutativity, projection length-preservation, and the empty-projection equation. Plain `filterRel` silently drops `err` rows; `Mz/ErrStream.lean` adds the explicit data/error stream pair. +* `Mz/ErrStream.lean`: the dataflow-style `BagStream = (data, errors)` pair. `BagStream.filter` routes erroring rows into the error collection instead of dropping them, with idempotence proved at both the data and the error level. ## What is not here @@ -65,7 +66,8 @@ Reviewers should expect both sides of the change in the same PR. The roadmap in priority order: -* Route `err` rows from `filterRel` to a separate error collection rather than silently dropping. Model the dataflow's data/error stream pair explicitly. +* `BagStream.project` analogous to `BagStream.filter`: each scalar in the projection list can produce its own error rows; aggregate them into the error collection. +* `BagStream.filter` commutativity. Data field commutes by `filterRel_comm`; the error field requires a notion of multiset equality on `List EvalError` since list-order differs across permutations. * Predicate pushdown across projection: `filterRel p (project es rel) = project es (filterRel (p[es]) rel)` under suitable substitution conditions. * Diff-semiring extension for global errors (v2). * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. From b684e16f627c3a6e1172437d895779066d75af0a Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 08:35:40 +0200 Subject: [PATCH 013/127] doc/semantics: predicate pushdown across projection via Expr.subst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the classical relational rewrite: filterRel p (project es rel) = project es (filterRel (p.subst es) rel) The rewrite lets an optimizer move a `WHERE` clause through a `SELECT` clause by substituting the projection's source scalars into the predicate's column references. * `Expr.subst es e` replaces each `Expr.col i` in `e` with the i-th scalar of `es`, with out-of-bounds references defaulting to `.lit .null` so the result still evaluates to `.null`. Defined mutually with `Expr.substArgs` for the nested-list constructors (`andN`, `orN`, `coalesce`). * `Expr.substArgs_eq_map` bridges the recursive helper to the more ergonomic `args.map (·.subst es)` form for use in proofs. * `Env.get_map_eval` is the column-lookup compatibility lemma: reading column `i` from the projected row equals evaluating the i-th projection scalar against the original row. * `eval_subst` is the headline correctness theorem: substituting and then evaluating against the original row equals evaluating the unsubstituted expression against the projected row. Proof is structural pattern recursion mirroring `Expr.subst`. * `filterRel_pushdown_project` packages `eval_subst` plus `List.filter_map` + `List.filter_congr` into the relational form an optimizer cites. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Pushdown.lean | 160 +++++++++++++++++++++++ doc/developer/semantics/README.md | 2 +- 3 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 doc/developer/semantics/Mz/Pushdown.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index ef3285ad5c7ac..c378d9286160d 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -10,3 +10,4 @@ import Mz.Variadic import Mz.ExprVariadic import Mz.Bag import Mz.ErrStream +import Mz.Pushdown diff --git a/doc/developer/semantics/Mz/Pushdown.lean b/doc/developer/semantics/Mz/Pushdown.lean new file mode 100644 index 0000000000000..3a220a3d3e293 --- /dev/null +++ b/doc/developer/semantics/Mz/Pushdown.lean @@ -0,0 +1,160 @@ +import Mz.Eval +import Mz.Bag + +/-! +# Predicate pushdown across projection + +The classical relational rewrite: + + `filterRel p (project es rel) = project es (filterRel (p[es]) rel)` + +where `p[es]` is the substitution that replaces each `Expr.col i` in +`p` with `es.get i` (the i-th scalar of the projection). The +rewrite is the basis for moving a `WHERE` clause through a `SELECT` +clause whenever the predicate's column references can be expressed +in terms of the projection's source. + +This file gives the substitution machinery (`Expr.subst`, +`Expr.substArgs`) and the two theorems an optimizer cites: + +* `eval_subst`: substituting and then evaluating against the + original row equals evaluating against the projected row. +* `filterRel_pushdown_project`: the relational pushdown rewrite. + +`Expr.subst` is mutually recursive with `Expr.substArgs` so Lean's +structural-recursion checker handles the nested-list constructors +(`andN`, `orN`, `coalesce`) without an explicit termination measure. +-/ + +namespace Mz + +/-! ## Substitution -/ + +mutual +/-- Substitute column references in `e` with the i-th scalar of +`es`. Out-of-bounds references are replaced by `.lit .null` so that +the resulting expression evaluates to `.null`, matching `Env.get`'s +fallback. -/ +def Expr.subst (es : List Expr) : Expr → Expr + | .lit d => .lit d + | .col i => es.getD i (.lit .null) + | .and a b => .and (a.subst es) (b.subst es) + | .or a b => .or (a.subst es) (b.subst es) + | .not a => .not (a.subst es) + | .ifThen c t e => .ifThen (c.subst es) (t.subst es) (e.subst es) + | .andN args => .andN (Expr.substArgs es args) + | .orN args => .orN (Expr.substArgs es args) + | .coalesce args => .coalesce (Expr.substArgs es args) + +/-- Pointwise application of `subst` to a list of operands. -/ +def Expr.substArgs (es : List Expr) : List Expr → List Expr + | [] => [] + | e :: rest => e.subst es :: Expr.substArgs es rest +end + +/-! ## Helpers for substitution / map agreement + +`substArgs es args` and `args.map (·.subst es)` produce the same +list. The skeleton uses the explicit recursive `substArgs` form in +the definition so structural recursion is accepted; the proofs +benefit from being able to switch to `List.map` when needed. -/ + +theorem Expr.substArgs_eq_map (es args : List Expr) : + Expr.substArgs es args = args.map (·.subst es) := by + induction args with + | nil => rfl + | cons hd tl ih => simp [Expr.substArgs, ih] + +/-! ## Substitution preserves evaluation -/ + +/-- Reading column `i` from the projected row equals evaluating the +i-th projection scalar on the original row. The proof case-splits on +whether `i` is in bounds. -/ +private theorem Env.get_map_eval (env : Env) (es : List Expr) (i : Nat) : + Env.get (es.map (eval env)) i = eval env (es.getD i (.lit .null)) := by + induction es generalizing i with + | nil => + -- both sides reduce to `.null` + show Env.get [] i = eval env (.lit .null) + simp [Env.get, eval] + | cons hd tl ih => + cases i with + | zero => + -- LHS: Env.get (eval env hd :: tl.map (eval env)) 0 = eval env hd + -- RHS: eval env ((hd :: tl).getD 0 (.lit .null)) = eval env hd + show Env.get ((eval env hd) :: tl.map (eval env)) 0 = eval env hd + rfl + | succ n => + -- recurse on tl + show Env.get (eval env hd :: tl.map (eval env)) (n + 1) + = eval env ((hd :: tl).getD (n + 1) (.lit .null)) + show Env.get (tl.map (eval env)) n = eval env (tl.getD n (.lit .null)) + exact ih n + +/-- The headline theorem: substituting into `e` and evaluating +against the original row equals evaluating the original `e` against +the projected row. + +The proof is structural recursion on `e`, mirroring the structure of +`Expr.subst`. The nested-list constructors recurse through +`Expr.substArgs` and are handled by `eval_substArgs` below. -/ +theorem eval_subst : + ∀ (env : Env) (es : List Expr) (e : Expr), + eval env (e.subst es) = eval (es.map (eval env)) e + | env, es, .lit d => by + simp only [Expr.subst, eval] + | env, es, .col i => by + simp only [Expr.subst, eval] + exact (Env.get_map_eval env es i).symm + | env, es, .and a b => by + simp only [Expr.subst, eval] + rw [eval_subst env es a, eval_subst env es b] + | env, es, .or a b => by + simp only [Expr.subst, eval] + rw [eval_subst env es a, eval_subst env es b] + | env, es, .not a => by + simp only [Expr.subst, eval] + rw [eval_subst env es a] + | env, es, .ifThen c t e => by + simp only [Expr.subst, eval] + rw [eval_subst env es c, eval_subst env es t, eval_subst env es e] + | env, es, .andN args => by + simp only [Expr.subst, eval, Expr.substArgs_eq_map] + rw [List.map_map] + congr 1 + apply List.map_congr_left + intro e _ + exact eval_subst env es e + | env, es, .orN args => by + simp only [Expr.subst, eval, Expr.substArgs_eq_map] + rw [List.map_map] + congr 1 + apply List.map_congr_left + intro e _ + exact eval_subst env es e + | env, es, .coalesce args => by + simp only [Expr.subst, eval, Expr.substArgs_eq_map] + rw [List.map_map] + congr 1 + apply List.map_congr_left + intro e _ + exact eval_subst env es e + +/-! ## Predicate pushdown -/ + +/-- The classical predicate-pushdown rewrite. Filtering after +projecting agrees with substituting the projection scalars into the +predicate and filtering before projecting. -/ +theorem filterRel_pushdown_project (p : Expr) (es : List Expr) (rel : Relation) : + filterRel p (project es rel) = project es (filterRel (p.subst es) rel) := by + unfold filterRel project + rw [List.filter_map] + congr 1 + apply List.filter_congr + intro row _ + -- Goal: (rowPredicate p ∘ (fun row => es.map (eval row))) row = rowPredicate (p.subst es) row + show rowPredicate p (es.map (eval row)) = rowPredicate (p.subst es) row + unfold rowPredicate + rw [eval_subst row es p] + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index ffc0629add0fb..af9f70e86d5e4 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -21,6 +21,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/ExprVariadic.lean`: `Expr`-level reduction lemmas connecting `eval env (.andN args)` / `.orN` / `.coalesce` to their primitive counterparts, identity / singleton / binary-equivalence corollaries lifted through `eval`, and variadic-absorption theorems — a single operand evaluating to `FALSE` (resp. `TRUE`) makes the whole `andN` (resp. `orN`) evaluate to `FALSE` (resp. `TRUE`). * `Mz/Bag.lean`: bag semantics on `List Row`. Defines `filterRel` and `project`, with filter idempotence, filter commutativity, projection length-preservation, and the empty-projection equation. Plain `filterRel` silently drops `err` rows; `Mz/ErrStream.lean` adds the explicit data/error stream pair. * `Mz/ErrStream.lean`: the dataflow-style `BagStream = (data, errors)` pair. `BagStream.filter` routes erroring rows into the error collection instead of dropping them, with idempotence proved at both the data and the error level. +* `Mz/Pushdown.lean`: substitution (`Expr.subst`) plus the headline `eval_subst` theorem (substituting then evaluating against the original row equals evaluating against the projected row), and the relational predicate-pushdown rewrite `filterRel p (project es rel) = project es (filterRel (p.subst es) rel)`. ## What is not here @@ -68,7 +69,6 @@ The roadmap in priority order: * `BagStream.project` analogous to `BagStream.filter`: each scalar in the projection list can produce its own error rows; aggregate them into the error collection. * `BagStream.filter` commutativity. Data field commutes by `filterRel_comm`; the error field requires a notion of multiset equality on `List EvalError` since list-order differs across permutations. -* Predicate pushdown across projection: `filterRel p (project es rel) = project es (filterRel (p[es]) rel)` under suitable substitution conditions. * Diff-semiring extension for global errors (v2). * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From 21d65a3630377006d33851e62f4b2032e4923ca3 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 08:39:23 +0200 Subject: [PATCH 014/127] =?UTF-8?q?doc/semantics:=20add=20DiffWithError=20?= =?UTF-8?q?=E2=80=94=20semiring=20extension=20for=20global=20errors?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model the diff-field encoding of collection-scoped (global) errors proposed in the design doc. The standard differential dataflow diff type (typically `ℤ` for multiset counts) is extended with an absorbing `error` element; any sum or product involving `error` yields `error`, encoding "this collection is invalid at time `t`". * `DiffWithError α` is a two-constructor inductive: `val (x : α)` for ordinary diffs, `error` for the absorbing marker. * `Add`, `Mul`, `Zero`, `One` instances lift the underlying operations and respect the absorbing behavior. * Absorption laws (`error_add_left`, `error_add_right`, `error_mul_left`, `error_mul_right`) are the defining property. * Commutativity, associativity, and left-distributivity are proved parameterically over the underlying `α`: each lemma takes the corresponding law on `α` as a hypothesis and discharges the `DiffWithError` version by case analysis. The lemmas are not packaged as `Semiring`/`CommRing` typeclass instances because the skeleton avoids depending on Mathlib for build-time reasons. Adding the typeclass wiring is straightforward once Mathlib is on the dependency list. Tying `DiffWithError` to an actual `(Row, Time, Diff)` triple stream and proving the operator-level propagation theorems is the next step; the present file lays the algebraic groundwork those theorems rely on. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/DiffSemiring.lean | 175 +++++++++++++++++++ doc/developer/semantics/README.md | 5 +- 3 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 doc/developer/semantics/Mz/DiffSemiring.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index c378d9286160d..33d6dc11a0326 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -11,3 +11,4 @@ import Mz.ExprVariadic import Mz.Bag import Mz.ErrStream import Mz.Pushdown +import Mz.DiffSemiring diff --git a/doc/developer/semantics/Mz/DiffSemiring.lean b/doc/developer/semantics/Mz/DiffSemiring.lean new file mode 100644 index 0000000000000..203b39641c46c --- /dev/null +++ b/doc/developer/semantics/Mz/DiffSemiring.lean @@ -0,0 +1,175 @@ +import Mz.Datum + +/-! +# Diff-semiring extension for global errors + +The design doc proposes that collection-scoped (global) errors be +encoded in the `diff` field of differential dataflow records: a +special marker in the diff that, when seen at time `t`, signals +"this collection is invalid at `t`". The marker must absorb the +ordinary multiset-count arithmetic — adding it to any other diff +gives the marker back, and likewise for multiplication. + +This file defines `DiffWithError α`, the simplest extension of an +arbitrary diff type `α` (typically `ℤ` for multiset counts) with an +absorbing `error` element, and proves the absorption / commutativity +/ associativity laws an operator over the extended semiring relies +on. + +The skeleton models only the algebraic structure; tying it to a +concrete dataflow operator is future work that requires modeling +collections-with-times. The laws here are however independent of +that — they are the proof obligations any such operator must +discharge. +-/ + +namespace Mz + +/-- `α`-valued diff augmented with an absorbing `error` marker. The +`error` element is the algebraic encoding of "this collection is +globally invalid". -/ +inductive DiffWithError (α : Type) where + | val (x : α) + | error + deriving Inhabited + +namespace DiffWithError + +variable {α : Type} + +/-- Lifted addition. `error` absorbs from either side; `val`s add +pointwise via the underlying `α`'s `+`. -/ +def add [Add α] : DiffWithError α → DiffWithError α → DiffWithError α + | .error, _ => .error + | _, .error => .error + | .val x, .val y => .val (x + y) + +/-- Lifted multiplication, with the same absorbing behavior. Joins +in differential dataflow are multiplicative on diffs. -/ +def mul [Mul α] : DiffWithError α → DiffWithError α → DiffWithError α + | .error, _ => .error + | _, .error => .error + | .val x, .val y => .val (x * y) + +instance [Add α] : Add (DiffWithError α) := ⟨add⟩ +instance [Mul α] : Mul (DiffWithError α) := ⟨mul⟩ + +/-- Lifted zero (identity for `+`). -/ +instance [Zero α] : Zero (DiffWithError α) := ⟨.val 0⟩ + +/-- Lifted one (identity for `*`). -/ +instance [One α] : One (DiffWithError α) := ⟨.val 1⟩ + +/-! ## Absorption laws + +The defining property of the `error` marker: any sum or product +involving it is itself `error`. -/ + +theorem error_add_left [Add α] (y : DiffWithError α) : + (error : DiffWithError α) + y = error := rfl + +theorem error_add_right [Add α] (x : DiffWithError α) : + x + (error : DiffWithError α) = error := by + cases x with + | val _ => rfl + | error => rfl + +theorem error_mul_left [Mul α] (y : DiffWithError α) : + (error : DiffWithError α) * y = error := rfl + +theorem error_mul_right [Mul α] (x : DiffWithError α) : + x * (error : DiffWithError α) = error := by + cases x with + | val _ => rfl + | error => rfl + +/-! ## Commutativity / associativity of `+` (when the base has them) -/ + +theorem add_comm [Add α] (h_comm : ∀ x y : α, x + y = y + x) + (a b : DiffWithError α) : a + b = b + a := by + cases a with + | val x => + cases b with + | val y => show (val (x + y) : DiffWithError α) = val (y + x); rw [h_comm] + | error => rfl + | error => + cases b with + | val _ => rfl + | error => rfl + +theorem add_assoc [Add α] (h_assoc : ∀ x y z : α, (x + y) + z = x + (y + z)) + (a b c : DiffWithError α) : (a + b) + c = a + (b + c) := by + cases a with + | val x => + cases b with + | val y => + cases c with + | val z => + show (val ((x + y) + z) : DiffWithError α) = val (x + (y + z)) + rw [h_assoc] + | error => rfl + | error => + cases c with + | val _ => rfl + | error => rfl + | error => + cases b with + | val _ => + cases c with + | val _ => rfl + | error => rfl + | error => + cases c with + | val _ => rfl + | error => rfl + +/-! ## Zero identity (when the base has it) -/ + +theorem zero_add_val [Add α] [Zero α] (h : ∀ x : α, 0 + x = x) (x : α) : + (0 : DiffWithError α) + val x = val x := by + show (val (0 + x) : DiffWithError α) = val x + rw [h] + +theorem val_add_zero [Add α] [Zero α] (h : ∀ x : α, x + 0 = x) (x : α) : + (val x : DiffWithError α) + 0 = val x := by + show (val (x + 0) : DiffWithError α) = val x + rw [h] + +/-! ## Distributivity (when the base has it) + +Left distributivity says `a * (b + c) = a * b + a * c`. With the +absorbing `error`, the law holds unconditionally on `DiffWithError` +provided it holds on `α`: any `error` in the inputs forces every +sub-expression containing it to `error`, and `error + error = error` +restores the equality on the right. -/ + +theorem mul_add [Mul α] [Add α] + (h_distrib : ∀ x y z : α, x * (y + z) = x * y + x * z) + (a b c : DiffWithError α) : a * (b + c) = a * b + a * c := by + cases a with + | val x => + cases b with + | val y => + cases c with + | val z => + show (val (x * (y + z)) : DiffWithError α) = val (x * y + x * z) + rw [h_distrib] + | error => rfl + | error => + cases c with + | val _ => rfl + | error => rfl + | error => + cases b with + | val _ => + cases c with + | val _ => rfl + | error => rfl + | error => + cases c with + | val _ => rfl + | error => rfl + +end DiffWithError + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index af9f70e86d5e4..aee71a2d0880c 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -22,6 +22,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Bag.lean`: bag semantics on `List Row`. Defines `filterRel` and `project`, with filter idempotence, filter commutativity, projection length-preservation, and the empty-projection equation. Plain `filterRel` silently drops `err` rows; `Mz/ErrStream.lean` adds the explicit data/error stream pair. * `Mz/ErrStream.lean`: the dataflow-style `BagStream = (data, errors)` pair. `BagStream.filter` routes erroring rows into the error collection instead of dropping them, with idempotence proved at both the data and the error level. * `Mz/Pushdown.lean`: substitution (`Expr.subst`) plus the headline `eval_subst` theorem (substituting then evaluating against the original row equals evaluating against the projected row), and the relational predicate-pushdown rewrite `filterRel p (project es rel) = project es (filterRel (p.subst es) rel)`. +* `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity laws that downstream operators must respect. ## What is not here @@ -69,7 +70,9 @@ The roadmap in priority order: * `BagStream.project` analogous to `BagStream.filter`: each scalar in the projection list can produce its own error rows; aggregate them into the error collection. * `BagStream.filter` commutativity. Data field commutes by `filterRel_comm`; the error field requires a notion of multiset equality on `List EvalError` since list-order differs across permutations. -* Diff-semiring extension for global errors (v2). +* Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. +* Joins on `BagStream` with explicit error propagation. +* Aggregates: variadic reductions (`SUM`, `MIN`, etc.) with strict-on-err semantics and `try_*` non-strict variants. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From 00e63e5a0ba702d7d36959b626f3e2519682997e Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 09:00:11 +0200 Subject: [PATCH 015/127] =?UTF-8?q?doc/semantics:=20add=20UnifiedStream=20?= =?UTF-8?q?=E2=80=94=20single-carrier=20data=20+=20error=20model?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model the spec's preferred encoding where data rows and errors flow through one collection rather than the `(data, errors)` pair carried by `BagStream`. The split form in `Mz/ErrStream.lean` mirrors the current Materialize runtime; the unified form mirrors what the diff-semiring extension naturally produces (any record with an `error` diff is a row-scoped error, and the same encoding extends to collection-scoped errors). * `UnifiedRow` is a sum type — `row (r : Row)` or `err (e : EvalError)`. The bare `err` variant preserves the current property that errors carry no row context. * `UnifiedStream := List UnifiedRow`. * `UnifiedStream.ofBag` packs a `BagStream` (data rows first, errors second). `UnifiedStream.split` is its inverse, splitting the carrier back via `filterMap pickRow` / `filterMap pickErr`. * `UnifiedStream.filter` runs the predicate per record, routing erroring rows to `.err` records in place. Existing `err` records pass through unchanged — the carrier handles error propagation without per-operator boilerplate. Round-trip theorem `UnifiedStream.split_ofBag : split (ofBag s) = s` proved at both the data and error field level, using four private filterMap-over-tagged-list helpers and structural induction. The cross-direction `filter ∘ ofBag ≈ ofBag ∘ filter` is left for future work — equivalence is only up to multiset equality on errors because `ofBag` fixes a concat order between data and errors, while filters that interleave the two would produce a different list order. `@[ext]` added to `BagStream` so the round-trip proof can use `apply BagStream.ext`. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/ErrStream.lean | 2 +- doc/developer/semantics/Mz/UnifiedStream.lean | 147 ++++++++++++++++++ doc/developer/semantics/README.md | 1 + 4 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 doc/developer/semantics/Mz/UnifiedStream.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 33d6dc11a0326..5529fad4a7a0f 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -12,3 +12,4 @@ import Mz.Bag import Mz.ErrStream import Mz.Pushdown import Mz.DiffSemiring +import Mz.UnifiedStream diff --git a/doc/developer/semantics/Mz/ErrStream.lean b/doc/developer/semantics/Mz/ErrStream.lean index 75f17573e7f3e..10551d62269ff 100644 --- a/doc/developer/semantics/Mz/ErrStream.lean +++ b/doc/developer/semantics/Mz/ErrStream.lean @@ -27,7 +27,7 @@ namespace Mz /-- A dataflow stream: row collection plus accompanying error collection. Operators below take a `BagStream` and return a `BagStream`. -/ -structure BagStream where +@[ext] structure BagStream where data : Relation errors : List EvalError diff --git a/doc/developer/semantics/Mz/UnifiedStream.lean b/doc/developer/semantics/Mz/UnifiedStream.lean new file mode 100644 index 0000000000000..e0c8d2293715e --- /dev/null +++ b/doc/developer/semantics/Mz/UnifiedStream.lean @@ -0,0 +1,147 @@ +import Mz.Eval +import Mz.Bag +import Mz.ErrStream + +/-! +# Unified data / error stream + +`BagStream` (`Mz/ErrStream.lean`) carries a `(data, errors)` pair — +two collections, threaded through every operator. That split +mirrors Materialize's current runtime but is a *pragmatic* choice +rather than a semantic one. The spec target is a single unified +stream where data rows and errors flow through the same carrier. + +This file gives the unified model and the conversion to / from the +split form. + +## Encoding + +`UnifiedRow` is a sum type: either an honest `row` or a bare `err` +without a row. The absence of a row in the `err` variant preserves +the current property that errors carry no row context (the runtime +`DataflowError` is the same way). A future refinement could attach +optional row provenance. + +`UnifiedStream := List UnifiedRow`. Operators consume and produce +`UnifiedStream`s; errors propagate through the carrier +automatically. + +## Diff-aware view + +The encoding here uses a plain list. The next refinement attaches a +`DiffWithError ℤ` to each record (see `Mz/DiffSemiring.lean`) so the +absorbing `error` diff captures collection-scoped global errors +alongside row-scoped errors. The conversion lemmas below transport +to that refinement when it lands. + +## Semantic differences with the split form + +`UnifiedStream.ofBag` concatenates data rows first and errors +second, fixing an order. Operators that process records left-to- +right will see data before errors. The split form makes no such +commitment between data and errors. Equivalence between unified +and split is therefore exact on the round trip (`split (ofBag s) = +s`) but only up to multiset equality on the cross-direction +`(filter ∘ ofBag) ≈ (ofBag ∘ filter)`. The skeleton states the +round trip; the cross-equivalence is left for a future iteration +that introduces multiset machinery on `List EvalError`. +-/ + +namespace Mz + +inductive UnifiedRow where + | row (r : Row) + | err (e : EvalError) + deriving Inhabited + +abbrev UnifiedStream := List UnifiedRow + +/-- Pick the row payload of a `UnifiedRow`, or `none` for errors. -/ +@[inline] private def pickRow : UnifiedRow → Option Row + | .row r => some r + | .err _ => none + +/-- Pick the error payload of a `UnifiedRow`, or `none` for rows. -/ +@[inline] private def pickErr : UnifiedRow → Option EvalError + | .row _ => none + | .err e => some e + +/-- Pack a `BagStream` into a single unified stream: data rows +first, error payloads second. -/ +def UnifiedStream.ofBag (s : BagStream) : UnifiedStream := + s.data.map UnifiedRow.row ++ s.errors.map UnifiedRow.err + +/-- Split a unified stream back into the `(data, errors)` pair. -/ +def UnifiedStream.split (us : UnifiedStream) : BagStream := + { data := us.filterMap pickRow + , errors := us.filterMap pickErr } + +/-- Filter on the unified stream. Predicate is evaluated on every +real `row`; survivors stay, erroring rows become `err` records, +non-true / non-error results are dropped. Existing `err` records +pass through unchanged. -/ +def UnifiedStream.filter (pred : Expr) (us : UnifiedStream) : UnifiedStream := + us.flatMap fun u => match u with + | .row r => + match eval r pred with + | .bool true => [.row r] + | .err e => [.err e] + | _ => [] + | .err e => [.err e] + +/-! ## Helper lemmas for filterMap over the packed concatenation -/ + +private theorem filterMap_pickRow_rowMap (rs : List Row) : + (rs.map UnifiedRow.row).filterMap pickRow = rs := by + induction rs with + | nil => rfl + | cons hd tl ih => simp [List.map, List.filterMap, pickRow, ih] + +private theorem filterMap_pickRow_errMap (es : List EvalError) : + (es.map UnifiedRow.err).filterMap pickRow = ([] : Relation) := by + induction es with + | nil => rfl + | cons _ tl ih => simp [List.map, List.filterMap, pickRow, ih] + +private theorem filterMap_pickErr_rowMap (rs : List Row) : + (rs.map UnifiedRow.row).filterMap pickErr = ([] : List EvalError) := by + induction rs with + | nil => rfl + | cons _ tl ih => simp [List.map, List.filterMap, pickErr, ih] + +private theorem filterMap_pickErr_errMap (es : List EvalError) : + (es.map UnifiedRow.err).filterMap pickErr = es := by + induction es with + | nil => rfl + | cons hd tl ih => simp [List.map, List.filterMap, pickErr, ih] + +/-! ## Round-trip lemmas -/ + +theorem UnifiedStream.split_data_ofBag (s : BagStream) : + (UnifiedStream.split (UnifiedStream.ofBag s)).data = s.data := by + show (s.data.map UnifiedRow.row ++ s.errors.map UnifiedRow.err).filterMap pickRow = s.data + induction s.data with + | nil => + simp only [List.map_nil, List.nil_append] + exact filterMap_pickRow_errMap s.errors + | cons hd tl ih => + simp [List.map, List.filterMap_cons, pickRow, ih] + +theorem UnifiedStream.split_errors_ofBag (s : BagStream) : + (UnifiedStream.split (UnifiedStream.ofBag s)).errors = s.errors := by + show (s.data.map UnifiedRow.row ++ s.errors.map UnifiedRow.err).filterMap pickErr = s.errors + induction s.data with + | nil => + simp only [List.map_nil, List.nil_append] + exact filterMap_pickErr_errMap s.errors + | cons _ tl ih => + simp [List.map, List.filterMap_cons, pickErr, ih] + +/-- Full round trip on the structure level. -/ +theorem UnifiedStream.split_ofBag (s : BagStream) : + UnifiedStream.split (UnifiedStream.ofBag s) = s := by + apply BagStream.ext + · exact UnifiedStream.split_data_ofBag s + · exact UnifiedStream.split_errors_ofBag s + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index aee71a2d0880c..6921f99f0b9ff 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -23,6 +23,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/ErrStream.lean`: the dataflow-style `BagStream = (data, errors)` pair. `BagStream.filter` routes erroring rows into the error collection instead of dropping them, with idempotence proved at both the data and the error level. * `Mz/Pushdown.lean`: substitution (`Expr.subst`) plus the headline `eval_subst` theorem (substituting then evaluating against the original row equals evaluating against the projected row), and the relational predicate-pushdown rewrite `filterRel p (project es rel) = project es (filterRel (p.subst es) rel)`. * `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity laws that downstream operators must respect. +* `Mz/UnifiedStream.lean`: unified single-collection alternative to `BagStream`. `UnifiedRow` is `row ⊕ err`, so errors flow through the same carrier as data rows. `ofBag` / `split` conversions, with the round-trip theorem `split (ofBag s) = s`. The unified form matches the spec's diff-semiring target; the split `BagStream` is a runtime concession the conversion reconciles. ## What is not here From 86939a51c70fb80f06f4aee56172c7bcaa31eb5d Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 09:07:28 +0200 Subject: [PATCH 016/127] doc/semantics: clean up unused simp args in UnifiedStream helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `List.filterMap` was passed to four `simp` invocations in the filterMap-of-mapped-list helpers but never actually used in the rewriting — the cons recurrence and the `pickRow` / `pickErr` arms combined with the IH are enough. Drop the unused arg; the linter-clean. --- doc/developer/semantics/Mz/UnifiedStream.lean | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/developer/semantics/Mz/UnifiedStream.lean b/doc/developer/semantics/Mz/UnifiedStream.lean index e0c8d2293715e..82280a64d4b98 100644 --- a/doc/developer/semantics/Mz/UnifiedStream.lean +++ b/doc/developer/semantics/Mz/UnifiedStream.lean @@ -95,25 +95,25 @@ private theorem filterMap_pickRow_rowMap (rs : List Row) : (rs.map UnifiedRow.row).filterMap pickRow = rs := by induction rs with | nil => rfl - | cons hd tl ih => simp [List.map, List.filterMap, pickRow, ih] + | cons hd tl ih => simp [List.map, pickRow, ih] private theorem filterMap_pickRow_errMap (es : List EvalError) : (es.map UnifiedRow.err).filterMap pickRow = ([] : Relation) := by induction es with | nil => rfl - | cons _ tl ih => simp [List.map, List.filterMap, pickRow, ih] + | cons _ tl ih => simp [List.map, pickRow, ih] private theorem filterMap_pickErr_rowMap (rs : List Row) : (rs.map UnifiedRow.row).filterMap pickErr = ([] : List EvalError) := by induction rs with | nil => rfl - | cons _ tl ih => simp [List.map, List.filterMap, pickErr, ih] + | cons _ tl ih => simp [List.map, pickErr, ih] private theorem filterMap_pickErr_errMap (es : List EvalError) : (es.map UnifiedRow.err).filterMap pickErr = es := by induction es with | nil => rfl - | cons hd tl ih => simp [List.map, List.filterMap, pickErr, ih] + | cons hd tl ih => simp [List.map, pickErr, ih] /-! ## Round-trip lemmas -/ @@ -125,7 +125,7 @@ theorem UnifiedStream.split_data_ofBag (s : BagStream) : simp only [List.map_nil, List.nil_append] exact filterMap_pickRow_errMap s.errors | cons hd tl ih => - simp [List.map, List.filterMap_cons, pickRow, ih] + simp [List.map, pickRow, ih] theorem UnifiedStream.split_errors_ofBag (s : BagStream) : (UnifiedStream.split (UnifiedStream.ofBag s)).errors = s.errors := by @@ -135,7 +135,7 @@ theorem UnifiedStream.split_errors_ofBag (s : BagStream) : simp only [List.map_nil, List.nil_append] exact filterMap_pickErr_errMap s.errors | cons _ tl ih => - simp [List.map, List.filterMap_cons, pickErr, ih] + simp [List.map, pickErr, ih] /-- Full round trip on the structure level. -/ theorem UnifiedStream.split_ofBag (s : BagStream) : From 800a6690178a3910c8c6e4d3c98e63c8c2f68ad7 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 09:19:37 +0200 Subject: [PATCH 017/127] =?UTF-8?q?doc/semantics:=20add=20aggregates=20?= =?UTF-8?q?=E2=80=94=20strict=20reduction=20over=20List=20Datum?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model the spec rule that SQL aggregates such as `SUM`, `MIN`, `MAX`, and `AVG` propagate `err` like strict scalar functions. NULLs are skipped (matching PostgreSQL's "ignore NULLs" reading); errors are propagated, with the first one in scan order winning. * `aggCountNonNull`: `COUNT(expr)` — counts rows whose value is neither `NULL` nor `err`. Matches PostgreSQL. * `aggStrict f`: variadic strict reduction parametric over the combiner `f`. Empty list (or list of only `NULL`s) returns `NULL`; any `err` returns the first `err`; non-`NULL`/`err` values are combined via `f`. Two propagation theorems: * `aggStrict_err`: any `err` input forces an `err` output. Proof by structural recursion on the list; the `.bool` case branches on `aggStrict f rest` and shows the `err` arm of the inner match is the one that fires. * `aggStrict_no_err`: dual statement under the additional hypothesis that the combiner preserves "no-err". Captures the precondition an aggregate operator can use to guarantee its output is not an error when its column is error-free. `try_*` aggregates (the non-strict variants that swallow `err` into `NULL`) are future work; they satisfy a coalesce-style law rather than strict propagation and warrant their own file once the spec side is clearer. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Aggregate.lean | 126 ++++++++++++++++++++++ doc/developer/semantics/README.md | 4 +- 3 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 doc/developer/semantics/Mz/Aggregate.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 5529fad4a7a0f..89ae564c77da5 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -13,3 +13,4 @@ import Mz.ErrStream import Mz.Pushdown import Mz.DiffSemiring import Mz.UnifiedStream +import Mz.Aggregate diff --git a/doc/developer/semantics/Mz/Aggregate.lean b/doc/developer/semantics/Mz/Aggregate.lean new file mode 100644 index 0000000000000..73c6b27a44772 --- /dev/null +++ b/doc/developer/semantics/Mz/Aggregate.lean @@ -0,0 +1,126 @@ +import Mz.Eval + +/-! +# Aggregates + +Strict aggregate reductions over `List Datum`. The spec rule is that +SQL aggregates such as `SUM`, `MIN`, `MAX`, and `AVG` propagate +`err` like strict scalar functions: any `err` input produces an +`err` output, with a deterministic choice of payload. `NULL` values +are skipped, mirroring PostgreSQL's `COUNT(expr)` and the +"`IGNORE NULLS`" reading of the other aggregates. + +`COUNT` itself is `COUNT(*)` (count all rows) or `COUNT(expr)` +(count rows where `expr` is neither `NULL` nor `err`). The skeleton +models the latter via `aggCountNonNull`. + +`try_sum`, `try_avg`, and friends — the non-strict variants that +swallow `err` into `NULL` instead of propagating — are future work. +They satisfy a *coalesce*-style law rather than a strict-propagation +law. +-/ + +namespace Mz + +/-- `COUNT(expr)`: count rows whose value is neither `NULL` nor an +`err`. Matches the PostgreSQL aggregate. -/ +def aggCountNonNull : List Datum → Nat + | [] => 0 + | .bool _ :: rest => 1 + aggCountNonNull rest + | .null :: rest => aggCountNonNull rest + | .err _ :: rest => aggCountNonNull rest + +/-- Strict aggregate reduction. `err` propagates: the first `err` +encountered is returned. `NULL`s are skipped. The reducer `f` is +applied to the surviving values; an empty list (or a list of only +`NULL`s) returns `NULL`. -/ +def aggStrict (f : Datum → Datum → Datum) : List Datum → Datum + | [] => .null + | .err e :: _ => .err e + | .null :: rest => aggStrict f rest + | x@(.bool _) :: rest => + match aggStrict f rest with + | .err e => .err e + | .null => x + | r => f x r + +/-! ## Strict propagation laws -/ + +/-- If any input is an `err`, the aggregate result is an `err`. The +exact payload is whichever `err` `aggStrict` selects (the first one +in scan order under this definition). -/ +theorem aggStrict_err : + ∀ {xs : List Datum} (f : Datum → Datum → Datum), + (∃ d ∈ xs, d.IsErr) → (aggStrict f xs).IsErr + | [], _, h => by + obtain ⟨_, hmem, _⟩ := h + cases hmem + | Datum.err e :: _, _, _ => by + show (Datum.err e).IsErr + trivial + | Datum.null :: rest, f, h => by + obtain ⟨d, hmem, hsafe⟩ := h + cases hmem with + | head _ => exact hsafe.elim + | tail _ h' => + show (aggStrict f rest).IsErr + exact aggStrict_err f ⟨d, h', hsafe⟩ + | Datum.bool b :: rest, f, h => by + obtain ⟨d, hmem, hsafe⟩ := h + cases hmem with + | head _ => exact hsafe.elim + | tail _ h' => + have h_rest : (aggStrict f rest).IsErr := + aggStrict_err f ⟨d, h', hsafe⟩ + cases h_eval : aggStrict f rest with + | err e' => + show (match aggStrict f rest with + | Datum.err e => Datum.err e + | Datum.null => Datum.bool b + | r => f (Datum.bool b) r).IsErr + rw [h_eval]; trivial + | null => rw [h_eval] at h_rest; exact h_rest.elim + | bool _ => rw [h_eval] at h_rest; exact h_rest.elim + +/-- Dual: if no input is an `err`, the aggregate result is not an +`err`. The reducer `f` is assumed to preserve "no-err": applied to +two non-`err` values it produces a non-`err` value. -/ +theorem aggStrict_no_err + (f : Datum → Datum → Datum) + (f_safe : ∀ x y, ¬x.IsErr → ¬y.IsErr → ¬(f x y).IsErr) : + ∀ {xs : List Datum}, (∀ d ∈ xs, ¬d.IsErr) → ¬(aggStrict f xs).IsErr + | [], _ => by + show ¬(Datum.null).IsErr + intro h; cases h + | Datum.err _ :: _, h => by + exact (h _ (List.Mem.head _) trivial).elim + | Datum.null :: rest, h => by + show ¬(aggStrict f rest).IsErr + apply aggStrict_no_err f f_safe + intro d hmem; exact h d (List.Mem.tail _ hmem) + | Datum.bool b :: rest, h => by + have h_rest : ¬(aggStrict f rest).IsErr := by + apply aggStrict_no_err f f_safe + intro d hmem; exact h d (List.Mem.tail _ hmem) + have hb : ¬(Datum.bool b).IsErr := h _ (List.Mem.head _) + cases h_eval : aggStrict f rest with + | err e => + rw [h_eval] at h_rest + exact absurd trivial h_rest + | null => + show ¬(match aggStrict f rest with + | Datum.err e => Datum.err e + | Datum.null => Datum.bool b + | r => f (Datum.bool b) r).IsErr + rw [h_eval]; exact hb + | bool b' => + show ¬(match aggStrict f rest with + | Datum.err e => Datum.err e + | Datum.null => Datum.bool b + | r => f (Datum.bool b) r).IsErr + rw [h_eval] + apply f_safe + · exact hb + · intro h_eq; cases h_eq + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 6921f99f0b9ff..fb8ee4b250537 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -24,6 +24,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Pushdown.lean`: substitution (`Expr.subst`) plus the headline `eval_subst` theorem (substituting then evaluating against the original row equals evaluating against the projected row), and the relational predicate-pushdown rewrite `filterRel p (project es rel) = project es (filterRel (p.subst es) rel)`. * `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity laws that downstream operators must respect. * `Mz/UnifiedStream.lean`: unified single-collection alternative to `BagStream`. `UnifiedRow` is `row ⊕ err`, so errors flow through the same carrier as data rows. `ofBag` / `split` conversions, with the round-trip theorem `split (ofBag s) = s`. The unified form matches the spec's diff-semiring target; the split `BagStream` is a runtime concession the conversion reconciles. +* `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. Two theorems: `aggStrict_err` (any `err` input → `err` output) and `aggStrict_no_err` (no-err inputs and a no-err-preserving reducer → no-err output). ## What is not here @@ -73,7 +74,8 @@ The roadmap in priority order: * `BagStream.filter` commutativity. Data field commutes by `filterRel_comm`; the error field requires a notion of multiset equality on `List EvalError` since list-order differs across permutations. * Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. * Joins on `BagStream` with explicit error propagation. -* Aggregates: variadic reductions (`SUM`, `MIN`, etc.) with strict-on-err semantics and `try_*` non-strict variants. +* `try_*` non-strict aggregate variants: swallow `err` into `NULL` instead of propagating. Coalesce-style law rather than strict. +* `GROUP BY` semantics: partition rows by key, run `aggStrict` per group. `Datum.err` keys form their own group (per the spec's grouping rule). * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From aeba83058602288c63c1d62a6fd09216d3e04b51 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 09:23:44 +0200 Subject: [PATCH 018/127] =?UTF-8?q?doc/semantics:=20add=20aggTry=20?= =?UTF-8?q?=E2=80=94=20non-strict=20(try=5F*)=20aggregate=20variants?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model the proposed `try_sum`, `try_min`, etc. that swallow `err` into `NULL` instead of propagating. Defined as a post-pass coalesce on `aggStrict`: aggTry f xs = match aggStrict f xs with | err _ => null | r => r Equivalent reading via the existing `evalCoalesce`: aggTry f xs = evalCoalesce [aggStrict f xs, .null] The defining property `aggTry_no_err` says the result is never an `err`. The companion `aggTry_eq_aggStrict_of_no_err` says the non-strict variant agrees with the strict one whenever the strict one would not have erred — so an optimizer that has already proved the inputs error-free can swap `aggTry` for `aggStrict` (and vice versa). Both lemmas reduce to case analysis on `aggStrict f xs`. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Aggregate.lean | 42 +++++++++++++++++++++++ doc/developer/semantics/README.md | 3 +- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/doc/developer/semantics/Mz/Aggregate.lean b/doc/developer/semantics/Mz/Aggregate.lean index 73c6b27a44772..000ff9fe701e2 100644 --- a/doc/developer/semantics/Mz/Aggregate.lean +++ b/doc/developer/semantics/Mz/Aggregate.lean @@ -123,4 +123,46 @@ theorem aggStrict_no_err · exact hb · intro h_eq; cases h_eq +/-! ## Non-strict (`try_*`) variants + +The proposed `try_sum`, `try_min`, etc. swallow `err` into `NULL` +instead of propagating. Defined here as a post-pass coalesce on +`aggStrict`'s output: an `err` result becomes `.null`; anything +else is unchanged. The skeleton's `evalCoalesce` exhibits the same +"`null` beats `err`" tiebreak, so an equivalent reading is + + aggTry f xs = evalCoalesce [aggStrict f xs, .null] + +The defining property is `aggTry_no_err`: the result is never an +`err`. The companion `aggTry_eq_aggStrict_of_no_err` says the +non-strict variant agrees with the strict one whenever the strict +one would not have erred — so an optimizer that has proved the +inputs error-free can swap `aggTry` for `aggStrict`. -/ + +def aggTry (f : Datum → Datum → Datum) (xs : List Datum) : Datum := + match aggStrict f xs with + | Datum.err _ => Datum.null + | r => r + +theorem aggTry_no_err (f : Datum → Datum → Datum) (xs : List Datum) : + ¬(aggTry f xs).IsErr := by + show ¬(match aggStrict f xs with + | Datum.err _ => Datum.null + | r => r).IsErr + cases aggStrict f xs <;> intro h <;> cases h + +theorem aggTry_eq_aggStrict_of_no_err + (f : Datum → Datum → Datum) + (f_safe : ∀ x y, ¬x.IsErr → ¬y.IsErr → ¬(f x y).IsErr) + {xs : List Datum} (h : ∀ d ∈ xs, ¬d.IsErr) : + aggTry f xs = aggStrict f xs := by + have h_safe : ¬(aggStrict f xs).IsErr := aggStrict_no_err f f_safe h + show (match aggStrict f xs with + | Datum.err _ => Datum.null + | r => r) = aggStrict f xs + cases h_eval : aggStrict f xs with + | err _ => exact absurd (h_eval ▸ h_safe) (fun h' => h' trivial) + | null => rfl + | bool _ => rfl + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index fb8ee4b250537..9d9cd658ee206 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -24,7 +24,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Pushdown.lean`: substitution (`Expr.subst`) plus the headline `eval_subst` theorem (substituting then evaluating against the original row equals evaluating against the projected row), and the relational predicate-pushdown rewrite `filterRel p (project es rel) = project es (filterRel (p.subst es) rel)`. * `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity laws that downstream operators must respect. * `Mz/UnifiedStream.lean`: unified single-collection alternative to `BagStream`. `UnifiedRow` is `row ⊕ err`, so errors flow through the same carrier as data rows. `ofBag` / `split` conversions, with the round-trip theorem `split (ofBag s) = s`. The unified form matches the spec's diff-semiring target; the split `BagStream` is a runtime concession the conversion reconciles. -* `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. Two theorems: `aggStrict_err` (any `err` input → `err` output) and `aggStrict_no_err` (no-err inputs and a no-err-preserving reducer → no-err output). +* `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). ## What is not here @@ -74,7 +74,6 @@ The roadmap in priority order: * `BagStream.filter` commutativity. Data field commutes by `filterRel_comm`; the error field requires a notion of multiset equality on `List EvalError` since list-order differs across permutations. * Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. * Joins on `BagStream` with explicit error propagation. -* `try_*` non-strict aggregate variants: swallow `err` into `NULL` instead of propagating. Coalesce-style law rather than strict. * `GROUP BY` semantics: partition rows by key, run `aggStrict` per group. `Datum.err` keys form their own group (per the spec's grouping rule). * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From 773a180563712599c1b284796cfe9b605145f26b Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 09:26:53 +0200 Subject: [PATCH 019/127] =?UTF-8?q?doc/semantics:=20add=20Consolidate=20?= =?UTF-8?q?=E2=80=94=20diff-summation=20absorption=20laws?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model the diff-only slice of differential dataflow's `compact` / consolidation operation: summing a list of `DiffWithError α` values and showing that an `error` diff absorbs the consolidated sum. * `DiffWithError.sumAll` sums a list of diffs starting from `0`, right-associative. The result lives in `DiffWithError α` for any base `α` with `Zero` and `Add` instances. * `sumAll_eq_error_of_mem`: the headline absorption theorem. If `error` appears anywhere in the list, the consolidated sum is `error`. Proof walks the list and uses the absorption laws from `Mz/DiffSemiring.lean` at the matching cons. * `sumAll_val_of_all_val`: companion no-error-preservation theorem. An all-`val` list sums to `val` of some `α`. The full `compact` operator also buckets records by `(row, time)`; that bucketing is orthogonal to the absorption argument and would require `DecidableEq Row` plus a time model, both follow-ups. The per-bucket inner sum modeled here is what an operator-level proof would invoke once those pieces are in place. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Consolidate.lean | 87 +++++++++++++++++++++ doc/developer/semantics/README.md | 1 + 3 files changed, 89 insertions(+) create mode 100644 doc/developer/semantics/Mz/Consolidate.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 89ae564c77da5..a797f7b242cfe 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -14,3 +14,4 @@ import Mz.Pushdown import Mz.DiffSemiring import Mz.UnifiedStream import Mz.Aggregate +import Mz.Consolidate diff --git a/doc/developer/semantics/Mz/Consolidate.lean b/doc/developer/semantics/Mz/Consolidate.lean new file mode 100644 index 0000000000000..b957353a06909 --- /dev/null +++ b/doc/developer/semantics/Mz/Consolidate.lean @@ -0,0 +1,87 @@ +import Mz.DiffSemiring + +/-! +# Consolidation + +Differential dataflow's `compact` operation sums the diffs of +records that share the same `(row, time)` key. The skeleton models +the diff-only slice of that operation: summing a list of +`DiffWithError α` values. The interesting property is that an +`error` diff in the list absorbs the whole sum to `error`, +regardless of what the other diffs are. + +This is the algebraic statement an actual `compact` operator would +cite when arguing that global errors propagate through +consolidation: as soon as any record's diff is `error`, the +consolidated diff for that key is `error`, so downstream operators +treat the key's contribution as invalid. + +The full operator would also bucket records by `(row, time)`, +which requires `DecidableEq` on `Row` and a notion of time. That +bucketing is orthogonal to the absorption argument; the skeleton +gives the single-bucket version, which is the per-key inner sum. +-/ + +namespace Mz + +namespace DiffWithError + +variable {α : Type} + +/-- Sum a list of diffs, starting from `0`. The fold is right- +associative; with `+` being commutative on the base type, the +result is order-independent. -/ +def sumAll [Zero α] [Add α] : List (DiffWithError α) → DiffWithError α + | [] => 0 + | d :: rest => d + sumAll rest + +/-! ## Absorption -/ + +/-- If `error` appears anywhere in the diff list, the consolidated +sum is `error`. The proof walks the list and uses the absorption +laws from `Mz/DiffSemiring.lean` at the matching cons. -/ +theorem sumAll_eq_error_of_mem [Zero α] [Add α] + {ds : List (DiffWithError α)} (h : (error : DiffWithError α) ∈ ds) : + sumAll ds = error := by + induction ds with + | nil => cases h + | cons hd tl ih => + cases h with + | head _ => + -- hd = error; first cons step is `error + sumAll tl`, which is `error`. + show (error : DiffWithError α) + sumAll tl = error + exact error_add_left _ + | tail _ h_tl => + -- error is in tl. By IH `sumAll tl = error`, then `hd + error = error`. + show hd + sumAll tl = error + rw [ih h_tl] + exact error_add_right hd + +/-! ## No-error preservation + +If every diff in the list is an honest `val x`, the consolidated +sum is also `val` of *some* `α`. The exact value depends on the +base addition, which the skeleton does not commit to here. -/ + +theorem sumAll_val_of_all_val [Zero α] [Add α] + {ds : List (DiffWithError α)} + (h : ∀ d ∈ ds, ∃ x : α, d = val x) : + ∃ x : α, sumAll ds = val x := by + induction ds with + | nil => + show ∃ x : α, (0 : DiffWithError α) = val x + refine ⟨0, ?_⟩ + rfl + | cons hd tl ih => + obtain ⟨xh, hh_eq⟩ := h hd (List.Mem.head _) + have htl : ∀ d ∈ tl, ∃ x : α, d = val x := + fun d hd_mem => h d (List.Mem.tail _ hd_mem) + obtain ⟨xt, ht_eq⟩ := ih htl + refine ⟨xh + xt, ?_⟩ + show hd + sumAll tl = val (xh + xt) + rw [hh_eq, ht_eq] + rfl + +end DiffWithError + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 9d9cd658ee206..12889e29ada97 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -25,6 +25,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity laws that downstream operators must respect. * `Mz/UnifiedStream.lean`: unified single-collection alternative to `BagStream`. `UnifiedRow` is `row ⊕ err`, so errors flow through the same carrier as data rows. `ofBag` / `split` conversions, with the round-trip theorem `split (ofBag s) = s`. The unified form matches the spec's diff-semiring target; the split `BagStream` is a runtime concession the conversion reconciles. * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). +* `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. ## What is not here From 9fb971b12da83b0ad078c65362943ab324b4ed02 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 09:37:16 +0200 Subject: [PATCH 020/127] doc/semantics: tie DiffWithError to (Row, Time, Diff) triple stream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `TimedRecord` and `TimedStream` to model differential dataflow's record format with errors. `Time := Nat`; the diff is `DiffWithError α` parametric in the base diff type. Operations: * `TimedStream.consolidateAll` sums every diff in the stream, ignoring row and time. The collection-wide diff. * `TimedStream.consolidateAt t` sums every diff at time `t`, ignoring row. The per-time collection diff. Both reduce to `DiffWithError.sumAll`, so the absorption laws from `Mz/Consolidate.lean` transport: * `consolidateAll_eq_error_of_mem`: any `error`-carrying record forces the all-stream consolidation to `error`. * `consolidateAt_eq_error_of_mem`: any `error`-carrying record at time `t` forces the per-time consolidation at `t` to `error`. Per-`(row, time)` bucketing — the form `compact` actually uses in the runtime — is the next refinement and requires `DecidableEq` on `Row`. The current sums are the collection-global diffs at each time slice, which is what operator-level proofs invoke once the bucketing is layered on. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Triple.lean | 80 ++++++++++++++++++++++++++ doc/developer/semantics/README.md | 1 + 3 files changed, 82 insertions(+) create mode 100644 doc/developer/semantics/Mz/Triple.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index a797f7b242cfe..cc9d114089069 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -15,3 +15,4 @@ import Mz.DiffSemiring import Mz.UnifiedStream import Mz.Aggregate import Mz.Consolidate +import Mz.Triple diff --git a/doc/developer/semantics/Mz/Triple.lean b/doc/developer/semantics/Mz/Triple.lean new file mode 100644 index 0000000000000..5e9971e216c4f --- /dev/null +++ b/doc/developer/semantics/Mz/Triple.lean @@ -0,0 +1,80 @@ +import Mz.Bag +import Mz.DiffSemiring +import Mz.Consolidate + +/-! +# Timed records — `(Row, Time, Diff)` triple stream + +A first sketch of differential dataflow's record format: a stream +of `(row, time, diff)` triples, where the diff lives in +`DiffWithError α`. The skeleton uses `Nat` for time and parametrizes +over the base diff type. + +The operations modeled are consolidation by time (sum diffs across +all rows at a given time) and consolidation across the whole stream +(sum every diff). Both reduce to `DiffWithError.sumAll`, so the +absorption laws from `Mz/Consolidate.lean` transport directly: if +any record in the consolidated range carries an `error` diff, the +consolidated total is `error`. + +Per-`(row, time)` bucketing is the next refinement and requires +`DecidableEq` on `Row`; the present file does the simpler "sum +everything in the time slice" version, which is the per-time +collection-global diff. +-/ + +namespace Mz + +/-- A timed record: row, time, and a diff value possibly carrying +the absorbing `error` marker. -/ +structure TimedRecord (α : Type) where + row : Row + time : Nat + diff : DiffWithError α + deriving Inhabited + +/-- A stream of timed records. Order does not matter; the operations +below are insensitive to order whenever the base `Add` on `α` is +commutative. -/ +abbrev TimedStream (α : Type) := List (TimedRecord α) + +/-- Sum every diff in the stream, ignoring row and time. The +collection-wide diff. -/ +def TimedStream.consolidateAll [Zero α] [Add α] (s : TimedStream α) : + DiffWithError α := + DiffWithError.sumAll (s.map (·.diff)) + +/-- Sum every diff at a given time, ignoring row. The per-time +collection diff. -/ +def TimedStream.consolidateAt [Zero α] [Add α] (t : Nat) (s : TimedStream α) : + DiffWithError α := + DiffWithError.sumAll ((s.filter (·.time = t)).map (·.diff)) + +/-! ## Absorption -/ + +/-- If any record carries an `error` diff, the all-stream consolidation +is `error`. -/ +theorem TimedStream.consolidateAll_eq_error_of_mem [Zero α] [Add α] + {s : TimedStream α} (r : TimedRecord α) + (h_mem : r ∈ s) (h_err : r.diff = DiffWithError.error) : + TimedStream.consolidateAll s = DiffWithError.error := by + unfold TimedStream.consolidateAll + apply DiffWithError.sumAll_eq_error_of_mem + -- Need: error ∈ s.map (·.diff). Since r ∈ s and r.diff = error, by List.mem_map. + refine List.mem_map.mpr ⟨r, h_mem, ?_⟩ + exact h_err + +/-- Same statement restricted to a single time slice: an `error` +record at time `t` forces the per-time consolidation at `t` to +`error`. -/ +theorem TimedStream.consolidateAt_eq_error_of_mem [Zero α] [Add α] + {s : TimedStream α} (t : Nat) (r : TimedRecord α) + (h_mem : r ∈ s) (h_time : r.time = t) + (h_err : r.diff = DiffWithError.error) : + TimedStream.consolidateAt t s = DiffWithError.error := by + unfold TimedStream.consolidateAt + apply DiffWithError.sumAll_eq_error_of_mem + refine List.mem_map.mpr ⟨r, ?_, h_err⟩ + exact List.mem_filter.mpr ⟨h_mem, by simp [h_time]⟩ + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 12889e29ada97..08d31174bd274 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -26,6 +26,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/UnifiedStream.lean`: unified single-collection alternative to `BagStream`. `UnifiedRow` is `row ⊕ err`, so errors flow through the same carrier as data rows. `ofBag` / `split` conversions, with the round-trip theorem `split (ofBag s) = s`. The unified form matches the spec's diff-semiring target; the split `BagStream` is a runtime concession the conversion reconciles. * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. +* `Mz/Triple.lean`: `TimedRecord = (row, time, diff)` triple stream tying `DiffWithError` to differential dataflow's record format. Defines `consolidateAll` and `consolidateAt t`; lifts `Consolidate`'s absorption to per-time and stream-wide statements. Per-`(row, time)` bucketing is a follow-up — needs `DecidableEq` on `Row`. ## What is not here From 43fd98a63f67b64dec390d0be8d338f8a41fc5d3 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 09:50:48 +0200 Subject: [PATCH 021/127] =?UTF-8?q?doc/semantics:=20add=20Join=20=E2=80=94?= =?UTF-8?q?=20cross=20product=20and=20predicate=20join=20on=20UnifiedStrea?= =?UTF-8?q?m?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two-input relational join modeled on the unified single-collection stream. Cross product is the building block; predicate join is cross followed by `UnifiedStream.filter`. * `UnifiedStream.cross l r` produces one output record per `(lu, ru)` pair: both rows ⇒ concatenated row; either side `err` ⇒ that side's `err` (left wins on conflict, matching the binary AND's first-error rule). * `UnifiedStream.join pred l r := (cross l r).filter pred`. Error propagation falls out of the carrier: an `err` record on either side contributes one `err` to the output per record on the other side, matching the diff-semiring's `error * diff = error` intuition. Theorems: `cross_nil_left` and `cross_nil_right` cover the empty cases (joining anything with the empty stream is empty). The cardinality theorem `(cross l r).length = l.length * r.length` is deferred — needs `List.length_flatMap` + Nat arithmetic that the current skeleton's lemma toolkit handles awkwardly. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Join.lean | 59 ++++++++++++++++++++++++++++ doc/developer/semantics/README.md | 5 ++- 3 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 doc/developer/semantics/Mz/Join.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index cc9d114089069..f9ff98954d89e 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -16,3 +16,4 @@ import Mz.UnifiedStream import Mz.Aggregate import Mz.Consolidate import Mz.Triple +import Mz.Join diff --git a/doc/developer/semantics/Mz/Join.lean b/doc/developer/semantics/Mz/Join.lean new file mode 100644 index 0000000000000..f81f53ac0be61 --- /dev/null +++ b/doc/developer/semantics/Mz/Join.lean @@ -0,0 +1,59 @@ +import Mz.Eval +import Mz.Bag +import Mz.ErrStream +import Mz.UnifiedStream + +/-! +# Joins on `UnifiedStream` + +Two-input relational join modeled on the unified single-collection +stream. The cartesian product `cross l r` is the building block; +`join pred l r` filters the product through a join predicate. + +Error propagation follows naturally from the carrier: every +`(lu, ru)` pair contributes one output, and that output is an +`err` whenever either side of the pair is an `err`. The +multiplicity matches the semiring intuition that +`error * diff = error`: an `err` record in `l` produces one `err` +in the output for every record in `r`, and vice versa. + +`cross` makes no commitment to row schema beyond list +concatenation. Schema-aware joins (equi-joins on named columns) +would lift to this with a column-substitution layer. +-/ + +namespace Mz + +/-- Cartesian product of two unified streams. + +For each pair `(lu, ru)`, produce one output: +* both real rows ⇒ concatenated row; +* either side is `err` ⇒ that side's `err` payload (left wins on + conflict, matching `evalAnd`'s first-error rule). -/ +def UnifiedStream.cross (l r : UnifiedStream) : UnifiedStream := + l.flatMap fun lu => + r.map fun ru => + match lu, ru with + | .row la, .row rb => .row (la ++ rb) + | .err e, _ => .err e + | _, .err e => .err e + +/-- Equi-join or theta-join: cross product filtered by a predicate. +The predicate evaluates against the concatenated row; existing +`UnifiedStream.filter` semantics apply (predicate `.err` routes the +row's error into the carrier). -/ +def UnifiedStream.join (pred : Expr) (l r : UnifiedStream) : UnifiedStream := + (UnifiedStream.cross l r).filter pred + +/-! ## Empty cases -/ + +theorem UnifiedStream.cross_nil_left (r : UnifiedStream) : + UnifiedStream.cross [] r = [] := rfl + +theorem UnifiedStream.cross_nil_right (l : UnifiedStream) : + UnifiedStream.cross l [] = [] := by + induction l with + | nil => rfl + | cons _ tl _ih => simp [UnifiedStream.cross, List.map_nil, List.flatMap_cons] + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 08d31174bd274..2037d7031de4b 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -27,6 +27,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. * `Mz/Triple.lean`: `TimedRecord = (row, time, diff)` triple stream tying `DiffWithError` to differential dataflow's record format. Defines `consolidateAll` and `consolidateAt t`; lifts `Consolidate`'s absorption to per-time and stream-wide statements. Per-`(row, time)` bucketing is a follow-up — needs `DecidableEq` on `Row`. +* `Mz/Join.lean`: relational joins on `UnifiedStream`. `cross` is the cartesian product; `join pred l r` filters the product through a join predicate. Errors propagate through the carrier — an `err` record on either side contributes one `err` to the output for every record on the other side, matching the diff-semiring's `error * diff = error`. Theorems cover the empty cases. ## What is not here @@ -76,7 +77,9 @@ The roadmap in priority order: * `BagStream.filter` commutativity. Data field commutes by `filterRel_comm`; the error field requires a notion of multiset equality on `List EvalError` since list-order differs across permutations. * Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. * Joins on `BagStream` with explicit error propagation. -* `GROUP BY` semantics: partition rows by key, run `aggStrict` per group. `Datum.err` keys form their own group (per the spec's grouping rule). +* `GROUP BY` semantics: partition rows by key, run `aggStrict` per group. `Datum.err` keys form their own group (per the spec's grouping rule). Requires `DecidableEq Row`. +* Join cardinality theorem: `(cross l r).length = l.length * r.length`. Skipped in the current skeleton; the proof needs `List.length_flatMap` + Nat arithmetic. +* Sketch a proof of `cross_assoc` modulo row concatenation associativity. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From 648b1d404cd63526e660362da498576b916b2fb0 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 09:53:55 +0200 Subject: [PATCH 022/127] =?UTF-8?q?doc/semantics:=20add=20GROUP=20BY=20?= =?UTF-8?q?=E2=80=94=20groupBy=20+=20aggregateBy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Partition a relation by an evaluated key expression and aggregate per group. Closes the SQL `SELECT k, AGG(v) FROM r GROUP BY k` shape the rest of the algebra implies. * `Datum` derives `DecidableEq` (via the already-decidable `EvalError` payload) so keys can be compared at runtime. * `groupBy keyExpr rel` walks rows, evaluates the key on each, and inserts into the existing group with that key or starts a new one. Output is `List (Datum × Relation)`. * `aggregateBy keyExpr valExpr f rel` applies `aggStrict` to each group's evaluated value column, producing `List (Datum × Datum)` — one aggregated value per group. Spec divergence on `err` keys documented inline: the skeleton uses standard `DecidableEq Datum`, so two `Datum.err e` values with the same payload collapse into one group. The spec requires every err key to be its own group. The natural refinement is a custom `Datum.groupKeyEq` that returns `false` whenever either side is an err; left for future work. Theorems for now cover the trivial cases (empty, singleton). Cardinality (`sum of group sizes = rel.length`) is the next concrete law and is deferred until the lemma toolkit picks up the needed Nat / List arithmetic helpers. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Datum.lean | 2 +- doc/developer/semantics/Mz/GroupBy.lean | 66 +++++++++++++++++++++++++ doc/developer/semantics/README.md | 4 +- 4 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 doc/developer/semantics/Mz/GroupBy.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index f9ff98954d89e..0be2f074b3aa7 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -17,3 +17,4 @@ import Mz.Aggregate import Mz.Consolidate import Mz.Triple import Mz.Join +import Mz.GroupBy diff --git a/doc/developer/semantics/Mz/Datum.lean b/doc/developer/semantics/Mz/Datum.lean index 16f2f8841c9e6..49fe205e0a7b7 100644 --- a/doc/developer/semantics/Mz/Datum.lean +++ b/doc/developer/semantics/Mz/Datum.lean @@ -35,7 +35,7 @@ inductive Datum | bool (b : Bool) | null | err (e : EvalError) - deriving Inhabited + deriving DecidableEq, Inhabited /-- Propositional predicate "this datum is an error". diff --git a/doc/developer/semantics/Mz/GroupBy.lean b/doc/developer/semantics/Mz/GroupBy.lean new file mode 100644 index 0000000000000..fdd37532347e2 --- /dev/null +++ b/doc/developer/semantics/Mz/GroupBy.lean @@ -0,0 +1,66 @@ +import Mz.Eval +import Mz.Bag +import Mz.Aggregate + +/-! +# `GROUP BY` + +Partition a relation by the value of a key expression, evaluated on +each row. The skeleton uses Lean's derived `DecidableEq Datum` to +compare keys. + +## Spec divergence on `err` keys + +The error-handling spec states that `Datum.err` keys form their own +group — every error is its own bucket, so distinct failures never +collapse into the same aggregate. The skeleton's `groupBy` uses +standard `DecidableEq`, which treats two `Datum.err e` values with +the same `e` as equal. A spec-faithful variant would special-case +`err` keys to always create a new group; the spec divergence is +called out here and is the natural next refinement. + +The companion `aggregateBy` runs `aggStrict` per group, producing +`(key, value)` pairs. +-/ + +namespace Mz + +/-- Insert `row` into the group keyed by `k`. If no group with key +`k` exists yet, create one; otherwise prepend `row` to the existing +group's row list. -/ +private def insertInto (k : Datum) (row : Row) : + List (Datum × Relation) → List (Datum × Relation) + | [] => [(k, [row])] + | (k', rows) :: rest => + if k = k' then (k', row :: rows) :: rest + else (k', rows) :: insertInto k row rest + +/-- `GROUP BY keyExpr`: partition `rel` by the value of `keyExpr` +on each row. Output is a list of `(key, rows)` pairs, one per +distinct key, in encounter order. -/ +def groupBy (keyExpr : Expr) (rel : Relation) : List (Datum × Relation) := + rel.foldr (fun row acc => insertInto (eval row keyExpr) row acc) [] + +/-- Aggregate per group: run `aggStrict` over each group's evaluated +column. Models the `SELECT keyExpr, SUM(valExpr) FROM rel GROUP BY +keyExpr` flow. -/ +def aggregateBy + (keyExpr valExpr : Expr) (f : Datum → Datum → Datum) + (rel : Relation) : List (Datum × Datum) := + (groupBy keyExpr rel).map fun grp => + (grp.1, aggStrict f (grp.2.map (fun row => eval row valExpr))) + +/-! ## Trivial cases -/ + +theorem groupBy_nil (keyExpr : Expr) : + groupBy keyExpr [] = [] := rfl + +theorem aggregateBy_nil (keyExpr valExpr : Expr) (f : Datum → Datum → Datum) : + aggregateBy keyExpr valExpr f [] = [] := rfl + +/-- A single-row relation produces exactly one group containing that +row, keyed by the row's evaluated key. -/ +theorem groupBy_singleton (keyExpr : Expr) (row : Row) : + groupBy keyExpr [row] = [(eval row keyExpr, [row])] := rfl + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 2037d7031de4b..930a99c3fd336 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -28,6 +28,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. * `Mz/Triple.lean`: `TimedRecord = (row, time, diff)` triple stream tying `DiffWithError` to differential dataflow's record format. Defines `consolidateAll` and `consolidateAt t`; lifts `Consolidate`'s absorption to per-time and stream-wide statements. Per-`(row, time)` bucketing is a follow-up — needs `DecidableEq` on `Row`. * `Mz/Join.lean`: relational joins on `UnifiedStream`. `cross` is the cartesian product; `join pred l r` filters the product through a join predicate. Errors propagate through the carrier — an `err` record on either side contributes one `err` to the output for every record on the other side, matching the diff-semiring's `error * diff = error`. Theorems cover the empty cases. +* `Mz/GroupBy.lean`: `groupBy keyExpr rel` partitions a relation by an evaluated key, using `DecidableEq Datum`. `aggregateBy keyExpr valExpr f rel` runs `aggStrict` per group, modeling `SELECT keyExpr, AGG(valExpr) ... GROUP BY keyExpr`. Spec divergence on `err` keys (the spec says every err is its own group) is documented inline. ## What is not here @@ -77,7 +78,8 @@ The roadmap in priority order: * `BagStream.filter` commutativity. Data field commutes by `filterRel_comm`; the error field requires a notion of multiset equality on `List EvalError` since list-order differs across permutations. * Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. * Joins on `BagStream` with explicit error propagation. -* `GROUP BY` semantics: partition rows by key, run `aggStrict` per group. `Datum.err` keys form their own group (per the spec's grouping rule). Requires `DecidableEq Row`. +* Spec-faithful err-distinct keying for `groupBy`: every `Datum.err` key must form its own bucket, even when two errs have the same payload. Requires either a custom `Datum.groupKeyEq` or wrapping err keys in a fresh-tag type. +* Cardinality theorem for `groupBy`: sum of group sizes equals `rel.length`. Requires `List.foldr_length` + Nat arithmetic. * Join cardinality theorem: `(cross l r).length = l.length * r.length`. Skipped in the current skeleton; the proof needs `List.length_flatMap` + Nat arithmetic. * Sketch a proof of `cross_assoc` modulo row concatenation associativity. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. From e2254c9c44606fa01bb2227aae61c0d665f2d907 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 10:00:08 +0200 Subject: [PATCH 023/127] doc/semantics: spec-faithful err-distinct GROUP BY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `Datum.groupKeyEq` — group-key equivalence that returns `false` whenever either side is `.err`, so two error keys never coalesce even if their payloads happen to match. Build `groupByErrDistinct` and `aggregateByErrDistinct` on top, leaving the existing `groupBy` / `aggregateBy` (which use derived `DecidableEq Datum`) in place as the more permissive variant. Theorems: * `Datum.groupKeyEq_err_left`: simp lemma reducing the err-left case to `false`. * `insertIntoDistinct_err`: inserting an err-keyed row into any group list appends a fresh singleton group at the end. * `groupByErrDistinct_length_of_all_err`: when every row's key evaluates to err, the output has one group per row — closes the spec-divergence note from the previous commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/GroupBy.lean | 110 +++++++++++++++++++++--- doc/developer/semantics/README.md | 10 ++- 2 files changed, 104 insertions(+), 16 deletions(-) diff --git a/doc/developer/semantics/Mz/GroupBy.lean b/doc/developer/semantics/Mz/GroupBy.lean index fdd37532347e2..917ef22d305a9 100644 --- a/doc/developer/semantics/Mz/GroupBy.lean +++ b/doc/developer/semantics/Mz/GroupBy.lean @@ -6,25 +6,32 @@ import Mz.Aggregate # `GROUP BY` Partition a relation by the value of a key expression, evaluated on -each row. The skeleton uses Lean's derived `DecidableEq Datum` to -compare keys. +each row. Two grouping primitives: -## Spec divergence on `err` keys +* `groupBy` uses Lean's derived `DecidableEq Datum`, which treats + two `Datum.err e` values with the same payload as equal. Two + rows whose key both evaluate to the same `err e` therefore end + up in the same group. +* `groupByErrDistinct` uses the spec-faithful `Datum.groupKeyEq` + predicate that returns `false` whenever either side is `.err`. + Every error row lands in its own singleton group, matching the + design doc's rule that distinct failures must not collapse into + one aggregate. -The error-handling spec states that `Datum.err` keys form their own -group — every error is its own bucket, so distinct failures never -collapse into the same aggregate. The skeleton's `groupBy` uses -standard `DecidableEq`, which treats two `Datum.err e` values with -the same `e` as equal. A spec-faithful variant would special-case -`err` keys to always create a new group; the spec divergence is -called out here and is the natural next refinement. - -The companion `aggregateBy` runs `aggStrict` per group, producing -`(key, value)` pairs. +The companion `aggregateBy` (and `aggregateByErrDistinct`) runs +`aggStrict` per group, producing `(key, value)` pairs. -/ namespace Mz +/-- Spec-faithful group-key equivalence. Two err keys are never +equal — every error is its own group, regardless of payload. On +non-err keys, falls through to structural `DecidableEq`. -/ +@[inline] def Datum.groupKeyEq : Datum → Datum → Bool + | .err _, _ => false + | _, .err _ => false + | a, b => a = b + /-- Insert `row` into the group keyed by `k`. If no group with key `k` exists yet, create one; otherwise prepend `row` to the existing group's row list. -/ @@ -35,12 +42,26 @@ private def insertInto (k : Datum) (row : Row) : if k = k' then (k', row :: rows) :: rest else (k', rows) :: insertInto k row rest +/-- Err-distinct insert. Uses `Datum.groupKeyEq` instead of `=`, so +`err` keys never coalesce with any existing group. -/ +private def insertIntoDistinct (k : Datum) (row : Row) : + List (Datum × Relation) → List (Datum × Relation) + | [] => [(k, [row])] + | (k', rows) :: rest => + if Datum.groupKeyEq k k' then (k', row :: rows) :: rest + else (k', rows) :: insertIntoDistinct k row rest + /-- `GROUP BY keyExpr`: partition `rel` by the value of `keyExpr` on each row. Output is a list of `(key, rows)` pairs, one per distinct key, in encounter order. -/ def groupBy (keyExpr : Expr) (rel : Relation) : List (Datum × Relation) := rel.foldr (fun row acc => insertInto (eval row keyExpr) row acc) [] +/-- Spec-faithful `GROUP BY` that never merges `err` keys. -/ +def groupByErrDistinct (keyExpr : Expr) (rel : Relation) : + List (Datum × Relation) := + rel.foldr (fun row acc => insertIntoDistinct (eval row keyExpr) row acc) [] + /-- Aggregate per group: run `aggStrict` over each group's evaluated column. Models the `SELECT keyExpr, SUM(valExpr) FROM rel GROUP BY keyExpr` flow. -/ @@ -50,17 +71,80 @@ def aggregateBy (groupBy keyExpr rel).map fun grp => (grp.1, aggStrict f (grp.2.map (fun row => eval row valExpr))) +/-- Err-distinct aggregate: per-group `aggStrict`, but err keys are +never merged. Since `aggStrict` of a singleton err-keyed group is +just `aggStrict` on that group's values, the practical effect is +that each err key produces its own row in the output. -/ +def aggregateByErrDistinct + (keyExpr valExpr : Expr) (f : Datum → Datum → Datum) + (rel : Relation) : List (Datum × Datum) := + (groupByErrDistinct keyExpr rel).map fun grp => + (grp.1, aggStrict f (grp.2.map (fun row => eval row valExpr))) + /-! ## Trivial cases -/ theorem groupBy_nil (keyExpr : Expr) : groupBy keyExpr [] = [] := rfl +theorem groupByErrDistinct_nil (keyExpr : Expr) : + groupByErrDistinct keyExpr [] = [] := rfl + theorem aggregateBy_nil (keyExpr valExpr : Expr) (f : Datum → Datum → Datum) : aggregateBy keyExpr valExpr f [] = [] := rfl +theorem aggregateByErrDistinct_nil + (keyExpr valExpr : Expr) (f : Datum → Datum → Datum) : + aggregateByErrDistinct keyExpr valExpr f [] = [] := rfl + /-- A single-row relation produces exactly one group containing that row, keyed by the row's evaluated key. -/ theorem groupBy_singleton (keyExpr : Expr) (row : Row) : groupBy keyExpr [row] = [(eval row keyExpr, [row])] := rfl +theorem groupByErrDistinct_singleton (keyExpr : Expr) (row : Row) : + groupByErrDistinct keyExpr [row] = [(eval row keyExpr, [row])] := rfl + +/-! ## Err-distinct laws -/ + +/-- `Datum.groupKeyEq` is `false` whenever the left side is an err, +regardless of the right side. -/ +@[simp] theorem Datum.groupKeyEq_err_left (e : EvalError) (d : Datum) : + Datum.groupKeyEq (.err e) d = false := by + cases d <;> rfl + +/-- Inserting an err-keyed row into any group list appends the row +as a fresh singleton group at the end, since `groupKeyEq` never +matches an err key. -/ +theorem insertIntoDistinct_err + (e : EvalError) (row : Row) (groups : List (Datum × Relation)) : + insertIntoDistinct (.err e) row groups = + groups ++ [(.err e, [row])] := by + induction groups with + | nil => rfl + | cons head tl ih => + obtain ⟨k', rows⟩ := head + simp [insertIntoDistinct, Datum.groupKeyEq_err_left, ih] + +/-- When every row's key evaluates to an err, the err-distinct +grouping produces exactly one group per row — no merging happens +across rows. -/ +theorem groupByErrDistinct_length_of_all_err + (keyExpr : Expr) (rel : Relation) + (h : ∀ row ∈ rel, ∃ e, eval row keyExpr = .err e) : + (groupByErrDistinct keyExpr rel).length = rel.length := by + induction rel with + | nil => rfl + | cons head tl ih => + have hHead : ∃ e, eval head keyExpr = .err e := + h head (List.mem_cons_self) + have hTl : ∀ row ∈ tl, ∃ e, eval row keyExpr = .err e := + fun row hMem => h row (List.mem_cons_of_mem _ hMem) + obtain ⟨e, heq⟩ := hHead + have ihApp := ih hTl + show (insertIntoDistinct (eval head keyExpr) head + (groupByErrDistinct keyExpr tl)).length + = (head :: tl).length + rw [heq, insertIntoDistinct_err, List.length_append, ihApp] + simp [List.length_cons] + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 930a99c3fd336..3633d9fe29a84 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -28,7 +28,11 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. * `Mz/Triple.lean`: `TimedRecord = (row, time, diff)` triple stream tying `DiffWithError` to differential dataflow's record format. Defines `consolidateAll` and `consolidateAt t`; lifts `Consolidate`'s absorption to per-time and stream-wide statements. Per-`(row, time)` bucketing is a follow-up — needs `DecidableEq` on `Row`. * `Mz/Join.lean`: relational joins on `UnifiedStream`. `cross` is the cartesian product; `join pred l r` filters the product through a join predicate. Errors propagate through the carrier — an `err` record on either side contributes one `err` to the output for every record on the other side, matching the diff-semiring's `error * diff = error`. Theorems cover the empty cases. -* `Mz/GroupBy.lean`: `groupBy keyExpr rel` partitions a relation by an evaluated key, using `DecidableEq Datum`. `aggregateBy keyExpr valExpr f rel` runs `aggStrict` per group, modeling `SELECT keyExpr, AGG(valExpr) ... GROUP BY keyExpr`. Spec divergence on `err` keys (the spec says every err is its own group) is documented inline. +* `Mz/GroupBy.lean`: two grouping primitives. + `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. + `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. + Theorem `insertIntoDistinct_err` proves the err-key insertion always appends a fresh group; `groupByErrDistinct_length_of_all_err` derives the cardinality consequence — every err-keyed row contributes one output group. + Companion `aggregateBy` / `aggregateByErrDistinct` run `aggStrict` per group, modeling `SELECT keyExpr, AGG(valExpr) ... GROUP BY keyExpr`. ## What is not here @@ -78,8 +82,8 @@ The roadmap in priority order: * `BagStream.filter` commutativity. Data field commutes by `filterRel_comm`; the error field requires a notion of multiset equality on `List EvalError` since list-order differs across permutations. * Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. * Joins on `BagStream` with explicit error propagation. -* Spec-faithful err-distinct keying for `groupBy`: every `Datum.err` key must form its own bucket, even when two errs have the same payload. Requires either a custom `Datum.groupKeyEq` or wrapping err keys in a fresh-tag type. -* Cardinality theorem for `groupBy`: sum of group sizes equals `rel.length`. Requires `List.foldr_length` + Nat arithmetic. +* Cardinality theorem for `groupBy`: sum of group sizes equals `rel.length`. Requires `List.foldr_length` + Nat arithmetic. (`groupByErrDistinct` already proves the all-err special case via `groupByErrDistinct_length_of_all_err`.) +* Agreement theorem `groupByErrDistinct = groupBy` whenever no key evaluates to `.err`. Requires an invariant on the accumulator (no existing key is an err) propagated through the foldr. * Join cardinality theorem: `(cross l r).length = l.length * r.length`. Skipped in the current skeleton; the proof needs `List.length_flatMap` + Nat arithmetic. * Sketch a proof of `cross_assoc` modulo row concatenation associativity. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. From d3204717f24eb4321bb0f63379d1e3fad61020ad Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 10:05:20 +0200 Subject: [PATCH 024/127] doc/semantics: add BagStream.project with explicit err routing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `BagStream.project es s` projects each row in `s.data` through the list of scalar expressions `es`. A row stays in the data collection only when every scalar succeeds on it; otherwise the row's err payloads (one per erroring scalar) are appended to the error collection. Helpers: * `rowAllSafe`: every projected scalar succeeds on a row. * `rowErrs`: collect every err payload from one row's projections. * `projectErrs`: aggregate `rowErrs` across the relation. Laws: * `BagStream.project_data` / `_errors` — per-field reductions. * `BagStream.project_nil_es` — empty projection list keeps every row and produces width-zero output rows. * `BagStream.project_empty_data` — empty data collection projects to empty data, preserving input errors. * `rowErrs_nil_of_all_safe` and `projectErrs_eq_nil_of_all_safe` — when every scalar succeeds on every row, the error collection is unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/ErrStream.lean | 129 +++++++++++++++++++++- doc/developer/semantics/README.md | 7 +- 2 files changed, 131 insertions(+), 5 deletions(-) diff --git a/doc/developer/semantics/Mz/ErrStream.lean b/doc/developer/semantics/Mz/ErrStream.lean index 10551d62269ff..2ec3257a9d07d 100644 --- a/doc/developer/semantics/Mz/ErrStream.lean +++ b/doc/developer/semantics/Mz/ErrStream.lean @@ -17,9 +17,11 @@ error-aware filter. input propagate to the output; new errors from the operator are appended. -The skeleton models only `filter`. Adding `project` follows the same -pattern: each expression in the projection list contributes its own -error rows. +The skeleton models `filter` and `project`. Each follows the same +pattern: erroring rows leave the data collection and contribute +their payload(s) to the error collection. `project` differs from +`filter` in that a single row can produce multiple errors — one +for every projected scalar that evaluates to `.err`. -/ namespace Mz @@ -114,4 +116,125 @@ theorem BagStream.filter_idem (pred : Expr) (s : BagStream) : BagStream.filter pred s := by simp [BagStream.filter, filterRel_idem, errorRows_filterRel] +/-! ## Project -/ + +/-- Boolean check: every projected scalar succeeds on this row +(none returns `.err`). Used to decide whether the row stays in the +data collection or is replaced by its error rows. -/ +@[inline] def rowAllSafe (es : List Expr) (row : Row) : Bool := + es.all (fun e => + match eval row e with + | .err _ => false + | _ => true) + +/-- Collect every `err` payload produced by evaluating each +expression in `es` against `row`. A single row can produce +zero, one, or many entries — one per erroring scalar. -/ +def rowErrs (es : List Expr) (row : Row) : List EvalError := + es.filterMap fun e => + match eval row e with + | .err err => some err + | _ => none + +/-- Aggregate `rowErrs` across the relation. Outer order matches +row order; inner order matches the expression order within each +row. -/ +def projectErrs (es : List Expr) (rel : Relation) : List EvalError := + rel.flatMap (rowErrs es) + +/-- Error-aware projection. A row stays in the data collection only +when every projected scalar succeeds on it; otherwise the row's err +payloads are appended to the error collection and the row is +dropped from the data side. -/ +def BagStream.project (es : List Expr) (s : BagStream) : BagStream := + { data := (s.data.filter (rowAllSafe es)).map (fun row => es.map (eval row)) + , errors := s.errors ++ projectErrs es s.data } + +/-! ### Per-field reduction lemmas -/ + +theorem BagStream.project_data (es : List Expr) (s : BagStream) : + (BagStream.project es s).data = + (s.data.filter (rowAllSafe es)).map (fun row => es.map (eval row)) := rfl + +theorem BagStream.project_errors (es : List Expr) (s : BagStream) : + (BagStream.project es s).errors = + s.errors ++ projectErrs es s.data := rfl + +/-! ### Trivial cases -/ + +theorem rowErrs_nil_es (row : Row) : + rowErrs [] row = [] := rfl + +theorem projectErrs_nil_rel (es : List Expr) : + projectErrs es [] = [] := rfl + +/-- An empty projection list keeps every row (no scalar can err) +and produces width-zero rows. -/ +theorem BagStream.project_nil_es (s : BagStream) : + BagStream.project [] s = { data := s.data.map (fun _ => []), errors := s.errors } := by + apply BagStream.ext + · show (s.data.filter (rowAllSafe [])).map (fun row => ([] : List Expr).map (eval row)) + = s.data.map (fun _ => []) + have hAll : ∀ row, rowAllSafe [] row = true := fun _ => rfl + rw [List.filter_eq_self.mpr (by intro row _; exact hAll row)] + rfl + · show s.errors ++ projectErrs [] s.data = s.errors + have : projectErrs [] s.data = [] := by + unfold projectErrs + induction s.data with + | nil => rfl + | cons _ tl ih => simp [List.flatMap_cons, rowErrs_nil_es, ih] + rw [this, List.append_nil] + +/-- Projecting an empty stream is empty in data and preserves the +input errors. -/ +theorem BagStream.project_empty_data (es : List Expr) (errs : List EvalError) : + BagStream.project es { data := [], errors := errs } + = { data := [], errors := errs } := by + apply BagStream.ext + · rfl + · show errs ++ projectErrs es [] = errs + rw [projectErrs_nil_rel, List.append_nil] + +/-! ### Safe-row laws + +When every projected scalar succeeds on every row, projection +behaves like the plain `Bag.project`: no errors are emitted and +no rows are dropped from the data collection. -/ + +theorem rowErrs_nil_of_all_safe (es : List Expr) (row : Row) + (h : rowAllSafe es row = true) : + rowErrs es row = [] := by + induction es with + | nil => rfl + | cons hd tl ih => + have hUnfold : rowAllSafe (hd :: tl) row = true := h + unfold rowAllSafe at hUnfold + rw [List.all_cons, Bool.and_eq_true] at hUnfold + obtain ⟨hHead, hTl⟩ := hUnfold + have hSafeTl : rowAllSafe tl row = true := hTl + have ihResult : rowErrs tl row = [] := ih hSafeTl + show ((hd :: tl).filterMap fun e => + match eval row e with | .err err => some err | _ => none) = [] + rw [List.filterMap_cons] + cases h_eval : eval row hd with + | bool _ => exact ihResult + | null => exact ihResult + | err e => + rw [h_eval] at hHead + cases hHead + +theorem projectErrs_eq_nil_of_all_safe + (es : List Expr) (rel : Relation) + (h : ∀ row ∈ rel, rowAllSafe es row = true) : + projectErrs es rel = [] := by + unfold projectErrs + induction rel with + | nil => rfl + | cons hd tl ih => + have hHead : rowAllSafe es hd = true := h hd List.mem_cons_self + have hTl : ∀ row ∈ tl, rowAllSafe es row = true := + fun row hMem => h row (List.mem_cons_of_mem _ hMem) + simp [List.flatMap_cons, rowErrs_nil_of_all_safe es hd hHead, ih hTl] + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 3633d9fe29a84..0e6cd9a1c1bdd 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -20,7 +20,10 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Variadic.lean`: laws for `evalAndN` and `evalOrN` over `List Datum` — cons recurrence, nil, singleton, binary equivalence with the binary evaluators, and `FALSE`/`TRUE` absorption. * `Mz/ExprVariadic.lean`: `Expr`-level reduction lemmas connecting `eval env (.andN args)` / `.orN` / `.coalesce` to their primitive counterparts, identity / singleton / binary-equivalence corollaries lifted through `eval`, and variadic-absorption theorems — a single operand evaluating to `FALSE` (resp. `TRUE`) makes the whole `andN` (resp. `orN`) evaluate to `FALSE` (resp. `TRUE`). * `Mz/Bag.lean`: bag semantics on `List Row`. Defines `filterRel` and `project`, with filter idempotence, filter commutativity, projection length-preservation, and the empty-projection equation. Plain `filterRel` silently drops `err` rows; `Mz/ErrStream.lean` adds the explicit data/error stream pair. -* `Mz/ErrStream.lean`: the dataflow-style `BagStream = (data, errors)` pair. `BagStream.filter` routes erroring rows into the error collection instead of dropping them, with idempotence proved at both the data and the error level. +* `Mz/ErrStream.lean`: the dataflow-style `BagStream = (data, errors)` pair. + `BagStream.filter` routes erroring rows into the error collection instead of dropping them, with idempotence proved at both the data and the error level. + `BagStream.project` projects each row through a list of scalars; a row stays in the data collection only when every scalar succeeds, otherwise its err payloads (one per erroring scalar) are appended to the error collection. + `rowErrs_nil_of_all_safe` and `projectErrs_eq_nil_of_all_safe` show that when no projection errs, `BagStream.project` does not extend the error collection. * `Mz/Pushdown.lean`: substitution (`Expr.subst`) plus the headline `eval_subst` theorem (substituting then evaluating against the original row equals evaluating against the projected row), and the relational predicate-pushdown rewrite `filterRel p (project es rel) = project es (filterRel (p.subst es) rel)`. * `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity laws that downstream operators must respect. * `Mz/UnifiedStream.lean`: unified single-collection alternative to `BagStream`. `UnifiedRow` is `row ⊕ err`, so errors flow through the same carrier as data rows. `ofBag` / `split` conversions, with the round-trip theorem `split (ofBag s) = s`. The unified form matches the spec's diff-semiring target; the split `BagStream` is a runtime concession the conversion reconciles. @@ -78,8 +81,8 @@ Reviewers should expect both sides of the change in the same PR. The roadmap in priority order: -* `BagStream.project` analogous to `BagStream.filter`: each scalar in the projection list can produce its own error rows; aggregate them into the error collection. * `BagStream.filter` commutativity. Data field commutes by `filterRel_comm`; the error field requires a notion of multiset equality on `List EvalError` since list-order differs across permutations. +* `BagStream.project` / `BagStream.filter` commutativity (when the predicate references only un-projected columns). Same multiset-equality caveat on the error collection. * Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. * Joins on `BagStream` with explicit error propagation. * Cardinality theorem for `groupBy`: sum of group sizes equals `rel.length`. Requires `List.foldr_length` + Nat arithmetic. (`groupByErrDistinct` already proves the all-err special case via `groupByErrDistinct_length_of_all_err`.) From 1ce936f09bc991dd065182f05df6863727b5d80a Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 10:18:33 +0200 Subject: [PATCH 025/127] doc/semantics: join cardinality theorems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `UnifiedStream.cross_length` proves the headline relational identity `(cross l r).length = l.length * r.length`. Cross product on the unified carrier produces exactly one output record per `(l, r)` pair regardless of which side carries an error — every err contributes one err record per element of the other side, matching the diff-semiring's `error * d = error`. Companion `UnifiedStream.filter_length_le` proves the predicate filter on the unified stream is non-expanding (each input produces zero or one output). Composing the two gives the corollary `UnifiedStream.join_length_le`: a join's output length is bounded above by `l.length * r.length`. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Join.lean | 75 ++++++++++++++++++++++++++++ doc/developer/semantics/README.md | 5 +- 2 files changed, 77 insertions(+), 3 deletions(-) diff --git a/doc/developer/semantics/Mz/Join.lean b/doc/developer/semantics/Mz/Join.lean index f81f53ac0be61..7ccf218768ec8 100644 --- a/doc/developer/semantics/Mz/Join.lean +++ b/doc/developer/semantics/Mz/Join.lean @@ -56,4 +56,79 @@ theorem UnifiedStream.cross_nil_right (l : UnifiedStream) : | nil => rfl | cons _ tl _ih => simp [UnifiedStream.cross, List.map_nil, List.flatMap_cons] +/-! ## Cardinality -/ + +/-- Cross product cardinality. `cross l r` produces exactly one +output per `(l, r)` pair, regardless of which side carries an +error — every err in `l` or `r` contributes one err record per +element of the other side, matching the diff-semiring's +`error * d = error`. -/ +theorem UnifiedStream.cross_length (l r : UnifiedStream) : + (UnifiedStream.cross l r).length = l.length * r.length := by + induction l with + | nil => simp [UnifiedStream.cross] + | cons hd tl ih => + show (UnifiedStream.cross (hd :: tl) r).length = (tl.length + 1) * r.length + rw [Nat.succ_mul] + show (((hd :: tl) : UnifiedStream).flatMap fun lu => r.map fun ru => + match lu, ru with + | .row la, .row rb => UnifiedRow.row (la ++ rb) + | .err e, _ => UnifiedRow.err e + | _, .err e => UnifiedRow.err e).length + = tl.length * r.length + r.length + rw [List.flatMap_cons, List.length_append, List.length_map] + show r.length + (UnifiedStream.cross tl r).length = tl.length * r.length + r.length + rw [ih] + exact Nat.add_comm _ _ + +/-- Filter on `UnifiedStream` is non-expanding: every input record +produces zero or one output record, so the output length is at +most the input length. -/ +theorem UnifiedStream.filter_length_le (pred : Expr) (us : UnifiedStream) : + (UnifiedStream.filter pred us).length ≤ us.length := by + unfold UnifiedStream.filter + induction us with + | nil => exact Nat.le.refl + | cons hd tl ih => + rw [List.flatMap_cons, List.length_append, List.length_cons] + have hHd : (match hd with + | UnifiedRow.row r => + match eval r pred with + | .bool true => [UnifiedRow.row r] + | .err e => [UnifiedRow.err e] + | _ => [] + | UnifiedRow.err e => [UnifiedRow.err e]).length ≤ 1 := by + cases hd with + | row r => + show (match eval r pred with + | .bool true => [UnifiedRow.row r] + | .err e => [UnifiedRow.err e] + | _ => []).length ≤ 1 + cases h_eval : eval r pred with + | bool b => cases b <;> simp [List.length_cons, List.length_nil] + | null => simp [List.length_nil] + | err _ => simp [List.length_cons] + | err _ => + show ([UnifiedRow.err _] : UnifiedStream).length ≤ 1 + simp [List.length_cons] + calc (match hd with + | UnifiedRow.row r => + match eval r pred with + | .bool true => [UnifiedRow.row r] + | .err e => [UnifiedRow.err e] + | _ => [] + | UnifiedRow.err e => [UnifiedRow.err e]).length + + (tl.flatMap _).length + ≤ 1 + tl.length := Nat.add_le_add hHd ih + _ = tl.length + 1 := Nat.add_comm _ _ + +/-- Join length is bounded by cross length: the predicate filter +can only remove rows. -/ +theorem UnifiedStream.join_length_le (pred : Expr) (l r : UnifiedStream) : + (UnifiedStream.join pred l r).length ≤ l.length * r.length := by + show (UnifiedStream.filter pred (UnifiedStream.cross l r)).length + ≤ l.length * r.length + rw [← UnifiedStream.cross_length l r] + exact UnifiedStream.filter_length_le pred _ + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 0e6cd9a1c1bdd..df48894d082ce 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -30,7 +30,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. * `Mz/Triple.lean`: `TimedRecord = (row, time, diff)` triple stream tying `DiffWithError` to differential dataflow's record format. Defines `consolidateAll` and `consolidateAt t`; lifts `Consolidate`'s absorption to per-time and stream-wide statements. Per-`(row, time)` bucketing is a follow-up — needs `DecidableEq` on `Row`. -* `Mz/Join.lean`: relational joins on `UnifiedStream`. `cross` is the cartesian product; `join pred l r` filters the product through a join predicate. Errors propagate through the carrier — an `err` record on either side contributes one `err` to the output for every record on the other side, matching the diff-semiring's `error * diff = error`. Theorems cover the empty cases. +* `Mz/Join.lean`: relational joins on `UnifiedStream`. `cross` is the cartesian product; `join pred l r` filters the product through a join predicate. Errors propagate through the carrier — an `err` record on either side contributes one `err` to the output for every record on the other side, matching the diff-semiring's `error * diff = error`. Cardinality theorems: `cross_length` (output length is `l.length * r.length`), `filter_length_le` (filter is non-expanding on the carrier), and the corollary `join_length_le` bounding join output by the cross product. * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. @@ -87,8 +87,7 @@ The roadmap in priority order: * Joins on `BagStream` with explicit error propagation. * Cardinality theorem for `groupBy`: sum of group sizes equals `rel.length`. Requires `List.foldr_length` + Nat arithmetic. (`groupByErrDistinct` already proves the all-err special case via `groupByErrDistinct_length_of_all_err`.) * Agreement theorem `groupByErrDistinct = groupBy` whenever no key evaluates to `.err`. Requires an invariant on the accumulator (no existing key is an err) propagated through the foldr. -* Join cardinality theorem: `(cross l r).length = l.length * r.length`. Skipped in the current skeleton; the proof needs `List.length_flatMap` + Nat arithmetic. -* Sketch a proof of `cross_assoc` modulo row concatenation associativity. +* Sketch a proof of `cross_assoc` modulo row concatenation associativity. Left-wins error rule is consistent in both nestings; the residual obligation is `(la ++ lb) ++ lc = la ++ (lb ++ lc)` lifted into the row carrier. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From 0c267437d5b05cf942fb7d1e67f8449e2fb6ce7c Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 10:22:23 +0200 Subject: [PATCH 026/127] doc/semantics: GROUP BY cardinality theorems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `totalRows g = sum of g.rows.length` is the metric. Theorems: * `totalRows_insertInto`: a single insert adds exactly one row, whether the key matches an existing group or creates a new one. * `totalRows_insertIntoDistinct`: same bookkeeping for the err-distinct variant. * `totalRows_groupBy`: sum of group sizes equals the input relation's length — partitioning loses and duplicates nothing. * `totalRows_groupByErrDistinct`: the err-distinct variant preserves the same invariant. Use an explicit recursive `totalRows` definition instead of `((map ·.2.length).sum`. The latter trips `omega` because the elaborator and the simp set produce two syntactically distinct forms (`(·.2.length)` vs `List.length ·.snd`) that omega treats as independent atoms. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/GroupBy.lean | 95 +++++++++++++++++++++++++ doc/developer/semantics/README.md | 4 +- 2 files changed, 97 insertions(+), 2 deletions(-) diff --git a/doc/developer/semantics/Mz/GroupBy.lean b/doc/developer/semantics/Mz/GroupBy.lean index 917ef22d305a9..4b59a6b03ed44 100644 --- a/doc/developer/semantics/Mz/GroupBy.lean +++ b/doc/developer/semantics/Mz/GroupBy.lean @@ -147,4 +147,99 @@ theorem groupByErrDistinct_length_of_all_err rw [heq, insertIntoDistinct_err, List.length_append, ihApp] simp [List.length_cons] +/-! ## Cardinality + +The sum of group sizes equals the input relation's length: no row +is lost, no row is duplicated. Holds for both `groupBy` (which may +merge err keys) and `groupByErrDistinct` (which never does). + +The proofs use an explicit `totalRows` function as the metric, +which avoids syntactic mismatches between `Prod.snd ∘ List.length` +spellings that throw off `omega`. -/ + +/-- Sum of `rows.length` over every group. -/ +def totalRows : List (Datum × Relation) → Nat + | [] => 0 + | (_, rs) :: rest => rs.length + totalRows rest + +/-- `insertInto` adds exactly one row to the total across all +groups. The new row either joins an existing group or creates a +fresh singleton group — either way, the group-size sum grows by +one. -/ +private theorem totalRows_insertInto + (k : Datum) (row : Row) (groups : List (Datum × Relation)) : + totalRows (insertInto k row groups) = totalRows groups + 1 := by + induction groups with + | nil => rfl + | cons head tl ih => + obtain ⟨k', rows⟩ := head + by_cases hEq : k = k' + · subst hEq + show totalRows (if k = k then (k, row :: rows) :: tl + else (k, rows) :: insertInto k row tl) + = totalRows ((k, rows) :: tl) + 1 + rw [if_pos rfl] + show (row :: rows).length + totalRows tl + = rows.length + totalRows tl + 1 + simp [List.length_cons] + omega + · show totalRows (if k = k' then (k', row :: rows) :: tl + else (k', rows) :: insertInto k row tl) + = totalRows ((k', rows) :: tl) + 1 + rw [if_neg hEq] + show rows.length + totalRows (insertInto k row tl) + = rows.length + totalRows tl + 1 + rw [ih] + omega + +/-- Analogue for the err-distinct insert: same row-count +bookkeeping, different match predicate. -/ +private theorem totalRows_insertIntoDistinct + (k : Datum) (row : Row) (groups : List (Datum × Relation)) : + totalRows (insertIntoDistinct k row groups) = totalRows groups + 1 := by + induction groups with + | nil => rfl + | cons head tl ih => + obtain ⟨k', rows⟩ := head + by_cases hEq : Datum.groupKeyEq k k' = true + · show totalRows (if Datum.groupKeyEq k k' + then (k', row :: rows) :: tl + else (k', rows) :: insertIntoDistinct k row tl) + = totalRows ((k', rows) :: tl) + 1 + rw [if_pos hEq] + show (row :: rows).length + totalRows tl + = rows.length + totalRows tl + 1 + simp [List.length_cons] + omega + · show totalRows (if Datum.groupKeyEq k k' + then (k', row :: rows) :: tl + else (k', rows) :: insertIntoDistinct k row tl) + = totalRows ((k', rows) :: tl) + 1 + rw [if_neg hEq] + show rows.length + totalRows (insertIntoDistinct k row tl) + = rows.length + totalRows tl + 1 + rw [ih] + omega + +/-- Cardinality theorem: every row in the input appears in exactly +one group's row list. -/ +theorem totalRows_groupBy (keyExpr : Expr) (rel : Relation) : + totalRows (groupBy keyExpr rel) = rel.length := by + induction rel with + | nil => rfl + | cons head tl ih => + show totalRows (insertInto (eval head keyExpr) head + (groupBy keyExpr tl)) = (head :: tl).length + rw [totalRows_insertInto, ih, List.length_cons] + +/-- Cardinality theorem for the err-distinct variant. -/ +theorem totalRows_groupByErrDistinct (keyExpr : Expr) (rel : Relation) : + totalRows (groupByErrDistinct keyExpr rel) = rel.length := by + induction rel with + | nil => rfl + | cons head tl ih => + show totalRows (insertIntoDistinct (eval head keyExpr) head + (groupByErrDistinct keyExpr tl)) = (head :: tl).length + rw [totalRows_insertIntoDistinct, ih, List.length_cons] + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index df48894d082ce..4d2261a41aebd 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -34,7 +34,8 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. - Theorem `insertIntoDistinct_err` proves the err-key insertion always appends a fresh group; `groupByErrDistinct_length_of_all_err` derives the cardinality consequence — every err-keyed row contributes one output group. + Theorem `insertIntoDistinct_err` proves the err-key insertion always appends a fresh group; `groupByErrDistinct_length_of_all_err` derives the consequence — every err-keyed row contributes one output group. + Headline cardinality `totalRows_groupBy` (and its err-distinct variant `totalRows_groupByErrDistinct`) state that the sum of group sizes equals the input relation's length — no row is lost or duplicated by partitioning. Companion `aggregateBy` / `aggregateByErrDistinct` run `aggStrict` per group, modeling `SELECT keyExpr, AGG(valExpr) ... GROUP BY keyExpr`. ## What is not here @@ -85,7 +86,6 @@ The roadmap in priority order: * `BagStream.project` / `BagStream.filter` commutativity (when the predicate references only un-projected columns). Same multiset-equality caveat on the error collection. * Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. * Joins on `BagStream` with explicit error propagation. -* Cardinality theorem for `groupBy`: sum of group sizes equals `rel.length`. Requires `List.foldr_length` + Nat arithmetic. (`groupByErrDistinct` already proves the all-err special case via `groupByErrDistinct_length_of_all_err`.) * Agreement theorem `groupByErrDistinct = groupBy` whenever no key evaluates to `.err`. Requires an invariant on the accumulator (no existing key is an err) propagated through the foldr. * Sketch a proof of `cross_assoc` modulo row concatenation associativity. Left-wins error rule is consistent in both nestings; the residual obligation is `(la ++ lb) ++ lc = la ++ (lb ++ lc)` lifted into the row carrier. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. From 6d287ceffbff9c9921030bc9fea8133ee1b9623b Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 10:27:55 +0200 Subject: [PATCH 027/127] doc/semantics: wire DiffWithError into UnifiedStream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Promote `UnifiedStream` from `List UnifiedRow` to `List (UnifiedRow × DiffWithError Int)`. Every record now carries a differential-dataflow diff augmented with the absorbing `.error` marker. Row-scoped errors still propagate through the `UnifiedRow` carrier; collection-scoped errors propagate through diff multiplication / addition. Changes: * `UnifiedStream.ofBag` tags every bag record with diff `.val 1`. * `UnifiedStream.split` discards the diff component (round trip still goes through because `ofBag` only emits `.val 1` diffs; the cross-direction loses information for diffs ≠ `.val 1`). * `UnifiedStream.filter` preserves the diff of survivors; rerouted rows (predicate errs) keep their multiplicity. * `UnifiedStream.cross` combines carriers via the new `combineCarrier` helper and multiplies diffs through `DiffWithError`'s `Mul` instance. A `.error` diff on either side absorbs the product diff via `DiffWithError.error_mul_{left,right}`. New theorem `cross_diff_error_left` witnesses the absorption: crossing a left-side `.error` diff with any record on the right produces a `.error` diff in the output. Existing theorems (`cross_length`, `filter_length_le`, `join_length_le`, `split_ofBag`) carry through to the new representation. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Join.lean | 107 ++++++++------ doc/developer/semantics/Mz/UnifiedStream.lean | 139 ++++++++++-------- doc/developer/semantics/README.md | 9 +- 3 files changed, 148 insertions(+), 107 deletions(-) diff --git a/doc/developer/semantics/Mz/Join.lean b/doc/developer/semantics/Mz/Join.lean index 7ccf218768ec8..07747e58c6d09 100644 --- a/doc/developer/semantics/Mz/Join.lean +++ b/doc/developer/semantics/Mz/Join.lean @@ -1,21 +1,24 @@ import Mz.Eval import Mz.Bag import Mz.ErrStream +import Mz.DiffSemiring import Mz.UnifiedStream /-! # Joins on `UnifiedStream` -Two-input relational join modeled on the unified single-collection -stream. The cartesian product `cross l r` is the building block; -`join pred l r` filters the product through a join predicate. +Two-input relational join on the unified diff-aware stream. The +cartesian product `cross l r` is the building block; `join pred l r` +filters the product through a join predicate. -Error propagation follows naturally from the carrier: every -`(lu, ru)` pair contributes one output, and that output is an -`err` whenever either side of the pair is an `err`. The -multiplicity matches the semiring intuition that -`error * diff = error`: an `err` record in `l` produces one `err` -in the output for every record in `r`, and vice versa. +Error propagation is now twofold: +* row-scoped: every `(lu, ru)` pair contributes one output, and + that output's carrier is an `err` whenever either side's carrier + is an `err` (left wins on conflict, matching `evalAnd`'s + first-error rule); +* collection-scoped: diffs multiply, so any `.error` diff on + either side forces the product diff to `.error` via + `DiffWithError.error_mul_{left,right}`. `cross` makes no commitment to row schema beyond list concatenation. Schema-aware joins (equi-joins on named columns) @@ -24,24 +27,25 @@ would lift to this with a column-substitution layer. namespace Mz -/-- Cartesian product of two unified streams. - -For each pair `(lu, ru)`, produce one output: -* both real rows ⇒ concatenated row; -* either side is `err` ⇒ that side's `err` payload (left wins on - conflict, matching `evalAnd`'s first-error rule). -/ +/-- Combine two unified carriers, with left winning on err conflict. -/ +@[inline] private def combineCarrier : UnifiedRow → UnifiedRow → UnifiedRow + | .row la, .row rb => .row (la ++ rb) + | .err e, _ => .err e + | _, .err e => .err e + +/-- Cartesian product of two unified streams. For each pair +`((lu, ld), (ru, rd))`: +* combine carriers via `combineCarrier`; +* multiply diffs via `DiffWithError`'s `Mul` instance, so any + `.error` diff absorbs the product. -/ def UnifiedStream.cross (l r : UnifiedStream) : UnifiedStream := - l.flatMap fun lu => - r.map fun ru => - match lu, ru with - | .row la, .row rb => .row (la ++ rb) - | .err e, _ => .err e - | _, .err e => .err e + l.flatMap fun ld => + r.map fun rd => (combineCarrier ld.1 rd.1, ld.2 * rd.2) /-- Equi-join or theta-join: cross product filtered by a predicate. The predicate evaluates against the concatenated row; existing -`UnifiedStream.filter` semantics apply (predicate `.err` routes the -row's error into the carrier). -/ +`UnifiedStream.filter` semantics apply (predicate `.err` routes +the row's error into the carrier, diff is preserved). -/ def UnifiedStream.join (pred : Expr) (l r : UnifiedStream) : UnifiedStream := (UnifiedStream.cross l r).filter pred @@ -59,10 +63,8 @@ theorem UnifiedStream.cross_nil_right (l : UnifiedStream) : /-! ## Cardinality -/ /-- Cross product cardinality. `cross l r` produces exactly one -output per `(l, r)` pair, regardless of which side carries an -error — every err in `l` or `r` contributes one err record per -element of the other side, matching the diff-semiring's -`error * d = error`. -/ +output record per `(l, r)` pair, regardless of which side carries +an error in its carrier or its diff. -/ theorem UnifiedStream.cross_length (l r : UnifiedStream) : (UnifiedStream.cross l r).length = l.length * r.length := by induction l with @@ -70,11 +72,8 @@ theorem UnifiedStream.cross_length (l r : UnifiedStream) : | cons hd tl ih => show (UnifiedStream.cross (hd :: tl) r).length = (tl.length + 1) * r.length rw [Nat.succ_mul] - show (((hd :: tl) : UnifiedStream).flatMap fun lu => r.map fun ru => - match lu, ru with - | .row la, .row rb => UnifiedRow.row (la ++ rb) - | .err e, _ => UnifiedRow.err e - | _, .err e => UnifiedRow.err e).length + show (((hd :: tl) : UnifiedStream).flatMap fun ld => + r.map fun rd => (combineCarrier ld.1 rd.1, ld.2 * rd.2)).length = tl.length * r.length + r.length rw [List.flatMap_cons, List.length_append, List.length_map] show r.length + (UnifiedStream.cross tl r).length = tl.length * r.length + r.length @@ -92,32 +91,33 @@ theorem UnifiedStream.filter_length_le (pred : Expr) (us : UnifiedStream) : | cons hd tl ih => rw [List.flatMap_cons, List.length_append, List.length_cons] have hHd : (match hd with - | UnifiedRow.row r => + | (UnifiedRow.row r, d) => match eval r pred with - | .bool true => [UnifiedRow.row r] - | .err e => [UnifiedRow.err e] + | .bool true => [(UnifiedRow.row r, d)] + | .err e => [(UnifiedRow.err e, d)] | _ => [] - | UnifiedRow.err e => [UnifiedRow.err e]).length ≤ 1 := by - cases hd with + | (UnifiedRow.err e, d) => [(UnifiedRow.err e, d)]).length ≤ 1 := by + obtain ⟨u, d⟩ := hd + cases u with | row r => show (match eval r pred with - | .bool true => [UnifiedRow.row r] - | .err e => [UnifiedRow.err e] + | .bool true => [(UnifiedRow.row r, d)] + | .err e => [(UnifiedRow.err e, d)] | _ => []).length ≤ 1 cases h_eval : eval r pred with | bool b => cases b <;> simp [List.length_cons, List.length_nil] | null => simp [List.length_nil] | err _ => simp [List.length_cons] | err _ => - show ([UnifiedRow.err _] : UnifiedStream).length ≤ 1 + show ([(UnifiedRow.err _, d)] : UnifiedStream).length ≤ 1 simp [List.length_cons] calc (match hd with - | UnifiedRow.row r => + | (UnifiedRow.row r, d) => match eval r pred with - | .bool true => [UnifiedRow.row r] - | .err e => [UnifiedRow.err e] + | .bool true => [(UnifiedRow.row r, d)] + | .err e => [(UnifiedRow.err e, d)] | _ => [] - | UnifiedRow.err e => [UnifiedRow.err e]).length + | (UnifiedRow.err e, d) => [(UnifiedRow.err e, d)]).length + (tl.flatMap _).length ≤ 1 + tl.length := Nat.add_le_add hHd ih _ = tl.length + 1 := Nat.add_comm _ _ @@ -131,4 +131,23 @@ theorem UnifiedStream.join_length_le (pred : Expr) (l r : UnifiedStream) : rw [← UnifiedStream.cross_length l r] exact UnifiedStream.filter_length_le pred _ +/-! ## Diff propagation -/ + +/-- A `.error` diff on a left-side record forces the diff of every +output record in `cross` to `.error`. The carrier follows the +ordinary `combineCarrier` rule. -/ +theorem UnifiedStream.cross_diff_error_left + (lc : UnifiedRow) (r : UnifiedStream) (rc : UnifiedRow) (rd : DiffWithError Int) + (h_mem : (rc, rd) ∈ r) : + ∃ uc, (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.cross [(lc, DiffWithError.error)] r := by + refine ⟨combineCarrier lc rc, ?_⟩ + show (combineCarrier lc rc, DiffWithError.error) + ∈ ([(lc, DiffWithError.error)].flatMap fun ld => + r.map fun rd' => (combineCarrier ld.1 rd'.1, ld.2 * rd'.2)) + simp only [List.flatMap_cons, List.flatMap_nil, List.append_nil] + refine List.mem_map.mpr ⟨(rc, rd), h_mem, ?_⟩ + show (combineCarrier lc rc, DiffWithError.error * rd) = (combineCarrier lc rc, DiffWithError.error) + rw [DiffWithError.error_mul_left] + end Mz diff --git a/doc/developer/semantics/Mz/UnifiedStream.lean b/doc/developer/semantics/Mz/UnifiedStream.lean index 82280a64d4b98..9c4e20310ac15 100644 --- a/doc/developer/semantics/Mz/UnifiedStream.lean +++ b/doc/developer/semantics/Mz/UnifiedStream.lean @@ -1,50 +1,55 @@ import Mz.Eval import Mz.Bag import Mz.ErrStream +import Mz.DiffSemiring /-! -# Unified data / error stream +# Unified data / error / diff stream `BagStream` (`Mz/ErrStream.lean`) carries a `(data, errors)` pair — two collections, threaded through every operator. That split mirrors Materialize's current runtime but is a *pragmatic* choice rather than a semantic one. The spec target is a single unified -stream where data rows and errors flow through the same carrier. +stream where data rows, row-scoped errors, and collection-scoped +("global") errors all flow through one carrier. -This file gives the unified model and the conversion to / from the -split form. +This file gives that unified model. The carrier pairs a +`UnifiedRow` (data row or row-scoped error) with a +`DiffWithError Int` (the differential-dataflow multiplicity, +augmented with an absorbing `error` marker that encodes +collection-scoped errors). ## Encoding -`UnifiedRow` is a sum type: either an honest `row` or a bare `err` -without a row. The absence of a row in the `err` variant preserves -the current property that errors carry no row context (the runtime -`DataflowError` is the same way). A future refinement could attach -optional row provenance. +`UnifiedRow` is a sum type — either an honest `row` or a bare `err` +without a row. The diff component is from `DiffWithError Int`: +* `.val n` — ordinary differential dataflow multiplicity (positive + for inserts, negative for retractions, zero for cancellation); +* `.error` — collection-scoped error marker that absorbs through + addition and multiplication. -`UnifiedStream := List UnifiedRow`. Operators consume and produce -`UnifiedStream`s; errors propagate through the carrier -automatically. - -## Diff-aware view - -The encoding here uses a plain list. The next refinement attaches a -`DiffWithError ℤ` to each record (see `Mz/DiffSemiring.lean`) so the -absorbing `error` diff captures collection-scoped global errors -alongside row-scoped errors. The conversion lemmas below transport -to that refinement when it lands. +`UnifiedStream := List (UnifiedRow × DiffWithError Int)`. Operators +consume and produce `UnifiedStream`s. Row-scoped errors propagate +through the carrier; collection-scoped errors propagate through +diff multiplication / addition. ## Semantic differences with the split form `UnifiedStream.ofBag` concatenates data rows first and errors -second, fixing an order. Operators that process records left-to- -right will see data before errors. The split form makes no such -commitment between data and errors. Equivalence between unified -and split is therefore exact on the round trip (`split (ofBag s) = -s`) but only up to multiset equality on the cross-direction -`(filter ∘ ofBag) ≈ (ofBag ∘ filter)`. The skeleton states the -round trip; the cross-equivalence is left for a future iteration -that introduces multiset machinery on `List EvalError`. +second, fixing an order, and assigns every record a diff of +`.val 1`. The split form makes no commitment between data and +errors. Equivalence between unified and split is therefore exact +on the round trip (`split (ofBag s) = s`) but only up to multiset +equality on the cross-direction `(filter ∘ ofBag) ≈ (ofBag ∘ filter)`. +The skeleton states the round trip; the cross-equivalence is left +for a future iteration that introduces multiset machinery on +`List EvalError`. + +`split` discards the diff component, mapping every carrier record +to one bag row regardless of multiplicity. This is lossy for diffs +other than `.val 1` (duplicate rows, retractions, or +collection-scoped errors). The round trip still goes through +because `ofBag` only ever produces `.val 1` diffs. -/ namespace Mz @@ -54,63 +59,77 @@ inductive UnifiedRow where | err (e : EvalError) deriving Inhabited -abbrev UnifiedStream := List UnifiedRow - -/-- Pick the row payload of a `UnifiedRow`, or `none` for errors. -/ -@[inline] private def pickRow : UnifiedRow → Option Row - | .row r => some r - | .err _ => none - -/-- Pick the error payload of a `UnifiedRow`, or `none` for rows. -/ -@[inline] private def pickErr : UnifiedRow → Option EvalError - | .row _ => none - | .err e => some e - -/-- Pack a `BagStream` into a single unified stream: data rows -first, error payloads second. -/ +/-- A unified-stream record pairs a row-or-error carrier with a +differential-dataflow diff augmented by the absorbing `error` +element. -/ +abbrev UnifiedStream := List (UnifiedRow × DiffWithError Int) + +/-- Pick the row payload of a unified record, or `none` for +errors. Diff component is discarded. -/ +@[inline] private def pickRow : UnifiedRow × DiffWithError Int → Option Row + | (.row r, _) => some r + | (.err _, _) => none + +/-- Pick the row-scoped error payload of a unified record, or +`none` for data rows. Diff component is discarded. -/ +@[inline] private def pickErr : UnifiedRow × DiffWithError Int → Option EvalError + | (.row _, _) => none + | (.err e, _) => some e + +/-- Pack a `BagStream` into a unified stream: data rows first, +error payloads second, each with diff `.val 1`. -/ def UnifiedStream.ofBag (s : BagStream) : UnifiedStream := - s.data.map UnifiedRow.row ++ s.errors.map UnifiedRow.err + s.data.map (fun r => (UnifiedRow.row r, (1 : DiffWithError Int))) + ++ s.errors.map (fun e => (UnifiedRow.err e, (1 : DiffWithError Int))) -/-- Split a unified stream back into the `(data, errors)` pair. -/ +/-- Split a unified stream back into the `(data, errors)` pair. +Diff multiplicities and `.error` diffs are dropped. -/ def UnifiedStream.split (us : UnifiedStream) : BagStream := { data := us.filterMap pickRow , errors := us.filterMap pickErr } /-- Filter on the unified stream. Predicate is evaluated on every -real `row`; survivors stay, erroring rows become `err` records, -non-true / non-error results are dropped. Existing `err` records -pass through unchanged. -/ +real `row`; survivors stay with their original diff; rows whose +predicate errs become `err` records (diff unchanged — multiplicity +is preserved through the error route); non-true / non-error +results are dropped. Existing `err` records pass through unchanged. +Collection-scoped errors encoded in the diff are preserved on +survivors. -/ def UnifiedStream.filter (pred : Expr) (us : UnifiedStream) : UnifiedStream := - us.flatMap fun u => match u with - | .row r => + us.flatMap fun ud => match ud with + | (.row r, d) => match eval r pred with - | .bool true => [.row r] - | .err e => [.err e] + | .bool true => [(.row r, d)] + | .err e => [(.err e, d)] | _ => [] - | .err e => [.err e] + | (.err e, d) => [(.err e, d)] /-! ## Helper lemmas for filterMap over the packed concatenation -/ private theorem filterMap_pickRow_rowMap (rs : List Row) : - (rs.map UnifiedRow.row).filterMap pickRow = rs := by + (rs.map (fun r => (UnifiedRow.row r, (1 : DiffWithError Int)))).filterMap pickRow + = rs := by induction rs with | nil => rfl | cons hd tl ih => simp [List.map, pickRow, ih] private theorem filterMap_pickRow_errMap (es : List EvalError) : - (es.map UnifiedRow.err).filterMap pickRow = ([] : Relation) := by + (es.map (fun e => (UnifiedRow.err e, (1 : DiffWithError Int)))).filterMap pickRow + = ([] : Relation) := by induction es with | nil => rfl | cons _ tl ih => simp [List.map, pickRow, ih] private theorem filterMap_pickErr_rowMap (rs : List Row) : - (rs.map UnifiedRow.row).filterMap pickErr = ([] : List EvalError) := by + (rs.map (fun r => (UnifiedRow.row r, (1 : DiffWithError Int)))).filterMap pickErr + = ([] : List EvalError) := by induction rs with | nil => rfl | cons _ tl ih => simp [List.map, pickErr, ih] private theorem filterMap_pickErr_errMap (es : List EvalError) : - (es.map UnifiedRow.err).filterMap pickErr = es := by + (es.map (fun e => (UnifiedRow.err e, (1 : DiffWithError Int)))).filterMap pickErr + = es := by induction es with | nil => rfl | cons hd tl ih => simp [List.map, pickErr, ih] @@ -119,7 +138,9 @@ private theorem filterMap_pickErr_errMap (es : List EvalError) : theorem UnifiedStream.split_data_ofBag (s : BagStream) : (UnifiedStream.split (UnifiedStream.ofBag s)).data = s.data := by - show (s.data.map UnifiedRow.row ++ s.errors.map UnifiedRow.err).filterMap pickRow = s.data + show ((s.data.map (fun r => (UnifiedRow.row r, (1 : DiffWithError Int)))) + ++ (s.errors.map (fun e => (UnifiedRow.err e, (1 : DiffWithError Int)))) + ).filterMap pickRow = s.data induction s.data with | nil => simp only [List.map_nil, List.nil_append] @@ -129,7 +150,9 @@ theorem UnifiedStream.split_data_ofBag (s : BagStream) : theorem UnifiedStream.split_errors_ofBag (s : BagStream) : (UnifiedStream.split (UnifiedStream.ofBag s)).errors = s.errors := by - show (s.data.map UnifiedRow.row ++ s.errors.map UnifiedRow.err).filterMap pickErr = s.errors + show ((s.data.map (fun r => (UnifiedRow.row r, (1 : DiffWithError Int)))) + ++ (s.errors.map (fun e => (UnifiedRow.err e, (1 : DiffWithError Int)))) + ).filterMap pickErr = s.errors induction s.data with | nil => simp only [List.map_nil, List.nil_append] diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 4d2261a41aebd..95511813b4324 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -25,12 +25,12 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `BagStream.project` projects each row through a list of scalars; a row stays in the data collection only when every scalar succeeds, otherwise its err payloads (one per erroring scalar) are appended to the error collection. `rowErrs_nil_of_all_safe` and `projectErrs_eq_nil_of_all_safe` show that when no projection errs, `BagStream.project` does not extend the error collection. * `Mz/Pushdown.lean`: substitution (`Expr.subst`) plus the headline `eval_subst` theorem (substituting then evaluating against the original row equals evaluating against the projected row), and the relational predicate-pushdown rewrite `filterRel p (project es rel) = project es (filterRel (p.subst es) rel)`. -* `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity laws that downstream operators must respect. -* `Mz/UnifiedStream.lean`: unified single-collection alternative to `BagStream`. `UnifiedRow` is `row ⊕ err`, so errors flow through the same carrier as data rows. `ofBag` / `split` conversions, with the round-trip theorem `split (ofBag s) = s`. The unified form matches the spec's diff-semiring target; the split `BagStream` is a runtime concession the conversion reconciles. +* `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity laws that downstream operators must respect. Now wired through `UnifiedStream` / `Join` — every operator on the unified stream speaks `DiffWithError Int` directly. +* `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. * `Mz/Triple.lean`: `TimedRecord = (row, time, diff)` triple stream tying `DiffWithError` to differential dataflow's record format. Defines `consolidateAll` and `consolidateAt t`; lifts `Consolidate`'s absorption to per-time and stream-wide statements. Per-`(row, time)` bucketing is a follow-up — needs `DecidableEq` on `Row`. -* `Mz/Join.lean`: relational joins on `UnifiedStream`. `cross` is the cartesian product; `join pred l r` filters the product through a join predicate. Errors propagate through the carrier — an `err` record on either side contributes one `err` to the output for every record on the other side, matching the diff-semiring's `error * diff = error`. Cardinality theorems: `cross_length` (output length is `l.length * r.length`), `filter_length_le` (filter is non-expanding on the carrier), and the corollary `join_length_le` bounding join output by the cross product. +* `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary), and `cross_diff_error_left` (a `.error` diff on the left propagates to a `.error` diff in every output record). * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. @@ -41,7 +41,6 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four ## What is not here * No bag semantics, joins, aggregates, or relational operators. -* No diff-semiring extension for global errors (see the design doc). * No bridge to the Rust evaluator. The model and the runtime are independent; divergences are caught by review, not by tooling. * No Mathlib dependency. @@ -91,4 +90,4 @@ The roadmap in priority order: * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. -The diff-semiring extension for global errors is a separate v2 effort. +The diff-semiring extension is now in scope: `UnifiedStream` records carry a `DiffWithError Int` and operators preserve / multiply / absorb it as appropriate. From 8b2371abdf947c91b2d189ae9e1f820fb294106d Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 10:32:09 +0200 Subject: [PATCH 028/127] doc/semantics: error diff absorbs through filter and cross MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tighten diff propagation in `UnifiedStream.filter` and add the symmetric `cross` absorption theorem. `filter` now special-cases records carrying a `.error` diff, passing them through unconditionally regardless of predicate outcome. The previous version dropped such records when the predicate evaluated to `.bool false` or `.null`, which violates the semiring law that `.error` must absorb every downstream operator. `filter_length_le` still holds — each input still produces at most one output. Three new theorems witness the absorption: * `cross_diff_error_right`: symmetric counterpart to the existing `cross_diff_error_left` — a `.error` diff on the right side forces the output diff to `.error` for every record on the left. * `filter_preserves_error_diff`: a record `(uc, .error)` in the input survives the filter as `(uc, .error)` in the output, no matter what the predicate is. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Join.lean | 84 +++++++++++++++---- doc/developer/semantics/Mz/UnifiedStream.lean | 21 +++-- doc/developer/semantics/README.md | 2 +- 3 files changed, 79 insertions(+), 28 deletions(-) diff --git a/doc/developer/semantics/Mz/Join.lean b/doc/developer/semantics/Mz/Join.lean index 07747e58c6d09..812c1deb8a4eb 100644 --- a/doc/developer/semantics/Mz/Join.lean +++ b/doc/developer/semantics/Mz/Join.lean @@ -91,33 +91,40 @@ theorem UnifiedStream.filter_length_le (pred : Expr) (us : UnifiedStream) : | cons hd tl ih => rw [List.flatMap_cons, List.length_append, List.length_cons] have hHd : (match hd with - | (UnifiedRow.row r, d) => + | (_, DiffWithError.error) => [hd] + | (UnifiedRow.err e, d) => [(UnifiedRow.err e, d)] + | (UnifiedRow.row r, d) => match eval r pred with | .bool true => [(UnifiedRow.row r, d)] | .err e => [(UnifiedRow.err e, d)] - | _ => [] - | (UnifiedRow.err e, d) => [(UnifiedRow.err e, d)]).length ≤ 1 := by + | _ => []).length ≤ 1 := by obtain ⟨u, d⟩ := hd - cases u with - | row r => - show (match eval r pred with - | .bool true => [(UnifiedRow.row r, d)] - | .err e => [(UnifiedRow.err e, d)] - | _ => []).length ≤ 1 - cases h_eval : eval r pred with - | bool b => cases b <;> simp [List.length_cons, List.length_nil] - | null => simp [List.length_nil] - | err _ => simp [List.length_cons] - | err _ => - show ([(UnifiedRow.err _, d)] : UnifiedStream).length ≤ 1 + cases d with + | error => + show ([(u, DiffWithError.error)] : UnifiedStream).length ≤ 1 simp [List.length_cons] + | val n => + cases u with + | row r => + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []).length ≤ 1 + cases h_eval : eval r pred with + | bool b => cases b <;> simp [List.length_cons, List.length_nil] + | null => simp [List.length_nil] + | err _ => simp [List.length_cons] + | err _ => + show ([(UnifiedRow.err _, DiffWithError.val n)] : UnifiedStream).length ≤ 1 + simp [List.length_cons] calc (match hd with - | (UnifiedRow.row r, d) => + | (_, DiffWithError.error) => [hd] + | (UnifiedRow.err e, d) => [(UnifiedRow.err e, d)] + | (UnifiedRow.row r, d) => match eval r pred with | .bool true => [(UnifiedRow.row r, d)] | .err e => [(UnifiedRow.err e, d)] - | _ => [] - | (UnifiedRow.err e, d) => [(UnifiedRow.err e, d)]).length + | _ => []).length + (tl.flatMap _).length ≤ 1 + tl.length := Nat.add_le_add hHd ih _ = tl.length + 1 := Nat.add_comm _ _ @@ -150,4 +157,45 @@ theorem UnifiedStream.cross_diff_error_left show (combineCarrier lc rc, DiffWithError.error * rd) = (combineCarrier lc rc, DiffWithError.error) rw [DiffWithError.error_mul_left] +/-- Symmetric statement for a `.error` diff on the right side. -/ +theorem UnifiedStream.cross_diff_error_right + (l : UnifiedStream) (lc : UnifiedRow) (ld : DiffWithError Int) (rc : UnifiedRow) + (h_mem : (lc, ld) ∈ l) : + ∃ uc, (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.cross l [(rc, DiffWithError.error)] := by + refine ⟨combineCarrier lc rc, ?_⟩ + show (combineCarrier lc rc, DiffWithError.error) + ∈ (l.flatMap fun ld' => + [(rc, DiffWithError.error)].map fun rd' => + (combineCarrier ld'.1 rd'.1, ld'.2 * rd'.2)) + refine List.mem_flatMap.mpr ⟨(lc, ld), h_mem, ?_⟩ + show (combineCarrier lc rc, DiffWithError.error) + ∈ [(combineCarrier lc rc, ld * DiffWithError.error)] + rw [DiffWithError.error_mul_right] + exact List.mem_singleton.mpr rfl + +/-- Absorption under `filter`: a record carrying a `.error` diff +is preserved by `UnifiedStream.filter`, regardless of the +predicate. The absorbing diff marker cannot be filtered away. -/ +theorem UnifiedStream.filter_preserves_error_diff + (pred : Expr) (us : UnifiedStream) (uc : UnifiedRow) + (h_mem : (uc, (DiffWithError.error : DiffWithError Int)) ∈ us) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.filter pred us := by + induction us with + | nil => exact absurd h_mem (List.not_mem_nil) + | cons hd tl ih => + rcases List.mem_cons.mp h_mem with hEq | hTail + · subst hEq + show (uc, DiffWithError.error) + ∈ UnifiedStream.filter pred ((uc, DiffWithError.error) :: tl) + unfold UnifiedStream.filter + rw [List.flatMap_cons] + show (uc, DiffWithError.error) + ∈ [(uc, DiffWithError.error)] ++ _ + exact List.mem_append.mpr (.inl (List.mem_singleton.mpr rfl)) + · unfold UnifiedStream.filter + rw [List.flatMap_cons] + exact List.mem_append.mpr (.inr (ih hTail)) + end Mz diff --git a/doc/developer/semantics/Mz/UnifiedStream.lean b/doc/developer/semantics/Mz/UnifiedStream.lean index 9c4e20310ac15..d1a4a8e45db7f 100644 --- a/doc/developer/semantics/Mz/UnifiedStream.lean +++ b/doc/developer/semantics/Mz/UnifiedStream.lean @@ -88,21 +88,24 @@ def UnifiedStream.split (us : UnifiedStream) : BagStream := { data := us.filterMap pickRow , errors := us.filterMap pickErr } -/-- Filter on the unified stream. Predicate is evaluated on every -real `row`; survivors stay with their original diff; rows whose -predicate errs become `err` records (diff unchanged — multiplicity -is preserved through the error route); non-true / non-error -results are dropped. Existing `err` records pass through unchanged. -Collection-scoped errors encoded in the diff are preserved on -survivors. -/ +/-- Filter on the unified stream. Records carrying a collection- +scoped `.error` diff pass through unconditionally — the absorbing +diff marker cannot be filtered away without violating the +semiring laws. For other diffs, the predicate is evaluated on +every real `row`: survivors stay with their original diff, rows +whose predicate errs become `err` records (diff unchanged — +multiplicity is preserved through the error route), non-true / +non-error results are dropped. Existing row-scoped `err` records +pass through unchanged. -/ def UnifiedStream.filter (pred : Expr) (us : UnifiedStream) : UnifiedStream := us.flatMap fun ud => match ud with - | (.row r, d) => + | (_, .error) => [ud] + | (.err e, d) => [(.err e, d)] + | (.row r, d) => match eval r pred with | .bool true => [(.row r, d)] | .err e => [(.err e, d)] | _ => [] - | (.err e, d) => [(.err e, d)] /-! ## Helper lemmas for filterMap over the packed concatenation -/ diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 95511813b4324..4b36279427fb7 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -30,7 +30,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. * `Mz/Triple.lean`: `TimedRecord = (row, time, diff)` triple stream tying `DiffWithError` to differential dataflow's record format. Defines `consolidateAll` and `consolidateAt t`; lifts `Consolidate`'s absorption to per-time and stream-wide statements. Per-`(row, time)` bucketing is a follow-up — needs `DecidableEq` on `Row`. -* `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary), and `cross_diff_error_left` (a `.error` diff on the left propagates to a `.error` diff in every output record). +* `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. From ceaf6528c65eb700af352b7c8c5ca18ab797a017 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 10:36:56 +0200 Subject: [PATCH 029/127] doc/semantics: row-keyed consolidation on UnifiedStream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `UnifiedStream.consolidate` buckets unified-stream records by carrier (data row or row-scoped err) and sums per-bucket diffs. Headline theorem `consolidate_preserves_error` lifts the diff-semiring absorption rule from `Mz/Consolidate.lean` to the unified stream: an `.error` diff anywhere in the input forces a `.error` diff in the consolidated output for that carrier. Required: * `deriving DecidableEq` on `UnifiedRow` (Row already has it via `Datum`, EvalError too). * Two helper lemmas — `consolidateInto_error_diff` (inserting an `.error` diff yields `.error` for the bucket) and `consolidateInto_preserves_error_mem` (a pre-existing `.error` record survives every subsequent insert). Adding times reduces to running `consolidate` inside each time slice; that lifting is mechanical and left for the per-`(row, time)` follow-up alongside `Mz/Triple.lean`. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + .../semantics/Mz/UnifiedConsolidate.lean | 154 ++++++++++++++++++ doc/developer/semantics/Mz/UnifiedStream.lean | 2 +- doc/developer/semantics/README.md | 3 +- 4 files changed, 158 insertions(+), 2 deletions(-) create mode 100644 doc/developer/semantics/Mz/UnifiedConsolidate.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 0be2f074b3aa7..0abf347ad79e0 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -13,6 +13,7 @@ import Mz.ErrStream import Mz.Pushdown import Mz.DiffSemiring import Mz.UnifiedStream +import Mz.UnifiedConsolidate import Mz.Aggregate import Mz.Consolidate import Mz.Triple diff --git a/doc/developer/semantics/Mz/UnifiedConsolidate.lean b/doc/developer/semantics/Mz/UnifiedConsolidate.lean new file mode 100644 index 0000000000000..c2773afc1f42e --- /dev/null +++ b/doc/developer/semantics/Mz/UnifiedConsolidate.lean @@ -0,0 +1,154 @@ +import Mz.UnifiedStream +import Mz.DiffSemiring + +/-! +# Row-keyed consolidation on `UnifiedStream` + +`compact` in differential dataflow buckets records that share the +same `(row, time)` key and sums their diffs. The diff-only slice +of that operation already lives in `Mz/Consolidate.lean`; this +file lifts the bucketing to `UnifiedStream` so a consolidated +output is itself a `UnifiedStream` (with one record per distinct +carrier and a summed diff). + +The headline property is preservation of the `.error` diff +marker. If any input record carries `.error`, the consolidated +output for that carrier carries `.error`. The semiring's +absorption law (`.error + x = .error`) does the work — the +bucket sum collapses to `.error` the moment one `.error` diff +joins it. + +The skeleton skips the time dimension; per-`(row, time)` +bucketing reduces to per-row consolidation inside each time +slice, so adding times later is mechanical. +-/ + +namespace Mz + +/-- Insert `(uc, d)` into a consolidated stream. If a record with +the same carrier already exists, add `d` to its diff. Otherwise +append a new record at the end of the list. -/ +private def consolidateInto (uc : UnifiedRow) (d : DiffWithError Int) : + UnifiedStream → UnifiedStream + | [] => [(uc, d)] + | (uc', d') :: rest => + if uc = uc' then (uc', d + d') :: rest + else (uc', d') :: consolidateInto uc d rest + +/-- Sum diffs per carrier across the stream. Order of distinct +carriers is unspecified beyond "encounter order from the right"; +order of `consolidate` matters only to the extent that diff +addition is non-commutative on the base, which it is not for +`Int`. -/ +def UnifiedStream.consolidate : UnifiedStream → UnifiedStream + | [] => [] + | (uc, d) :: rest => + consolidateInto uc d (UnifiedStream.consolidate rest) + +/-! ## Trivial cases -/ + +theorem UnifiedStream.consolidate_nil : + UnifiedStream.consolidate [] = [] := rfl + +theorem UnifiedStream.consolidate_singleton (uc : UnifiedRow) (d : DiffWithError Int) : + UnifiedStream.consolidate [(uc, d)] = [(uc, d)] := rfl + +/-! ## `.error` absorption -/ + +/-- Inserting an `.error` diff into any consolidated stream yields +an output containing the carrier with diff `.error`. Either the +carrier was already in the stream (bucket diff becomes +`.error + d_old = .error`) or it was not (fresh bucket appended). -/ +private theorem consolidateInto_error_diff + (uc : UnifiedRow) (us : UnifiedStream) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ consolidateInto uc DiffWithError.error us := by + induction us with + | nil => exact List.mem_singleton.mpr rfl + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + by_cases hEq : uc = uc' + · subst hEq + show (uc, DiffWithError.error) + ∈ (if uc = uc then (uc, DiffWithError.error + d') :: tl + else (uc, d') :: consolidateInto uc DiffWithError.error tl) + rw [if_pos rfl] + rw [DiffWithError.error_add_left] + exact List.mem_cons_self + · show (uc, DiffWithError.error) + ∈ (if uc = uc' then (uc', DiffWithError.error + d') :: tl + else (uc', d') :: consolidateInto uc DiffWithError.error tl) + rw [if_neg hEq] + exact List.mem_cons_of_mem _ ih + +/-- Inserting any record into a consolidated stream that already +contains `(uc, .error)` leaves the `.error` record in place. If +the inserted key matches `uc`, the bucket diff becomes +`d_new + .error = .error`; otherwise the `.error` record is in +the recursive tail. -/ +private theorem consolidateInto_preserves_error_mem + (uc' : UnifiedRow) (d' : DiffWithError Int) (us : UnifiedStream) + (uc : UnifiedRow) + (h_mem : (uc, (DiffWithError.error : DiffWithError Int)) ∈ us) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ consolidateInto uc' d' us := by + induction us with + | nil => exact absurd h_mem (List.not_mem_nil) + | cons hd tl ih => + obtain ⟨uc₀, d₀⟩ := hd + rcases List.mem_cons.mp h_mem with hEq | hTail + · -- (uc, .error) = (uc₀, d₀) + have hUc : uc = uc₀ := (Prod.mk.injEq _ _ _ _).mp hEq |>.1 + have hD : (DiffWithError.error : DiffWithError Int) = d₀ := + (Prod.mk.injEq _ _ _ _).mp hEq |>.2 + subst hUc + subst hD + by_cases hEq' : uc' = uc + · show (uc, DiffWithError.error) + ∈ (if uc' = uc then (uc, d' + DiffWithError.error) :: tl + else (uc, DiffWithError.error) :: consolidateInto uc' d' tl) + rw [if_pos hEq'] + rw [DiffWithError.error_add_right] + exact List.mem_cons_self + · show (uc, DiffWithError.error) + ∈ (if uc' = uc then (uc, d' + DiffWithError.error) :: tl + else (uc, DiffWithError.error) :: consolidateInto uc' d' tl) + rw [if_neg hEq'] + exact List.mem_cons_self + · by_cases hEq' : uc' = uc₀ + · show (uc, DiffWithError.error) + ∈ (if uc' = uc₀ then (uc₀, d' + d₀) :: tl + else (uc₀, d₀) :: consolidateInto uc' d' tl) + rw [if_pos hEq'] + exact List.mem_cons_of_mem _ hTail + · show (uc, DiffWithError.error) + ∈ (if uc' = uc₀ then (uc₀, d' + d₀) :: tl + else (uc₀, d₀) :: consolidateInto uc' d' tl) + rw [if_neg hEq'] + exact List.mem_cons_of_mem _ (ih hTail) + +/-- Headline absorption: an `.error` diff anywhere in the input +survives the row-keyed consolidation. -/ +theorem UnifiedStream.consolidate_preserves_error + (us : UnifiedStream) (uc : UnifiedRow) + (h_mem : (uc, (DiffWithError.error : DiffWithError Int)) ∈ us) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.consolidate us := by + induction us with + | nil => exact absurd h_mem (List.not_mem_nil) + | cons hd tl ih => + obtain ⟨uc₀, d₀⟩ := hd + rcases List.mem_cons.mp h_mem with hEq | hTail + · have hUc : uc = uc₀ := (Prod.mk.injEq _ _ _ _).mp hEq |>.1 + have hD : (DiffWithError.error : DiffWithError Int) = d₀ := + (Prod.mk.injEq _ _ _ _).mp hEq |>.2 + subst hUc + subst hD + show (uc, DiffWithError.error) + ∈ consolidateInto uc DiffWithError.error (UnifiedStream.consolidate tl) + exact consolidateInto_error_diff uc _ + · show (uc, DiffWithError.error) + ∈ consolidateInto uc₀ d₀ (UnifiedStream.consolidate tl) + exact consolidateInto_preserves_error_mem uc₀ d₀ _ uc (ih hTail) + +end Mz diff --git a/doc/developer/semantics/Mz/UnifiedStream.lean b/doc/developer/semantics/Mz/UnifiedStream.lean index d1a4a8e45db7f..59b1f941703b3 100644 --- a/doc/developer/semantics/Mz/UnifiedStream.lean +++ b/doc/developer/semantics/Mz/UnifiedStream.lean @@ -57,7 +57,7 @@ namespace Mz inductive UnifiedRow where | row (r : Row) | err (e : EvalError) - deriving Inhabited + deriving DecidableEq, Inhabited /-- A unified-stream record pairs a row-or-error carrier with a differential-dataflow diff augmented by the absorbing `error` diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 4b36279427fb7..23334be4d1833 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -29,7 +29,8 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. -* `Mz/Triple.lean`: `TimedRecord = (row, time, diff)` triple stream tying `DiffWithError` to differential dataflow's record format. Defines `consolidateAll` and `consolidateAt t`; lifts `Consolidate`'s absorption to per-time and stream-wide statements. Per-`(row, time)` bucketing is a follow-up — needs `DecidableEq` on `Row`. +* `Mz/UnifiedConsolidate.lean`: row-keyed diff summation on `UnifiedStream`. `UnifiedStream.consolidate` buckets records by carrier (via `DecidableEq UnifiedRow`) and sums per-bucket diffs. Headline theorem `consolidate_preserves_error` proves the lifted absorption: an `.error` diff anywhere in the input gives an `.error` diff in the consolidated output for that carrier. Helpers `consolidateInto_error_diff` and `consolidateInto_preserves_error_mem` carry the inductive step. +* `Mz/Triple.lean`: `TimedRecord = (row, time, diff)` triple stream tying `DiffWithError` to differential dataflow's record format. Defines `consolidateAll` and `consolidateAt t`; lifts `Consolidate`'s absorption to per-time and stream-wide statements. Per-`(row, time)` bucketing reduces to applying `UnifiedConsolidate` inside each time slice. * `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. From 56fba90e77bcd2b381a1d843dd9dd77fa86057d8 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 10:47:38 +0200 Subject: [PATCH 030/127] doc/semantics: cardinality and no-error for consolidate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two companion theorems for `UnifiedStream.consolidate`. `consolidate_length_le`: bucketing only merges, never expands. The output has at most as many records as the input. Helper `consolidateInto_length_le_succ` bounds a single insert at `us.length + 1`. `consolidate_no_error`: if every input diff is `.val n`, every output diff is `.val m`. The diff-semiring's `.val + .val = .val (· + ·)` keeps the consolidated buckets in the ordinary `Int` slice, so `.error` is the only source of absorption. Together with the existing `consolidate_preserves_error`, the three theorems pin down the consolidation operator: absorption on errors, non-expansion on cardinality, and stability on error-free input. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../semantics/Mz/UnifiedConsolidate.lean | 126 ++++++++++++++++++ doc/developer/semantics/README.md | 5 +- 2 files changed, 130 insertions(+), 1 deletion(-) diff --git a/doc/developer/semantics/Mz/UnifiedConsolidate.lean b/doc/developer/semantics/Mz/UnifiedConsolidate.lean index c2773afc1f42e..2f5d053f29890 100644 --- a/doc/developer/semantics/Mz/UnifiedConsolidate.lean +++ b/doc/developer/semantics/Mz/UnifiedConsolidate.lean @@ -151,4 +151,130 @@ theorem UnifiedStream.consolidate_preserves_error ∈ consolidateInto uc₀ d₀ (UnifiedStream.consolidate tl) exact consolidateInto_preserves_error_mem uc₀ d₀ _ uc (ih hTail) +/-! ## Cardinality + +Consolidation is non-expanding. Every input record either lands +in an existing bucket (no length change) or starts a new one +(length grows by one), so the output length is at most the input +length. The strict inequality holds when at least one carrier +appears more than once, which the skeleton does not state +separately. -/ + +/-- `consolidateInto` adds at most one record to the bucket list: +either it appends a fresh bucket (length + 1) or it lands inside +an existing bucket (length unchanged). -/ +private theorem consolidateInto_length_le_succ + (uc : UnifiedRow) (d : DiffWithError Int) (us : UnifiedStream) : + (consolidateInto uc d us).length ≤ us.length + 1 := by + induction us with + | nil => exact Nat.le.refl + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + by_cases hEq : uc = uc' + · show (if uc = uc' then (uc', d + d') :: tl + else (uc', d') :: consolidateInto uc d tl).length + ≤ (((uc', d') :: tl).length) + 1 + rw [if_pos hEq] + simp [List.length_cons] + · show (if uc = uc' then (uc', d + d') :: tl + else (uc', d') :: consolidateInto uc d tl).length + ≤ (((uc', d') :: tl).length) + 1 + rw [if_neg hEq] + show (consolidateInto uc d tl).length + 1 ≤ tl.length + 1 + 1 + omega + +/-- Output of `consolidate` has length at most the input length. -/ +theorem UnifiedStream.consolidate_length_le (us : UnifiedStream) : + (UnifiedStream.consolidate us).length ≤ us.length := by + induction us with + | nil => exact Nat.le.refl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + show (consolidateInto uc d (UnifiedStream.consolidate tl)).length + ≤ ((uc, d) :: tl).length + have hStep := consolidateInto_length_le_succ uc d (UnifiedStream.consolidate tl) + have hIh : (UnifiedStream.consolidate tl).length + 1 ≤ tl.length + 1 := + Nat.add_le_add_right ih 1 + show (consolidateInto uc d (UnifiedStream.consolidate tl)).length ≤ tl.length + 1 + exact Nat.le_trans hStep hIh + +/-! ## No-error preservation + +If every input diff is a `.val`, every output diff is a `.val`. +The semiring's `.val + .val = .val (· + ·)` keeps absorption from +firing. -/ + +/-- Inserting a `.val` diff into a list whose every record has +`.val` diff yields a list whose every record has `.val` diff. -/ +private theorem consolidateInto_no_error + (uc : UnifiedRow) (n : Int) (us : UnifiedStream) + (h : ∀ r ∈ us, ∃ m : Int, r.2 = DiffWithError.val m) : + ∀ r ∈ consolidateInto uc (DiffWithError.val n) us, + ∃ m : Int, r.2 = DiffWithError.val m := by + induction us with + | nil => + intro r hMem + have : r = (uc, DiffWithError.val n) := List.mem_singleton.mp hMem + exact ⟨n, by rw [this]⟩ + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + have hHd : ∃ m : Int, d' = DiffWithError.val m := by + have := h (uc', d') (List.mem_cons_self) + exact this + have hTl : ∀ r ∈ tl, ∃ m : Int, r.2 = DiffWithError.val m := + fun r hMem => h r (List.mem_cons_of_mem _ hMem) + obtain ⟨m, hM⟩ := hHd + intro r hMem + by_cases hEq : uc = uc' + · have hOut : consolidateInto uc (DiffWithError.val n) ((uc', d') :: tl) + = (uc', DiffWithError.val n + d') :: tl := by + show (if uc = uc' then (uc', DiffWithError.val n + d') :: tl + else (uc', d') :: consolidateInto uc (DiffWithError.val n) tl) + = (uc', DiffWithError.val n + d') :: tl + rw [if_pos hEq] + rw [hOut] at hMem + rcases List.mem_cons.mp hMem with hHead | hTail' + · subst hHead + rw [hM] + show ∃ m' : Int, DiffWithError.val n + DiffWithError.val m + = DiffWithError.val m' + exact ⟨n + m, rfl⟩ + · exact hTl r hTail' + · have hOut : consolidateInto uc (DiffWithError.val n) ((uc', d') :: tl) + = (uc', d') :: consolidateInto uc (DiffWithError.val n) tl := by + show (if uc = uc' then (uc', DiffWithError.val n + d') :: tl + else (uc', d') :: consolidateInto uc (DiffWithError.val n) tl) + = (uc', d') :: consolidateInto uc (DiffWithError.val n) tl + rw [if_neg hEq] + rw [hOut] at hMem + rcases List.mem_cons.mp hMem with hHead | hTail' + · subst hHead + exact ⟨m, hM⟩ + · exact ih hTl r hTail' + +/-- Headline no-error: if every input diff is `.val`, every +output diff is `.val`. The consolidated total stays in the +ordinary `Int` slice of the diff-semiring. -/ +theorem UnifiedStream.consolidate_no_error + (us : UnifiedStream) + (h : ∀ r ∈ us, ∃ n : Int, r.2 = DiffWithError.val n) : + ∀ r ∈ UnifiedStream.consolidate us, + ∃ n : Int, r.2 = DiffWithError.val n := by + induction us with + | nil => intro r hMem; exact absurd hMem (List.not_mem_nil) + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hHd : ∃ n : Int, d = DiffWithError.val n := + h (uc, d) List.mem_cons_self + have hTl : ∀ r ∈ tl, ∃ n : Int, r.2 = DiffWithError.val n := + fun r hMem => h r (List.mem_cons_of_mem _ hMem) + obtain ⟨n, hN⟩ := hHd + have hConsTl : ∀ r ∈ UnifiedStream.consolidate tl, + ∃ n : Int, r.2 = DiffWithError.val n := + ih hTl + intro r hMem + have : r ∈ consolidateInto uc d (UnifiedStream.consolidate tl) := hMem + rw [hN] at this + exact consolidateInto_no_error uc n _ hConsTl r this + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 23334be4d1833..89607883a016b 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -29,7 +29,10 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. -* `Mz/UnifiedConsolidate.lean`: row-keyed diff summation on `UnifiedStream`. `UnifiedStream.consolidate` buckets records by carrier (via `DecidableEq UnifiedRow`) and sums per-bucket diffs. Headline theorem `consolidate_preserves_error` proves the lifted absorption: an `.error` diff anywhere in the input gives an `.error` diff in the consolidated output for that carrier. Helpers `consolidateInto_error_diff` and `consolidateInto_preserves_error_mem` carry the inductive step. +* `Mz/UnifiedConsolidate.lean`: row-keyed diff summation on `UnifiedStream`. `UnifiedStream.consolidate` buckets records by carrier (via `DecidableEq UnifiedRow`) and sums per-bucket diffs. Theorems cover three properties: + *absorption* — `consolidate_preserves_error` proves an `.error` diff anywhere in the input gives an `.error` diff in the consolidated output for that carrier; + *cardinality* — `consolidate_length_le` bounds the output by the input length (consolidation only merges, never expands); + *no-error preservation* — `consolidate_no_error` proves that if every input diff is a `.val`, every output diff is a `.val`, so `.error` is the only source of absorption. * `Mz/Triple.lean`: `TimedRecord = (row, time, diff)` triple stream tying `DiffWithError` to differential dataflow's record format. Defines `consolidateAll` and `consolidateAt t`; lifts `Consolidate`'s absorption to per-time and stream-wide statements. Per-`(row, time)` bucketing reduces to applying `UnifiedConsolidate` inside each time slice. * `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). * `Mz/GroupBy.lean`: two grouping primitives. From 8dd1e56d05409a2a621cee41748f6fc2388e1c16 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 10:54:44 +0200 Subject: [PATCH 031/127] doc/semantics: groupByErrDistinct = groupBy on no-err inputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When every row's key evaluates to a non-err `Datum`, the two grouping primitives produce identical output. `Datum.groupKeyEq` reduces to `decide (· = ·)` on non-err inputs, so the if-branches of `insertIntoDistinct` and `insertInto` agree. Three supporting lemmas thread the invariant through the foldr: * `Datum.groupKeyEq_eq_decide_of_no_err`: the boolean predicate reduces to `decide` of propositional equality whenever both sides are non-err. * `insertInto_preserves_non_err_keys`: inserting a non-err key into a list of non-err keys produces a list of non-err keys. * `groupBy_keys_non_err`: the `groupBy` accumulator's keys are all non-err whenever the input row keys are. Headline `groupByErrDistinct_eq_groupBy_of_no_err` chains the three together. Closes the agreement follow-up from the err- distinct keying commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/GroupBy.lean | 140 ++++++++++++++++++++++++ doc/developer/semantics/README.md | 2 +- 2 files changed, 141 insertions(+), 1 deletion(-) diff --git a/doc/developer/semantics/Mz/GroupBy.lean b/doc/developer/semantics/Mz/GroupBy.lean index 4b59a6b03ed44..ea258b6b963ab 100644 --- a/doc/developer/semantics/Mz/GroupBy.lean +++ b/doc/developer/semantics/Mz/GroupBy.lean @@ -147,6 +147,146 @@ theorem groupByErrDistinct_length_of_all_err rw [heq, insertIntoDistinct_err, List.length_append, ihApp] simp [List.length_cons] +/-! ## Agreement between `groupBy` and `groupByErrDistinct` + +When no row's key evaluates to `.err`, the two grouping primitives +produce the same result: `Datum.groupKeyEq` reduces to ordinary +equality on non-err keys, and the accumulators thread through the +foldr identically. -/ + +/-- On non-err inputs, `Datum.groupKeyEq` agrees with `decide` of +ordinary equality. -/ +private theorem Datum.groupKeyEq_eq_decide_of_no_err + {a b : Datum} (hA : ¬ a.IsErr) (hB : ¬ b.IsErr) : + Datum.groupKeyEq a b = decide (a = b) := by + cases a with + | err _ => exact absurd (show Datum.IsErr (Datum.err _) from trivial) hA + | bool _ => + cases b with + | err _ => exact absurd (show Datum.IsErr (Datum.err _) from trivial) hB + | bool _ => rfl + | null => rfl + | null => + cases b with + | err _ => exact absurd (show Datum.IsErr (Datum.err _) from trivial) hB + | bool _ => rfl + | null => rfl + +/-- Inserting the same non-err key into a group list whose keys +are all non-err yields the same result whether we use the +err-distinct or the merge-on-equal variant. -/ +private theorem insertIntoDistinct_eq_insertInto + {k : Datum} {row : Row} {groups : List (Datum × Relation)} + (hK : ¬ k.IsErr) + (hGroups : ∀ g ∈ groups, ¬ g.1.IsErr) : + insertIntoDistinct k row groups = insertInto k row groups := by + induction groups with + | nil => rfl + | cons head tl ih => + obtain ⟨k', rows⟩ := head + have hK' : ¬ k'.IsErr := hGroups (k', rows) List.mem_cons_self + have hTl : ∀ g ∈ tl, ¬ g.1.IsErr := + fun g hMem => hGroups g (List.mem_cons_of_mem _ hMem) + have hKey := Datum.groupKeyEq_eq_decide_of_no_err hK hK' + by_cases hEq : k = k' + · show (if Datum.groupKeyEq k k' then (k', row :: rows) :: tl + else (k', rows) :: insertIntoDistinct k row tl) + = (if k = k' then (k', row :: rows) :: tl + else (k', rows) :: insertInto k row tl) + have hKeyTrue : Datum.groupKeyEq k k' = true := by + rw [hKey]; exact decide_eq_true hEq + rw [if_pos hKeyTrue, if_pos hEq] + · show (if Datum.groupKeyEq k k' then (k', row :: rows) :: tl + else (k', rows) :: insertIntoDistinct k row tl) + = (if k = k' then (k', row :: rows) :: tl + else (k', rows) :: insertInto k row tl) + have hKeyFalse : Datum.groupKeyEq k k' = false := by + rw [hKey]; exact decide_eq_false hEq + rw [if_neg (by simp [hKeyFalse]), if_neg hEq, ih hTl] + +/-- `insertInto` propagates the "no err keys" invariant from its +input bucket list to its output: if the inserted key is non-err +and every existing bucket has a non-err key, every bucket in the +output has a non-err key. -/ +private theorem insertInto_preserves_non_err_keys + {k : Datum} {row : Row} {groups : List (Datum × Relation)} + (hK : ¬ k.IsErr) + (hGroups : ∀ g ∈ groups, ¬ g.1.IsErr) : + ∀ g ∈ insertInto k row groups, ¬ g.1.IsErr := by + induction groups with + | nil => + intro g hMem + have : g = (k, [row]) := List.mem_singleton.mp hMem + rw [this] + exact hK + | cons head tl ih => + obtain ⟨k', rows⟩ := head + have hK' : ¬ k'.IsErr := hGroups (k', rows) List.mem_cons_self + have hTl : ∀ g ∈ tl, ¬ g.1.IsErr := + fun g hMem => hGroups g (List.mem_cons_of_mem _ hMem) + intro g hMem + show ¬ g.1.IsErr + by_cases hEq : k = k' + · have hOut : insertInto k row ((k', rows) :: tl) + = (k', row :: rows) :: tl := by + show (if k = k' then (k', row :: rows) :: tl + else (k', rows) :: insertInto k row tl) + = (k', row :: rows) :: tl + rw [if_pos hEq] + rw [hOut] at hMem + rcases List.mem_cons.mp hMem with hHead | hTail + · subst hHead; exact hK' + · exact hTl g hTail + · have hOut : insertInto k row ((k', rows) :: tl) + = (k', rows) :: insertInto k row tl := by + show (if k = k' then (k', row :: rows) :: tl + else (k', rows) :: insertInto k row tl) + = (k', rows) :: insertInto k row tl + rw [if_neg hEq] + rw [hOut] at hMem + rcases List.mem_cons.mp hMem with hHead | hTail + · subst hHead; exact hK' + · exact ih hTl g hTail + +/-- Invariant: when every row's key evaluates to a non-err +`Datum`, every bucket in the `groupBy` output also has a non-err +key. -/ +private theorem groupBy_keys_non_err + {keyExpr : Expr} {rel : Relation} + (h : ∀ row ∈ rel, ¬ (eval row keyExpr).IsErr) : + ∀ g ∈ groupBy keyExpr rel, ¬ g.1.IsErr := by + induction rel with + | nil => intro g hMem; exact absurd hMem (List.not_mem_nil) + | cons head tl ih => + have hHd : ¬ (eval head keyExpr).IsErr := h head List.mem_cons_self + have hTl : ∀ row ∈ tl, ¬ (eval row keyExpr).IsErr := + fun row hMem => h row (List.mem_cons_of_mem _ hMem) + have ihGroups := ih hTl + show ∀ g ∈ insertInto (eval head keyExpr) head (groupBy keyExpr tl), + ¬ g.1.IsErr + exact insertInto_preserves_non_err_keys hHd ihGroups + +/-- Agreement theorem: when every row's key evaluates to a +non-err `Datum`, the err-distinct grouping is exactly the +merge-on-equal grouping. -/ +theorem groupByErrDistinct_eq_groupBy_of_no_err + (keyExpr : Expr) (rel : Relation) + (h : ∀ row ∈ rel, ¬ (eval row keyExpr).IsErr) : + groupByErrDistinct keyExpr rel = groupBy keyExpr rel := by + induction rel with + | nil => rfl + | cons head tl ih => + have hHd : ¬ (eval head keyExpr).IsErr := h head List.mem_cons_self + have hTl : ∀ row ∈ tl, ¬ (eval row keyExpr).IsErr := + fun row hMem => h row (List.mem_cons_of_mem _ hMem) + have ihEq : groupByErrDistinct keyExpr tl = groupBy keyExpr tl := ih hTl + have hAllNonErr : ∀ g ∈ groupBy keyExpr tl, ¬ g.1.IsErr := + groupBy_keys_non_err hTl + show insertIntoDistinct (eval head keyExpr) head (groupByErrDistinct keyExpr tl) + = insertInto (eval head keyExpr) head (groupBy keyExpr tl) + rw [ihEq] + exact insertIntoDistinct_eq_insertInto hHd hAllNonErr + /-! ## Cardinality The sum of group sizes equals the input relation's length: no row diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 89607883a016b..7019bee263e3f 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -40,6 +40,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. Theorem `insertIntoDistinct_err` proves the err-key insertion always appends a fresh group; `groupByErrDistinct_length_of_all_err` derives the consequence — every err-keyed row contributes one output group. Headline cardinality `totalRows_groupBy` (and its err-distinct variant `totalRows_groupByErrDistinct`) state that the sum of group sizes equals the input relation's length — no row is lost or duplicated by partitioning. + Agreement theorem `groupByErrDistinct_eq_groupBy_of_no_err` proves the two variants coincide when no row's key evaluates to `.err`. Supporting invariants `insertInto_preserves_non_err_keys` and `groupBy_keys_non_err` thread the "no err keys in the accumulator" property through the foldr. Companion `aggregateBy` / `aggregateByErrDistinct` run `aggStrict` per group, modeling `SELECT keyExpr, AGG(valExpr) ... GROUP BY keyExpr`. ## What is not here @@ -89,7 +90,6 @@ The roadmap in priority order: * `BagStream.project` / `BagStream.filter` commutativity (when the predicate references only un-projected columns). Same multiset-equality caveat on the error collection. * Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. * Joins on `BagStream` with explicit error propagation. -* Agreement theorem `groupByErrDistinct = groupBy` whenever no key evaluates to `.err`. Requires an invariant on the accumulator (no existing key is an err) propagated through the foldr. * Sketch a proof of `cross_assoc` modulo row concatenation associativity. Left-wins error rule is consistent in both nestings; the residual obligation is `(la ++ lb) ++ lc = la ++ (lb ++ lc)` lifted into the row carrier. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From 406f717ae2b220b3feeff51d3eae2dfe6ac78752 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 10:57:29 +0200 Subject: [PATCH 032/127] doc/semantics: per-(row, time) consolidation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `Mz/TimedConsolidate.lean` lifts `UnifiedStream.consolidate` to a joint `(row, time)` key by chaining a time-slice filter with the existing per-row consolidator. * `TimedUnifiedRecord := UnifiedRow × Nat × DiffWithError Int`. * `atTime t` projects to one time slice and drops the time field. * `consolidateAtTime t = consolidate ∘ atTime t`. Theorems: * `mem_atTime_of_mem`: a record at time `t` survives the slice. * `consolidateAtTime_preserves_error`: an `.error` diff at time `t` survives both the slice and the per-row consolidation, closing the absorbing-diff propagation story for the joint key. * `atTime_length_le` and `consolidateAtTime_length_le`: both steps are non-expanding. No new generic machinery — the joint key decomposes into "filter by time, then bucket by row", so existing absorption / non-expansion results transport directly. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + .../semantics/Mz/TimedConsolidate.lean | 123 ++++++++++++++++++ doc/developer/semantics/README.md | 3 +- 3 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 doc/developer/semantics/Mz/TimedConsolidate.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 0abf347ad79e0..98f2803ce1cb7 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -14,6 +14,7 @@ import Mz.Pushdown import Mz.DiffSemiring import Mz.UnifiedStream import Mz.UnifiedConsolidate +import Mz.TimedConsolidate import Mz.Aggregate import Mz.Consolidate import Mz.Triple diff --git a/doc/developer/semantics/Mz/TimedConsolidate.lean b/doc/developer/semantics/Mz/TimedConsolidate.lean new file mode 100644 index 0000000000000..13225e8f81fd3 --- /dev/null +++ b/doc/developer/semantics/Mz/TimedConsolidate.lean @@ -0,0 +1,123 @@ +import Mz.UnifiedStream +import Mz.UnifiedConsolidate +import Mz.DiffSemiring + +/-! +# Per-`(row, time)` consolidation + +`Mz/UnifiedConsolidate.lean` buckets records by carrier and sums +diffs per bucket. Differential dataflow buckets by the joint key +`(row, time)` instead. This file lifts the row-only consolidator +into the timed setting by isolating each time slice with +`atTime t` and running `UnifiedStream.consolidate` on the slice. + +A `TimedUnifiedRecord` is a `(UnifiedRow, Nat, DiffWithError Int)` +triple — carrier, time, diff. `TimedUnifiedStream.atTime t` keeps +the records at time `t` and forgets the time component, producing +an ordinary `UnifiedStream`. Composing with `consolidate` gives +`consolidateAtTime`. + +The headline theorem `consolidateAtTime_preserves_error` proves +that an `.error` diff at time `t` survives both the time-slice +filter and the per-row consolidation — the absorbing diff marker +propagates through the joint key. Cardinality follows from +`consolidate_length_le` plus the obvious bound on `atTime`. +-/ + +namespace Mz + +/-- A timed record on the unified stream: carrier, time, diff. -/ +abbrev TimedUnifiedRecord := UnifiedRow × Nat × DiffWithError Int + +/-- Differential-dataflow-style stream of timed unified records. -/ +abbrev TimedUnifiedStream := List TimedUnifiedRecord + +/-- Project a timed stream to the time slice at `t`. Records at +other times are dropped; the time component is forgotten. -/ +def TimedUnifiedStream.atTime (t : Nat) (s : TimedUnifiedStream) : UnifiedStream := + s.filterMap fun r => + if r.2.1 = t then some (r.1, r.2.2) else none + +/-- Bucket records at time `t` by carrier and sum their diffs. -/ +def TimedUnifiedStream.consolidateAtTime (t : Nat) (s : TimedUnifiedStream) : + UnifiedStream := + UnifiedStream.consolidate (TimedUnifiedStream.atTime t s) + +/-! ## Trivial cases -/ + +theorem TimedUnifiedStream.atTime_nil (t : Nat) : + TimedUnifiedStream.atTime t [] = [] := rfl + +theorem TimedUnifiedStream.consolidateAtTime_nil (t : Nat) : + TimedUnifiedStream.consolidateAtTime t [] = [] := rfl + +/-! ## Time-slice extraction -/ + +/-- A record present at time `t` shows up in the time slice +`atTime t` with its carrier and diff. -/ +theorem TimedUnifiedStream.mem_atTime_of_mem + {t : Nat} {s : TimedUnifiedStream} + {uc : UnifiedRow} {d : DiffWithError Int} + (h_mem : (uc, t, d) ∈ s) : + (uc, d) ∈ TimedUnifiedStream.atTime t s := by + induction s with + | nil => exact absurd h_mem List.not_mem_nil + | cons hd tl ih => + rcases List.mem_cons.mp h_mem with hHead | hTail + · subst hHead + show (uc, d) ∈ ((uc, t, d) :: tl).filterMap fun r => + if r.2.1 = t then some (r.1, r.2.2) else none + simp + · have ihMem := ih hTail + show (uc, d) ∈ (hd :: tl).filterMap fun r => + if r.2.1 = t then some (r.1, r.2.2) else none + rw [List.filterMap_cons] + cases hCond : (if hd.2.1 = t then some (hd.1, hd.2.2) else (none : Option _)) + case none => exact ihMem + case some hdSlice => exact List.mem_cons_of_mem _ ihMem + +/-! ## `.error` absorption -/ + +/-- An `.error` diff at time `t` survives the per-`(row, time)` +consolidation: the consolidated output at time `t` carries the +carrier with `.error` diff. -/ +theorem TimedUnifiedStream.consolidateAtTime_preserves_error + (t : Nat) (s : TimedUnifiedStream) (uc : UnifiedRow) + (h_mem : (uc, t, (DiffWithError.error : DiffWithError Int)) ∈ s) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ TimedUnifiedStream.consolidateAtTime t s := by + have hSlice : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ TimedUnifiedStream.atTime t s := + TimedUnifiedStream.mem_atTime_of_mem h_mem + exact UnifiedStream.consolidate_preserves_error _ uc hSlice + +/-! ## Cardinality -/ + +/-- `atTime` is non-expanding: each input record contributes at +most one output record (it is either kept with its time stripped +or dropped). -/ +theorem TimedUnifiedStream.atTime_length_le (t : Nat) (s : TimedUnifiedStream) : + (TimedUnifiedStream.atTime t s).length ≤ s.length := by + unfold TimedUnifiedStream.atTime + induction s with + | nil => exact Nat.le.refl + | cons hd tl ih => + rw [List.filterMap_cons, List.length_cons] + by_cases hT : hd.2.1 = t + · rw [if_pos hT, List.length_cons] + exact Nat.add_le_add_right ih 1 + · rw [if_neg hT] + exact Nat.le_trans ih (Nat.le_succ _) + +/-- Cardinality of the per-time consolidation, chained from +`atTime_length_le` and `consolidate_length_le`. -/ +theorem TimedUnifiedStream.consolidateAtTime_length_le + (t : Nat) (s : TimedUnifiedStream) : + (TimedUnifiedStream.consolidateAtTime t s).length ≤ s.length := by + unfold TimedUnifiedStream.consolidateAtTime + exact Nat.le_trans + (UnifiedStream.consolidate_length_le _) + (TimedUnifiedStream.atTime_length_le t s) + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 7019bee263e3f..e3311aae860d1 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -29,6 +29,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. +* `Mz/TimedConsolidate.lean`: per-`(row, time)` consolidation. `TimedUnifiedStream := List (UnifiedRow × Nat × DiffWithError Int)` carries records with time. `atTime t` projects to one time slice (dropping the time component); `consolidateAtTime t` chains it with `UnifiedStream.consolidate`. Theorems: `consolidateAtTime_preserves_error` (an `.error` diff at time `t` survives both filter and consolidation), `atTime_length_le` and `consolidateAtTime_length_le` (both non-expanding). Decomposes the joint key into "filter by time, then consolidate by row". * `Mz/UnifiedConsolidate.lean`: row-keyed diff summation on `UnifiedStream`. `UnifiedStream.consolidate` buckets records by carrier (via `DecidableEq UnifiedRow`) and sums per-bucket diffs. Theorems cover three properties: *absorption* — `consolidate_preserves_error` proves an `.error` diff anywhere in the input gives an `.error` diff in the consolidated output for that carrier; *cardinality* — `consolidate_length_le` bounds the output by the input length (consolidation only merges, never expands); @@ -90,7 +91,7 @@ The roadmap in priority order: * `BagStream.project` / `BagStream.filter` commutativity (when the predicate references only un-projected columns). Same multiset-equality caveat on the error collection. * Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. * Joins on `BagStream` with explicit error propagation. -* Sketch a proof of `cross_assoc` modulo row concatenation associativity. Left-wins error rule is consistent in both nestings; the residual obligation is `(la ++ lb) ++ lc = la ++ (lb ++ lc)` lifted into the row carrier. +* Sketch a proof of `cross_assoc` modulo row concatenation associativity. Left-wins error rule is consistent in both nestings; diff multiplication is associative on the base, so the diff side follows from the semiring laws; the residual obligation is `(la ++ lb) ++ lc = la ++ (lb ++ lc)` lifted into the row carrier. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From 422bfb5969330fb6de5ef1a5c39181979eb4a2ff Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 11:06:10 +0200 Subject: [PATCH 033/127] doc/semantics: cross is associative MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three pieces: * `DiffWithError.mul_assoc`: lifts associativity of `*` from the base type into the augmented semiring. Any `.error` makes both sides equal to `.error`; on `.val` triples it reduces to the base associativity hypothesis. * `combineCarrier_assoc`: carrier combine is associative on the unified-row level. On the `row, row, row` case it reduces to `List.append_assoc`; the seven err-mixed cases all close by `rfl` because left wins absorbs the same way in both nestings. * `UnifiedStream.cross_assoc`: the headline relational law `(a × b) × c = a × (b × c)` on the diff-aware stream. The proof rearranges the nested `flatMap` / `map` via local list-monad lemmas (`flatMap_flatMap`, `flatMap_map`, `map_flatMap`, plus `flatMap_congr` / `map_congr`) into a common triple-fold and closes per record via `cross_step_assoc` (one position of the carrier laws + one position of the diff law). The list-monad lemmas are stated inline as private `_local` variants since Lean core lacks them under these names. Once Mathlib lands they reduce to one-liners. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/DiffSemiring.lean | 29 ++++ doc/developer/semantics/Mz/Join.lean | 134 +++++++++++++++++++ doc/developer/semantics/README.md | 3 +- 3 files changed, 164 insertions(+), 2 deletions(-) diff --git a/doc/developer/semantics/Mz/DiffSemiring.lean b/doc/developer/semantics/Mz/DiffSemiring.lean index 203b39641c46c..8bb7c73faaab0 100644 --- a/doc/developer/semantics/Mz/DiffSemiring.lean +++ b/doc/developer/semantics/Mz/DiffSemiring.lean @@ -170,6 +170,35 @@ theorem mul_add [Mul α] [Add α] | val _ => rfl | error => rfl +/-! ## Associativity of `*` (when the base has it) -/ + +theorem mul_assoc [Mul α] + (h_assoc : ∀ x y z : α, (x * y) * z = x * (y * z)) + (a b c : DiffWithError α) : (a * b) * c = a * (b * c) := by + cases a with + | val x => + cases b with + | val y => + cases c with + | val z => + show (val ((x * y) * z) : DiffWithError α) = val (x * (y * z)) + rw [h_assoc] + | error => rfl + | error => + cases c with + | val _ => rfl + | error => rfl + | error => + cases b with + | val _ => + cases c with + | val _ => rfl + | error => rfl + | error => + cases c with + | val _ => rfl + | error => rfl + end DiffWithError end Mz diff --git a/doc/developer/semantics/Mz/Join.lean b/doc/developer/semantics/Mz/Join.lean index 812c1deb8a4eb..f690a003b5b24 100644 --- a/doc/developer/semantics/Mz/Join.lean +++ b/doc/developer/semantics/Mz/Join.lean @@ -198,4 +198,138 @@ theorem UnifiedStream.filter_preserves_error_diff rw [List.flatMap_cons] exact List.mem_append.mpr (.inr (ih hTail)) +/-! ## Associativity of `cross` + +`cross` is associative modulo associativity of row concatenation +on the carrier and associativity of diff multiplication on the +diff component. The combinatorial structure (one output per +triple) is identical in both nestings; the only obligation is +that the two ways of folding three carriers / diffs together +agree. -/ + +/-- `combineCarrier` is associative modulo `List.append_assoc` on +the row case, and trivially so on the err cases (left-wins). -/ +theorem combineCarrier_assoc (a b c : UnifiedRow) : + combineCarrier (combineCarrier a b) c = combineCarrier a (combineCarrier b c) := by + cases a with + | row la => + cases b with + | row lb => + cases c with + | row lc => show UnifiedRow.row ((la ++ lb) ++ lc) + = UnifiedRow.row (la ++ (lb ++ lc)) + rw [List.append_assoc] + | err _ => rfl + | err _ => + cases c with + | row _ => rfl + | err _ => rfl + | err _ => + cases b with + | row _ => + cases c with + | row _ => rfl + | err _ => rfl + | err _ => + cases c with + | row _ => rfl + | err _ => rfl + +/-- Per-record associativity of the cross-product building block. +The diff side uses `DiffWithError.mul_assoc` instantiated at +`Int`. -/ +private theorem cross_step_assoc + (ad bd cd : UnifiedRow × DiffWithError Int) : + (combineCarrier (combineCarrier ad.1 bd.1) cd.1, (ad.2 * bd.2) * cd.2) + = (combineCarrier ad.1 (combineCarrier bd.1 cd.1), ad.2 * (bd.2 * cd.2)) := by + congr 1 + · exact combineCarrier_assoc ad.1 bd.1 cd.1 + · exact DiffWithError.mul_assoc Int.mul_assoc ad.2 bd.2 cd.2 + +/-- Local lemma: associativity of `flatMap`. Lean core has the +building blocks (`flatMap_cons`, `flatMap_append`) but not the +joint statement at this name. -/ +private theorem List.flatMap_flatMap_local {α β γ : Type} + (l : List α) (f : α → List β) (g : β → List γ) : + (l.flatMap f).flatMap g = l.flatMap (fun a => (f a).flatMap g) := by + induction l with + | nil => rfl + | cons hd tl ih => + show ((hd :: tl).flatMap f).flatMap g + = (hd :: tl).flatMap (fun a => (f a).flatMap g) + rw [List.flatMap_cons, List.flatMap_append, ih] + rfl + +/-- Local lemma: pushing a `map` inside a `flatMap`. -/ +private theorem List.map_flatMap_local {α β γ : Type} + (l : List α) (f : α → List β) (g : β → γ) : + (l.flatMap f).map g = l.flatMap (fun a => (f a).map g) := by + induction l with + | nil => rfl + | cons hd tl ih => + show ((hd :: tl).flatMap f).map g + = (hd :: tl).flatMap (fun a => (f a).map g) + rw [List.flatMap_cons, List.map_append, ih] + rfl + +/-- Local lemma: `flatMap` of a `map`. -/ +private theorem List.flatMap_map_local {α β γ : Type} + (l : List α) (f : α → β) (g : β → List γ) : + (l.map f).flatMap g = l.flatMap (fun a => g (f a)) := by + induction l with + | nil => rfl + | cons hd tl ih => + show ((hd :: tl).map f).flatMap g + = (hd :: tl).flatMap (fun a => g (f a)) + rw [List.map_cons, List.flatMap_cons, List.flatMap_cons, ih] + +/-- Local lemma: pointwise-equal bodies give equal `flatMap`s. -/ +private theorem List.flatMap_congr_local {α β : Type} + {l : List α} {f g : α → List β} + (h : ∀ x ∈ l, f x = g x) : + l.flatMap f = l.flatMap g := by + induction l with + | nil => rfl + | cons hd tl ih => + rw [List.flatMap_cons, List.flatMap_cons, + h hd List.mem_cons_self, + ih (fun x hMem => h x (List.mem_cons_of_mem _ hMem))] + +/-- Local lemma: pointwise-equal bodies give equal `map`s. -/ +private theorem List.map_congr_local {α β : Type} + {l : List α} {f g : α → β} + (h : ∀ x ∈ l, f x = g x) : + l.map f = l.map g := by + induction l with + | nil => rfl + | cons hd tl ih => + rw [List.map_cons, List.map_cons, + h hd List.mem_cons_self, + ih (fun x hMem => h x (List.mem_cons_of_mem _ hMem))] + +/-- Cross is associative on the unified stream. The proof rewrites +both sides into a common triple-fold via the list-monad equations +and closes the leaves with `cross_step_assoc`. -/ +theorem UnifiedStream.cross_assoc (a b c : UnifiedStream) : + UnifiedStream.cross (UnifiedStream.cross a b) c + = UnifiedStream.cross a (UnifiedStream.cross b c) := by + show (a.flatMap fun ad => b.map fun bd => + (combineCarrier ad.1 bd.1, ad.2 * bd.2)).flatMap + (fun abd => c.map fun cd => + (combineCarrier abd.1 cd.1, abd.2 * cd.2)) + = a.flatMap fun ad => + (b.flatMap fun bd => c.map fun cd => + (combineCarrier bd.1 cd.1, bd.2 * cd.2)).map + (fun bcd => (combineCarrier ad.1 bcd.1, ad.2 * bcd.2)) + rw [List.flatMap_flatMap_local] + apply List.flatMap_congr_local + intro ad _ + rw [List.flatMap_map_local, List.map_flatMap_local] + apply List.flatMap_congr_local + intro bd _ + rw [List.map_map] + apply List.map_congr_local + intro cd _ + exact cross_step_assoc ad bd cd + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index e3311aae860d1..cc1072ab7bae6 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -35,7 +35,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four *cardinality* — `consolidate_length_le` bounds the output by the input length (consolidation only merges, never expands); *no-error preservation* — `consolidate_no_error` proves that if every input diff is a `.val`, every output diff is a `.val`, so `.error` is the only source of absorption. * `Mz/Triple.lean`: `TimedRecord = (row, time, diff)` triple stream tying `DiffWithError` to differential dataflow's record format. Defines `consolidateAll` and `consolidateAt t`; lifts `Consolidate`'s absorption to per-time and stream-wide statements. Per-`(row, time)` bucketing reduces to applying `UnifiedConsolidate` inside each time slice. -* `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). +* `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). Algebraic laws: `combineCarrier_assoc` (carrier combine is associative modulo `List.append_assoc`) and the headline `UnifiedStream.cross_assoc` (`(a × b) × c = a × (b × c)`). The proof rearranges nested `flatMap` / `map` via local list-monad lemmas and closes via `DiffWithError.mul_assoc` plus `combineCarrier_assoc`. * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. @@ -91,7 +91,6 @@ The roadmap in priority order: * `BagStream.project` / `BagStream.filter` commutativity (when the predicate references only un-projected columns). Same multiset-equality caveat on the error collection. * Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. * Joins on `BagStream` with explicit error propagation. -* Sketch a proof of `cross_assoc` modulo row concatenation associativity. Left-wins error rule is consistent in both nestings; diff multiplication is associative on the base, so the diff side follows from the semiring laws; the residual obligation is `(la ++ lb) ++ lc = la ++ (lb ++ lc)` lifted into the row carrier. * Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. * Lift to bag semantics for predicate / projection rewrites. From 69df13af443e3742afb16ff83925d4abe37ca8d2 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 11:17:07 +0200 Subject: [PATCH 034/127] doc/semantics: tighten might_error with short-circuit detection Binary `AND` / `OR` now detect literal-absorber short-circuits: * `.and (.lit (.bool false)) _` and `.and _ (.lit (.bool false))` return `false`. The four-valued AND table absorbs to `.bool false` from either position, so a literal-false operand statically rules out an error. * `.or (.lit (.bool true)) _` and `.or _ (.lit (.bool true))` return `false` for the dual reason. Implementation uses two non-recursive head-matchers `Expr.isLitBoolFalse` and `Expr.isLitBoolTrue` and an `if` guard inside the existing mutual block. Falls back to the old `a.might_error || b.might_error` rule when neither operand is the absorbing literal. Soundness `might_error_sound` extended with the short-circuit branches. Four new helper lemmas cover the value-level reductions: `evalAnd_{left,right}_false`, `evalOr_{left,right}_true`. The remaining `.coalesce`, `.andN`, `.orN`, and arithmetic ground-truth tightenings stay on the roadmap as additive refinements. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/MightError.lean | 195 +++++++++++++++++---- doc/developer/semantics/README.md | 4 +- 2 files changed, 162 insertions(+), 37 deletions(-) diff --git a/doc/developer/semantics/Mz/MightError.lean b/doc/developer/semantics/Mz/MightError.lean index a1052c700a23e..c42032830dd96 100644 --- a/doc/developer/semantics/Mz/MightError.lean +++ b/doc/developer/semantics/Mz/MightError.lean @@ -73,6 +73,30 @@ theorem evalIfThen_not_err simp only [evalIfThen]; decide | err _ => exact (hc trivial).elim +/-! ### Short-circuit absorbers + +`AND` absorbs to `.bool false` whenever either operand is +`.bool false`, regardless of what the other operand is. Likewise +`OR` absorbs to `.bool true` on either side. These are the +algebraic facts behind the value-level tightening of +`Expr.might_error`. -/ + +theorem evalAnd_left_false (d : Datum) : evalAnd (.bool false) d = .bool false := rfl + +theorem evalAnd_right_false (d : Datum) : evalAnd d (.bool false) = .bool false := by + cases d with + | bool b => cases b <;> rfl + | null => rfl + | err _ => rfl + +theorem evalOr_left_true (d : Datum) : evalOr (.bool true) d = .bool true := rfl + +theorem evalOr_right_true (d : Datum) : evalOr d (.bool true) = .bool true := by + cases d with + | bool b => cases b <;> rfl + | null => rfl + | err _ => rfl + /-- List-level analogue of `evalAnd_not_err`: if every operand is error-free, the variadic AND is error-free. -/ theorem evalAndN_not_err : @@ -97,33 +121,55 @@ theorem evalOrN_not_err : /-! ## Static analyzer Returns `true` when `e` might evaluate to an `err`. The current -implementation is purely structural and conservative: any literal -`err` taints every ancestor. Columns are assumed not to contain errors -(see `Env.ErrFree`). - -For `andN` and `orN`, the analyzer recurses into the operand list via -`Expr.argsMightError` and returns `true` if any operand might error. -The mutual recursion across `Expr.might_error` and -`Expr.argsMightError` keeps Lean's structural-recursion checker -satisfied without an explicit termination measure. - -`coalesce` is still tainted unconditionally. A precise analyzer -would reason about the rescue rule (`coalesce(err, x) = x` when `x` -is concrete), which requires tracking which operands are statically -*safe* rather than merely *not erroring*. Tightening it is a separate -follow-up. -/ +implementation is structural with two pieces of value-level +tightening on binary `AND` and `OR`: + +* `.and (.lit (.bool false)) _` and `.and _ (.lit (.bool false))` + return `false`. The four-valued AND table has `false` as the + dominant absorber (`false AND error = false` from either side), + so a literal-false operand statically rules out an error. +* `.or (.lit (.bool true)) _` and `.or _ (.lit (.bool true))` + return `false` for the dual reason. + +Any literal `err` taints every ancestor. Columns are assumed not +to contain errors (see `Env.ErrFree`). + +For `andN` and `orN`, the analyzer recurses into the operand list +via `Expr.argsMightError`. For `coalesce`, the analyzer fires only +when *every* operand might error. + +The mutual recursion across `Expr.might_error`, +`Expr.argsMightError`, and `Expr.argsAllMightError` keeps Lean's +structural-recursion checker satisfied without an explicit +termination measure. -/ +/-- Top-of-expression literal-false detector. Non-recursive on +`Expr`; matches only the head constructor. Used by `might_error` +to identify the `false`-absorber position of binary `AND`. -/ +@[simp] def Expr.isLitBoolFalse : Expr → Bool + | .lit (.bool false) => true + | _ => false + +/-- Dual: top-of-expression literal-true detector. -/ +@[simp] def Expr.isLitBoolTrue : Expr → Bool + | .lit (.bool true) => true + | _ => false + mutual def Expr.might_error : Expr → Bool - | .lit (.err _) => true - | .lit _ => false - | .col _ => false - | .and a b => a.might_error || b.might_error - | .or a b => a.might_error || b.might_error - | .not a => a.might_error - | .ifThen c t e => c.might_error || t.might_error || e.might_error - | .andN args => Expr.argsMightError args - | .orN args => Expr.argsMightError args - | .coalesce [] => false + | .lit (.err _) => true + | .lit _ => false + | .col _ => false + | .and a b => + if a.isLitBoolFalse || b.isLitBoolFalse then false + else a.might_error || b.might_error + | .or a b => + if a.isLitBoolTrue || b.isLitBoolTrue then false + else a.might_error || b.might_error + | .not a => a.might_error + | .ifThen c t e => c.might_error || t.might_error || e.might_error + | .andN args => Expr.argsMightError args + | .orN args => Expr.argsMightError args + | .coalesce [] => false | .coalesce (a :: rest) => a.might_error && Expr.argsAllMightError rest /-- Bool fold of `might_error` over a list of operands ("does any @@ -317,19 +363,98 @@ theorem might_error_sound : | .and a b, env, hMe, hEnv => by intro hRes simp only [eval] at hRes - have ha : ¬(a.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) - have hb : ¬(b.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) - exact evalAnd_not_err - (might_error_sound a env ha hEnv) - (might_error_sound b env hb hEnv) hRes + cases hA : a.isLitBoolFalse with + | true => + -- a = .lit (.bool false): evalAnd .bool false _ = .bool false + have hEq : a = .lit (.bool false) := by + cases a with + | lit d => + cases d with + | bool b' => cases b' with + | false => rfl + | true => simp [Expr.isLitBoolFalse] at hA + | _ => simp [Expr.isLitBoolFalse] at hA + | _ => simp [Expr.isLitBoolFalse] at hA + rw [hEq] at hRes + simp only [eval, evalAnd_left_false] at hRes + cases hRes + | false => + cases hB : b.isLitBoolFalse with + | true => + -- b = .lit (.bool false): evalAnd _ .bool false = .bool false + have hEq : b = .lit (.bool false) := by + cases b with + | lit d => + cases d with + | bool b' => cases b' with + | false => rfl + | true => simp [Expr.isLitBoolFalse] at hB + | _ => simp [Expr.isLitBoolFalse] at hB + | _ => simp [Expr.isLitBoolFalse] at hB + rw [hEq] at hRes + simp only [eval, evalAnd_right_false] at hRes + cases hRes + | false => + -- Non-short-circuit. Fall through to recursive check. + have hMeReduce : + Expr.might_error (.and a b) = (a.might_error || b.might_error) := by + show (if a.isLitBoolFalse || b.isLitBoolFalse + then false + else a.might_error || b.might_error) + = (a.might_error || b.might_error) + rw [hA, hB]; rfl + rw [hMeReduce] at hMe + have ha : ¬(a.might_error = true) := fun h => hMe (by simp [h]) + have hb : ¬(b.might_error = true) := fun h => hMe (by simp [h]) + exact evalAnd_not_err + (might_error_sound a env ha hEnv) + (might_error_sound b env hb hEnv) hRes | .or a b, env, hMe, hEnv => by intro hRes simp only [eval] at hRes - have ha : ¬(a.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) - have hb : ¬(b.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) - exact evalOr_not_err - (might_error_sound a env ha hEnv) - (might_error_sound b env hb hEnv) hRes + cases hA : a.isLitBoolTrue with + | true => + have hEq : a = .lit (.bool true) := by + cases a with + | lit d => + cases d with + | bool b' => cases b' with + | true => rfl + | false => simp [Expr.isLitBoolTrue] at hA + | _ => simp [Expr.isLitBoolTrue] at hA + | _ => simp [Expr.isLitBoolTrue] at hA + rw [hEq] at hRes + simp only [eval, evalOr_left_true] at hRes + cases hRes + | false => + cases hB : b.isLitBoolTrue with + | true => + have hEq : b = .lit (.bool true) := by + cases b with + | lit d => + cases d with + | bool b' => cases b' with + | true => rfl + | false => simp [Expr.isLitBoolTrue] at hB + | _ => simp [Expr.isLitBoolTrue] at hB + | _ => simp [Expr.isLitBoolTrue] at hB + rw [hEq] at hRes + simp only [eval, evalOr_right_true] at hRes + cases hRes + | false => + have hMeReduce : + Expr.might_error (.or a b) = (a.might_error || b.might_error) := by + show (if a.isLitBoolTrue || b.isLitBoolTrue + then false + else a.might_error || b.might_error) + = (a.might_error || b.might_error) + rw [hA, hB]; rfl + rw [hMeReduce] at hMe + have ha : ¬(a.might_error = true) := fun h => hMe (by simp [h]) + have hb : ¬(b.might_error = true) := fun h => hMe (by simp [h]) + exact evalOr_not_err + (might_error_sound a env ha hEnv) + (might_error_sound b env hb hEnv) hRes | .not a, env, hMe, hEnv => by intro hRes simp only [eval] at hRes diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index cc1072ab7bae6..9508180d9a820 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -13,7 +13,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/PrimEval.lean`: primitive evaluators on `Datum` and `List Datum` — `evalAnd`, `evalOr`, `evalNot`, `evalIfThen`, `Env`, `Env.get`, `evalAndN`, `evalOrN`, `evalCoalesce`. Split out so the algebraic-law files and the expression-level evaluator can both import them without circular dependencies. * `Mz/Eval.lean`: the big-step `eval : Env → Expr → Datum`. List-carrying constructors evaluate each operand and hand the result list to the matching primitive. * `Mz/Boolean.lean`: per-cell truth-table proofs for `AND`, `OR`, and `NOT`, plus involutivity of `NOT`. -* `Mz/MightError.lean`: the `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. `andN` and `orN` recurse via `Expr.argsMightError` ("any operand might error"); `coalesce` recurses via `Expr.argsAllMightError` ("every operand might error"), special-casing the empty list as safe. Soundness for `coalesce` extracts a statically-safe operand through `Expr.exists_safe_of_not_argsAllMightError` and applies `evalCoalesce_not_err_of_some_safe`, which in turn rests on `Coalesce.go_not_err` — the state-machine lemma that "once one safe operand is in the remaining list, the walk cannot return an error". +* `Mz/MightError.lean`: the `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. Binary `AND` / `OR` short-circuit on literal-`.bool false` / literal-`.bool true` operands via `Expr.isLitBoolFalse` / `Expr.isLitBoolTrue`: either position being the absorbing literal makes the analyzer return `false` regardless of the other operand. `andN` and `orN` recurse via `Expr.argsMightError` ("any operand might error"); `coalesce` recurses via `Expr.argsAllMightError` ("every operand might error"), special-casing the empty list as safe. Soundness for `coalesce` extracts a statically-safe operand through `Expr.exists_safe_of_not_argsAllMightError` and applies `evalCoalesce_not_err_of_some_safe`, which in turn rests on `Coalesce.go_not_err` — the state-machine lemma that "once one safe operand is in the remaining list, the walk cannot return an error". Companion value-level helpers `evalAnd_{left,right}_false` / `evalOr_{left,right}_true` discharge the short-circuit branches of soundness. * `Mz/Strict.lean`: strictness predicates (`ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary`), positive instances for `evalNot` and the condition slot of `evalIfThen`, closure under composition, and negative results witnessing that `AND` and `OR` are *not* err-strict in either position. * `Mz/Coalesce.lean`: laws for `evalCoalesce` — error-rescue, null-beats-err tiebreak, first-error stickiness. * `Mz/Laws.lean`: algebraic laws — two-sided identity (`TRUE` for `AND`, `FALSE` for `OR`), idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. @@ -91,7 +91,7 @@ The roadmap in priority order: * `BagStream.project` / `BagStream.filter` commutativity (when the predicate references only un-projected columns). Same multiset-equality caveat on the error collection. * Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. * Joins on `BagStream` with explicit error propagation. -* Tightening `Expr.might_error`. The skeleton version is purely structural and ignores type / nullability information; bringing it closer to `MirScalarExpr::might_error` is additive. +* Further tightening of `Expr.might_error`. Short-circuit detection is in place for binary `AND` / `OR` against literal absorbers; ground-truth lookups (literal arithmetic, known-null operands, type guards) are the natural next refinements and remain additive against the current soundness proof. * Lift to bag semantics for predicate / projection rewrites. The diff-semiring extension is now in scope: `UnifiedStream` records carry a `DiffWithError Int` and operators preserve / multiply / absorb it as appropriate. From 60a254d1dd65b6f4dc62aa5dffc3cbef72a85176 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 11:21:00 +0200 Subject: [PATCH 035/127] doc/semantics: tighten might_error for IfThen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the condition of `.ifThen c t e` is a literal `.bool`, only the chosen branch's `might_error` is consulted. The discarded branch may statically err without tainting the result, since `evalIfThen` selects one of the two pre-evaluated branches and discards the other: * `c = .lit (.bool true)` ⇒ result = `eval env t`, so the analyzer returns `t.might_error`. * `c = .lit (.bool false)` ⇒ result = `eval env e`, so the analyzer returns `e.might_error`. * Otherwise: fall back to the three-way disjunction. The existing helpers `Expr.isLitBoolTrue` / `isLitBoolFalse` from the AND / OR commit do double duty here. Soundness `might_error_ sound` extended with the two short-circuit branches via the trivial reductions `evalIfThen .bool true dt _ = dt` and `evalIfThen .bool false _ de = de`. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/MightError.lean | 81 +++++++++++++++++++--- doc/developer/semantics/README.md | 2 +- 2 files changed, 74 insertions(+), 9 deletions(-) diff --git a/doc/developer/semantics/Mz/MightError.lean b/doc/developer/semantics/Mz/MightError.lean index c42032830dd96..32fe8da2d0fa6 100644 --- a/doc/developer/semantics/Mz/MightError.lean +++ b/doc/developer/semantics/Mz/MightError.lean @@ -166,7 +166,10 @@ def Expr.might_error : Expr → Bool if a.isLitBoolTrue || b.isLitBoolTrue then false else a.might_error || b.might_error | .not a => a.might_error - | .ifThen c t e => c.might_error || t.might_error || e.might_error + | .ifThen c t e => + if c.isLitBoolTrue then t.might_error + else if c.isLitBoolFalse then e.might_error + else c.might_error || t.might_error || e.might_error | .andN args => Expr.argsMightError args | .orN args => Expr.argsMightError args | .coalesce [] => false @@ -463,13 +466,75 @@ theorem might_error_sound : | .ifThen c t e, env, hMe, hEnv => by intro hRes simp only [eval] at hRes - have hc : ¬(c.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) - have ht : ¬(t.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) - have he : ¬(e.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) - exact evalIfThen_not_err - (might_error_sound c env hc hEnv) - (might_error_sound t env ht hEnv) - (might_error_sound e env he hEnv) hRes + cases hCT : c.isLitBoolTrue with + | true => + -- c = .lit (.bool true): evalIfThen .bool true dt de = dt + have hEq : c = .lit (.bool true) := by + cases c with + | lit d => + cases d with + | bool b' => cases b' with + | true => rfl + | false => simp [Expr.isLitBoolTrue] at hCT + | _ => simp [Expr.isLitBoolTrue] at hCT + | _ => simp [Expr.isLitBoolTrue] at hCT + rw [hEq] at hRes + simp only [eval] at hRes + -- hRes : (evalIfThen .bool true (eval env t) (eval env e)).IsErr + -- evalIfThen .bool true dt _ = dt by definition + have h_reduce : evalIfThen (.bool true) (eval env t) (eval env e) = eval env t := rfl + rw [h_reduce] at hRes + have ht : ¬(t.might_error = true) := fun h => hMe (by + rw [hEq] + show (if (Expr.lit (.bool true)).isLitBoolTrue then t.might_error + else if (Expr.lit (.bool true)).isLitBoolFalse then e.might_error + else (Expr.lit (.bool true)).might_error || t.might_error || e.might_error) + = true + simp [Expr.isLitBoolTrue, h]) + exact might_error_sound t env ht hEnv hRes + | false => + cases hCF : c.isLitBoolFalse with + | true => + have hEq : c = .lit (.bool false) := by + cases c with + | lit d => + cases d with + | bool b' => cases b' with + | false => rfl + | true => simp [Expr.isLitBoolFalse] at hCF + | _ => simp [Expr.isLitBoolFalse] at hCF + | _ => simp [Expr.isLitBoolFalse] at hCF + rw [hEq] at hRes + simp only [eval] at hRes + have h_reduce : evalIfThen (.bool false) (eval env t) (eval env e) = eval env e := rfl + rw [h_reduce] at hRes + have he : ¬(e.might_error = true) := fun h => hMe (by + rw [hEq] + show (if (Expr.lit (.bool false)).isLitBoolTrue then t.might_error + else if (Expr.lit (.bool false)).isLitBoolFalse then e.might_error + else (Expr.lit (.bool false)).might_error || t.might_error || e.might_error) + = true + simp [Expr.isLitBoolTrue, Expr.isLitBoolFalse, h]) + exact might_error_sound e env he hEnv hRes + | false => + -- Non-short-circuit branch + have hMeReduce : + Expr.might_error (.ifThen c t e) + = (c.might_error || t.might_error || e.might_error) := by + show (if c.isLitBoolTrue then t.might_error + else if c.isLitBoolFalse then e.might_error + else c.might_error || t.might_error || e.might_error) + = (c.might_error || t.might_error || e.might_error) + rw [hCT, hCF] + rfl + rw [hMeReduce] at hMe + have hc : ¬(c.might_error = true) := fun h => hMe (by simp [h]) + have ht : ¬(t.might_error = true) := fun h => hMe (by simp [h]) + have he : ¬(e.might_error = true) := fun h => hMe (by simp [h]) + exact evalIfThen_not_err + (might_error_sound c env hc hEnv) + (might_error_sound t env ht hEnv) + (might_error_sound e env he hEnv) hRes | .andN args, env, hMe, hEnv => by intro hRes simp only [eval] at hRes diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 9508180d9a820..c0d04591d32ce 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -13,7 +13,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/PrimEval.lean`: primitive evaluators on `Datum` and `List Datum` — `evalAnd`, `evalOr`, `evalNot`, `evalIfThen`, `Env`, `Env.get`, `evalAndN`, `evalOrN`, `evalCoalesce`. Split out so the algebraic-law files and the expression-level evaluator can both import them without circular dependencies. * `Mz/Eval.lean`: the big-step `eval : Env → Expr → Datum`. List-carrying constructors evaluate each operand and hand the result list to the matching primitive. * `Mz/Boolean.lean`: per-cell truth-table proofs for `AND`, `OR`, and `NOT`, plus involutivity of `NOT`. -* `Mz/MightError.lean`: the `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. Binary `AND` / `OR` short-circuit on literal-`.bool false` / literal-`.bool true` operands via `Expr.isLitBoolFalse` / `Expr.isLitBoolTrue`: either position being the absorbing literal makes the analyzer return `false` regardless of the other operand. `andN` and `orN` recurse via `Expr.argsMightError` ("any operand might error"); `coalesce` recurses via `Expr.argsAllMightError` ("every operand might error"), special-casing the empty list as safe. Soundness for `coalesce` extracts a statically-safe operand through `Expr.exists_safe_of_not_argsAllMightError` and applies `evalCoalesce_not_err_of_some_safe`, which in turn rests on `Coalesce.go_not_err` — the state-machine lemma that "once one safe operand is in the remaining list, the walk cannot return an error". Companion value-level helpers `evalAnd_{left,right}_false` / `evalOr_{left,right}_true` discharge the short-circuit branches of soundness. +* `Mz/MightError.lean`: the `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. Binary `AND` / `OR` short-circuit on literal-`.bool false` / literal-`.bool true` operands via `Expr.isLitBoolFalse` / `Expr.isLitBoolTrue`: either position being the absorbing literal makes the analyzer return `false` regardless of the other operand. `IfThen` likewise short-circuits when the condition is a literal `.bool` — only the picked branch's analyzer result is consulted, so a known-erring branch on the discarded side cannot taint the result. `andN` and `orN` recurse via `Expr.argsMightError` ("any operand might error"); `coalesce` recurses via `Expr.argsAllMightError` ("every operand might error"), special-casing the empty list as safe. Soundness for `coalesce` extracts a statically-safe operand through `Expr.exists_safe_of_not_argsAllMightError` and applies `evalCoalesce_not_err_of_some_safe`, which in turn rests on `Coalesce.go_not_err` — the state-machine lemma that "once one safe operand is in the remaining list, the walk cannot return an error". Companion value-level helpers `evalAnd_{left,right}_false` / `evalOr_{left,right}_true` discharge the short-circuit branches of soundness. * `Mz/Strict.lean`: strictness predicates (`ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary`), positive instances for `evalNot` and the condition slot of `evalIfThen`, closure under composition, and negative results witnessing that `AND` and `OR` are *not* err-strict in either position. * `Mz/Coalesce.lean`: laws for `evalCoalesce` — error-rescue, null-beats-err tiebreak, first-error stickiness. * `Mz/Laws.lean`: algebraic laws — two-sided identity (`TRUE` for `AND`, `FALSE` for `OR`), idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. From 5d122f3c5b5b122c8772404b38e53444a8cbdd76 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 11:30:01 +0200 Subject: [PATCH 036/127] doc/semantics: extend short-circuit detection to andN / orN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `Expr.might_error (.andN args)` now returns `false` whenever any operand is `.lit (.bool false)`. Dually, `.orN args` returns `false` whenever any operand is `.lit (.bool true)`. The absorbing literal forces the variadic evaluator to short-circuit regardless of the other operands' eval results. Shared helpers extracted alongside the AND / OR commit: * `Expr.eq_of_isLitBoolFalse`: characterizes `isLitBoolFalse e = true` as `e = .lit (.bool false)`. * `Expr.eq_of_isLitBoolTrue`: dual. Soundness `might_error_sound` extended with two new branches that locate the absorbing literal via `List.any_eq_true`, lift to `Datum.bool _ ∈ args.map (eval env)`, and discharge via the inline absorption lemmas `evalAndN_false_mem_eq` / `evalOrN_true_mem_eq`. The absorption facts are duplicated here to avoid an import cycle through `Mz/Variadic.lean → Mz/Laws.lean → Mz/MightError.lean`; the equivalent `evalAndN_false_absorbs` / `evalOrN_true_absorbs` continue to live in `Mz/Variadic.lean`. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/MightError.lean | 146 ++++++++++++++++++--- doc/developer/semantics/README.md | 2 +- 2 files changed, 128 insertions(+), 20 deletions(-) diff --git a/doc/developer/semantics/Mz/MightError.lean b/doc/developer/semantics/Mz/MightError.lean index 32fe8da2d0fa6..7bf03680b4cab 100644 --- a/doc/developer/semantics/Mz/MightError.lean +++ b/doc/developer/semantics/Mz/MightError.lean @@ -97,6 +97,43 @@ theorem evalOr_right_true (d : Datum) : evalOr d (.bool true) = .bool true := by | null => rfl | err _ => rfl +/-- Variadic `AND` absorbs to `.bool false` as soon as any operand +is `.bool false`. Stated inline to keep `MightError.lean` +self-contained (the same fact reappears in `Mz/Variadic.lean` +under the name `evalAndN_false_absorbs`, but importing that +module creates a cycle with `Mz/Laws.lean`). -/ +private theorem evalAndN_false_mem_eq + {ds : List Datum} (h : .bool false ∈ ds) : + evalAndN ds = .bool false := by + induction ds with + | nil => exact absurd h List.not_mem_nil + | cons hd tl ih => + rcases List.mem_cons.mp h with hHead | hTail + · subst hHead + show evalAnd (.bool false) (evalAndN tl) = .bool false + rfl + · have hTl := ih hTail + show evalAnd hd (evalAndN tl) = .bool false + rw [hTl] + exact evalAnd_right_false hd + +/-- Dual: variadic `OR` absorbs to `.bool true` as soon as any +operand is `.bool true`. -/ +private theorem evalOrN_true_mem_eq + {ds : List Datum} (h : .bool true ∈ ds) : + evalOrN ds = .bool true := by + induction ds with + | nil => exact absurd h List.not_mem_nil + | cons hd tl ih => + rcases List.mem_cons.mp h with hHead | hTail + · subst hHead + show evalOr (.bool true) (evalOrN tl) = .bool true + rfl + · have hTl := ih hTail + show evalOr hd (evalOrN tl) = .bool true + rw [hTl] + exact evalOr_right_true hd + /-- List-level analogue of `evalAnd_not_err`: if every operand is error-free, the variadic AND is error-free. -/ theorem evalAndN_not_err : @@ -144,7 +181,8 @@ structural-recursion checker satisfied without an explicit termination measure. -/ /-- Top-of-expression literal-false detector. Non-recursive on `Expr`; matches only the head constructor. Used by `might_error` -to identify the `false`-absorber position of binary `AND`. -/ +to identify the `false`-absorber position of binary / variadic +`AND` and the `false` branch of `IfThen`. -/ @[simp] def Expr.isLitBoolFalse : Expr → Bool | .lit (.bool false) => true | _ => false @@ -154,6 +192,31 @@ to identify the `false`-absorber position of binary `AND`. -/ | .lit (.bool true) => true | _ => false +/-- `isLitBoolFalse e = true` exactly characterizes +`e = .lit (.bool false)`. -/ +theorem Expr.eq_of_isLitBoolFalse {e : Expr} + (h : e.isLitBoolFalse = true) : e = .lit (.bool false) := by + cases e with + | lit d => + cases d with + | bool b => cases b with + | false => rfl + | true => simp [Expr.isLitBoolFalse] at h + | _ => simp [Expr.isLitBoolFalse] at h + | _ => simp [Expr.isLitBoolFalse] at h + +/-- Dual characterization. -/ +theorem Expr.eq_of_isLitBoolTrue {e : Expr} + (h : e.isLitBoolTrue = true) : e = .lit (.bool true) := by + cases e with + | lit d => + cases d with + | bool b => cases b with + | true => rfl + | false => simp [Expr.isLitBoolTrue] at h + | _ => simp [Expr.isLitBoolTrue] at h + | _ => simp [Expr.isLitBoolTrue] at h + mutual def Expr.might_error : Expr → Bool | .lit (.err _) => true @@ -170,8 +233,12 @@ def Expr.might_error : Expr → Bool if c.isLitBoolTrue then t.might_error else if c.isLitBoolFalse then e.might_error else c.might_error || t.might_error || e.might_error - | .andN args => Expr.argsMightError args - | .orN args => Expr.argsMightError args + | .andN args => + if args.any (·.isLitBoolFalse) then false + else Expr.argsMightError args + | .orN args => + if args.any (·.isLitBoolTrue) then false + else Expr.argsMightError args | .coalesce [] => false | .coalesce (a :: rest) => a.might_error && Expr.argsAllMightError rest @@ -537,24 +604,65 @@ theorem might_error_sound : (might_error_sound e env he hEnv) hRes | .andN args, env, hMe, hEnv => by intro hRes - simp only [eval] at hRes - apply evalAndN_not_err (ds := args.map (eval env)) ?_ hRes - intro d hd_mem - obtain ⟨e, e_mem, h_eq⟩ := List.mem_map.mp hd_mem - subst h_eq - have he : ¬(e.might_error = true) := fun h => hMe - (Expr.argsMightError_of_mem e_mem h) - exact might_error_sound e env he hEnv + cases hAny : args.any (·.isLitBoolFalse) with + | true => + -- Some operand is `.lit (.bool false)`. evalAndN absorbs to `.bool false`. + obtain ⟨e, he_mem, he_lit⟩ := List.any_eq_true.mp hAny + have hEq : e = .lit (.bool false) := Expr.eq_of_isLitBoolFalse he_lit + rw [hEq] at he_mem + have hEvalLit : eval env (.lit (.bool false)) = .bool false := by + simp only [eval] + have hMapMem : (Datum.bool false) ∈ args.map (eval env) := + List.mem_map.mpr ⟨.lit (.bool false), he_mem, hEvalLit⟩ + simp only [eval] at hRes + rw [evalAndN_false_mem_eq hMapMem] at hRes + cases hRes + | false => + have hMeReduce : + Expr.might_error (.andN args) = Expr.argsMightError args := by + show (if args.any (·.isLitBoolFalse) then false + else Expr.argsMightError args) + = Expr.argsMightError args + rw [hAny]; rfl + rw [hMeReduce] at hMe + simp only [eval] at hRes + apply evalAndN_not_err (ds := args.map (eval env)) ?_ hRes + intro d hd_mem + obtain ⟨e, e_mem, h_eq⟩ := List.mem_map.mp hd_mem + subst h_eq + have he : ¬(e.might_error = true) := fun h => hMe + (Expr.argsMightError_of_mem e_mem h) + exact might_error_sound e env he hEnv | .orN args, env, hMe, hEnv => by intro hRes - simp only [eval] at hRes - apply evalOrN_not_err (ds := args.map (eval env)) ?_ hRes - intro d hd_mem - obtain ⟨e, e_mem, h_eq⟩ := List.mem_map.mp hd_mem - subst h_eq - have he : ¬(e.might_error = true) := fun h => hMe - (Expr.argsMightError_of_mem e_mem h) - exact might_error_sound e env he hEnv + cases hAny : args.any (·.isLitBoolTrue) with + | true => + obtain ⟨e, he_mem, he_lit⟩ := List.any_eq_true.mp hAny + have hEq : e = .lit (.bool true) := Expr.eq_of_isLitBoolTrue he_lit + rw [hEq] at he_mem + have hEvalLit : eval env (.lit (.bool true)) = .bool true := by + simp only [eval] + have hMapMem : (Datum.bool true) ∈ args.map (eval env) := + List.mem_map.mpr ⟨.lit (.bool true), he_mem, hEvalLit⟩ + simp only [eval] at hRes + rw [evalOrN_true_mem_eq hMapMem] at hRes + cases hRes + | false => + have hMeReduce : + Expr.might_error (.orN args) = Expr.argsMightError args := by + show (if args.any (·.isLitBoolTrue) then false + else Expr.argsMightError args) + = Expr.argsMightError args + rw [hAny]; rfl + rw [hMeReduce] at hMe + simp only [eval] at hRes + apply evalOrN_not_err (ds := args.map (eval env)) ?_ hRes + intro d hd_mem + obtain ⟨e, e_mem, h_eq⟩ := List.mem_map.mp hd_mem + subst h_eq + have he : ¬(e.might_error = true) := fun h => hMe + (Expr.argsMightError_of_mem e_mem h) + exact might_error_sound e env he hEnv | .coalesce args, env, hMe, hEnv => by intro hRes simp only [eval] at hRes diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index c0d04591d32ce..cb873b400a414 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -13,7 +13,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/PrimEval.lean`: primitive evaluators on `Datum` and `List Datum` — `evalAnd`, `evalOr`, `evalNot`, `evalIfThen`, `Env`, `Env.get`, `evalAndN`, `evalOrN`, `evalCoalesce`. Split out so the algebraic-law files and the expression-level evaluator can both import them without circular dependencies. * `Mz/Eval.lean`: the big-step `eval : Env → Expr → Datum`. List-carrying constructors evaluate each operand and hand the result list to the matching primitive. * `Mz/Boolean.lean`: per-cell truth-table proofs for `AND`, `OR`, and `NOT`, plus involutivity of `NOT`. -* `Mz/MightError.lean`: the `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. Binary `AND` / `OR` short-circuit on literal-`.bool false` / literal-`.bool true` operands via `Expr.isLitBoolFalse` / `Expr.isLitBoolTrue`: either position being the absorbing literal makes the analyzer return `false` regardless of the other operand. `IfThen` likewise short-circuits when the condition is a literal `.bool` — only the picked branch's analyzer result is consulted, so a known-erring branch on the discarded side cannot taint the result. `andN` and `orN` recurse via `Expr.argsMightError` ("any operand might error"); `coalesce` recurses via `Expr.argsAllMightError` ("every operand might error"), special-casing the empty list as safe. Soundness for `coalesce` extracts a statically-safe operand through `Expr.exists_safe_of_not_argsAllMightError` and applies `evalCoalesce_not_err_of_some_safe`, which in turn rests on `Coalesce.go_not_err` — the state-machine lemma that "once one safe operand is in the remaining list, the walk cannot return an error". Companion value-level helpers `evalAnd_{left,right}_false` / `evalOr_{left,right}_true` discharge the short-circuit branches of soundness. +* `Mz/MightError.lean`: the `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. Binary `AND` / `OR` short-circuit on literal-`.bool false` / literal-`.bool true` operands via `Expr.isLitBoolFalse` / `Expr.isLitBoolTrue`: either position being the absorbing literal makes the analyzer return `false` regardless of the other operand. The same short-circuit fires on variadic `andN` / `orN` when any operand is the absorbing literal. `IfThen` likewise short-circuits when the condition is a literal `.bool` — only the picked branch's analyzer result is consulted, so a known-erring branch on the discarded side cannot taint the result. `andN` and `orN` recurse via `Expr.argsMightError` ("any operand might error"); `coalesce` recurses via `Expr.argsAllMightError` ("every operand might error"), special-casing the empty list as safe. Soundness for `coalesce` extracts a statically-safe operand through `Expr.exists_safe_of_not_argsAllMightError` and applies `evalCoalesce_not_err_of_some_safe`, which in turn rests on `Coalesce.go_not_err` — the state-machine lemma that "once one safe operand is in the remaining list, the walk cannot return an error". Companion value-level helpers `evalAnd_{left,right}_false` / `evalOr_{left,right}_true` discharge the short-circuit branches of soundness. * `Mz/Strict.lean`: strictness predicates (`ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary`), positive instances for `evalNot` and the condition slot of `evalIfThen`, closure under composition, and negative results witnessing that `AND` and `OR` are *not* err-strict in either position. * `Mz/Coalesce.lean`: laws for `evalCoalesce` — error-rescue, null-beats-err tiebreak, first-error stickiness. * `Mz/Laws.lean`: algebraic laws — two-sided identity (`TRUE` for `AND`, `FALSE` for `OR`), idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. From 3cf40df7b178c606063e61ff8fdd19eae9b3619e Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 11:31:59 +0200 Subject: [PATCH 037/127] doc/semantics: BagStream.filter commutativity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two new theorems: * `BagStream.filter_comm_data` (unconditional): data-side commutativity, reducing immediately through `filterRel_comm` on the data field. The two filter orderings produce the same set of surviving rows. * `BagStream.filter_comm_no_err` (preconditioned): full stream commutativity when neither predicate errs on any input row. Under no-error preconditions both `errorRows` collections are empty, so the appended errors collapse to the input `errors` field in both orderings. Helper `errorRows_eq_nil_of_no_err` discharges the "empty error collection" step. Unconditional error-side commutativity is left out: even multiset (permutation) equality fails when both predicates err on the same row, since the two orderings record different err payloads for `(p err, q err)` rows — only the first-firing filter sees that row alive. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/ErrStream.lean | 74 +++++++++++++++++++++++ doc/developer/semantics/README.md | 3 +- 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/doc/developer/semantics/Mz/ErrStream.lean b/doc/developer/semantics/Mz/ErrStream.lean index 2ec3257a9d07d..f3e216df38e68 100644 --- a/doc/developer/semantics/Mz/ErrStream.lean +++ b/doc/developer/semantics/Mz/ErrStream.lean @@ -116,6 +116,80 @@ theorem BagStream.filter_idem (pred : Expr) (s : BagStream) : BagStream.filter pred s := by simp [BagStream.filter, filterRel_idem, errorRows_filterRel] +/-! ## Filter commutativity + +The data side of `BagStream.filter` commutes unconditionally via +`filterRel_comm`. The error side does not commute as a list +equality — the two orderings collect errors in different +positions and, when both predicates error on the same row, only +one ordering records the second error. + +Full structural commutativity therefore requires a no-error +precondition: when neither predicate errs on the input data, +both errorRows collections are empty, the appended errors reduce +to the input `errors` field, and the streams agree. -/ + +theorem BagStream.filter_comm_data (p q : Expr) (s : BagStream) : + (BagStream.filter p (BagStream.filter q s)).data + = (BagStream.filter q (BagStream.filter p s)).data := by + simp [BagStream.filter, filterRel_comm] + +/-- `errorRows` of a relation on which the predicate never errors +is empty. -/ +theorem errorRows_eq_nil_of_no_err + (pred : Expr) (rel : Relation) + (h : ∀ row ∈ rel, ¬(eval row pred).IsErr) : + errorRows pred rel = [] := by + induction rel with + | nil => rfl + | cons hd tl ih => + have hHd : ¬(eval hd pred).IsErr := h hd List.mem_cons_self + have hTl : ∀ row ∈ tl, ¬(eval row pred).IsErr := + fun row hMem => h row (List.mem_cons_of_mem _ hMem) + show (match eval hd pred with + | .err e => e :: errorRows pred tl + | _ => errorRows pred tl) = [] + cases h_eval : eval hd pred with + | bool _ => exact ih hTl + | null => exact ih hTl + | err _ => + rw [h_eval] at hHd + exact absurd (show True by trivial) hHd + +/-- Full commutativity of `BagStream.filter` under a no-error +precondition: when neither predicate errs on any row of the input +data, the two filter orderings produce the same stream. -/ +theorem BagStream.filter_comm_no_err + (p q : Expr) (s : BagStream) + (hP : ∀ row ∈ s.data, ¬(eval row p).IsErr) + (hQ : ∀ row ∈ s.data, ¬(eval row q).IsErr) : + BagStream.filter p (BagStream.filter q s) + = BagStream.filter q (BagStream.filter p s) := by + apply BagStream.ext + · exact BagStream.filter_comm_data p q s + · show s.errors ++ errorRows q s.data ++ errorRows p (filterRel q s.data) + = s.errors ++ errorRows p s.data ++ errorRows q (filterRel p s.data) + have hQEmpty : errorRows q s.data = [] := + errorRows_eq_nil_of_no_err q s.data hQ + have hPEmpty : errorRows p s.data = [] := + errorRows_eq_nil_of_no_err p s.data hP + -- A row surviving `filterRel q s.data` is a row in `s.data` with q true, + -- so the no-err precondition still applies; `errorRows p (filterRel q ...)` + -- is therefore also empty. + have hPOnQ : ∀ row ∈ filterRel q s.data, ¬(eval row p).IsErr := by + intro row hMem + unfold filterRel at hMem + exact hP row (List.mem_filter.mp hMem).1 + have hQOnP : ∀ row ∈ filterRel p s.data, ¬(eval row q).IsErr := by + intro row hMem + unfold filterRel at hMem + exact hQ row (List.mem_filter.mp hMem).1 + have hPFiltered : errorRows p (filterRel q s.data) = [] := + errorRows_eq_nil_of_no_err p (filterRel q s.data) hPOnQ + have hQFiltered : errorRows q (filterRel p s.data) = [] := + errorRows_eq_nil_of_no_err q (filterRel p s.data) hQOnP + rw [hQEmpty, hPEmpty, hPFiltered, hQFiltered] + /-! ## Project -/ /-- Boolean check: every projected scalar succeeds on this row diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index cb873b400a414..a299b34d05a3f 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -22,6 +22,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Bag.lean`: bag semantics on `List Row`. Defines `filterRel` and `project`, with filter idempotence, filter commutativity, projection length-preservation, and the empty-projection equation. Plain `filterRel` silently drops `err` rows; `Mz/ErrStream.lean` adds the explicit data/error stream pair. * `Mz/ErrStream.lean`: the dataflow-style `BagStream = (data, errors)` pair. `BagStream.filter` routes erroring rows into the error collection instead of dropping them, with idempotence proved at both the data and the error level. + `BagStream.filter_comm_data` proves the data-side commutativity unconditionally; `BagStream.filter_comm_no_err` strengthens to full stream equality when neither predicate errors on the input data. `BagStream.project` projects each row through a list of scalars; a row stays in the data collection only when every scalar succeeds, otherwise its err payloads (one per erroring scalar) are appended to the error collection. `rowErrs_nil_of_all_safe` and `projectErrs_eq_nil_of_all_safe` show that when no projection errs, `BagStream.project` does not extend the error collection. * `Mz/Pushdown.lean`: substitution (`Expr.subst`) plus the headline `eval_subst` theorem (substituting then evaluating against the original row equals evaluating against the projected row), and the relational predicate-pushdown rewrite `filterRel p (project es rel) = project es (filterRel (p.subst es) rel)`. @@ -87,7 +88,7 @@ Reviewers should expect both sides of the change in the same PR. The roadmap in priority order: -* `BagStream.filter` commutativity. Data field commutes by `filterRel_comm`; the error field requires a notion of multiset equality on `List EvalError` since list-order differs across permutations. +* Full `BagStream.filter` commutativity (no preconditions). The error field requires a notion of multiset equality on `List EvalError` since list-order differs across permutations, and even multiset equality fails when both predicates err on the same row — `(p err, q err)` rows record only one error in either ordering, but different ones depending on which filter runs first. * `BagStream.project` / `BagStream.filter` commutativity (when the predicate references only un-projected columns). Same multiset-equality caveat on the error collection. * Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. * Joins on `BagStream` with explicit error propagation. From 63b36b4a90becd070154f5672ede8f086dc76544 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 11:37:20 +0200 Subject: [PATCH 038/127] doc/semantics: BagStream project / filter data-side pushdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `BagStream.project_filter_pushdown_data` lifts the relational predicate pushdown from `Mz/Pushdown.lean` to the data side of `BagStream`: (BagStream.filter p (BagStream.project es s)).data = (BagStream.project es (BagStream.filter (p.subst es) s)).data Proof chains three steps: * Apply `filterRel_pushdown_project` on the projected relation to move `filterRel p` inside the projection as `filterRel (p.subst es)`. * Use `List.filter_filter` to merge the `rowAllSafe` and predicate filters into a single conjunction filter. * Close via `Bool.and_comm` to flip the conjunction order. Errors-side pushdown is omitted by design — the two orderings collect predicate errors from different row sets (projected vs unprojected), and even multiset equality on `List EvalError` fails when `rowAllSafe` filtering removes rows that the predicate would have visited. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/ErrStream.lean | 42 +++++++++++++++++++++++ doc/developer/semantics/README.md | 3 +- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/doc/developer/semantics/Mz/ErrStream.lean b/doc/developer/semantics/Mz/ErrStream.lean index f3e216df38e68..a05792756b7a0 100644 --- a/doc/developer/semantics/Mz/ErrStream.lean +++ b/doc/developer/semantics/Mz/ErrStream.lean @@ -1,5 +1,6 @@ import Mz.Eval import Mz.Bag +import Mz.Pushdown /-! # Data / error stream pair @@ -311,4 +312,45 @@ theorem projectErrs_eq_nil_of_all_safe fun row hMem => h row (List.mem_cons_of_mem _ hMem) simp [List.flatMap_cons, rowErrs_nil_of_all_safe es hd hHead, ih hTl] +/-! ## Predicate pushdown on `BagStream` + +Data-side pushdown lifts `filterRel_pushdown_project` (in +`Mz/Pushdown.lean`) to `BagStream`: the rows produced by +filtering after projecting equal the rows produced by +substituting through the projection and filtering before +projecting. Errors-side pushdown does *not* hold in general — +the two orderings produce different error collections — so this +theorem speaks only about the data slice. + +The error-collection asymmetry is structural: filtering after +projection collects predicate errors from the projected rows, +while filtering before projection collects them from the +original rows. Even under substitution `p.subst es`, the two err +sets are not equal as multisets when projection's `rowAllSafe` +filter changes which rows the predicate sees. -/ + +theorem BagStream.project_filter_pushdown_data + (p : Expr) (es : List Expr) (s : BagStream) : + (BagStream.filter p (BagStream.project es s)).data + = (BagStream.project es (BagStream.filter (p.subst es) s)).data := by + show filterRel p ((s.data.filter (rowAllSafe es)).map + (fun row => es.map (eval row))) + = ((filterRel (p.subst es) s.data).filter (rowAllSafe es)).map + (fun row => es.map (eval row)) + -- Step 1: push `filterRel p` through `map`, using pushdown on plain `project`. + rw [show ((s.data.filter (rowAllSafe es)).map (fun row => es.map (eval row))) + = Mz.project es (s.data.filter (rowAllSafe es)) from rfl] + rw [filterRel_pushdown_project p es (s.data.filter (rowAllSafe es))] + -- Now LHS = project es (filterRel (p.subst es) (s.data.filter (rowAllSafe es))). + -- Push the rowAllSafe filter through filterRel via filter/filter commutativity. + show Mz.project es (filterRel (p.subst es) (s.data.filter (rowAllSafe es))) + = ((filterRel (p.subst es) s.data).filter (rowAllSafe es)).map + (fun row => es.map (eval row)) + unfold filterRel Mz.project + congr 1 + rw [List.filter_filter, List.filter_filter] + congr 1 + funext row + exact Bool.and_comm _ _ + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index a299b34d05a3f..2e547f26b3c9a 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -23,6 +23,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/ErrStream.lean`: the dataflow-style `BagStream = (data, errors)` pair. `BagStream.filter` routes erroring rows into the error collection instead of dropping them, with idempotence proved at both the data and the error level. `BagStream.filter_comm_data` proves the data-side commutativity unconditionally; `BagStream.filter_comm_no_err` strengthens to full stream equality when neither predicate errors on the input data. + `BagStream.project_filter_pushdown_data` lifts `filterRel_pushdown_project` to the data side of `BagStream`: filtering after projecting agrees with substituting through the projection and filtering before projecting. Holds unconditionally on the data field; the errors collection diverges between the two orderings (filter sees projected vs unprojected rows) and is out of scope for this theorem. `BagStream.project` projects each row through a list of scalars; a row stays in the data collection only when every scalar succeeds, otherwise its err payloads (one per erroring scalar) are appended to the error collection. `rowErrs_nil_of_all_safe` and `projectErrs_eq_nil_of_all_safe` show that when no projection errs, `BagStream.project` does not extend the error collection. * `Mz/Pushdown.lean`: substitution (`Expr.subst`) plus the headline `eval_subst` theorem (substituting then evaluating against the original row equals evaluating against the projected row), and the relational predicate-pushdown rewrite `filterRel p (project es rel) = project es (filterRel (p.subst es) rel)`. @@ -89,7 +90,7 @@ Reviewers should expect both sides of the change in the same PR. The roadmap in priority order: * Full `BagStream.filter` commutativity (no preconditions). The error field requires a notion of multiset equality on `List EvalError` since list-order differs across permutations, and even multiset equality fails when both predicates err on the same row — `(p err, q err)` rows record only one error in either ordering, but different ones depending on which filter runs first. -* `BagStream.project` / `BagStream.filter` commutativity (when the predicate references only un-projected columns). Same multiset-equality caveat on the error collection. +* Errors-side `BagStream.project` / `BagStream.filter` pushdown (the two orderings collect predicate errors from different row sets; even multiset equality on `List EvalError` fails when projection's `rowAllSafe` filter changes the predicate's input). * Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. * Joins on `BagStream` with explicit error propagation. * Further tightening of `Expr.might_error`. Short-circuit detection is in place for binary `AND` / `OR` against literal absorbers; ground-truth lookups (literal arithmetic, known-null operands, type guards) are the natural next refinements and remain additive against the current soundness proof. From fe9d6473df589ab0a664ac91c5d5512ef356557d Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 11:46:13 +0200 Subject: [PATCH 039/127] doc/semantics: no-error preservation for cross and filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two companion theorems pinning down when `UnifiedStream` operators stay in the `.val` slice of the diff-semiring. `UnifiedStream.cross_no_error`: every output diff is `.val (n * m)` when every input on both sides has `.val` diff. Diff multiplication of `.val * .val = .val (·*·)` never introduces `.error`. `UnifiedStream.filter_no_error`: every output diff is the same `.val n` as the input record's diff. Filter never modifies the diff — only the carrier changes when the predicate produces `.err` (the `.row` becomes `.err`). Together with `cross_diff_error_{left,right}` (absorption on the error side) and `cross_assoc` / `cross_length` (algebraic / cardinality laws), the operators on the unified diff-aware stream now have a complete characterization on both diff slices. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Join.lean | 75 ++++++++++++++++++++++++++++ doc/developer/semantics/README.md | 2 +- 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/doc/developer/semantics/Mz/Join.lean b/doc/developer/semantics/Mz/Join.lean index f690a003b5b24..dfdbb13470cd4 100644 --- a/doc/developer/semantics/Mz/Join.lean +++ b/doc/developer/semantics/Mz/Join.lean @@ -307,6 +307,81 @@ private theorem List.map_congr_local {α β : Type} h hd List.mem_cons_self, ih (fun x hMem => h x (List.mem_cons_of_mem _ hMem))] +/-- `cross` preserves `.val` diffs: when every input record on +both sides has a `.val` diff, every output record has a `.val` +diff. Diff multiplication of `.val * .val = .val (· * ·)` +stays in the ordinary `Int` slice of the diff-semiring. -/ +theorem UnifiedStream.cross_no_error + (l r : UnifiedStream) + (hL : ∀ ld ∈ l, ∃ n : Int, ld.2 = DiffWithError.val n) + (hR : ∀ rd ∈ r, ∃ m : Int, rd.2 = DiffWithError.val m) : + ∀ od ∈ UnifiedStream.cross l r, ∃ k : Int, od.2 = DiffWithError.val k := by + intro od hMem + show ∃ k : Int, od.2 = DiffWithError.val k + obtain ⟨ld, hLMem, hMid⟩ := List.mem_flatMap.mp hMem + obtain ⟨rd, hRMem, hEq⟩ := List.mem_map.mp hMid + obtain ⟨n, hN⟩ := hL ld hLMem + obtain ⟨m, hM⟩ := hR rd hRMem + refine ⟨n * m, ?_⟩ + rw [← hEq] + show ld.2 * rd.2 = DiffWithError.val (n * m) + rw [hN, hM] + rfl + +/-- `filter` preserves `.val` diffs: survivors and rerouted err +rows inherit the input diff unchanged. The diff is never modified +by the filter — only the carrier changes when the predicate +returns `.err`. -/ +theorem UnifiedStream.filter_no_error + (pred : Expr) (us : UnifiedStream) + (h : ∀ ud ∈ us, ∃ n : Int, ud.2 = DiffWithError.val n) : + ∀ od ∈ UnifiedStream.filter pred us, ∃ n : Int, od.2 = DiffWithError.val n := by + intro od hMem + unfold UnifiedStream.filter at hMem + obtain ⟨ud, hUMem, hMid⟩ := List.mem_flatMap.mp hMem + obtain ⟨n, hN⟩ := h ud hUMem + -- ud = (uc, .val n); the produced records all carry diff `.val n` (or the + -- input record itself when the `.error` short-circuit fires — but that + -- case is vacuous here since `ud.2 = .val n` by hypothesis). + refine ⟨n, ?_⟩ + obtain ⟨uc, d⟩ := ud + simp only at hN + subst hN + -- now ud = (uc, .val n); filter produces records whose snd is .val n + cases uc with + | row r => + show od.2 = DiffWithError.val n + cases h_eval : eval r pred with + | bool b => + cases b with + | true => + have hOd : od = (UnifiedRow.row r, DiffWithError.val n) := by + have := hMid + simp [h_eval] at this + exact this + rw [hOd] + | false => + exfalso + have := hMid + simp [h_eval] at this + | null => + exfalso + have := hMid + simp [h_eval] at this + | err e => + have hOd : od = (UnifiedRow.err e, DiffWithError.val n) := by + have := hMid + simp [h_eval] at this + exact this + rw [hOd] + | err e => + show od.2 = DiffWithError.val n + have hOd : od = (UnifiedRow.err e, DiffWithError.val n) := by + have := hMid + simp at this + exact this + rw [hOd] + /-- Cross is associative on the unified stream. The proof rewrites both sides into a common triple-fold via the list-monad equations and closes the leaves with `cross_step_assoc`. -/ diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 2e547f26b3c9a..6e145f729ea19 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -37,7 +37,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four *cardinality* — `consolidate_length_le` bounds the output by the input length (consolidation only merges, never expands); *no-error preservation* — `consolidate_no_error` proves that if every input diff is a `.val`, every output diff is a `.val`, so `.error` is the only source of absorption. * `Mz/Triple.lean`: `TimedRecord = (row, time, diff)` triple stream tying `DiffWithError` to differential dataflow's record format. Defines `consolidateAll` and `consolidateAt t`; lifts `Consolidate`'s absorption to per-time and stream-wide statements. Per-`(row, time)` bucketing reduces to applying `UnifiedConsolidate` inside each time slice. -* `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). Algebraic laws: `combineCarrier_assoc` (carrier combine is associative modulo `List.append_assoc`) and the headline `UnifiedStream.cross_assoc` (`(a × b) × c = a × (b × c)`). The proof rearranges nested `flatMap` / `map` via local list-monad lemmas and closes via `DiffWithError.mul_assoc` plus `combineCarrier_assoc`. +* `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). No-error preservation: `cross_no_error` and `filter_no_error` prove that all-`.val` input diffs yield all-`.val` output diffs, so `.error` is the only source of absorbing diffs in the joint output. Algebraic laws: `combineCarrier_assoc` (carrier combine is associative modulo `List.append_assoc`) and the headline `UnifiedStream.cross_assoc` (`(a × b) × c = a × (b × c)`). The proof rearranges nested `flatMap` / `map` via local list-monad lemmas and closes via `DiffWithError.mul_assoc` plus `combineCarrier_assoc`. * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. From f473bff5cb90f70a326eb5d8afb3c876c4ee942c Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 11:49:29 +0200 Subject: [PATCH 040/127] doc/semantics: DiffWithError mul_comm + Int specializations `DiffWithError.mul_comm`: commutativity of `*` lifted from the base type. The `.error` absorber commutes trivially; `.val * .val` reduces to base `*` commutativity. Five `_int` specializations discharge the base-law hypotheses once at `Int` so the downstream code can cite named laws without repeating `Int.mul_assoc` / `Int.add_comm` arguments: * `add_comm_int`, `add_assoc_int` * `mul_comm_int`, `mul_assoc_int` * `mul_add_int` `Join.lean`'s `cross_step_assoc` switches to `mul_assoc_int` so the call site no longer threads the base hypothesis. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/DiffSemiring.lean | 41 ++++++++++++++++++++ doc/developer/semantics/Mz/Join.lean | 2 +- doc/developer/semantics/README.md | 2 +- 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/doc/developer/semantics/Mz/DiffSemiring.lean b/doc/developer/semantics/Mz/DiffSemiring.lean index 8bb7c73faaab0..3d628de40dcc0 100644 --- a/doc/developer/semantics/Mz/DiffSemiring.lean +++ b/doc/developer/semantics/Mz/DiffSemiring.lean @@ -199,6 +199,47 @@ theorem mul_assoc [Mul α] | val _ => rfl | error => rfl +/-- Commutativity of `*` (when the base has it). -/ +theorem mul_comm [Mul α] (h_comm : ∀ x y : α, x * y = y * x) + (a b : DiffWithError α) : a * b = b * a := by + cases a with + | val x => + cases b with + | val y => + show (val (x * y) : DiffWithError α) = val (y * x) + rw [h_comm] + | error => rfl + | error => + cases b with + | val _ => rfl + | error => rfl + +/-! ## Int specializations + +The diff-aware operators in `Mz/UnifiedStream.lean` and +`Mz/Join.lean` instantiate the base type at `Int`. The following +specializations discharge the `h_*` hypotheses once at the type +level so downstream code can cite the named laws without +re-supplying base proofs every time. -/ + +theorem add_comm_int (a b : DiffWithError Int) : a + b = b + a := + add_comm Int.add_comm a b + +theorem add_assoc_int (a b c : DiffWithError Int) : + (a + b) + c = a + (b + c) := + add_assoc Int.add_assoc a b c + +theorem mul_assoc_int (a b c : DiffWithError Int) : + (a * b) * c = a * (b * c) := + mul_assoc Int.mul_assoc a b c + +theorem mul_comm_int (a b : DiffWithError Int) : a * b = b * a := + mul_comm Int.mul_comm a b + +theorem mul_add_int (a b c : DiffWithError Int) : + a * (b + c) = a * b + a * c := + mul_add Int.mul_add a b c + end DiffWithError end Mz diff --git a/doc/developer/semantics/Mz/Join.lean b/doc/developer/semantics/Mz/Join.lean index dfdbb13470cd4..3238f1dd8b0fa 100644 --- a/doc/developer/semantics/Mz/Join.lean +++ b/doc/developer/semantics/Mz/Join.lean @@ -244,7 +244,7 @@ private theorem cross_step_assoc = (combineCarrier ad.1 (combineCarrier bd.1 cd.1), ad.2 * (bd.2 * cd.2)) := by congr 1 · exact combineCarrier_assoc ad.1 bd.1 cd.1 - · exact DiffWithError.mul_assoc Int.mul_assoc ad.2 bd.2 cd.2 + · exact DiffWithError.mul_assoc_int ad.2 bd.2 cd.2 /-- Local lemma: associativity of `flatMap`. Lean core has the building blocks (`flatMap_cons`, `flatMap_append`) but not the diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 6e145f729ea19..544f4b45ed63b 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -27,7 +27,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `BagStream.project` projects each row through a list of scalars; a row stays in the data collection only when every scalar succeeds, otherwise its err payloads (one per erroring scalar) are appended to the error collection. `rowErrs_nil_of_all_safe` and `projectErrs_eq_nil_of_all_safe` show that when no projection errs, `BagStream.project` does not extend the error collection. * `Mz/Pushdown.lean`: substitution (`Expr.subst`) plus the headline `eval_subst` theorem (substituting then evaluating against the original row equals evaluating against the projected row), and the relational predicate-pushdown rewrite `filterRel p (project es rel) = project es (filterRel (p.subst es) rel)`. -* `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity laws that downstream operators must respect. Now wired through `UnifiedStream` / `Join` — every operator on the unified stream speaks `DiffWithError Int` directly. +* `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity laws that downstream operators must respect. The `_int` specializations (`add_comm_int`, `add_assoc_int`, `mul_assoc_int`, `mul_comm_int`, `mul_add_int`) discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean` and `Mz/UnifiedConsolidate.lean` can cite the named laws directly. * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. From 91d0a07e2758ad46dc8275edf5c1a104963a50b4 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 11:52:33 +0200 Subject: [PATCH 041/127] doc/semantics: refresh roadmap Reorganize Next steps into three buckets: * Blocked: items the proof obligation cannot discharge as stated (full BagStream.filter commutativity, errors-side pushdown). Each gets a one-line obstruction sentence so future contributors do not redo the analysis. * Additive refinements: further might_error tightening, UnifiedStream.project, strict consolidate cardinality, Triple retirement. * Material expansions: numeric arithmetic, set operations, distinct, spec-doc cross-references. Drop completed items (DiffSemiring wiring, joins on UnifiedStream, diff-aware operator family, Triple-style consolidate at the joint key, predicate / projection rewrites at the bag level). Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/README.md | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 544f4b45ed63b..9a7f96910cce1 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -87,13 +87,23 @@ Reviewers should expect both sides of the change in the same PR. ## Next steps -The roadmap in priority order: +The diff-semiring extension is in scope: `UnifiedStream` records carry `(UnifiedRow × DiffWithError Int)` and `filter`, `cross`, `join`, `consolidate`, `consolidateAtTime` preserve / multiply / absorb the diff per the semiring laws. -* Full `BagStream.filter` commutativity (no preconditions). The error field requires a notion of multiset equality on `List EvalError` since list-order differs across permutations, and even multiset equality fails when both predicates err on the same row — `(p err, q err)` rows record only one error in either ordering, but different ones depending on which filter runs first. -* Errors-side `BagStream.project` / `BagStream.filter` pushdown (the two orderings collect predicate errors from different row sets; even multiset equality on `List EvalError` fails when projection's `rowAllSafe` filter changes the predicate's input). -* Tie `DiffWithError` to a concrete dataflow operator: model a `(Row, Time, DiffWithError ℤ)` triple stream and prove that an `error` diff at time `t` propagates to every downstream consolidation. -* Joins on `BagStream` with explicit error propagation. -* Further tightening of `Expr.might_error`. Short-circuit detection is in place for binary `AND` / `OR` against literal absorbers; ground-truth lookups (literal arithmetic, known-null operands, type guards) are the natural next refinements and remain additive against the current soundness proof. -* Lift to bag semantics for predicate / projection rewrites. +### Blocked -The diff-semiring extension is now in scope: `UnifiedStream` records carry a `DiffWithError Int` and operators preserve / multiply / absorb it as appropriate. +* Full `BagStream.filter` commutativity (no preconditions). The error field requires a notion of multiset equality on `List EvalError`, and even multiset equality fails when both predicates err on the same row — `(p err, q err)` rows record different err payloads depending on filter order. The data side is closed by `filter_comm_data`; the precondition variant by `filter_comm_no_err`. +* Errors-side `BagStream.project` / `BagStream.filter` pushdown. The two orderings collect predicate errors from different row sets; even multiset equality fails when `rowAllSafe` filtering removes rows the predicate would have visited. The data side is closed by `project_filter_pushdown_data`. + +### Additive refinements + +* Tightening `Expr.might_error` further. Short-circuit detection covers binary / variadic `AND` / `OR` and `IfThen` against literal absorbers. Remaining: ground-truth lookups (literal arithmetic, known-null operands, type-driven), all additive against the current soundness proof. +* `UnifiedStream.project` analogous to `BagStream.project` but diff-aware. Each scalar can contribute its own erroring records via row-scoped err carriers; collection-scoped `.error` diffs propagate. +* Strict cardinality bound for `UnifiedConsolidate`: when a carrier appears `k > 1` times in the input, the output is `k - 1` shorter than the input. +* Drop / replace `Mz/Triple.lean` once `Mz/TimedConsolidate.lean` covers every consolidation the spec doc needs. + +### Material expansions + +* Numeric arithmetic. Extend `Datum` with `.int (n : Int)`, add `Expr.plus` / `.minus` / `.times` / `.divide`, and a `divide-by-zero` `EvalError` variant. Tightens the err-handling story since divide-by-zero is the canonical `EvalError`. +* Set operations (`UNION ALL`, `INTERSECT ALL`, `EXCEPT ALL`) on `UnifiedStream`. Composing with `consolidate` derives the set-semantics (`UNION` / `INTERSECT` / `EXCEPT`) variants. +* `distinct` operator on `UnifiedStream`: collapse multiplicity via `consolidate` + sign normalization. +* Cross-link the spec doc (`../design/20260517_error_handling_semantics.md`) to specific theorem names via `[Mz/...:thm]` cross-references. From 644e0fca07a6908e1b71efb9f91a1c9ccb7a9085 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 11:53:39 +0200 Subject: [PATCH 042/127] doc/semantics: rebase Triple onto TimedUnifiedRecord MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch `Mz/Triple.lean` from the bespoke `TimedRecord` struct over plain `Row` and a generic diff base type to `TimedUnifiedRecord = UnifiedRow × Nat × DiffWithError Int` — the carrier shared with `Mz/TimedConsolidate.lean`. Row-scoped errors now flow through the `UnifiedRow` carrier; collection- scoped errors flow through the `.error` diff marker. Drop the generic-`α` parametrization. Every downstream operator on the unified stream speaks `DiffWithError Int`; the arbitrary-base flexibility was never used. Two diff-flat consolidation views (no row bucketing): * `TimedUnifiedStream.consolidateAll`: collection-wide sum. * `TimedUnifiedStream.consolidateAtTimeFlat t`: per-time sum. Both reduce to `DiffWithError.sumAll`. The renamed `consolidateAtTimeFlat` disambiguates from `TimedConsolidate.consolidateAtTime`, which buckets per `(row, time)` and returns a `UnifiedStream` instead of a scalar diff. Absorption theorems `consolidateAll_eq_error_of_mem` and `consolidateAtTimeFlat_eq_error_of_mem` are preserved. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Triple.lean | 107 ++++++++++++------------- doc/developer/semantics/README.md | 3 +- 2 files changed, 52 insertions(+), 58 deletions(-) diff --git a/doc/developer/semantics/Mz/Triple.lean b/doc/developer/semantics/Mz/Triple.lean index 5e9971e216c4f..96dae513d0439 100644 --- a/doc/developer/semantics/Mz/Triple.lean +++ b/doc/developer/semantics/Mz/Triple.lean @@ -1,78 +1,73 @@ -import Mz.Bag +import Mz.UnifiedStream +import Mz.TimedConsolidate import Mz.DiffSemiring import Mz.Consolidate /-! -# Timed records — `(Row, Time, Diff)` triple stream +# Collection-wide diff sum on the timed unified stream -A first sketch of differential dataflow's record format: a stream -of `(row, time, diff)` triples, where the diff lives in -`DiffWithError α`. The skeleton uses `Nat` for time and parametrizes -over the base diff type. +Differential dataflow records arrive as `(row, time, diff)` triples. +The carrier here is `TimedUnifiedRecord` from +`Mz/TimedConsolidate.lean`, which pairs a `UnifiedRow` (data row +or row-scoped err) with a `Nat` time and a `DiffWithError Int` +diff. Row-scoped errors flow through the carrier; collection- +scoped errors flow through the diff via the absorbing `.error` +marker. -The operations modeled are consolidation by time (sum diffs across -all rows at a given time) and consolidation across the whole stream -(sum every diff). Both reduce to `DiffWithError.sumAll`, so the -absorption laws from `Mz/Consolidate.lean` transport directly: if -any record in the consolidated range carries an `error` diff, the -consolidated total is `error`. +This file gives two consolidation views that *do not* bucket by +row: -Per-`(row, time)` bucketing is the next refinement and requires -`DecidableEq` on `Row`; the present file does the simpler "sum -everything in the time slice" version, which is the per-time -collection-global diff. --/ +* `consolidateAll`: sum every diff in the stream, ignoring row + and time. The collection-wide diff. +* `consolidateAt t`: sum every diff at time `t`, ignoring row. + The per-time collection diff. -namespace Mz +Both reduce to `DiffWithError.sumAll`, so the absorption laws +from `Mz/Consolidate.lean` transport directly: an `.error` diff +anywhere in the consolidated range forces the consolidated total +to `.error`. -/-- A timed record: row, time, and a diff value possibly carrying -the absorbing `error` marker. -/ -structure TimedRecord (α : Type) where - row : Row - time : Nat - diff : DiffWithError α - deriving Inhabited +For per-`(row, time)` bucketing — where the output is itself a +`UnifiedStream`, one record per surviving carrier with the +bucket's summed diff — use `TimedUnifiedStream.consolidateAtTime` +in `Mz/TimedConsolidate.lean`. The two views are complementary: +this file collapses time slices to a single diff value; the +TimedConsolidate view collapses each `(row, time)` bucket +separately. +-/ -/-- A stream of timed records. Order does not matter; the operations -below are insensitive to order whenever the base `Add` on `α` is -commutative. -/ -abbrev TimedStream (α : Type) := List (TimedRecord α) +namespace Mz -/-- Sum every diff in the stream, ignoring row and time. The -collection-wide diff. -/ -def TimedStream.consolidateAll [Zero α] [Add α] (s : TimedStream α) : - DiffWithError α := - DiffWithError.sumAll (s.map (·.diff)) +/-- Sum every diff in the stream, ignoring row and time. -/ +def TimedUnifiedStream.consolidateAll (s : TimedUnifiedStream) : DiffWithError Int := + DiffWithError.sumAll (s.map (·.2.2)) -/-- Sum every diff at a given time, ignoring row. The per-time -collection diff. -/ -def TimedStream.consolidateAt [Zero α] [Add α] (t : Nat) (s : TimedStream α) : - DiffWithError α := - DiffWithError.sumAll ((s.filter (·.time = t)).map (·.diff)) +/-- Sum every diff at a given time, ignoring row. -/ +def TimedUnifiedStream.consolidateAtTimeFlat + (t : Nat) (s : TimedUnifiedStream) : DiffWithError Int := + DiffWithError.sumAll ((s.filter (·.2.1 = t)).map (·.2.2)) /-! ## Absorption -/ -/-- If any record carries an `error` diff, the all-stream consolidation -is `error`. -/ -theorem TimedStream.consolidateAll_eq_error_of_mem [Zero α] [Add α] - {s : TimedStream α} (r : TimedRecord α) - (h_mem : r ∈ s) (h_err : r.diff = DiffWithError.error) : - TimedStream.consolidateAll s = DiffWithError.error := by - unfold TimedStream.consolidateAll +/-- An `.error` diff anywhere in the stream forces the +collection-wide consolidation to `.error`. -/ +theorem TimedUnifiedStream.consolidateAll_eq_error_of_mem + {s : TimedUnifiedStream} (r : TimedUnifiedRecord) + (h_mem : r ∈ s) (h_err : r.2.2 = (DiffWithError.error : DiffWithError Int)) : + TimedUnifiedStream.consolidateAll s = DiffWithError.error := by + unfold TimedUnifiedStream.consolidateAll apply DiffWithError.sumAll_eq_error_of_mem - -- Need: error ∈ s.map (·.diff). Since r ∈ s and r.diff = error, by List.mem_map. refine List.mem_map.mpr ⟨r, h_mem, ?_⟩ exact h_err -/-- Same statement restricted to a single time slice: an `error` -record at time `t` forces the per-time consolidation at `t` to -`error`. -/ -theorem TimedStream.consolidateAt_eq_error_of_mem [Zero α] [Add α] - {s : TimedStream α} (t : Nat) (r : TimedRecord α) - (h_mem : r ∈ s) (h_time : r.time = t) - (h_err : r.diff = DiffWithError.error) : - TimedStream.consolidateAt t s = DiffWithError.error := by - unfold TimedStream.consolidateAt +/-- Restricted to a time slice: an `.error` record at time `t` +forces the per-time flat consolidation at `t` to `.error`. -/ +theorem TimedUnifiedStream.consolidateAtTimeFlat_eq_error_of_mem + {s : TimedUnifiedStream} (t : Nat) (r : TimedUnifiedRecord) + (h_mem : r ∈ s) (h_time : r.2.1 = t) + (h_err : r.2.2 = (DiffWithError.error : DiffWithError Int)) : + TimedUnifiedStream.consolidateAtTimeFlat t s = DiffWithError.error := by + unfold TimedUnifiedStream.consolidateAtTimeFlat apply DiffWithError.sumAll_eq_error_of_mem refine List.mem_map.mpr ⟨r, ?_, h_err⟩ exact List.mem_filter.mpr ⟨h_mem, by simp [h_time]⟩ diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 9a7f96910cce1..1cda1c097ebf0 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -36,7 +36,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four *absorption* — `consolidate_preserves_error` proves an `.error` diff anywhere in the input gives an `.error` diff in the consolidated output for that carrier; *cardinality* — `consolidate_length_le` bounds the output by the input length (consolidation only merges, never expands); *no-error preservation* — `consolidate_no_error` proves that if every input diff is a `.val`, every output diff is a `.val`, so `.error` is the only source of absorption. -* `Mz/Triple.lean`: `TimedRecord = (row, time, diff)` triple stream tying `DiffWithError` to differential dataflow's record format. Defines `consolidateAll` and `consolidateAt t`; lifts `Consolidate`'s absorption to per-time and stream-wide statements. Per-`(row, time)` bucketing reduces to applying `UnifiedConsolidate` inside each time slice. +* `Mz/Triple.lean`: collection-wide and per-time *flat* consolidation views on `TimedUnifiedStream`. `consolidateAll` sums every diff in the stream; `consolidateAtTimeFlat t` sums every diff at time `t`. Both ignore the carrier — they collapse a time slice (or the whole stream) to one `DiffWithError Int`. Absorption: `consolidateAll_eq_error_of_mem` and `consolidateAtTimeFlat_eq_error_of_mem`. Complementary to `Mz/TimedConsolidate.lean`'s `consolidateAtTime t`, which buckets per `(row, time)` and returns a `UnifiedStream`. * `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). No-error preservation: `cross_no_error` and `filter_no_error` prove that all-`.val` input diffs yield all-`.val` output diffs, so `.error` is the only source of absorbing diffs in the joint output. Algebraic laws: `combineCarrier_assoc` (carrier combine is associative modulo `List.append_assoc`) and the headline `UnifiedStream.cross_assoc` (`(a × b) × c = a × (b × c)`). The proof rearranges nested `flatMap` / `map` via local list-monad lemmas and closes via `DiffWithError.mul_assoc` plus `combineCarrier_assoc`. * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. @@ -99,7 +99,6 @@ The diff-semiring extension is in scope: `UnifiedStream` records carry `(Unified * Tightening `Expr.might_error` further. Short-circuit detection covers binary / variadic `AND` / `OR` and `IfThen` against literal absorbers. Remaining: ground-truth lookups (literal arithmetic, known-null operands, type-driven), all additive against the current soundness proof. * `UnifiedStream.project` analogous to `BagStream.project` but diff-aware. Each scalar can contribute its own erroring records via row-scoped err carriers; collection-scoped `.error` diffs propagate. * Strict cardinality bound for `UnifiedConsolidate`: when a carrier appears `k > 1` times in the input, the output is `k - 1` shorter than the input. -* Drop / replace `Mz/Triple.lean` once `Mz/TimedConsolidate.lean` covers every consolidation the spec doc needs. ### Material expansions From c05eb1f8dab861e5d2ba8d03c9817f00b01b2d1f Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 13:21:38 +0200 Subject: [PATCH 043/127] doc/semantics: add Datum.int and EvalError.divisionByZero Extend the `Datum` type with `.int (n : Int)` and the `EvalError` type with a `divisionByZero` variant. The new variants set up the numeric arithmetic story: `divide-by-zero` is the canonical cell-scoped error and the natural anchor for the err-handling design doc. Exhaustiveness across the existing proofs is restored without weakening any law: * `evalAnd` and `evalOr` extend with `.int`-preserving identity patterns. The two-sided identity laws `evalAnd_true_*` and `evalOr_false_*` continue to hold universally; commutativity on `.int n, .int m` uses a value-level `if n = m` to keep both sides equal when the payloads disagree; idempotence closes via `if_pos rfl`. * `evalNot` is identity on `.int` (preserves involutivity). * `evalIfThen` falls through to `.null` on a non-`.bool` condition (type-error stand-in). * `Coalesce.go` extends with an explicit `.int n :: _` arm that short-circuits to `.int n`, mirroring the `.bool b` case. * `aggCountNonNull`, `aggStrict`, `aggTry_*` extend with the parallel `.int` arms. Per-file exhaustiveness fixes in Boolean.lean, Laws.lean, Variadic.lean, ExprVariadic.lean, Aggregate.lean, ErrStream.lean, GroupBy.lean, Join.lean, MightError.lean. This commit is foundation-only; the new `Expr.plus` / `.minus` / `.times` / `.divide` constructors and the operator-level semantics arrive in the next commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Aggregate.lean | 67 ++++++++++++++++++++ doc/developer/semantics/Mz/Boolean.lean | 3 + doc/developer/semantics/Mz/Datum.lean | 21 +++--- doc/developer/semantics/Mz/ErrStream.lean | 3 + doc/developer/semantics/Mz/ExprVariadic.lean | 1 + doc/developer/semantics/Mz/GroupBy.lean | 8 +++ doc/developer/semantics/Mz/Join.lean | 5 ++ doc/developer/semantics/Mz/Laws.lean | 36 +++++++++++ doc/developer/semantics/Mz/MightError.lean | 54 ++++++++++++---- doc/developer/semantics/Mz/PrimEval.lean | 47 +++++++++----- doc/developer/semantics/Mz/Variadic.lean | 2 + 11 files changed, 210 insertions(+), 37 deletions(-) diff --git a/doc/developer/semantics/Mz/Aggregate.lean b/doc/developer/semantics/Mz/Aggregate.lean index 000ff9fe701e2..2e675dd469b66 100644 --- a/doc/developer/semantics/Mz/Aggregate.lean +++ b/doc/developer/semantics/Mz/Aggregate.lean @@ -27,6 +27,7 @@ namespace Mz def aggCountNonNull : List Datum → Nat | [] => 0 | .bool _ :: rest => 1 + aggCountNonNull rest + | .int _ :: rest => 1 + aggCountNonNull rest | .null :: rest => aggCountNonNull rest | .err _ :: rest => aggCountNonNull rest @@ -43,6 +44,11 @@ def aggStrict (f : Datum → Datum → Datum) : List Datum → Datum | .err e => .err e | .null => x | r => f x r + | x@(.int _) :: rest => + match aggStrict f rest with + | .err e => .err e + | .null => x + | r => f x r /-! ## Strict propagation laws -/ @@ -81,6 +87,24 @@ theorem aggStrict_err : rw [h_eval]; trivial | null => rw [h_eval] at h_rest; exact h_rest.elim | bool _ => rw [h_eval] at h_rest; exact h_rest.elim + | int _ => rw [h_eval] at h_rest; exact h_rest.elim + | Datum.int n :: rest, f, h => by + obtain ⟨d, hmem, hsafe⟩ := h + cases hmem with + | head _ => exact hsafe.elim + | tail _ h' => + have h_rest : (aggStrict f rest).IsErr := + aggStrict_err f ⟨d, h', hsafe⟩ + cases h_eval : aggStrict f rest with + | err e' => + show (match aggStrict f rest with + | Datum.err e => Datum.err e + | Datum.null => Datum.int n + | r => f (Datum.int n) r).IsErr + rw [h_eval]; trivial + | null => rw [h_eval] at h_rest; exact h_rest.elim + | bool _ => rw [h_eval] at h_rest; exact h_rest.elim + | int _ => rw [h_eval] at h_rest; exact h_rest.elim /-- Dual: if no input is an `err`, the aggregate result is not an `err`. The reducer `f` is assumed to preserve "no-err": applied to @@ -122,6 +146,48 @@ theorem aggStrict_no_err apply f_safe · exact hb · intro h_eq; cases h_eq + | int _ => + show ¬(match aggStrict f rest with + | Datum.err e => Datum.err e + | Datum.null => Datum.bool b + | r => f (Datum.bool b) r).IsErr + rw [h_eval] + apply f_safe + · exact hb + · intro h_eq; cases h_eq + | Datum.int n :: rest, h => by + have h_rest : ¬(aggStrict f rest).IsErr := by + apply aggStrict_no_err f f_safe + intro d hmem; exact h d (List.Mem.tail _ hmem) + have hb : ¬(Datum.int n).IsErr := h _ (List.Mem.head _) + cases h_eval : aggStrict f rest with + | err e => + rw [h_eval] at h_rest + exact absurd trivial h_rest + | null => + show ¬(match aggStrict f rest with + | Datum.err e => Datum.err e + | Datum.null => Datum.int n + | r => f (Datum.int n) r).IsErr + rw [h_eval]; exact hb + | bool _ => + show ¬(match aggStrict f rest with + | Datum.err e => Datum.err e + | Datum.null => Datum.int n + | r => f (Datum.int n) r).IsErr + rw [h_eval] + apply f_safe + · exact hb + · intro h_eq; cases h_eq + | int _ => + show ¬(match aggStrict f rest with + | Datum.err e => Datum.err e + | Datum.null => Datum.int n + | r => f (Datum.int n) r).IsErr + rw [h_eval] + apply f_safe + · exact hb + · intro h_eq; cases h_eq /-! ## Non-strict (`try_*`) variants @@ -164,5 +230,6 @@ theorem aggTry_eq_aggStrict_of_no_err | err _ => exact absurd (h_eval ▸ h_safe) (fun h' => h' trivial) | null => rfl | bool _ => rfl + | int _ => rfl end Mz diff --git a/doc/developer/semantics/Mz/Boolean.lean b/doc/developer/semantics/Mz/Boolean.lean index d8c9a32d68812..94d22f629d6b4 100644 --- a/doc/developer/semantics/Mz/Boolean.lean +++ b/doc/developer/semantics/Mz/Boolean.lean @@ -27,6 +27,7 @@ theorem and_false_left (d : Datum) : evalAnd (.bool false) d = .bool false := by theorem and_false_right (d : Datum) : evalAnd d (.bool false) = .bool false := by cases d with | bool b => cases b <;> rfl + | int _ => rfl | null => rfl | err _ => rfl @@ -54,6 +55,7 @@ theorem or_true_left (d : Datum) : evalOr (.bool true) d = .bool true := by theorem or_true_right (d : Datum) : evalOr d (.bool true) = .bool true := by cases d with | bool b => cases b <;> rfl + | int _ => rfl | null => rfl | err _ => rfl @@ -85,6 +87,7 @@ and `err`. The latter mirrors the strict propagation rule. -/ theorem not_not (d : Datum) : evalNot (evalNot d) = d := by cases d with | bool b => cases b <;> rfl + | int _ => rfl | null => rfl | err _ => rfl diff --git a/doc/developer/semantics/Mz/Datum.lean b/doc/developer/semantics/Mz/Datum.lean index 49fe205e0a7b7..c40f2a2ce9c3e 100644 --- a/doc/developer/semantics/Mz/Datum.lean +++ b/doc/developer/semantics/Mz/Datum.lean @@ -15,26 +15,25 @@ The Rust counterpart lives in `src/repr/src/row.rs` (`Datum`) and namespace Mz -/-- Opaque payload for cell-scoped errors. - -The skeleton does not enumerate the variants of the Rust -`EvalError`. A single placeholder constructor keeps the type -inhabited so that proofs that need a concrete value can supply one, -without committing to a wire format. Later refinements will replace -this with the real variant set. -/ +/-- Cell-scoped errors raised by `Datum`-level operations. The +skeleton's variants are intentionally small; production +`EvalError` (in `src/expr/src/scalar.rs`) has many more. -/ inductive EvalError | placeholder + | divisionByZero deriving DecidableEq, Inhabited /-- A modeled scalar value. -`bool b` is a boolean literal; `null` is the SQL `NULL` value; `err e` -is the proposed cell-scoped error variant whose payload is the -`EvalError` raised at the cell. -/ +`bool b` is a boolean literal; `int n` is an integer literal +(skeleton models `Int`, not the full SQL numeric tower); `null` +is the SQL `NULL` value; `err e` is the cell-scoped error +variant whose payload is the `EvalError` raised at the cell. -/ inductive Datum | bool (b : Bool) + | int (n : Int) | null - | err (e : EvalError) + | err (e : EvalError) deriving DecidableEq, Inhabited /-- Propositional predicate "this datum is an error". diff --git a/doc/developer/semantics/Mz/ErrStream.lean b/doc/developer/semantics/Mz/ErrStream.lean index a05792756b7a0..846f0e5d57b6b 100644 --- a/doc/developer/semantics/Mz/ErrStream.lean +++ b/doc/developer/semantics/Mz/ErrStream.lean @@ -78,6 +78,7 @@ theorem rows_in_filterRel_eval_to_true (pred : Expr) (rel : Relation) : | bool b => cases b · rw [h_eval] at h_pred; cases h_pred · rfl + | int _ => rw [h_eval] at h_pred; cases h_pred | null => rw [h_eval] at h_pred; cases h_pred | err _ => rw [h_eval] at h_pred; cases h_pred @@ -152,6 +153,7 @@ theorem errorRows_eq_nil_of_no_err | _ => errorRows pred tl) = [] cases h_eval : eval hd pred with | bool _ => exact ih hTl + | int _ => exact ih hTl | null => exact ih hTl | err _ => rw [h_eval] at hHd @@ -294,6 +296,7 @@ theorem rowErrs_nil_of_all_safe (es : List Expr) (row : Row) rw [List.filterMap_cons] cases h_eval : eval row hd with | bool _ => exact ihResult + | int _ => exact ihResult | null => exact ihResult | err e => rw [h_eval] at hHead diff --git a/doc/developer/semantics/Mz/ExprVariadic.lean b/doc/developer/semantics/Mz/ExprVariadic.lean index 10577c66d2369..a812bfec466fe 100644 --- a/doc/developer/semantics/Mz/ExprVariadic.lean +++ b/doc/developer/semantics/Mz/ExprVariadic.lean @@ -92,6 +92,7 @@ theorem eval_coalesce_singleton (env : Env) (a : Expr) : -- Case analysis on the underlying datum. cases h : eval env a with | bool b => rw [show evalCoalesce [Datum.bool b] = Datum.bool b from rfl] + | int n => rw [show evalCoalesce [Datum.int n] = Datum.int n from rfl] | null => rw [show evalCoalesce [Datum.null] = Datum.null from rfl] | err e => rw [show evalCoalesce [Datum.err e] = Datum.err e from rfl] diff --git a/doc/developer/semantics/Mz/GroupBy.lean b/doc/developer/semantics/Mz/GroupBy.lean index ea258b6b963ab..b8243df04e947 100644 --- a/doc/developer/semantics/Mz/GroupBy.lean +++ b/doc/developer/semantics/Mz/GroupBy.lean @@ -165,11 +165,19 @@ private theorem Datum.groupKeyEq_eq_decide_of_no_err cases b with | err _ => exact absurd (show Datum.IsErr (Datum.err _) from trivial) hB | bool _ => rfl + | int _ => rfl + | null => rfl + | int _ => + cases b with + | err _ => exact absurd (show Datum.IsErr (Datum.err _) from trivial) hB + | bool _ => rfl + | int _ => rfl | null => rfl | null => cases b with | err _ => exact absurd (show Datum.IsErr (Datum.err _) from trivial) hB | bool _ => rfl + | int _ => rfl | null => rfl /-- Inserting the same non-err key into a group list whose keys diff --git a/doc/developer/semantics/Mz/Join.lean b/doc/developer/semantics/Mz/Join.lean index 3238f1dd8b0fa..846ceedebbfe7 100644 --- a/doc/developer/semantics/Mz/Join.lean +++ b/doc/developer/semantics/Mz/Join.lean @@ -112,6 +112,7 @@ theorem UnifiedStream.filter_length_le (pred : Expr) (us : UnifiedStream) : | _ => []).length ≤ 1 cases h_eval : eval r pred with | bool b => cases b <;> simp [List.length_cons, List.length_nil] + | int _ => simp [List.length_nil] | null => simp [List.length_nil] | err _ => simp [List.length_cons] | err _ => @@ -364,6 +365,10 @@ theorem UnifiedStream.filter_no_error exfalso have := hMid simp [h_eval] at this + | int _ => + exfalso + have := hMid + simp [h_eval] at this | null => exfalso have := hMid diff --git a/doc/developer/semantics/Mz/Laws.lean b/doc/developer/semantics/Mz/Laws.lean index c97e31e159bd5..1fbd926cf446f 100644 --- a/doc/developer/semantics/Mz/Laws.lean +++ b/doc/developer/semantics/Mz/Laws.lean @@ -40,24 +40,28 @@ variadic fold in `Mz/Variadic.lean`. -/ theorem evalAnd_true_left (d : Datum) : evalAnd (.bool true) d = d := by cases d with | bool b => cases b <;> rfl + | int _ => rfl | null => rfl | err _ => rfl theorem evalAnd_true_right (d : Datum) : evalAnd d (.bool true) = d := by cases d with | bool b => cases b <;> rfl + | int _ => rfl | null => rfl | err _ => rfl theorem evalOr_false_left (d : Datum) : evalOr (.bool false) d = d := by cases d with | bool b => cases b <;> rfl + | int _ => rfl | null => rfl | err _ => rfl theorem evalOr_false_right (d : Datum) : evalOr d (.bool false) = d := by cases d with | bool b => cases b <;> rfl + | int _ => rfl | null => rfl | err _ => rfl @@ -66,12 +70,16 @@ theorem evalOr_false_right (d : Datum) : evalOr d (.bool false) = d := by theorem evalAnd_idem (d : Datum) : evalAnd d d = d := by cases d with | bool b => cases b <;> rfl + | int n => show (if n = n then Datum.int n else Datum.null) = .int n + rw [if_pos rfl] | null => rfl | err _ => rfl theorem evalOr_idem (d : Datum) : evalOr d d = d := by cases d with | bool b => cases b <;> rfl + | int n => show (if n = n then Datum.int n else Datum.null) = .int n + rw [if_pos rfl] | null => rfl | err _ => rfl @@ -91,11 +99,25 @@ theorem evalAnd_comm_of_no_err | bool b₁ => cases d₂ with | bool b₂ => cases b₁ <;> cases b₂ <;> rfl + | int _ => cases b₁ <;> rfl | null => cases b₁ <;> rfl | err _ => exact (h₂ trivial).elim + | int n => + cases d₂ with + | bool b₂ => cases b₂ <;> rfl + | int m => + by_cases h : n = m + · subst h; rfl + · have h' : ¬m = n := fun h_eq => h h_eq.symm + show (if n = m then Datum.int n else Datum.null) + = (if m = n then Datum.int m else Datum.null) + rw [if_neg h, if_neg h'] + | null => rfl + | err _ => exact (h₂ trivial).elim | null => cases d₂ with | bool b₂ => cases b₂ <;> rfl + | int _ => rfl | null => rfl | err _ => exact (h₂ trivial).elim | err _ => exact (h₁ trivial).elim @@ -107,11 +129,25 @@ theorem evalOr_comm_of_no_err | bool b₁ => cases d₂ with | bool b₂ => cases b₁ <;> cases b₂ <;> rfl + | int _ => cases b₁ <;> rfl | null => cases b₁ <;> rfl | err _ => exact (h₂ trivial).elim + | int n => + cases d₂ with + | bool b₂ => cases b₂ <;> rfl + | int m => + by_cases h : n = m + · subst h; rfl + · have h' : ¬m = n := fun h_eq => h h_eq.symm + show (if n = m then Datum.int n else Datum.null) + = (if m = n then Datum.int m else Datum.null) + rw [if_neg h, if_neg h'] + | null => rfl + | err _ => exact (h₂ trivial).elim | null => cases d₂ with | bool b₂ => cases b₂ <;> rfl + | int _ => rfl | null => rfl | err _ => exact (h₂ trivial).elim | err _ => exact (h₁ trivial).elim diff --git a/doc/developer/semantics/Mz/MightError.lean b/doc/developer/semantics/Mz/MightError.lean index 7bf03680b4cab..c3fa258fdad88 100644 --- a/doc/developer/semantics/Mz/MightError.lean +++ b/doc/developer/semantics/Mz/MightError.lean @@ -25,13 +25,23 @@ theorem evalAnd_not_err cases d₁ with | bool b₁ => cases d₂ with - | bool b₂ => cases b₁ <;> cases b₂ <;> decide - | null => cases b₁ <;> decide + | bool b₂ => cases b₁ <;> cases b₂ <;> (intro h; cases h) + | int _ => cases b₁ <;> (intro h; cases h) + | null => cases b₁ <;> (intro h; cases h) + | err _ => exact (h₂ trivial).elim + | int n => + cases d₂ with + | bool b₂ => cases b₂ <;> (intro h; cases h) + | int m => + show ¬(if n = m then Datum.int n else Datum.null).IsErr + split <;> (intro h; cases h) + | null => intro h; cases h | err _ => exact (h₂ trivial).elim | null => cases d₂ with - | bool b₂ => cases b₂ <;> decide - | null => decide + | bool b₂ => cases b₂ <;> (intro h; cases h) + | int _ => intro h; cases h + | null => intro h; cases h | err _ => exact (h₂ trivial).elim | err _ => exact (h₁ trivial).elim @@ -41,21 +51,32 @@ theorem evalOr_not_err cases d₁ with | bool b₁ => cases d₂ with - | bool b₂ => cases b₁ <;> cases b₂ <;> decide - | null => cases b₁ <;> decide + | bool b₂ => cases b₁ <;> cases b₂ <;> (intro h; cases h) + | int _ => cases b₁ <;> (intro h; cases h) + | null => cases b₁ <;> (intro h; cases h) + | err _ => exact (h₂ trivial).elim + | int n => + cases d₂ with + | bool b₂ => cases b₂ <;> (intro h; cases h) + | int m => + show ¬(if n = m then Datum.int n else Datum.null).IsErr + split <;> (intro h; cases h) + | null => intro h; cases h | err _ => exact (h₂ trivial).elim | null => cases d₂ with - | bool b₂ => cases b₂ <;> decide - | null => decide + | bool b₂ => cases b₂ <;> (intro h; cases h) + | int _ => intro h; cases h + | null => intro h; cases h | err _ => exact (h₂ trivial).elim | err _ => exact (h₁ trivial).elim theorem evalNot_not_err {d : Datum} (h : ¬d.IsErr) : ¬(evalNot d).IsErr := by cases d with - | bool b => cases b <;> decide - | null => decide + | bool b => cases b <;> (intro h; cases h) + | int _ => intro h; cases h + | null => intro h; cases h | err _ => exact (h trivial).elim theorem evalIfThen_not_err @@ -69,8 +90,10 @@ theorem evalIfThen_not_err simp only [evalIfThen]; exact he · -- true branch: evalIfThen reduces to `dt` simp only [evalIfThen]; exact ht + | int _ => + simp only [evalIfThen]; intro h; cases h | null => - simp only [evalIfThen]; decide + simp only [evalIfThen]; intro h; cases h | err _ => exact (hc trivial).elim /-! ### Short-circuit absorbers @@ -86,6 +109,7 @@ theorem evalAnd_left_false (d : Datum) : evalAnd (.bool false) d = .bool false : theorem evalAnd_right_false (d : Datum) : evalAnd d (.bool false) = .bool false := by cases d with | bool b => cases b <;> rfl + | int _ => rfl | null => rfl | err _ => rfl @@ -94,6 +118,7 @@ theorem evalOr_left_true (d : Datum) : evalOr (.bool true) d = .bool true := rfl theorem evalOr_right_true (d : Datum) : evalOr d (.bool true) = .bool true := by cases d with | bool b => cases b <;> rfl + | int _ => rfl | null => rfl | err _ => rfl @@ -337,6 +362,12 @@ theorem Coalesce.go_not_err : show False simp only [Coalesce.go] at hRes cases hRes + | _, _, .int n :: _, _ => by + intro hRes + -- Coalesce.go _ _ (.int n :: _) = .int n + show False + simp only [Coalesce.go] at hRes + cases hRes | _, firstErr, .null :: rest, _ => by -- Recurse with seenNull=true. show ¬(Coalesce.go true firstErr rest).IsErr @@ -422,6 +453,7 @@ theorem might_error_sound : simp only [eval] at hRes cases d with | bool _ => cases hRes + | int _ => cases hRes | null => cases hRes | err _ => apply hMe diff --git a/doc/developer/semantics/Mz/PrimEval.lean b/doc/developer/semantics/Mz/PrimEval.lean index b52a05cfd420f..adbe796460cff 100644 --- a/doc/developer/semantics/Mz/PrimEval.lean +++ b/doc/developer/semantics/Mz/PrimEval.lean @@ -25,32 +25,47 @@ namespace Mz /-! ## Binary and ternary boolean evaluators -/ /-- AND evaluation table. Pattern order encodes the absorption -hierarchy `FALSE > ERROR > NULL > TRUE`. -/ +hierarchy `FALSE > ERROR > NULL > TRUE`. Non-boolean operands +(`.int _`) are preserved when paired with the identity element +`.bool true` (or another `.int`), so the algebraic laws +`evalAnd_true_left/right` and `evalAnd_idem` hold universally. +SQL would reject `.int` in `AND` at type-check time; the +skeleton's semantics is a coherent total extension. -/ def evalAnd : Datum → Datum → Datum - | .bool false, _ => .bool false - | _, .bool false => .bool false - | .err e, _ => .err e - | _, .err e => .err e - | .null, _ => .null - | _, .null => .null + | .bool false, _ => .bool false + | _, .bool false => .bool false + | .err e, _ => .err e + | _, .err e => .err e + | .null, _ => .null + | _, .null => .null | .bool true, .bool true => .bool true + | .bool true, .int n => .int n + | .int n, .bool true => .int n + | .int n, .int m => if n = m then .int n else .null /-- OR evaluation table. Mirror of `evalAnd` with `TRUE` as the -dominant absorber: `TRUE > ERROR > NULL > FALSE`. -/ +dominant absorber: `TRUE > ERROR > NULL > FALSE`. Identity on +`.int` operands paired with the identity element `.bool false`. -/ def evalOr : Datum → Datum → Datum - | .bool true, _ => .bool true - | _, .bool true => .bool true - | .err e, _ => .err e - | _, .err e => .err e - | .null, _ => .null - | _, .null => .null + | .bool true, _ => .bool true + | _, .bool true => .bool true + | .err e, _ => .err e + | _, .err e => .err e + | .null, _ => .null + | _, .null => .null | .bool false, .bool false => .bool false + | .bool false, .int n => .int n + | .int n, .bool false => .int n + | .int n, .int m => if n = m then .int n else .null -/-- NOT evaluation table. Strict on `null` and `err`. -/ +/-- NOT evaluation table. Strict on `null` and `err`. Numeric +operands pass through unchanged so that `evalNot` stays +involutive on `.int` even though SQL would type-reject it. -/ def evalNot : Datum → Datum | .bool b => .bool (!b) | .null => .null | .err e => .err e + | .int n => .int n /-- `IfThen` evaluation table. Modeled strictly; see `Mz/Eval.lean` for the discussion of lazy vs strict in a total skeleton. -/ @@ -59,6 +74,7 @@ def evalIfThen : Datum → Datum → Datum → Datum | .bool false, _, de => de | .null, _, _ => .null | .err e, _, _ => .err e + | _, _, _ => .null /-! ## Environment -/ @@ -106,6 +122,7 @@ def Coalesce.go (seenNull : Bool) (firstErr : Option EvalError) : | some e => .err e | none => .null | .bool b :: _ => .bool b + | .int n :: _ => .int n | .null :: rest => Coalesce.go true firstErr rest | .err e :: rest => match firstErr with diff --git a/doc/developer/semantics/Mz/Variadic.lean b/doc/developer/semantics/Mz/Variadic.lean index 3d5c7946c6d69..16efd03fed341 100644 --- a/doc/developer/semantics/Mz/Variadic.lean +++ b/doc/developer/semantics/Mz/Variadic.lean @@ -77,6 +77,7 @@ private theorem evalAnd_false_right_any (d : Datum) : evalAnd d (.bool false) = .bool false := by cases d with | bool b => cases b <;> rfl + | int _ => rfl | null => rfl | err _ => rfl @@ -84,6 +85,7 @@ private theorem evalOr_true_right_any (d : Datum) : evalOr d (.bool true) = .bool true := by cases d with | bool b => cases b <;> rfl + | int _ => rfl | null => rfl | err _ => rfl From 356d7cbfd105b7cf03ea89a91e37deea95f542cc Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 13:28:05 +0200 Subject: [PATCH 044/127] doc/semantics: numeric arithmetic and divide-by-zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Build the operator layer on top of the `.int` / `divisionByZero` foundation. Adds four binary arithmetic operators and wires them through every layer of the skeleton. Primitives in `Mz/PrimEval.lean`: * `evalPlus`, `evalMinus`, `evalTimes`: strict on `.err` and `.null`; integer addition / subtraction / multiplication on `.int n / .int m`; type-mismatched operands route to `.null`. * `evalDivide`: same propagation, plus the canonical erring rule — `.int n / .int 0` produces `.err .divisionByZero`, otherwise `.int (n / m)`. `Expr` constructors `.plus`, `.minus`, `.times`, `.divide` are wired through: * `eval` in `Mz/Eval.lean`. * `Expr.subst` in `Mz/Pushdown.lean` (recursive substitution), closing `eval_subst` over the new constructors. * `Expr.might_error` in `Mz/MightError.lean`: `plus` / `minus` / `times` recurse into operands (err iff any operand might err); `divide` returns `true` unconditionally — the canonical erring operator. Tightening to detect literal-nonzero divisors is additive follow-up. Soundness `might_error_sound` extended with four new branches. The `.divide` branch closes via absurd-premise; the additive branches close via three new helper lemmas `evalPlus_not_err` / `evalMinus_not_err` / `evalTimes_not_err`. Companion theorems: * `evalDivide_not_err_of_nonzero`: a literal-nonzero divisor makes `evalDivide` non-erring on integer operands. * `evalDivide_zero`: the divide-by-zero reduction, stated as a named theorem so optimization rewrites can cite it. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Eval.lean | 4 + doc/developer/semantics/Mz/Expr.lean | 4 + doc/developer/semantics/Mz/MightError.lean | 126 +++++++++++++++++++++ doc/developer/semantics/Mz/PrimEval.lean | 45 ++++++++ doc/developer/semantics/Mz/Pushdown.lean | 16 +++ doc/developer/semantics/README.md | 7 +- 6 files changed, 198 insertions(+), 4 deletions(-) diff --git a/doc/developer/semantics/Mz/Eval.lean b/doc/developer/semantics/Mz/Eval.lean index f9acc4376fa69..e08fc43387a8b 100644 --- a/doc/developer/semantics/Mz/Eval.lean +++ b/doc/developer/semantics/Mz/Eval.lean @@ -37,5 +37,9 @@ def eval (env : Env) : Expr → Datum | .andN args => evalAndN (args.map (eval env)) | .orN args => evalOrN (args.map (eval env)) | .coalesce args => evalCoalesce (args.map (eval env)) + | .plus a b => evalPlus (eval env a) (eval env b) + | .minus a b => evalMinus (eval env a) (eval env b) + | .times a b => evalTimes (eval env a) (eval env b) + | .divide a b => evalDivide (eval env a) (eval env b) end Mz diff --git a/doc/developer/semantics/Mz/Expr.lean b/doc/developer/semantics/Mz/Expr.lean index b2af7186cdbbc..c7e32701f10a4 100644 --- a/doc/developer/semantics/Mz/Expr.lean +++ b/doc/developer/semantics/Mz/Expr.lean @@ -38,6 +38,10 @@ inductive Expr | andN (args : List Expr) | orN (args : List Expr) | coalesce (args : List Expr) + | plus (a b : Expr) + | minus (a b : Expr) + | times (a b : Expr) + | divide (a b : Expr) deriving Inhabited end Mz diff --git a/doc/developer/semantics/Mz/MightError.lean b/doc/developer/semantics/Mz/MightError.lean index c3fa258fdad88..80fa3f80e1456 100644 --- a/doc/developer/semantics/Mz/MightError.lean +++ b/doc/developer/semantics/Mz/MightError.lean @@ -79,6 +79,98 @@ theorem evalNot_not_err | null => intro h; cases h | err _ => exact (h trivial).elim +theorem evalPlus_not_err + {d₁ d₂ : Datum} (h₁ : ¬d₁.IsErr) (h₂ : ¬d₂.IsErr) : + ¬(evalPlus d₁ d₂).IsErr := by + cases d₁ with + | bool _ => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | int _ => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | null => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | err _ => exact (h₁ trivial).elim + +theorem evalMinus_not_err + {d₁ d₂ : Datum} (h₁ : ¬d₁.IsErr) (h₂ : ¬d₂.IsErr) : + ¬(evalMinus d₁ d₂).IsErr := by + cases d₁ with + | bool _ => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | int _ => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | null => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | err _ => exact (h₁ trivial).elim + +theorem evalTimes_not_err + {d₁ d₂ : Datum} (h₁ : ¬d₁.IsErr) (h₂ : ¬d₂.IsErr) : + ¬(evalTimes d₁ d₂).IsErr := by + cases d₁ with + | bool _ => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | int _ => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | null => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | err _ => exact (h₁ trivial).elim + +/-- Division is the canonical erring operation: a right operand +of `.int 0` produces `.err .divisionByZero` even when both +operands are otherwise error-free. So the analyzer's universal +"divide might err" verdict is exactly right; soundness on +`.divide` proceeds via the absurd-premise path. -/ +theorem evalDivide_not_err_of_nonzero + {n m : Int} (hm : m ≠ 0) : + ¬(evalDivide (.int n) (.int m)).IsErr := by + show ¬(if m = 0 then Datum.err EvalError.divisionByZero + else Datum.int (n / m)).IsErr + rw [if_neg hm] + intro h; cases h + +theorem evalDivide_zero (n : Int) : + evalDivide (.int n) (.int 0) = .err .divisionByZero := by + show (if (0 : Int) = 0 then Datum.err EvalError.divisionByZero + else Datum.int (n / 0)) + = Datum.err .divisionByZero + rw [if_pos rfl] + theorem evalIfThen_not_err {dc dt de : Datum} (hc : ¬dc.IsErr) (ht : ¬dt.IsErr) (he : ¬de.IsErr) : @@ -266,6 +358,10 @@ def Expr.might_error : Expr → Bool else Expr.argsMightError args | .coalesce [] => false | .coalesce (a :: rest) => a.might_error && Expr.argsAllMightError rest + | .plus a b => a.might_error || b.might_error + | .minus a b => a.might_error || b.might_error + | .times a b => a.might_error || b.might_error + | .divide _ _ => true /-- Bool fold of `might_error` over a list of operands ("does any operand might-error"), declared mutually with `might_error` so @@ -721,5 +817,35 @@ theorem might_error_sound : (ds := (a :: rest).map (eval env)) ?_ hRes' exact ⟨eval env e, List.mem_map.mpr ⟨e, e_mem, rfl⟩, he⟩ + | .plus a b, env, hMe, hEnv => by + intro hRes + simp only [eval] at hRes + have ha : ¬(a.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + have hb : ¬(b.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + exact evalPlus_not_err + (might_error_sound a env ha hEnv) + (might_error_sound b env hb hEnv) hRes + | .minus a b, env, hMe, hEnv => by + intro hRes + simp only [eval] at hRes + have ha : ¬(a.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + have hb : ¬(b.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + exact evalMinus_not_err + (might_error_sound a env ha hEnv) + (might_error_sound b env hb hEnv) hRes + | .times a b, env, hMe, hEnv => by + intro hRes + simp only [eval] at hRes + have ha : ¬(a.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + have hb : ¬(b.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + exact evalTimes_not_err + (might_error_sound a env ha hEnv) + (might_error_sound b env hb hEnv) hRes + | .divide a b, env, hMe, _ => by + -- `.divide _ _` is always tagged `might_error = true`, so the premise + -- `¬might_error = true` is absurd. A future tightening will detect + -- literal-nonzero divisors and lift this case to a real proof. + intro _ + exact hMe rfl end Mz diff --git a/doc/developer/semantics/Mz/PrimEval.lean b/doc/developer/semantics/Mz/PrimEval.lean index adbe796460cff..0ec90cff61f74 100644 --- a/doc/developer/semantics/Mz/PrimEval.lean +++ b/doc/developer/semantics/Mz/PrimEval.lean @@ -76,6 +76,51 @@ def evalIfThen : Datum → Datum → Datum → Datum | .err e, _, _ => .err e | _, _, _ => .null +/-! ## Numeric arithmetic + +Binary integer arithmetic. Strict on `.err` and `.null`. Non- +numeric operands route to `.null` for totality. Division by zero +returns `.err .divisionByZero` — the canonical cell-scoped error +the design doc cites. -/ + +/-- Integer addition. Strict on `.err` (propagates) and `.null` +(propagates). Type-mismatched operands route to `.null`. -/ +def evalPlus : Datum → Datum → Datum + | .err e, _ => .err e + | _, .err e => .err e + | .null, _ => .null + | _, .null => .null + | .int n, .int m => .int (n + m) + | _, _ => .null + +/-- Integer subtraction. Same propagation rules as `evalPlus`. -/ +def evalMinus : Datum → Datum → Datum + | .err e, _ => .err e + | _, .err e => .err e + | .null, _ => .null + | _, .null => .null + | .int n, .int m => .int (n - m) + | _, _ => .null + +/-- Integer multiplication. Same propagation rules. -/ +def evalTimes : Datum → Datum → Datum + | .err e, _ => .err e + | _, .err e => .err e + | .null, _ => .null + | _, .null => .null + | .int n, .int m => .int (n * m) + | _, _ => .null + +/-- Integer division. Strict on `.err` and `.null`. A right +operand of `.int 0` produces `.err .divisionByZero`. -/ +def evalDivide : Datum → Datum → Datum + | .err e, _ => .err e + | _, .err e => .err e + | .null, _ => .null + | _, .null => .null + | .int n, .int m => if m = 0 then .err .divisionByZero else .int (n / m) + | _, _ => .null + /-! ## Environment -/ /-- Environment: a positional list of bindings for `Expr.col`. -/ diff --git a/doc/developer/semantics/Mz/Pushdown.lean b/doc/developer/semantics/Mz/Pushdown.lean index 3a220a3d3e293..95cb15e29165d 100644 --- a/doc/developer/semantics/Mz/Pushdown.lean +++ b/doc/developer/semantics/Mz/Pushdown.lean @@ -45,6 +45,10 @@ def Expr.subst (es : List Expr) : Expr → Expr | .andN args => .andN (Expr.substArgs es args) | .orN args => .orN (Expr.substArgs es args) | .coalesce args => .coalesce (Expr.substArgs es args) + | .plus a b => .plus (a.subst es) (b.subst es) + | .minus a b => .minus (a.subst es) (b.subst es) + | .times a b => .times (a.subst es) (b.subst es) + | .divide a b => .divide (a.subst es) (b.subst es) /-- Pointwise application of `subst` to a list of operands. -/ def Expr.substArgs (es : List Expr) : List Expr → List Expr @@ -139,6 +143,18 @@ theorem eval_subst : apply List.map_congr_left intro e _ exact eval_subst env es e + | env, es, .plus a b => by + simp only [Expr.subst, eval] + rw [eval_subst env es a, eval_subst env es b] + | env, es, .minus a b => by + simp only [Expr.subst, eval] + rw [eval_subst env es a, eval_subst env es b] + | env, es, .times a b => by + simp only [Expr.subst, eval] + rw [eval_subst env es a, eval_subst env es b] + | env, es, .divide a b => by + simp only [Expr.subst, eval] + rw [eval_subst env es a, eval_subst env es b] /-! ## Predicate pushdown -/ diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 1cda1c097ebf0..8fc8c91f2ffd9 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -8,9 +8,9 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four ## What is here -* `Mz/Datum.lean`: `Datum`, `EvalError`, and the `Datum.IsErr` predicate. -* `Mz/Expr.lean`: `Expr` inductive — literals, columns, binary `and`/`or`, `not`, `ifThen`, plus the list-carrying constructors `andN`, `orN`, and `coalesce`. -* `Mz/PrimEval.lean`: primitive evaluators on `Datum` and `List Datum` — `evalAnd`, `evalOr`, `evalNot`, `evalIfThen`, `Env`, `Env.get`, `evalAndN`, `evalOrN`, `evalCoalesce`. Split out so the algebraic-law files and the expression-level evaluator can both import them without circular dependencies. +* `Mz/Datum.lean`: `Datum` (`.bool`, `.int`, `.null`, `.err`), `EvalError` (`.placeholder`, `.divisionByZero`), and the `Datum.IsErr` predicate. +* `Mz/Expr.lean`: `Expr` inductive — literals, columns, binary `and`/`or`, `not`, `ifThen`, the list-carrying constructors `andN`, `orN`, `coalesce`, and the binary integer arithmetic constructors `plus`, `minus`, `times`, `divide`. +* `Mz/PrimEval.lean`: primitive evaluators on `Datum` and `List Datum` — `evalAnd`, `evalOr`, `evalNot`, `evalIfThen`, `Env`, `Env.get`, `evalAndN`, `evalOrN`, `evalCoalesce`, plus the integer arithmetic primitives `evalPlus`, `evalMinus`, `evalTimes`, `evalDivide`. Split out so the algebraic-law files and the expression-level evaluator can both import them without circular dependencies. Division strict on `.err` and `.null`; a `.int n / .int 0` divisor produces `.err .divisionByZero` — the canonical cell-scoped error. * `Mz/Eval.lean`: the big-step `eval : Env → Expr → Datum`. List-carrying constructors evaluate each operand and hand the result list to the matching primitive. * `Mz/Boolean.lean`: per-cell truth-table proofs for `AND`, `OR`, and `NOT`, plus involutivity of `NOT`. * `Mz/MightError.lean`: the `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. Binary `AND` / `OR` short-circuit on literal-`.bool false` / literal-`.bool true` operands via `Expr.isLitBoolFalse` / `Expr.isLitBoolTrue`: either position being the absorbing literal makes the analyzer return `false` regardless of the other operand. The same short-circuit fires on variadic `andN` / `orN` when any operand is the absorbing literal. `IfThen` likewise short-circuits when the condition is a literal `.bool` — only the picked branch's analyzer result is consulted, so a known-erring branch on the discarded side cannot taint the result. `andN` and `orN` recurse via `Expr.argsMightError` ("any operand might error"); `coalesce` recurses via `Expr.argsAllMightError` ("every operand might error"), special-casing the empty list as safe. Soundness for `coalesce` extracts a statically-safe operand through `Expr.exists_safe_of_not_argsAllMightError` and applies `evalCoalesce_not_err_of_some_safe`, which in turn rests on `Coalesce.go_not_err` — the state-machine lemma that "once one safe operand is in the remaining list, the walk cannot return an error". Companion value-level helpers `evalAnd_{left,right}_false` / `evalOr_{left,right}_true` discharge the short-circuit branches of soundness. @@ -102,7 +102,6 @@ The diff-semiring extension is in scope: `UnifiedStream` records carry `(Unified ### Material expansions -* Numeric arithmetic. Extend `Datum` with `.int (n : Int)`, add `Expr.plus` / `.minus` / `.times` / `.divide`, and a `divide-by-zero` `EvalError` variant. Tightens the err-handling story since divide-by-zero is the canonical `EvalError`. * Set operations (`UNION ALL`, `INTERSECT ALL`, `EXCEPT ALL`) on `UnifiedStream`. Composing with `consolidate` derives the set-semantics (`UNION` / `INTERSECT` / `EXCEPT`) variants. * `distinct` operator on `UnifiedStream`: collapse multiplicity via `consolidate` + sign normalization. * Cross-link the spec doc (`../design/20260517_error_handling_semantics.md`) to specific theorem names via `[Mz/...:thm]` cross-references. From ff3d103b55ade703043078ada5cc662ef6def280 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 13:40:26 +0200 Subject: [PATCH 045/127] doc/semantics: tighten might_error for divide with literal divisor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `Expr.might_error (.divide a b)` now reduces to `a.might_error` whenever `b` is statically a literal nonzero int. Falls back to `true` (conservative) when the divisor is unknown or literally zero. * `Expr.divisorIsSafe`: head matcher returning `true` exactly on `.lit (.int n)` with `n ≠ 0`. Non-recursive on `Expr`. * `Expr.lit_nonzero_int_of_divisorIsSafe`: characterization, extracts the `n` and the `n ≠ 0` witness. * `evalDivide_lit_nonzero`: any non-erring dividend divided by a literal nonzero int does not err. Generalizes `evalDivide_not_err_of_nonzero` from `.int n` dividends to any non-erring `Datum`. Soundness `might_error_sound` extended on `.divide` with two branches: the safe-divisor branch recurses into the dividend via the new helper; the conservative branch closes via absurd-premise. The first non-vacuous proof for `.divide` in soundness, replacing the previous unconditional `hMe rfl` contradiction. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/MightError.lean | 72 +++++++++++++++++++--- doc/developer/semantics/README.md | 3 +- 2 files changed, 67 insertions(+), 8 deletions(-) diff --git a/doc/developer/semantics/Mz/MightError.lean b/doc/developer/semantics/Mz/MightError.lean index 80fa3f80e1456..adfb1f76b97c4 100644 --- a/doc/developer/semantics/Mz/MightError.lean +++ b/doc/developer/semantics/Mz/MightError.lean @@ -171,6 +171,22 @@ theorem evalDivide_zero (n : Int) : = Datum.err .divisionByZero rw [if_pos rfl] +/-- Generalization: any non-erring dividend divided by a literal +nonzero int does not err. The dividend can be any `Datum`; only +the divisor is constrained. -/ +theorem evalDivide_lit_nonzero + {d : Datum} {n : Int} (h : ¬d.IsErr) (hn : n ≠ 0) : + ¬(evalDivide d (.int n)).IsErr := by + cases d with + | bool _ => intro hRes; cases hRes + | int m => + show ¬(if n = 0 then Datum.err EvalError.divisionByZero + else Datum.int (m / n)).IsErr + rw [if_neg hn] + intro hRes; cases hRes + | null => intro hRes; cases hRes + | err _ => exact (h trivial).elim + theorem evalIfThen_not_err {dc dt de : Datum} (hc : ¬dc.IsErr) (ht : ¬dt.IsErr) (he : ¬de.IsErr) : @@ -334,6 +350,29 @@ theorem Expr.eq_of_isLitBoolTrue {e : Expr} | _ => simp [Expr.isLitBoolTrue] at h | _ => simp [Expr.isLitBoolTrue] at h +/-- Head matcher for "expression is a statically-safe divisor": +literal `.int n` with `n ≠ 0`. Used by `might_error` to detect +divide expressions whose divisor cannot trigger divide-by-zero. -/ +@[simp] def Expr.divisorIsSafe : Expr → Bool + | .lit (.int 0) => false + | .lit (.int _) => true + | _ => false + +/-- Characterization: `divisorIsSafe e = true` iff `e` is a +literal `.int n` with `n ≠ 0`. -/ +theorem Expr.lit_nonzero_int_of_divisorIsSafe {e : Expr} + (h : e.divisorIsSafe = true) : + ∃ n : Int, e = .lit (.int n) ∧ n ≠ 0 := by + cases e with + | lit d => + cases d with + | int n => + by_cases hZ : n = 0 + · subst hZ; simp [Expr.divisorIsSafe] at h + · exact ⟨n, rfl, hZ⟩ + | _ => simp [Expr.divisorIsSafe] at h + | _ => simp [Expr.divisorIsSafe] at h + mutual def Expr.might_error : Expr → Bool | .lit (.err _) => true @@ -361,7 +400,9 @@ def Expr.might_error : Expr → Bool | .plus a b => a.might_error || b.might_error | .minus a b => a.might_error || b.might_error | .times a b => a.might_error || b.might_error - | .divide _ _ => true + | .divide a b => + if b.divisorIsSafe then a.might_error + else true /-- Bool fold of `might_error` over a list of operands ("does any operand might-error"), declared mutually with `might_error` so @@ -841,11 +882,28 @@ theorem might_error_sound : exact evalTimes_not_err (might_error_sound a env ha hEnv) (might_error_sound b env hb hEnv) hRes - | .divide a b, env, hMe, _ => by - -- `.divide _ _` is always tagged `might_error = true`, so the premise - -- `¬might_error = true` is absurd. A future tightening will detect - -- literal-nonzero divisors and lift this case to a real proof. - intro _ - exact hMe rfl + | .divide a b, env, hMe, hEnv => by + intro hRes + simp only [eval] at hRes + cases hSafe : b.divisorIsSafe with + | true => + -- b is a literal nonzero int: evalDivide _ (.int n) errs only on div0 + obtain ⟨n, hEqB, hNZ⟩ := Expr.lit_nonzero_int_of_divisorIsSafe hSafe + have hMeReduce : + Expr.might_error (.divide a b) = a.might_error := by + show (if b.divisorIsSafe = true then a.might_error else true) + = a.might_error + rw [hSafe]; rfl + rw [hMeReduce] at hMe + have hAe := might_error_sound a env hMe hEnv + rw [hEqB] at hRes + simp only [eval] at hRes + exact evalDivide_lit_nonzero hAe hNZ hRes + | false => + -- conservative branch: might_error returns `true`, premise absurd + have hMeTrue : Expr.might_error (.divide a b) = true := by + show (if b.divisorIsSafe = true then a.might_error else true) = true + rw [hSafe]; rfl + exact hMe hMeTrue end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 8fc8c91f2ffd9..6f78fdf4f8eea 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -13,7 +13,8 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/PrimEval.lean`: primitive evaluators on `Datum` and `List Datum` — `evalAnd`, `evalOr`, `evalNot`, `evalIfThen`, `Env`, `Env.get`, `evalAndN`, `evalOrN`, `evalCoalesce`, plus the integer arithmetic primitives `evalPlus`, `evalMinus`, `evalTimes`, `evalDivide`. Split out so the algebraic-law files and the expression-level evaluator can both import them without circular dependencies. Division strict on `.err` and `.null`; a `.int n / .int 0` divisor produces `.err .divisionByZero` — the canonical cell-scoped error. * `Mz/Eval.lean`: the big-step `eval : Env → Expr → Datum`. List-carrying constructors evaluate each operand and hand the result list to the matching primitive. * `Mz/Boolean.lean`: per-cell truth-table proofs for `AND`, `OR`, and `NOT`, plus involutivity of `NOT`. -* `Mz/MightError.lean`: the `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. Binary `AND` / `OR` short-circuit on literal-`.bool false` / literal-`.bool true` operands via `Expr.isLitBoolFalse` / `Expr.isLitBoolTrue`: either position being the absorbing literal makes the analyzer return `false` regardless of the other operand. The same short-circuit fires on variadic `andN` / `orN` when any operand is the absorbing literal. `IfThen` likewise short-circuits when the condition is a literal `.bool` — only the picked branch's analyzer result is consulted, so a known-erring branch on the discarded side cannot taint the result. `andN` and `orN` recurse via `Expr.argsMightError` ("any operand might error"); `coalesce` recurses via `Expr.argsAllMightError` ("every operand might error"), special-casing the empty list as safe. Soundness for `coalesce` extracts a statically-safe operand through `Expr.exists_safe_of_not_argsAllMightError` and applies `evalCoalesce_not_err_of_some_safe`, which in turn rests on `Coalesce.go_not_err` — the state-machine lemma that "once one safe operand is in the remaining list, the walk cannot return an error". Companion value-level helpers `evalAnd_{left,right}_false` / `evalOr_{left,right}_true` discharge the short-circuit branches of soundness. +* `Mz/MightError.lean`: the `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. Binary `AND` / `OR` short-circuit on literal-`.bool false` / literal-`.bool true` operands via `Expr.isLitBoolFalse` / `Expr.isLitBoolTrue`: either position being the absorbing literal makes the analyzer return `false` regardless of the other operand. The same short-circuit fires on variadic `andN` / `orN` when any operand is the absorbing literal. `IfThen` likewise short-circuits when the condition is a literal `.bool` — only the picked branch's analyzer result is consulted, so a known-erring branch on the discarded side cannot taint the result. + `.divide a b` reduces to `a.might_error` when `b` is a literal nonzero int (`Expr.divisorIsSafe`) — the divisor cannot trigger divide-by-zero, so the operator inherits errors only from the dividend. Falls back to `true` when the divisor is unknown or a literal zero. `andN` and `orN` recurse via `Expr.argsMightError` ("any operand might error"); `coalesce` recurses via `Expr.argsAllMightError` ("every operand might error"), special-casing the empty list as safe. Soundness for `coalesce` extracts a statically-safe operand through `Expr.exists_safe_of_not_argsAllMightError` and applies `evalCoalesce_not_err_of_some_safe`, which in turn rests on `Coalesce.go_not_err` — the state-machine lemma that "once one safe operand is in the remaining list, the walk cannot return an error". Companion value-level helpers `evalAnd_{left,right}_false` / `evalOr_{left,right}_true` discharge the short-circuit branches of soundness. * `Mz/Strict.lean`: strictness predicates (`ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary`), positive instances for `evalNot` and the condition slot of `evalIfThen`, closure under composition, and negative results witnessing that `AND` and `OR` are *not* err-strict in either position. * `Mz/Coalesce.lean`: laws for `evalCoalesce` — error-rescue, null-beats-err tiebreak, first-error stickiness. * `Mz/Laws.lean`: algebraic laws — two-sided identity (`TRUE` for `AND`, `FALSE` for `OR`), idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. From 386acf829e30b96d3121092b0411a5de81d03e66 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 13:55:02 +0200 Subject: [PATCH 046/127] doc/semantics: ErrPropagating / NullPropagating instances for arithmetic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds weaker IsErr-only propagation predicates alongside the existing payload-preserving `ErrStrictBinary`. Arithmetic operators do not satisfy the payload-preserving form because `evalPlus (.err e₁) (.err e₂)` picks the left payload, but they do satisfy the weaker propagation form. Symmetric `NullPropagatingBinary` requires the other arg to be non-err, since `err` absorbs `null` in the four-valued lattice. Provides instances for `evalPlus`, `evalMinus`, `evalTimes`, `evalDivide` (both err- and null-propagating in both positions). The divide instance covers `err` propagation even when the divisor is the canonical `.int 0`, since input err still wins over division-by-zero. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Strict.lean | 152 ++++++++++++++++++++++++- doc/developer/semantics/README.md | 3 +- 2 files changed, 153 insertions(+), 2 deletions(-) diff --git a/doc/developer/semantics/Mz/Strict.lean b/doc/developer/semantics/Mz/Strict.lean index 4f66d5f863b83..deeddc57dd527 100644 --- a/doc/developer/semantics/Mz/Strict.lean +++ b/doc/developer/semantics/Mz/Strict.lean @@ -35,15 +35,41 @@ def ErrStrictUnary (f : Datum → Datum) : Prop := /-- `f` is err-strict in each argument position. The two positions have independent witnesses; a function strict only in its left -argument is captured by `.left` alone. -/ +argument is captured by `.left` alone. + +This is the *payload-preserving* form: an `err` argument propagates +the *same* payload to the output. The boolean fragment's `evalNot` +satisfies it. Arithmetic operators do not satisfy `right` literally +because `evalPlus (.err e₁) (.err e₂) = .err e₁`, not `.err e₂`. See +`ErrPropagatingBinary` for the strictly weaker form that arithmetic +satisfies. -/ structure ErrStrictBinary (f : Datum → Datum → Datum) : Prop where left : ∀ d e, f (.err e) d = .err e right : ∀ d e, f d (.err e) = .err e +/-- `f` is err-propagating: an `err` in either argument forces an +`err` output, but the output payload may depend on which input came +first. The four-valued lattice's `err`-absorbs-`null`-absorbs-`int` +ordering means arithmetic operators satisfy this weaker form even +though they break the payload-preserving `ErrStrictBinary`. -/ +structure ErrPropagatingBinary (f : Datum → Datum → Datum) : Prop where + left : ∀ d₁ d₂, d₁.IsErr → (f d₁ d₂).IsErr + right : ∀ d₁ d₂, d₂.IsErr → (f d₁ d₂).IsErr + /-- `f` is null-strict: a `null` argument forces a `null` output. -/ def NullStrictUnary (f : Datum → Datum) : Prop := f .null = .null +/-- `f` is null-propagating in each position, provided the *other* +argument is not `.err`. The guard is necessary in the four-valued +lattice: `evalPlus .null (.err e) = .err e`, so the unguarded form +"`.null` in either position forces `.null`" would fail. The guard +captures the standard SQL/Materialize rule: in the absence of an +absorbing `err`, a `null` operand makes the result `null`. -/ +structure NullPropagatingBinary (f : Datum → Datum → Datum) : Prop where + left : ∀ d, ¬d.IsErr → f .null d = .null + right : ∀ d, ¬d.IsErr → f d .null = .null + /-! ## Concrete instances on the boolean fragment -/ theorem evalNot_errStrict : ErrStrictUnary evalNot := fun _ => rfl @@ -71,6 +97,130 @@ theorem ErrStrictUnary.comp {f g : Datum → Datum} show f (g (.err e)) = .err e rw [hg e, hf e] +/-! ## Arithmetic instances + +`evalPlus`, `evalMinus`, `evalTimes`, and `evalDivide` propagate `err` +in both positions and `null` in both positions (when the other +operand is not itself `err`). The four-valued lattice is in force: +`err > null > int`. These instances are the cell-scoped analogue of +SQL's "STRICT in NULL" qualifier, lifted to a setting where `err` +takes the dominant role. -/ + +theorem evalPlus_errPropagating : ErrPropagatingBinary evalPlus where + left := by + intro d₁ d₂ h + match d₁, h with + | .err e, _ => + show (evalPlus (.err e) d₂).IsErr + simp [evalPlus, Datum.IsErr] + right := by + intro d₁ d₂ h + match d₂, h with + | .err e, _ => cases d₁ <;> simp [evalPlus, Datum.IsErr] + +theorem evalPlus_nullPropagating : NullPropagatingBinary evalPlus where + left := by + intro d h + cases d with + | bool b => rfl + | int n => rfl + | null => rfl + | err e => exact absurd (by simp [Datum.IsErr] : (Datum.err e).IsErr) h + right := by + intro d h + cases d with + | bool b => rfl + | int n => rfl + | null => rfl + | err e => exact absurd (by simp [Datum.IsErr] : (Datum.err e).IsErr) h + +theorem evalMinus_errPropagating : ErrPropagatingBinary evalMinus where + left := by + intro d₁ d₂ h + match d₁, h with + | .err e, _ => + show (evalMinus (.err e) d₂).IsErr + simp [evalMinus, Datum.IsErr] + right := by + intro d₁ d₂ h + match d₂, h with + | .err e, _ => cases d₁ <;> simp [evalMinus, Datum.IsErr] + +theorem evalMinus_nullPropagating : NullPropagatingBinary evalMinus where + left := by + intro d h + cases d with + | bool b => rfl + | int n => rfl + | null => rfl + | err e => exact absurd (by simp [Datum.IsErr] : (Datum.err e).IsErr) h + right := by + intro d h + cases d with + | bool b => rfl + | int n => rfl + | null => rfl + | err e => exact absurd (by simp [Datum.IsErr] : (Datum.err e).IsErr) h + +theorem evalTimes_errPropagating : ErrPropagatingBinary evalTimes where + left := by + intro d₁ d₂ h + match d₁, h with + | .err e, _ => + show (evalTimes (.err e) d₂).IsErr + simp [evalTimes, Datum.IsErr] + right := by + intro d₁ d₂ h + match d₂, h with + | .err e, _ => cases d₁ <;> simp [evalTimes, Datum.IsErr] + +theorem evalTimes_nullPropagating : NullPropagatingBinary evalTimes where + left := by + intro d h + cases d with + | bool b => rfl + | int n => rfl + | null => rfl + | err e => exact absurd (by simp [Datum.IsErr] : (Datum.err e).IsErr) h + right := by + intro d h + cases d with + | bool b => rfl + | int n => rfl + | null => rfl + | err e => exact absurd (by simp [Datum.IsErr] : (Datum.err e).IsErr) h + +/-- `evalDivide` propagates `err` from either side. The output may be +`.err .divisionByZero` rather than the input payload when the divisor +is `.int 0`, but the input err always wins when present. -/ +theorem evalDivide_errPropagating : ErrPropagatingBinary evalDivide where + left := by + intro d₁ d₂ h + match d₁, h with + | .err e, _ => + show (evalDivide (.err e) d₂).IsErr + simp [evalDivide, Datum.IsErr] + right := by + intro d₁ d₂ h + match d₂, h with + | .err e, _ => cases d₁ <;> simp [evalDivide, Datum.IsErr] + +theorem evalDivide_nullPropagating : NullPropagatingBinary evalDivide where + left := by + intro d h + cases d with + | bool b => rfl + | int n => rfl + | null => rfl + | err e => exact absurd (by simp [Datum.IsErr] : (Datum.err e).IsErr) h + right := by + intro d h + cases d with + | bool b => rfl + | int n => rfl + | null => rfl + | err e => exact absurd (by simp [Datum.IsErr] : (Datum.err e).IsErr) h + /-! ## Negative results `AND` and `OR` are not err-strict in either position. The short diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 6f78fdf4f8eea..6d76c091bdab5 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -15,7 +15,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Boolean.lean`: per-cell truth-table proofs for `AND`, `OR`, and `NOT`, plus involutivity of `NOT`. * `Mz/MightError.lean`: the `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. Binary `AND` / `OR` short-circuit on literal-`.bool false` / literal-`.bool true` operands via `Expr.isLitBoolFalse` / `Expr.isLitBoolTrue`: either position being the absorbing literal makes the analyzer return `false` regardless of the other operand. The same short-circuit fires on variadic `andN` / `orN` when any operand is the absorbing literal. `IfThen` likewise short-circuits when the condition is a literal `.bool` — only the picked branch's analyzer result is consulted, so a known-erring branch on the discarded side cannot taint the result. `.divide a b` reduces to `a.might_error` when `b` is a literal nonzero int (`Expr.divisorIsSafe`) — the divisor cannot trigger divide-by-zero, so the operator inherits errors only from the dividend. Falls back to `true` when the divisor is unknown or a literal zero. `andN` and `orN` recurse via `Expr.argsMightError` ("any operand might error"); `coalesce` recurses via `Expr.argsAllMightError` ("every operand might error"), special-casing the empty list as safe. Soundness for `coalesce` extracts a statically-safe operand through `Expr.exists_safe_of_not_argsAllMightError` and applies `evalCoalesce_not_err_of_some_safe`, which in turn rests on `Coalesce.go_not_err` — the state-machine lemma that "once one safe operand is in the remaining list, the walk cannot return an error". Companion value-level helpers `evalAnd_{left,right}_false` / `evalOr_{left,right}_true` discharge the short-circuit branches of soundness. -* `Mz/Strict.lean`: strictness predicates (`ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary`), positive instances for `evalNot` and the condition slot of `evalIfThen`, closure under composition, and negative results witnessing that `AND` and `OR` are *not* err-strict in either position. +* `Mz/Strict.lean`: strictness predicates — payload-preserving (`ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary`) and weaker propagation forms (`ErrPropagatingBinary`, `NullPropagatingBinary`) that match the four-valued lattice's `err > null > int` absorption order. Positive instances for `evalNot` and the condition slot of `evalIfThen`; closure under composition; arithmetic instances (`evalPlus`, `evalMinus`, `evalTimes`, `evalDivide` all err-propagating and null-propagating in both positions); negative results witnessing that `AND` and `OR` are *not* err-strict in either position. * `Mz/Coalesce.lean`: laws for `evalCoalesce` — error-rescue, null-beats-err tiebreak, first-error stickiness. * `Mz/Laws.lean`: algebraic laws — two-sided identity (`TRUE` for `AND`, `FALSE` for `OR`), idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. * `Mz/Variadic.lean`: laws for `evalAndN` and `evalOrN` over `List Datum` — cons recurrence, nil, singleton, binary equivalence with the binary evaluators, and `FALSE`/`TRUE` absorption. @@ -100,6 +100,7 @@ The diff-semiring extension is in scope: `UnifiedStream` records carry `(Unified * Tightening `Expr.might_error` further. Short-circuit detection covers binary / variadic `AND` / `OR` and `IfThen` against literal absorbers. Remaining: ground-truth lookups (literal arithmetic, known-null operands, type-driven), all additive against the current soundness proof. * `UnifiedStream.project` analogous to `BagStream.project` but diff-aware. Each scalar can contribute its own erroring records via row-scoped err carriers; collection-scoped `.error` diffs propagate. * Strict cardinality bound for `UnifiedConsolidate`: when a carrier appears `k > 1` times in the input, the output is `k - 1` shorter than the input. +* Lift `ErrPropagatingBinary` / `NullPropagatingBinary` to `Expr` form. Today's predicates work at the `Datum` level; an `Expr.err_propagating` analogue would let the optimizer reason about whole sub-expressions, not just primitives. ### Material expansions From e961666c06bceceb5996dc29b34ae50f20721511 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 14:10:26 +0200 Subject: [PATCH 047/127] doc/semantics: UnifiedStream.project MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Diff-aware analogue of `BagStream.project`. Records with `.error` diff or `.err` carrier pass through unchanged; a `.row r` record with `.val n` diff is evaluated against `es`: * If every scalar succeeds, emit `(.row projected, .val n)`. * If any scalar errs, emit one `(.err e, .val n)` per erroring scalar — multiplicity is preserved per err, matching `BagStream.project`'s `projectErrs` lifted into the carrier. Theorems: * `project_preserves_error_diff`: an `.error` diff anywhere in the input reaches the output unchanged. * `project_no_error`: all-`.val` inputs yield all-`.val` outputs, so `.error` remains the only source of absorbing diffs. * `project_nil_es`: empty projection list keeps every record with rows collapsed to width zero. * `project_nil_stream`: empty input yields empty output. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/UnifiedStream.lean | 185 ++++++++++++++++++ doc/developer/semantics/README.md | 3 +- 2 files changed, 187 insertions(+), 1 deletion(-) diff --git a/doc/developer/semantics/Mz/UnifiedStream.lean b/doc/developer/semantics/Mz/UnifiedStream.lean index 59b1f941703b3..18ff46569a322 100644 --- a/doc/developer/semantics/Mz/UnifiedStream.lean +++ b/doc/developer/semantics/Mz/UnifiedStream.lean @@ -107,6 +107,191 @@ def UnifiedStream.filter (pred : Expr) (us : UnifiedStream) : UnifiedStream := | .err e => [(.err e, d)] | _ => [] +/-! ## Project + +Diff-aware projection. Each non-error record splits on its carrier: + +* `.error` diff: pass through unchanged (the absorbing marker + cannot be transformed away). +* `.err e` carrier with `.val` diff: pass through unchanged (the + row-scoped err already represents a failed row; projection has + nothing to evaluate). +* `.row r` carrier with `.val n` diff: evaluate `es` on `r`. If + every scalar succeeds, emit `(.row (es.map ...), .val n)`. If any + scalar errs, emit one `(.err e, .val n)` per erroring scalar — + multiplicity is preserved per err, mirroring `BagStream.project`'s + `projectErrs` but lifted into the carrier. + +The split between `.row` and `.err` is the unified analogue of +`BagStream.project`'s `(data, errors)` split: the same record kind +holds both, distinguished by the carrier tag. -/ + +/-- Project a single `.row` record through `es`, returning the +list of unified records the row contributes. Diff is preserved on +every produced record (rows-share-diff, errs-share-diff). -/ +private def rowProjectRecords (es : List Expr) (d : DiffWithError Int) (r : Row) : + UnifiedStream := + if rowAllSafe es r then + [(UnifiedRow.row (es.map (eval r)), d)] + else + (rowErrs es r).map (fun e => (UnifiedRow.err e, d)) + +/-- Diff-aware projection. -/ +def UnifiedStream.project (es : List Expr) (us : UnifiedStream) : UnifiedStream := + us.flatMap fun ud => match ud with + | (_, .error) => [ud] + | (.err e, d) => [(.err e, d)] + | (.row r, d) => rowProjectRecords es d r + +/-! ### Trivial cases -/ + +theorem UnifiedStream.project_nil_stream (es : List Expr) : + UnifiedStream.project es [] = [] := rfl + +/-- The empty projection list cannot error on any row, so every +record passes through with the row collapsed to width zero. -/ +theorem UnifiedStream.project_nil_es (us : UnifiedStream) : + UnifiedStream.project [] us = + us.map (fun ud => match ud with + | (_, .error) => ud + | (.err e, d) => (.err e, d) + | (.row _, d) => (.row [], d)) := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + cases d with + | error => + show ([(uc, DiffWithError.error)] : UnifiedStream) + ++ UnifiedStream.project [] tl + = (uc, DiffWithError.error) + :: tl.map (fun ud => match ud with + | (_, .error) => ud + | (.err e, d) => (.err e, d) + | (.row _, d) => (.row [], d)) + simp [ih] + | val n => + cases uc with + | row r => + show rowProjectRecords [] (DiffWithError.val n) r + ++ UnifiedStream.project [] tl + = (UnifiedRow.row [], DiffWithError.val n) + :: tl.map (fun ud => match ud with + | (_, .error) => ud + | (.err e, d) => (.err e, d) + | (.row _, d) => (.row [], d)) + have hSafe : rowAllSafe [] r = true := rfl + show (if rowAllSafe [] r then + [(UnifiedRow.row (([] : List Expr).map (eval r)), + DiffWithError.val n)] + else + (rowErrs [] r).map (fun e => + (UnifiedRow.err e, DiffWithError.val n))) + ++ UnifiedStream.project [] tl + = (UnifiedRow.row [], DiffWithError.val n) + :: tl.map (fun ud => match ud with + | (_, .error) => ud + | (.err e, d) => (.err e, d) + | (.row _, d) => (.row [], d)) + rw [if_pos hSafe] + simp [ih] + | err e => + show ([(UnifiedRow.err e, DiffWithError.val n)] : UnifiedStream) + ++ UnifiedStream.project [] tl + = (UnifiedRow.err e, DiffWithError.val n) + :: tl.map (fun ud => match ud with + | (_, .error) => ud + | (.err e, d) => (.err e, d) + | (.row _, d) => (.row [], d)) + simp [ih] + +/-! ### `.error` absorption + +A record carrying the absorbing `.error` diff passes through +projection unchanged. Combined with no-error preservation below, +this means `.error` remains the only source of absorbing diffs. -/ + +theorem UnifiedStream.project_preserves_error_diff + (es : List Expr) (us : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) ∈ us) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.project es us := by + induction us with + | nil => exact absurd h List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc₀, d₀⟩ := hd + rcases List.mem_cons.mp h with hEq | hTail + · have hUc : uc = uc₀ := (Prod.mk.injEq _ _ _ _).mp hEq |>.1 + have hD : (DiffWithError.error : DiffWithError Int) = d₀ := + (Prod.mk.injEq _ _ _ _).mp hEq |>.2 + subst hUc; subst hD + show (uc, DiffWithError.error) + ∈ (([(uc, DiffWithError.error)] : UnifiedStream) + ++ UnifiedStream.project es tl) + exact List.mem_append.mpr (Or.inl List.mem_cons_self) + · have ihResult := ih hTail + -- The head splits into a list (possibly empty for row + non-safe); + -- in every shape, `mem_append.mpr (Or.inr ihResult)` discharges the goal. + show (uc, DiffWithError.error) + ∈ ((match (uc₀, d₀) with + | (_, .error) => [(uc₀, d₀)] + | (.err e, d) => [(.err e, d)] + | (.row r, d) => rowProjectRecords es d r) + ++ UnifiedStream.project es tl) + exact List.mem_append.mpr (Or.inr ihResult) + +/-! ### No-error preservation + +If every input diff is `.val`, every output diff is `.val`. The +row-projection helper only emits records whose diff is the input +record's diff, so no `.error` is introduced. -/ + +private theorem rowProjectRecords_no_error + (es : List Expr) (n : Int) (r : Row) : + ∀ rec ∈ rowProjectRecords es (DiffWithError.val n) r, + ∃ m : Int, rec.2 = DiffWithError.val m := by + intro rec hMem + unfold rowProjectRecords at hMem + split at hMem + · -- branch: all safe; the singleton has diff `.val n`. + have : rec = (UnifiedRow.row (es.map (eval r)), DiffWithError.val n) := + List.mem_singleton.mp hMem + exact ⟨n, by rw [this]⟩ + · -- branch: some err; every produced record has diff `.val n`. + obtain ⟨e, _, hRec⟩ := List.mem_map.mp hMem + exact ⟨n, by rw [← hRec]⟩ + +theorem UnifiedStream.project_no_error + (es : List Expr) (us : UnifiedStream) + (h : ∀ r ∈ us, ∃ n : Int, r.2 = DiffWithError.val n) : + ∀ r ∈ UnifiedStream.project es us, + ∃ n : Int, r.2 = DiffWithError.val n := by + induction us with + | nil => intro r hMem; exact absurd hMem List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hHd : ∃ n : Int, d = DiffWithError.val n := h (uc, d) List.mem_cons_self + have hTl : ∀ r ∈ tl, ∃ n : Int, r.2 = DiffWithError.val n := + fun r hMem => h r (List.mem_cons_of_mem _ hMem) + obtain ⟨n, hN⟩ := hHd + subst hN + intro rec hMem + cases uc with + | row r => + have hMem' : rec ∈ rowProjectRecords es (DiffWithError.val n) r + ++ UnifiedStream.project es tl := hMem + rcases List.mem_append.mp hMem' with hHead | hTail + · exact rowProjectRecords_no_error es n r rec hHead + · exact ih hTl rec hTail + | err e => + have hMem' : rec ∈ ([(UnifiedRow.err e, DiffWithError.val n)] : UnifiedStream) + ++ UnifiedStream.project es tl := hMem + rcases List.mem_append.mp hMem' with hHead | hTail + · have : rec = (UnifiedRow.err e, DiffWithError.val n) := + List.mem_singleton.mp hHead + exact ⟨n, by rw [this]⟩ + · exact ih hTl rec hTail + /-! ## Helper lemmas for filterMap over the packed concatenation -/ private theorem filterMap_pickRow_rowMap (rs : List Row) : diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 6d76c091bdab5..d85fb29c3d098 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -30,6 +30,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Pushdown.lean`: substitution (`Expr.subst`) plus the headline `eval_subst` theorem (substituting then evaluating against the original row equals evaluating against the projected row), and the relational predicate-pushdown rewrite `filterRel p (project es rel) = project es (filterRel (p.subst es) rel)`. * `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity laws that downstream operators must respect. The `_int` specializations (`add_comm_int`, `add_assoc_int`, `mul_assoc_int`, `mul_comm_int`, `mul_add_int`) discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean` and `Mz/UnifiedConsolidate.lean` can cite the named laws directly. * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). + `UnifiedStream.project` lifts `BagStream.project` to the diff-aware carrier. Records with `.error` diff or `.err` carrier pass through unchanged; a `.row r` record with `.val n` diff is evaluated against `es` — if every scalar succeeds, the row is emitted with diff `.val n`; if any scalar errs, one `(.err e, .val n)` is emitted per erroring scalar (each preserving the original multiplicity). Theorems: `project_preserves_error_diff` (an `.error` diff in the input always reaches the output), `project_no_error` (all-`.val` inputs yield all-`.val` outputs), `project_nil_es` (empty projection list collapses every row to width-zero), `project_nil_stream` (empty stream is empty). * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. * `Mz/TimedConsolidate.lean`: per-`(row, time)` consolidation. `TimedUnifiedStream := List (UnifiedRow × Nat × DiffWithError Int)` carries records with time. `atTime t` projects to one time slice (dropping the time component); `consolidateAtTime t` chains it with `UnifiedStream.consolidate`. Theorems: `consolidateAtTime_preserves_error` (an `.error` diff at time `t` survives both filter and consolidation), `atTime_length_le` and `consolidateAtTime_length_le` (both non-expanding). Decomposes the joint key into "filter by time, then consolidate by row". @@ -98,8 +99,8 @@ The diff-semiring extension is in scope: `UnifiedStream` records carry `(Unified ### Additive refinements * Tightening `Expr.might_error` further. Short-circuit detection covers binary / variadic `AND` / `OR` and `IfThen` against literal absorbers. Remaining: ground-truth lookups (literal arithmetic, known-null operands, type-driven), all additive against the current soundness proof. -* `UnifiedStream.project` analogous to `BagStream.project` but diff-aware. Each scalar can contribute its own erroring records via row-scoped err carriers; collection-scoped `.error` diffs propagate. * Strict cardinality bound for `UnifiedConsolidate`: when a carrier appears `k > 1` times in the input, the output is `k - 1` shorter than the input. +* `UnifiedStream.project` pushdown analogue of `BagStream.project_filter_pushdown_data`. Predicate pushdown across the diff-aware projection is straightforward on the carrier side but the err-split adds asymmetries: filter-after-project sees projected rows; filter-before-project (with substitution) sees originals. * Lift `ErrPropagatingBinary` / `NullPropagatingBinary` to `Expr` form. Today's predicates work at the `Datum` level; an `Expr.err_propagating` analogue would let the optimizer reason about whole sub-expressions, not just primitives. ### Material expansions From e7491aedaf533c6db4524d78c9bd14361a36da7c Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 14:13:52 +0200 Subject: [PATCH 048/127] doc/semantics: UNION ALL and UNION on UnifiedStream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `Mz/SetOps.lean` with the bag-semantics flavor (`unionAll`) and the set-semantics flavor derived through consolidation (`union = consolidate ∘ unionAll`). `unionAll = l ++ r`: * Theorems: nil identities, length is sum, associativity. * Diff propagation: `.error` diff in either input survives; all-`.val` inputs yield all-`.val` outputs. `union`: * Theorems lift the consolidation guarantees to the union setting. * Length is at most the sum of input lengths (consolidate only merges, never expands). * `.error` diffs from either side carry into the corresponding consolidated bucket via the semiring absorption laws. * All-`.val` inputs yield all-`.val` outputs (chained no-error preservation). The bag-difference flavors (`INTERSECT ALL`, `EXCEPT ALL`) require subtraction-aware multiplicity arithmetic on `DiffWithError` and are deferred. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/SetOps.lean | 147 +++++++++++++++++++++++++ doc/developer/semantics/README.md | 3 +- 3 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 doc/developer/semantics/Mz/SetOps.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 98f2803ce1cb7..d752b8b154bc1 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -20,3 +20,4 @@ import Mz.Consolidate import Mz.Triple import Mz.Join import Mz.GroupBy +import Mz.SetOps diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean new file mode 100644 index 0000000000000..14c1a3c374be5 --- /dev/null +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -0,0 +1,147 @@ +import Mz.UnifiedStream +import Mz.UnifiedConsolidate +import Mz.DiffSemiring + +/-! +# Set operations on `UnifiedStream` + +`UNION ALL`, `UNION` derived via consolidation. The bag-multiplicity +flavors (`INTERSECT ALL`, `EXCEPT ALL`) require subtraction-aware +multiplicity arithmetic and are deferred to a later iteration. + +## `UNION ALL` + +Bag union is concatenation. Each input record passes through with +its diff unchanged. Collection-scoped errors propagate via the +union of the inputs' error-bearing records; row-scoped errors flow +through carriers; ordinary `.val` diffs preserve. + +## `UNION` + +Set union is `UNION ALL` followed by `consolidate` — duplicate +carriers fold into one record whose diff is the sum of the input +diffs. The semiring's `.error` absorber ensures collection-scoped +errors in either input become collection-scoped errors per carrier +in the output. This file gives the operator and lifts the existing +`UnifiedConsolidate` theorems to the union setting. + +## Out of scope + +Bag-difference flavors (`INTERSECT ALL`, `EXCEPT ALL`) require +counting multiplicities and producing diffs that subtract. +Differential dataflow encodes these via `consolidate` with negative +diffs and per-carrier clamping, which the diff-semiring's `Int` +slice supports but the skeleton's `DiffWithError` semiring does not +yet expose. Landing them is additive against the current proofs. +-/ + +namespace Mz + +/-! ## `UNION ALL` -/ + +/-- Bag union: concatenate two unified streams. Order is left +input first, then right input. Every record passes through with +its diff unchanged. -/ +def UnifiedStream.unionAll (l r : UnifiedStream) : UnifiedStream := + l ++ r + +/-! ### Reduction lemmas -/ + +theorem UnifiedStream.unionAll_nil_left (r : UnifiedStream) : + UnifiedStream.unionAll [] r = r := List.nil_append r + +theorem UnifiedStream.unionAll_nil_right (l : UnifiedStream) : + UnifiedStream.unionAll l [] = l := List.append_nil l + +theorem UnifiedStream.unionAll_length (l r : UnifiedStream) : + (UnifiedStream.unionAll l r).length = l.length + r.length := + List.length_append + +theorem UnifiedStream.unionAll_assoc (a b c : UnifiedStream) : + UnifiedStream.unionAll (UnifiedStream.unionAll a b) c + = UnifiedStream.unionAll a (UnifiedStream.unionAll b c) := + List.append_assoc a b c + +/-! ### Diff propagation + +`.error` diffs on either side survive the union. The companion +no-error preservation states that an all-`.val` pair of inputs +yields an all-`.val` output. -/ + +theorem UnifiedStream.unionAll_preserves_error_diff_left + (l r : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) ∈ l) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.unionAll l r := + List.mem_append.mpr (Or.inl h) + +theorem UnifiedStream.unionAll_preserves_error_diff_right + (l r : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) ∈ r) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.unionAll l r := + List.mem_append.mpr (Or.inr h) + +theorem UnifiedStream.unionAll_no_error + (l r : UnifiedStream) + (hL : ∀ x ∈ l, ∃ n : Int, x.2 = DiffWithError.val n) + (hR : ∀ x ∈ r, ∃ n : Int, x.2 = DiffWithError.val n) : + ∀ x ∈ UnifiedStream.unionAll l r, + ∃ n : Int, x.2 = DiffWithError.val n := by + intro x hMem + rcases List.mem_append.mp hMem with hL' | hR' + · exact hL x hL' + · exact hR x hR' + +/-! ## `UNION` (set semantics) + +Set union: union-all then consolidate. The duplicate-carrier +buckets fold into single records via diff addition. The semiring +laws in `Mz/DiffSemiring.lean` carry the absorption: any `.error` +diff in either input survives into the corresponding bucket of +the output. -/ + +def UnifiedStream.union (l r : UnifiedStream) : UnifiedStream := + UnifiedStream.consolidate (UnifiedStream.unionAll l r) + +theorem UnifiedStream.union_length_le (l r : UnifiedStream) : + (UnifiedStream.union l r).length ≤ l.length + r.length := by + show (UnifiedStream.consolidate (UnifiedStream.unionAll l r)).length + ≤ l.length + r.length + have hCons := UnifiedStream.consolidate_length_le + (UnifiedStream.unionAll l r) + have hConcat : (UnifiedStream.unionAll l r).length = l.length + r.length := + UnifiedStream.unionAll_length l r + exact hConcat ▸ hCons + +/-- `.error` diff on the left input survives the set union for its +carrier. The proof chains the `unionAll` lift with the existing +`consolidate_preserves_error`. -/ +theorem UnifiedStream.union_preserves_error_diff_left + (l r : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) ∈ l) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.union l r := + UnifiedStream.consolidate_preserves_error _ uc + (UnifiedStream.unionAll_preserves_error_diff_left l r uc h) + +theorem UnifiedStream.union_preserves_error_diff_right + (l r : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) ∈ r) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.union l r := + UnifiedStream.consolidate_preserves_error _ uc + (UnifiedStream.unionAll_preserves_error_diff_right l r uc h) + +/-- All-`.val` diffs on both inputs yield all-`.val` diffs on the +output. -/ +theorem UnifiedStream.union_no_error + (l r : UnifiedStream) + (hL : ∀ x ∈ l, ∃ n : Int, x.2 = DiffWithError.val n) + (hR : ∀ x ∈ r, ∃ n : Int, x.2 = DiffWithError.val n) : + ∀ x ∈ UnifiedStream.union l r, + ∃ n : Int, x.2 = DiffWithError.val n := + UnifiedStream.consolidate_no_error _ + (UnifiedStream.unionAll_no_error l r hL hR) + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index d85fb29c3d098..935c6c1652176 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -40,6 +40,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four *no-error preservation* — `consolidate_no_error` proves that if every input diff is a `.val`, every output diff is a `.val`, so `.error` is the only source of absorption. * `Mz/Triple.lean`: collection-wide and per-time *flat* consolidation views on `TimedUnifiedStream`. `consolidateAll` sums every diff in the stream; `consolidateAtTimeFlat t` sums every diff at time `t`. Both ignore the carrier — they collapse a time slice (or the whole stream) to one `DiffWithError Int`. Absorption: `consolidateAll_eq_error_of_mem` and `consolidateAtTimeFlat_eq_error_of_mem`. Complementary to `Mz/TimedConsolidate.lean`'s `consolidateAtTime t`, which buckets per `(row, time)` and returns a `UnifiedStream`. * `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). No-error preservation: `cross_no_error` and `filter_no_error` prove that all-`.val` input diffs yield all-`.val` output diffs, so `.error` is the only source of absorbing diffs in the joint output. Algebraic laws: `combineCarrier_assoc` (carrier combine is associative modulo `List.append_assoc`) and the headline `UnifiedStream.cross_assoc` (`(a × b) × c = a × (b × c)`). The proof rearranges nested `flatMap` / `map` via local list-monad lemmas and closes via `DiffWithError.mul_assoc` plus `combineCarrier_assoc`. +* `Mz/SetOps.lean`: set operations on `UnifiedStream`. `unionAll = (++)` concatenates two streams record-wise; theorems cover length (sum), associativity, nil identities, and error / no-error preservation from each input (`unionAll_preserves_error_diff_left`, `unionAll_preserves_error_diff_right`, `unionAll_no_error`). `union = consolidate ∘ unionAll` derives the set-semantics flavor; theorems lift the consolidation guarantees to `union` (`union_length_le`, `union_preserves_error_diff_left`, `union_preserves_error_diff_right`, `union_no_error`). Bag-difference flavors (`INTERSECT ALL`, `EXCEPT ALL`) require subtraction-aware multiplicity arithmetic and are deferred. * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. @@ -105,6 +106,6 @@ The diff-semiring extension is in scope: `UnifiedStream` records carry `(Unified ### Material expansions -* Set operations (`UNION ALL`, `INTERSECT ALL`, `EXCEPT ALL`) on `UnifiedStream`. Composing with `consolidate` derives the set-semantics (`UNION` / `INTERSECT` / `EXCEPT`) variants. +* `INTERSECT ALL` and `EXCEPT ALL` on `UnifiedStream`. `UNION ALL` and `UNION` shipped in `Mz/SetOps.lean`; the bag-difference flavors require subtraction-aware multiplicity arithmetic on the diff semiring. Once landed, `INTERSECT` / `EXCEPT` derive via `consolidate`. * `distinct` operator on `UnifiedStream`: collapse multiplicity via `consolidate` + sign normalization. * Cross-link the spec doc (`../design/20260517_error_handling_semantics.md`) to specific theorem names via `[Mz/...:thm]` cross-references. From d026fd60277cc74fbdf4f87f70b9a0635f5d52b8 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 14:25:52 +0200 Subject: [PATCH 049/127] doc/semantics: EXCEPT ALL (signed-diff flavor) on UnifiedStream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `Neg` instance on `DiffWithError`, `UnifiedStream.negate`, and `UnifiedStream.exceptAll` realizing the differential-dataflow signed-diff representation of bag difference. `DiffWithError`: * `Neg` instance with `.error` absorbing negation (a collection- scoped error cannot be retracted away). * Laws: `neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`. * `Int` specializations: `neg_neg_int`, `val_add_neg_val_int`. `UnifiedStream`: * `negate us = us.map (fun (uc, d) => (uc, -d))`. Theorems: `negate_length`, `negate_negate` (involution), `negate_preserves_error_diff`, `negate_no_error`. * `exceptAll l r = consolidate (unionAll l (negate r))`. Output diffs are signed: positive for net-present carriers, negative for net-absent. Bag-semantics `EXCEPT ALL` clamps negatives to zero via a separate normalize step. Theorems on `exceptAll`: * `exceptAll_length_le` (≤ sum of input lengths). * `exceptAll_preserves_error_diff_left` / `_right` (errors from either side survive — negation absorbs at `.error`). * `exceptAll_no_error` (all-`.val` inputs yield all-`.val` outputs). `INTERSECT ALL` requires a per-carrier `min` combinator not expressible via `+` / `*` / `-`; deferred. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/DiffSemiring.lean | 44 ++++++ doc/developer/semantics/Mz/SetOps.lean | 139 ++++++++++++++++++- doc/developer/semantics/README.md | 9 +- 3 files changed, 183 insertions(+), 9 deletions(-) diff --git a/doc/developer/semantics/Mz/DiffSemiring.lean b/doc/developer/semantics/Mz/DiffSemiring.lean index 3d628de40dcc0..773021f6ade8c 100644 --- a/doc/developer/semantics/Mz/DiffSemiring.lean +++ b/doc/developer/semantics/Mz/DiffSemiring.lean @@ -51,8 +51,17 @@ def mul [Mul α] : DiffWithError α → DiffWithError α → DiffWithError α | _, .error => .error | .val x, .val y => .val (x * y) +/-- Lifted negation. `error` absorbs (negation cannot remove +the "collection invalid" marker); `val x` negates pointwise via +the underlying `α`'s `Neg`. Required for bag-difference flavors +of set operations (`EXCEPT ALL`): `L - R = L + (-R)` on diffs. -/ +def neg [Neg α] : DiffWithError α → DiffWithError α + | .error => .error + | .val x => .val (-x) + instance [Add α] : Add (DiffWithError α) := ⟨add⟩ instance [Mul α] : Mul (DiffWithError α) := ⟨mul⟩ +instance [Neg α] : Neg (DiffWithError α) := ⟨neg⟩ /-- Lifted zero (identity for `+`). -/ instance [Zero α] : Zero (DiffWithError α) := ⟨.val 0⟩ @@ -214,6 +223,34 @@ theorem mul_comm [Mul α] (h_comm : ∀ x y : α, x * y = y * x) | val _ => rfl | error => rfl +/-! ## Negation laws + +`error` absorbs negation, and double-negation is the identity on +`val` (when the base has the same property). -/ + +theorem neg_error [Neg α] : + -(error : DiffWithError α) = error := rfl + +theorem neg_val [Neg α] (x : α) : + -(val x : DiffWithError α) = val (-x) := rfl + +theorem neg_neg_val [Neg α] (h : ∀ x : α, - -x = x) (a : DiffWithError α) : + - -a = a := by + cases a with + | val x => show (val (- -x) : DiffWithError α) = val x; rw [h] + | error => rfl + +/-- Right-inverse on `val`: when the base has `x + -x = 0`, +`val x + -val x = 0` in the lifted semiring. `error` does not have +an inverse — the absorber is unrecoverable, which is exactly the +spec property: a collection-scoped error cannot be "subtracted +away". -/ +theorem val_add_neg_val [Add α] [Neg α] [Zero α] + (h : ∀ x : α, x + -x = 0) (x : α) : + (val x : DiffWithError α) + -val x = 0 := by + show (val (x + -x) : DiffWithError α) = val 0 + rw [h] + /-! ## Int specializations The diff-aware operators in `Mz/UnifiedStream.lean` and @@ -240,6 +277,13 @@ theorem mul_add_int (a b c : DiffWithError Int) : a * (b + c) = a * b + a * c := mul_add Int.mul_add a b c +theorem neg_neg_int (a : DiffWithError Int) : - -a = a := + neg_neg_val (fun x => Int.neg_neg x) a + +theorem val_add_neg_val_int (x : Int) : + (val x : DiffWithError Int) + -val x = 0 := + val_add_neg_val (fun x => by omega) x + end DiffWithError end Mz diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 14c1a3c374be5..2362c4380c5eb 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -25,14 +25,30 @@ errors in either input become collection-scoped errors per carrier in the output. This file gives the operator and lifts the existing `UnifiedConsolidate` theorems to the union setting. +## `EXCEPT ALL` + +Differential dataflow's signed-diff representation makes +bag-difference natural: negate every diff of the right-hand input +and union the result with the left. Consolidation merges per +carrier, summing per-bucket diffs: + +* Carrier in L only: output diff = +L. +* Carrier in R only: output diff = -R. +* Carrier in both: output diff = L - R. + +Negative output diffs encode "this carrier has `n` fewer copies in +the result than in the input" — the standard retraction signal in +differential dataflow. Bag-semantics `EXCEPT ALL` clamps negatives +to zero; that clamp is a separate normalize step (sign +normalization), deferred to a later iteration. The skeleton states +the signed flavor. + ## Out of scope -Bag-difference flavors (`INTERSECT ALL`, `EXCEPT ALL`) require -counting multiplicities and producing diffs that subtract. -Differential dataflow encodes these via `consolidate` with negative -diffs and per-carrier clamping, which the diff-semiring's `Int` -slice supports but the skeleton's `DiffWithError` semiring does not -yet expose. Landing them is additive against the current proofs. +`INTERSECT ALL` requires per-carrier `min(L, R)` on diffs, which +cannot be expressed via the additive / multiplicative / negation +primitives alone. Landing it is additive against the current +proofs but needs a new diff combinator. -/ namespace Mz @@ -144,4 +160,115 @@ theorem UnifiedStream.union_no_error UnifiedStream.consolidate_no_error _ (UnifiedStream.unionAll_no_error l r hL hR) +/-! ## Negation + +Negate every diff in the stream. `.error` diffs absorb the negation +(the marker survives — a collection-scoped error cannot be +retracted away); `.val n` diffs become `.val (-n)`. -/ + +def UnifiedStream.negate (us : UnifiedStream) : UnifiedStream := + us.map fun ud => (ud.1, -ud.2) + +theorem UnifiedStream.negate_length (us : UnifiedStream) : + (UnifiedStream.negate us).length = us.length := + List.length_map _ + +/-- Double negation is the identity (lifted from `Int.neg_neg` +through `DiffWithError.neg_neg_int`). -/ +theorem UnifiedStream.negate_negate (us : UnifiedStream) : + UnifiedStream.negate (UnifiedStream.negate us) = us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + show ((uc, - -d) :: UnifiedStream.negate (UnifiedStream.negate tl)) + = (uc, d) :: tl + rw [ih, DiffWithError.neg_neg_int] + +/-- `.error` diffs survive negation; the carrier is preserved. -/ +theorem UnifiedStream.negate_preserves_error_diff + (us : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) ∈ us) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.negate us := by + unfold UnifiedStream.negate + rw [List.mem_map] + refine ⟨(uc, DiffWithError.error), h, ?_⟩ + show (uc, -DiffWithError.error) = (uc, DiffWithError.error) + rw [DiffWithError.neg_error] + +/-- Negation preserves the `.val` slice: an all-`.val` input yields +an all-`.val` output (with values negated). -/ +theorem UnifiedStream.negate_no_error + (us : UnifiedStream) + (h : ∀ x ∈ us, ∃ n : Int, x.2 = DiffWithError.val n) : + ∀ x ∈ UnifiedStream.negate us, ∃ n : Int, x.2 = DiffWithError.val n := by + intro x hMem + unfold UnifiedStream.negate at hMem + obtain ⟨y, hY, hEq⟩ := List.mem_map.mp hMem + obtain ⟨n, hN⟩ := h y hY + refine ⟨-n, ?_⟩ + rw [← hEq] + show -y.2 = DiffWithError.val (-n) + rw [hN] + rfl + +/-! ## `EXCEPT ALL` (signed-diff flavor) + +`exceptAll L R = consolidate (unionAll L (negate R))`. Output diffs +are signed: positive for net-present carriers, negative for +net-absent. Bag-semantics `EXCEPT ALL` clamps negative diffs to +zero in a follow-up normalize step. -/ + +def UnifiedStream.exceptAll (l r : UnifiedStream) : UnifiedStream := + UnifiedStream.consolidate + (UnifiedStream.unionAll l (UnifiedStream.negate r)) + +theorem UnifiedStream.exceptAll_length_le (l r : UnifiedStream) : + (UnifiedStream.exceptAll l r).length ≤ l.length + r.length := by + show (UnifiedStream.consolidate + (UnifiedStream.unionAll l (UnifiedStream.negate r))).length + ≤ l.length + r.length + have hCons := UnifiedStream.consolidate_length_le + (UnifiedStream.unionAll l (UnifiedStream.negate r)) + have hConcat : (UnifiedStream.unionAll l (UnifiedStream.negate r)).length + = l.length + (UnifiedStream.negate r).length := + UnifiedStream.unionAll_length _ _ + have hNeg : (UnifiedStream.negate r).length = r.length := + UnifiedStream.negate_length r + rw [hNeg] at hConcat + exact hConcat ▸ hCons + +/-- `.error` diff on the left input survives `exceptAll`. -/ +theorem UnifiedStream.exceptAll_preserves_error_diff_left + (l r : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) ∈ l) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.exceptAll l r := + UnifiedStream.consolidate_preserves_error _ uc + (UnifiedStream.unionAll_preserves_error_diff_left l _ uc h) + +/-- `.error` diff on the right input also survives `exceptAll`: +negation is absorbed by `.error`, so the negated right-hand input +still carries the marker. -/ +theorem UnifiedStream.exceptAll_preserves_error_diff_right + (l r : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) ∈ r) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.exceptAll l r := + UnifiedStream.consolidate_preserves_error _ uc + (UnifiedStream.unionAll_preserves_error_diff_right l _ uc + (UnifiedStream.negate_preserves_error_diff r uc h)) + +/-- All-`.val` inputs yield all-`.val` outputs. -/ +theorem UnifiedStream.exceptAll_no_error + (l r : UnifiedStream) + (hL : ∀ x ∈ l, ∃ n : Int, x.2 = DiffWithError.val n) + (hR : ∀ x ∈ r, ∃ n : Int, x.2 = DiffWithError.val n) : + ∀ x ∈ UnifiedStream.exceptAll l r, + ∃ n : Int, x.2 = DiffWithError.val n := + UnifiedStream.consolidate_no_error _ + (UnifiedStream.unionAll_no_error l _ hL + (UnifiedStream.negate_no_error r hR)) + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 935c6c1652176..2fff6b8d765ec 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -28,7 +28,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `BagStream.project` projects each row through a list of scalars; a row stays in the data collection only when every scalar succeeds, otherwise its err payloads (one per erroring scalar) are appended to the error collection. `rowErrs_nil_of_all_safe` and `projectErrs_eq_nil_of_all_safe` show that when no projection errs, `BagStream.project` does not extend the error collection. * `Mz/Pushdown.lean`: substitution (`Expr.subst`) plus the headline `eval_subst` theorem (substituting then evaluating against the original row equals evaluating against the projected row), and the relational predicate-pushdown rewrite `filterRel p (project es rel) = project es (filterRel (p.subst es) rel)`. -* `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity laws that downstream operators must respect. The `_int` specializations (`add_comm_int`, `add_assoc_int`, `mul_assoc_int`, `mul_comm_int`, `mul_add_int`) discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean` and `Mz/UnifiedConsolidate.lean` can cite the named laws directly. +* `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `-`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity / negation laws that downstream operators must respect. Negation laws (`neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`) carry the principle that `.error` is unrecoverable — a collection-scoped error cannot be subtracted away. The `_int` specializations (`add_comm_int`, `add_assoc_int`, `mul_assoc_int`, `mul_comm_int`, `mul_add_int`, `neg_neg_int`, `val_add_neg_val_int`) discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean`, `Mz/UnifiedConsolidate.lean`, and `Mz/SetOps.lean` can cite the named laws directly. * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). `UnifiedStream.project` lifts `BagStream.project` to the diff-aware carrier. Records with `.error` diff or `.err` carrier pass through unchanged; a `.row r` record with `.val n` diff is evaluated against `es` — if every scalar succeeds, the row is emitted with diff `.val n`; if any scalar errs, one `(.err e, .val n)` is emitted per erroring scalar (each preserving the original multiplicity). Theorems: `project_preserves_error_diff` (an `.error` diff in the input always reaches the output), `project_no_error` (all-`.val` inputs yield all-`.val` outputs), `project_nil_es` (empty projection list collapses every row to width-zero), `project_nil_stream` (empty stream is empty). * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). @@ -40,7 +40,9 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four *no-error preservation* — `consolidate_no_error` proves that if every input diff is a `.val`, every output diff is a `.val`, so `.error` is the only source of absorption. * `Mz/Triple.lean`: collection-wide and per-time *flat* consolidation views on `TimedUnifiedStream`. `consolidateAll` sums every diff in the stream; `consolidateAtTimeFlat t` sums every diff at time `t`. Both ignore the carrier — they collapse a time slice (or the whole stream) to one `DiffWithError Int`. Absorption: `consolidateAll_eq_error_of_mem` and `consolidateAtTimeFlat_eq_error_of_mem`. Complementary to `Mz/TimedConsolidate.lean`'s `consolidateAtTime t`, which buckets per `(row, time)` and returns a `UnifiedStream`. * `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). No-error preservation: `cross_no_error` and `filter_no_error` prove that all-`.val` input diffs yield all-`.val` output diffs, so `.error` is the only source of absorbing diffs in the joint output. Algebraic laws: `combineCarrier_assoc` (carrier combine is associative modulo `List.append_assoc`) and the headline `UnifiedStream.cross_assoc` (`(a × b) × c = a × (b × c)`). The proof rearranges nested `flatMap` / `map` via local list-monad lemmas and closes via `DiffWithError.mul_assoc` plus `combineCarrier_assoc`. -* `Mz/SetOps.lean`: set operations on `UnifiedStream`. `unionAll = (++)` concatenates two streams record-wise; theorems cover length (sum), associativity, nil identities, and error / no-error preservation from each input (`unionAll_preserves_error_diff_left`, `unionAll_preserves_error_diff_right`, `unionAll_no_error`). `union = consolidate ∘ unionAll` derives the set-semantics flavor; theorems lift the consolidation guarantees to `union` (`union_length_le`, `union_preserves_error_diff_left`, `union_preserves_error_diff_right`, `union_no_error`). Bag-difference flavors (`INTERSECT ALL`, `EXCEPT ALL`) require subtraction-aware multiplicity arithmetic and are deferred. +* `Mz/SetOps.lean`: set operations on `UnifiedStream`. `unionAll = (++)` concatenates two streams record-wise; theorems cover length (sum), associativity, nil identities, and error / no-error preservation from each input (`unionAll_preserves_error_diff_left`, `unionAll_preserves_error_diff_right`, `unionAll_no_error`). `union = consolidate ∘ unionAll` derives the set-semantics flavor; theorems lift the consolidation guarantees to `union` (`union_length_le`, `union_preserves_error_diff_left`, `union_preserves_error_diff_right`, `union_no_error`). + `negate` negates every diff (`.error` absorbs negation, `.val n` becomes `.val (-n)`). Theorems: `negate_length` (length preserved), `negate_negate` (involution), `negate_preserves_error_diff`, `negate_no_error`. + `exceptAll l r = consolidate (unionAll l (negate r))` realizes the signed-diff `EXCEPT ALL` (output diffs may be negative, encoding "this carrier has `n` fewer copies in the result than in the input"). Theorems: `exceptAll_length_le` (≤ sum of input lengths), `exceptAll_preserves_error_diff_left`/`exceptAll_preserves_error_diff_right` (errors from either side survive — negation absorbs at `.error`), `exceptAll_no_error`. Bag-semantics `EXCEPT ALL` requires a separate sign-normalization clamp; `INTERSECT ALL` requires a per-carrier `min` combinator not yet exposed by `DiffWithError`. Both deferred. * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. @@ -106,6 +108,7 @@ The diff-semiring extension is in scope: `UnifiedStream` records carry `(Unified ### Material expansions -* `INTERSECT ALL` and `EXCEPT ALL` on `UnifiedStream`. `UNION ALL` and `UNION` shipped in `Mz/SetOps.lean`; the bag-difference flavors require subtraction-aware multiplicity arithmetic on the diff semiring. Once landed, `INTERSECT` / `EXCEPT` derive via `consolidate`. +* Sign-normalization on `UnifiedStream`: drop records whose diff has consolidated to `.val 0` or `.val n` with `n < 0` (for bag semantics). With sign-normalization in scope, the bag-semantics `EXCEPT ALL` derives from the signed-diff `exceptAll` already in `Mz/SetOps.lean`. +* `INTERSECT ALL` on `UnifiedStream`: requires a per-carrier `min` combinator over `DiffWithError Int`. The combinator is not derivable from `+`, `*`, `-` alone — landing it requires either a new diff primitive or a bucketing operator that materializes per-carrier multiplicities from both inputs. * `distinct` operator on `UnifiedStream`: collapse multiplicity via `consolidate` + sign normalization. * Cross-link the spec doc (`../design/20260517_error_handling_semantics.md`) to specific theorem names via `[Mz/...:thm]` cross-references. From a35bc8ebd7d2e0b78b086ae429ede383fc09c730 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 14:36:00 +0200 Subject: [PATCH 050/127] doc/semantics: sign normalization and bag-semantics EXCEPT ALL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `clampPositive` to drop records whose diff has consolidated to non-positive (zero or negative multiplicity), while preserving `.error` records. Lifts the signed-diff `exceptAll` to the bag-semantics `bagExceptAll = clampPositive ∘ exceptAll`, producing `max(L - R, 0)` per carrier. `clampPositive` filters records by `isPositiveDiff`: * `.error` ↦ keep (the absorbing marker cannot be filtered away). * `.val n` ↦ keep iff `n > 0`. Theorems: * `clampPositive_length_le` (non-expanding). * `clampPositive_preserves_error_diff`. * `clampPositive_only_positive` — every output `.val` is strictly positive (the spec property of the normalized form). `bagExceptAll` lifts the same guarantees to the full pipeline: length bounded by `l.length + r.length`, errors survive from either side, every output `.val` is strictly positive. Note: dot-notation `us.filter` resolves to `UnifiedStream.filter` (the predicate operator), which is unrelated. `clampPositive` uses fully-qualified `List.filter` to disambiguate. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 101 +++++++++++++++++++++++++ doc/developer/semantics/README.md | 8 +- 2 files changed, 106 insertions(+), 3 deletions(-) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 2362c4380c5eb..bb705635abfb7 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -271,4 +271,105 @@ theorem UnifiedStream.exceptAll_no_error (UnifiedStream.unionAll_no_error l _ hL (UnifiedStream.negate_no_error r hR)) +/-! ## Sign normalization + +`clampPositive` drops records whose diff has consolidated to a +non-positive `.val` (zero or negative multiplicity); `.error` +records survive unconditionally — the absorbing marker cannot be +filtered away without violating the diff semiring. This is the +post-pass that turns the signed-diff `exceptAll` into the +bag-semantics `EXCEPT ALL`. -/ + +@[inline] private def isPositiveDiff : DiffWithError Int → Bool + | .error => true + | .val n => decide (0 < n) + +/-- Drop records whose diff is `.val 0` or `.val n` with `n < 0`. +`.error` records pass through. The dot-notation here resolves to +`List.filter`; `UnifiedStream.filter` is the predicate-driven +operator in `Mz/UnifiedStream.lean`, which is a different +operation. -/ +def UnifiedStream.clampPositive (us : UnifiedStream) : UnifiedStream := + List.filter (fun ud => isPositiveDiff ud.2) us + +theorem UnifiedStream.clampPositive_length_le (us : UnifiedStream) : + (UnifiedStream.clampPositive us).length ≤ us.length := by + unfold UnifiedStream.clampPositive + exact List.length_filter_le _ _ + +theorem UnifiedStream.clampPositive_preserves_error_diff + (us : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) ∈ us) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.clampPositive us := by + unfold UnifiedStream.clampPositive + rw [List.mem_filter] + refine ⟨h, ?_⟩ + show isPositiveDiff DiffWithError.error = true + rfl + +/-- The output of `clampPositive` never contains a `.val n` with +`n ≤ 0`. Equivalently, every surviving `.val` diff is strictly +positive. -/ +theorem UnifiedStream.clampPositive_only_positive + (us : UnifiedStream) : + ∀ x ∈ UnifiedStream.clampPositive us, + (∃ n : Int, x.2 = DiffWithError.val n ∧ 0 < n) + ∨ x.2 = DiffWithError.error := by + intro x hMem + unfold UnifiedStream.clampPositive at hMem + have hAnd := List.mem_filter.mp hMem + have hKeep : isPositiveDiff x.2 = true := hAnd.2 + match hD : x.2 with + | .error => exact Or.inr rfl + | .val n => + refine Or.inl ⟨n, rfl, ?_⟩ + rw [hD] at hKeep + show 0 < n + have hDec : decide (0 < n) = true := hKeep + exact of_decide_eq_true hDec + +/-! ## Bag-semantics `EXCEPT ALL` + +`bagExceptAll = clampPositive ∘ exceptAll`. The signed-diff result +of `exceptAll` is post-processed to drop non-positive +multiplicities, producing the bag-semantics output: a carrier with +output multiplicity `max(L - R, 0)`. `.error` diffs survive the +clamp (collection-scoped errors cannot be sign-normalized away). -/ + +def UnifiedStream.bagExceptAll (l r : UnifiedStream) : UnifiedStream := + UnifiedStream.clampPositive (UnifiedStream.exceptAll l r) + +theorem UnifiedStream.bagExceptAll_length_le (l r : UnifiedStream) : + (UnifiedStream.bagExceptAll l r).length ≤ l.length + r.length := + Nat.le_trans + (UnifiedStream.clampPositive_length_le _) + (UnifiedStream.exceptAll_length_le l r) + +theorem UnifiedStream.bagExceptAll_preserves_error_diff_left + (l r : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) ∈ l) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.bagExceptAll l r := + UnifiedStream.clampPositive_preserves_error_diff _ uc + (UnifiedStream.exceptAll_preserves_error_diff_left l r uc h) + +theorem UnifiedStream.bagExceptAll_preserves_error_diff_right + (l r : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) ∈ r) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.bagExceptAll l r := + UnifiedStream.clampPositive_preserves_error_diff _ uc + (UnifiedStream.exceptAll_preserves_error_diff_right l r uc h) + +/-- Every `.val` record in the bag-semantics output has strictly +positive multiplicity (zero / negative records are sign-normalized +away). -/ +theorem UnifiedStream.bagExceptAll_only_positive + (l r : UnifiedStream) : + ∀ x ∈ UnifiedStream.bagExceptAll l r, + (∃ n : Int, x.2 = DiffWithError.val n ∧ 0 < n) + ∨ x.2 = DiffWithError.error := + UnifiedStream.clampPositive_only_positive _ + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 2fff6b8d765ec..0d4e6e78b4837 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -42,7 +42,10 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). No-error preservation: `cross_no_error` and `filter_no_error` prove that all-`.val` input diffs yield all-`.val` output diffs, so `.error` is the only source of absorbing diffs in the joint output. Algebraic laws: `combineCarrier_assoc` (carrier combine is associative modulo `List.append_assoc`) and the headline `UnifiedStream.cross_assoc` (`(a × b) × c = a × (b × c)`). The proof rearranges nested `flatMap` / `map` via local list-monad lemmas and closes via `DiffWithError.mul_assoc` plus `combineCarrier_assoc`. * `Mz/SetOps.lean`: set operations on `UnifiedStream`. `unionAll = (++)` concatenates two streams record-wise; theorems cover length (sum), associativity, nil identities, and error / no-error preservation from each input (`unionAll_preserves_error_diff_left`, `unionAll_preserves_error_diff_right`, `unionAll_no_error`). `union = consolidate ∘ unionAll` derives the set-semantics flavor; theorems lift the consolidation guarantees to `union` (`union_length_le`, `union_preserves_error_diff_left`, `union_preserves_error_diff_right`, `union_no_error`). `negate` negates every diff (`.error` absorbs negation, `.val n` becomes `.val (-n)`). Theorems: `negate_length` (length preserved), `negate_negate` (involution), `negate_preserves_error_diff`, `negate_no_error`. - `exceptAll l r = consolidate (unionAll l (negate r))` realizes the signed-diff `EXCEPT ALL` (output diffs may be negative, encoding "this carrier has `n` fewer copies in the result than in the input"). Theorems: `exceptAll_length_le` (≤ sum of input lengths), `exceptAll_preserves_error_diff_left`/`exceptAll_preserves_error_diff_right` (errors from either side survive — negation absorbs at `.error`), `exceptAll_no_error`. Bag-semantics `EXCEPT ALL` requires a separate sign-normalization clamp; `INTERSECT ALL` requires a per-carrier `min` combinator not yet exposed by `DiffWithError`. Both deferred. + `exceptAll l r = consolidate (unionAll l (negate r))` realizes the signed-diff `EXCEPT ALL` (output diffs may be negative, encoding "this carrier has `n` fewer copies in the result than in the input"). Theorems: `exceptAll_length_le` (≤ sum of input lengths), `exceptAll_preserves_error_diff_left`/`exceptAll_preserves_error_diff_right` (errors from either side survive — negation absorbs at `.error`), `exceptAll_no_error`. + `clampPositive` drops records with `.val n` where `n ≤ 0`, keeping `.error` records and records with `.val n > 0`. Theorems: `clampPositive_length_le`, `clampPositive_preserves_error_diff`, `clampPositive_only_positive` (every output `.val` is strictly positive). + `bagExceptAll = clampPositive ∘ exceptAll` realizes the bag-semantics `EXCEPT ALL` — the signed-diff result is post-processed to drop non-positive multiplicities, producing `max(L - R, 0)` per carrier. Theorems lift the signed flavor: `bagExceptAll_length_le`, `bagExceptAll_preserves_error_diff_left`/`_right`, `bagExceptAll_only_positive`. + `INTERSECT ALL` requires a per-carrier `min` combinator not yet exposed by `DiffWithError`. Deferred. * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. @@ -108,7 +111,6 @@ The diff-semiring extension is in scope: `UnifiedStream` records carry `(Unified ### Material expansions -* Sign-normalization on `UnifiedStream`: drop records whose diff has consolidated to `.val 0` or `.val n` with `n < 0` (for bag semantics). With sign-normalization in scope, the bag-semantics `EXCEPT ALL` derives from the signed-diff `exceptAll` already in `Mz/SetOps.lean`. * `INTERSECT ALL` on `UnifiedStream`: requires a per-carrier `min` combinator over `DiffWithError Int`. The combinator is not derivable from `+`, `*`, `-` alone — landing it requires either a new diff primitive or a bucketing operator that materializes per-carrier multiplicities from both inputs. -* `distinct` operator on `UnifiedStream`: collapse multiplicity via `consolidate` + sign normalization. +* `distinct` operator on `UnifiedStream`: collapse multiplicity via `consolidate` + map-positive-to-one. With `clampPositive` already in scope, `distinct` is the natural follow-up: replace each surviving `.val n > 0` with `.val 1`. * Cross-link the spec doc (`../design/20260517_error_handling_semantics.md`) to specific theorem names via `[Mz/...:thm]` cross-references. From 2ec8ee13804a4e1d1937cda84799d19a726daffb Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 14:49:54 +0200 Subject: [PATCH 051/127] doc/semantics: DISTINCT on UnifiedStream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `clampToOne` (collapse positive multiplicities to one, drop non-positives, preserve `.error`) and `distinct = clampToOne ∘ consolidate` realizing SQL `DISTINCT` on the diff-aware stream. `clampToOne` is defined by structural recursion on the list to make per-output diff shape visible to the inductive proofs: * `(uc, .error) :: rest` → `(uc, .error) :: clampToOne rest` * `(uc, .val n) :: rest` → if `n > 0`, prepend `(uc, .val 1)`, otherwise drop the record. Theorems: * `clampToOne_length_le` (non-expanding). * `clampToOne_preserves_error_diff` (`.error` records survive membership-wise). * `clampToOne_only_one_or_error` — every output `.val` has multiplicity exactly 1; `.error` may also appear. `distinct` chains `consolidate` with `clampToOne`. Output: each distinct carrier appears at most once with `.val 1` (positive net multiplicity) or `.error` (collection-scoped error existed). Theorems lift the building blocks: `distinct_length_le`, `distinct_preserves_error_diff`, `distinct_only_one_or_error`. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 127 +++++++++++++++++++++++++ doc/developer/semantics/README.md | 4 +- 2 files changed, 130 insertions(+), 1 deletion(-) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index bb705635abfb7..5faed5756a55a 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -372,4 +372,131 @@ theorem UnifiedStream.bagExceptAll_only_positive ∨ x.2 = DiffWithError.error := UnifiedStream.clampPositive_only_positive _ +/-! ## `DISTINCT` + +Set semantics: collapse multiplicities so each carrier appears at +most once with `.val 1` (or `.error` if a collection-scoped error +existed for that carrier). `clampToOne` is the post-consolidation +pass — `.val n` with `n > 0` becomes `.val 1`, non-positive `.val` +is dropped, `.error` survives. `distinct = clampToOne ∘ consolidate`. -/ + +/-- Map positive multiplicities to one and drop non-positive +ones. `.error` survives. The recursive form is preferred over a +filter+map composition because it makes the per-output diff shape +visible to the structural-induction proofs below. -/ +def UnifiedStream.clampToOne : UnifiedStream → UnifiedStream + | [] => [] + | (uc, .error) :: rest => (uc, .error) :: UnifiedStream.clampToOne rest + | (uc, .val n) :: rest => + if 0 < n then (uc, .val 1) :: UnifiedStream.clampToOne rest + else UnifiedStream.clampToOne rest + +theorem UnifiedStream.clampToOne_nil : + UnifiedStream.clampToOne [] = [] := rfl + +theorem UnifiedStream.clampToOne_length_le (us : UnifiedStream) : + (UnifiedStream.clampToOne us).length ≤ us.length := by + induction us with + | nil => exact Nat.le.refl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + cases d with + | error => + simp only [UnifiedStream.clampToOne, List.length_cons] + omega + | val n => + simp only [UnifiedStream.clampToOne, List.length_cons] + split + · simp only [List.length_cons]; omega + · omega + +theorem UnifiedStream.clampToOne_preserves_error_diff + (us : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) ∈ us) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.clampToOne us := by + induction us with + | nil => exact absurd h List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc₀, d₀⟩ := hd + rcases List.mem_cons.mp h with hEq | hTail + · have hUc : uc = uc₀ := (Prod.mk.injEq _ _ _ _).mp hEq |>.1 + have hD : (DiffWithError.error : DiffWithError Int) = d₀ := + (Prod.mk.injEq _ _ _ _).mp hEq |>.2 + subst hUc; subst hD + show (uc, DiffWithError.error) + ∈ ((uc, DiffWithError.error) :: UnifiedStream.clampToOne tl) + exact List.mem_cons_self + · cases d₀ with + | error => + show (uc, DiffWithError.error) + ∈ ((uc₀, DiffWithError.error) :: UnifiedStream.clampToOne tl) + exact List.mem_cons_of_mem _ (ih hTail) + | val n => + show (uc, DiffWithError.error) + ∈ (if 0 < n + then (uc₀, DiffWithError.val 1) :: UnifiedStream.clampToOne tl + else UnifiedStream.clampToOne tl) + split + · exact List.mem_cons_of_mem _ (ih hTail) + · exact ih hTail + +/-- Every `.val` record in the output of `clampToOne` has +multiplicity exactly one. `.error` records pass through unchanged. -/ +theorem UnifiedStream.clampToOne_only_one_or_error + (us : UnifiedStream) : + ∀ x ∈ UnifiedStream.clampToOne us, + x.2 = DiffWithError.val 1 ∨ x.2 = DiffWithError.error := by + induction us with + | nil => intro x hMem; exact absurd hMem List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + cases d with + | error => + intro x hMem + have hMem' : x ∈ (uc, (DiffWithError.error : DiffWithError Int)) + :: UnifiedStream.clampToOne tl := hMem + rcases List.mem_cons.mp hMem' with hHead | hTail + · exact Or.inr (by rw [hHead]) + · exact ih x hTail + | val n => + intro x hMem + have hMem' : x ∈ (if 0 < n + then ((uc, DiffWithError.val 1) + :: UnifiedStream.clampToOne tl) + else UnifiedStream.clampToOne tl) := hMem + split at hMem' + · rcases List.mem_cons.mp hMem' with hHead | hTail + · exact Or.inl (by rw [hHead]) + · exact ih x hTail + · exact ih x hMem' + +/-! ### `distinct` + +Pipeline: consolidate, then `clampToOne`. Each carrier appears at +most once in the output, with `.val 1` if it had positive net +multiplicity, or `.error` if a collection-scoped error existed. -/ + +def UnifiedStream.distinct (us : UnifiedStream) : UnifiedStream := + UnifiedStream.clampToOne (UnifiedStream.consolidate us) + +theorem UnifiedStream.distinct_length_le (us : UnifiedStream) : + (UnifiedStream.distinct us).length ≤ us.length := + Nat.le_trans + (UnifiedStream.clampToOne_length_le _) + (UnifiedStream.consolidate_length_le us) + +theorem UnifiedStream.distinct_preserves_error_diff + (us : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) ∈ us) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.distinct us := + UnifiedStream.clampToOne_preserves_error_diff _ uc + (UnifiedStream.consolidate_preserves_error _ uc h) + +theorem UnifiedStream.distinct_only_one_or_error (us : UnifiedStream) : + ∀ x ∈ UnifiedStream.distinct us, + x.2 = DiffWithError.val 1 ∨ x.2 = DiffWithError.error := + UnifiedStream.clampToOne_only_one_or_error _ + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 0d4e6e78b4837..6b79409bacf97 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -45,6 +45,8 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `exceptAll l r = consolidate (unionAll l (negate r))` realizes the signed-diff `EXCEPT ALL` (output diffs may be negative, encoding "this carrier has `n` fewer copies in the result than in the input"). Theorems: `exceptAll_length_le` (≤ sum of input lengths), `exceptAll_preserves_error_diff_left`/`exceptAll_preserves_error_diff_right` (errors from either side survive — negation absorbs at `.error`), `exceptAll_no_error`. `clampPositive` drops records with `.val n` where `n ≤ 0`, keeping `.error` records and records with `.val n > 0`. Theorems: `clampPositive_length_le`, `clampPositive_preserves_error_diff`, `clampPositive_only_positive` (every output `.val` is strictly positive). `bagExceptAll = clampPositive ∘ exceptAll` realizes the bag-semantics `EXCEPT ALL` — the signed-diff result is post-processed to drop non-positive multiplicities, producing `max(L - R, 0)` per carrier. Theorems lift the signed flavor: `bagExceptAll_length_le`, `bagExceptAll_preserves_error_diff_left`/`_right`, `bagExceptAll_only_positive`. + `clampToOne` collapses surviving multiplicities to one: `.val n > 0` becomes `.val 1`, non-positive `.val` is dropped, `.error` survives. Defined by structural recursion on the list. Theorems: `clampToOne_length_le`, `clampToOne_preserves_error_diff`, `clampToOne_only_one_or_error` (every output diff is `.val 1` or `.error`). + `distinct = clampToOne ∘ consolidate` realizes SQL `DISTINCT`: each distinct carrier appears at most once with multiplicity one (or `.error` if a collection-scoped error existed). Theorems: `distinct_length_le`, `distinct_preserves_error_diff`, `distinct_only_one_or_error`. `INTERSECT ALL` requires a per-carrier `min` combinator not yet exposed by `DiffWithError`. Deferred. * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. @@ -112,5 +114,5 @@ The diff-semiring extension is in scope: `UnifiedStream` records carry `(Unified ### Material expansions * `INTERSECT ALL` on `UnifiedStream`: requires a per-carrier `min` combinator over `DiffWithError Int`. The combinator is not derivable from `+`, `*`, `-` alone — landing it requires either a new diff primitive or a bucketing operator that materializes per-carrier multiplicities from both inputs. -* `distinct` operator on `UnifiedStream`: collapse multiplicity via `consolidate` + map-positive-to-one. With `clampPositive` already in scope, `distinct` is the natural follow-up: replace each surviving `.val n > 0` with `.val 1`. +* `distinct` is in scope; remaining: stronger correctness theorems (idempotence `distinct ∘ distinct = distinct`, agreement with the carrier-set view, no-error preservation on `.val` inputs). * Cross-link the spec doc (`../design/20260517_error_handling_semantics.md`) to specific theorem names via `[Mz/...:thm]` cross-references. From 31c7cef4e9e2bd0bdb2696e16b35bcee91b2d3ee Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 14:58:28 +0200 Subject: [PATCH 052/127] doc/semantics: comparison operators eq and lt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds binary `=` and `<` to the expression language. Both are err-strict and null-strict in both positions, with type-mismatched operands routing to `.null` (the skeleton does not model SQL implicit casts). Datum-level primitives (`evalEq`, `evalLt`): * `.bool` vs `.bool` compares via decidable Bool equality / order. * `.int` vs `.int` compares via Int. * Mixed types → `.null`. * `.err` in either position propagates (left payload wins). * `.null` in either position propagates when the other side is not `.err`. Expression-level integration: * `Expr.eq`, `Expr.lt` constructors. * `eval` cases route to the matching primitive. * `Expr.subst` and `eval_subst` extended. * `Expr.might_error` recurses into both operands (no static refinement yet — these are conservative branches). * `might_error_sound` extended with `.eq` and `.lt` branches via the new `evalEq_not_err` / `evalLt_not_err` helpers. Strictness instances (`Mz/Strict.lean`): * `ErrPropagatingBinary` and `NullPropagatingBinary` for both primitives in both positions. Comparison output is always `.bool`, `.null`, or `.err`, so the operators chain cleanly into the boolean fragment as `WHERE` predicates. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Eval.lean | 2 + doc/developer/semantics/Mz/Expr.lean | 2 + doc/developer/semantics/Mz/MightError.lean | 66 ++++++++++++++++++++++ doc/developer/semantics/Mz/PrimEval.lean | 35 ++++++++++++ doc/developer/semantics/Mz/Pushdown.lean | 8 +++ doc/developer/semantics/Mz/Strict.lean | 64 +++++++++++++++++++++ doc/developer/semantics/README.md | 6 +- 7 files changed, 180 insertions(+), 3 deletions(-) diff --git a/doc/developer/semantics/Mz/Eval.lean b/doc/developer/semantics/Mz/Eval.lean index e08fc43387a8b..da304c2d185a4 100644 --- a/doc/developer/semantics/Mz/Eval.lean +++ b/doc/developer/semantics/Mz/Eval.lean @@ -41,5 +41,7 @@ def eval (env : Env) : Expr → Datum | .minus a b => evalMinus (eval env a) (eval env b) | .times a b => evalTimes (eval env a) (eval env b) | .divide a b => evalDivide (eval env a) (eval env b) + | .eq a b => evalEq (eval env a) (eval env b) + | .lt a b => evalLt (eval env a) (eval env b) end Mz diff --git a/doc/developer/semantics/Mz/Expr.lean b/doc/developer/semantics/Mz/Expr.lean index c7e32701f10a4..0e414b6b2e3e4 100644 --- a/doc/developer/semantics/Mz/Expr.lean +++ b/doc/developer/semantics/Mz/Expr.lean @@ -42,6 +42,8 @@ inductive Expr | minus (a b : Expr) | times (a b : Expr) | divide (a b : Expr) + | eq (a b : Expr) + | lt (a b : Expr) deriving Inhabited end Mz diff --git a/doc/developer/semantics/Mz/MightError.lean b/doc/developer/semantics/Mz/MightError.lean index adfb1f76b97c4..3e9816e0a4d3d 100644 --- a/doc/developer/semantics/Mz/MightError.lean +++ b/doc/developer/semantics/Mz/MightError.lean @@ -151,6 +151,54 @@ theorem evalTimes_not_err | err _ => exact (h₂ trivial).elim | err _ => exact (h₁ trivial).elim +theorem evalEq_not_err + {d₁ d₂ : Datum} (h₁ : ¬d₁.IsErr) (h₂ : ¬d₂.IsErr) : + ¬(evalEq d₁ d₂).IsErr := by + cases d₁ with + | bool _ => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | int _ => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | null => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | err _ => exact (h₁ trivial).elim + +theorem evalLt_not_err + {d₁ d₂ : Datum} (h₁ : ¬d₁.IsErr) (h₂ : ¬d₂.IsErr) : + ¬(evalLt d₁ d₂).IsErr := by + cases d₁ with + | bool _ => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | int _ => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | null => + cases d₂ with + | bool _ => intro h; cases h + | int _ => intro h; cases h + | null => intro h; cases h + | err _ => exact (h₂ trivial).elim + | err _ => exact (h₁ trivial).elim + /-- Division is the canonical erring operation: a right operand of `.int 0` produces `.err .divisionByZero` even when both operands are otherwise error-free. So the analyzer's universal @@ -403,6 +451,8 @@ def Expr.might_error : Expr → Bool | .divide a b => if b.divisorIsSafe then a.might_error else true + | .eq a b => a.might_error || b.might_error + | .lt a b => a.might_error || b.might_error /-- Bool fold of `might_error` over a list of operands ("does any operand might-error"), declared mutually with `might_error` so @@ -905,5 +955,21 @@ theorem might_error_sound : show (if b.divisorIsSafe = true then a.might_error else true) = true rw [hSafe]; rfl exact hMe hMeTrue + | .eq a b, env, hMe, hEnv => by + intro hRes + simp only [eval] at hRes + have ha : ¬(a.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + have hb : ¬(b.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + exact evalEq_not_err + (might_error_sound a env ha hEnv) + (might_error_sound b env hb hEnv) hRes + | .lt a b, env, hMe, hEnv => by + intro hRes + simp only [eval] at hRes + have ha : ¬(a.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + have hb : ¬(b.might_error = true) := fun h => hMe (by simp [Expr.might_error, h]) + exact evalLt_not_err + (might_error_sound a env ha hEnv) + (might_error_sound b env hb hEnv) hRes end Mz diff --git a/doc/developer/semantics/Mz/PrimEval.lean b/doc/developer/semantics/Mz/PrimEval.lean index 0ec90cff61f74..ee9c74818dcee 100644 --- a/doc/developer/semantics/Mz/PrimEval.lean +++ b/doc/developer/semantics/Mz/PrimEval.lean @@ -121,6 +121,41 @@ def evalDivide : Datum → Datum → Datum | .int n, .int m => if m = 0 then .err .divisionByZero else .int (n / m) | _, _ => .null +/-! ## Comparison + +Binary comparison primitives. Strict on `.err` (propagates the +left-most err) and `.null` (propagates `.null`). Mixed-type +operands route to `.null` — the skeleton does not model SQL +implicit casts. Booleans compare by SQL's `false < true` ordering; +integers compare by `Int`'s built-in `<` / `=`. + +The output is always a `.bool`, `.null`, or `.err` — never a +numeric or string. This keeps comparisons compatible with the +boolean fragment as a `WHERE` predicate. -/ + +/-- Equality test. `.bool x = .bool y` and `.int n = .int m` use +the decidable equality of the base types; mixed types yield +`.null`. -/ +def evalEq : Datum → Datum → Datum + | .err e, _ => .err e + | _, .err e => .err e + | .null, _ => .null + | _, .null => .null + | .bool x, .bool y => .bool (decide (x = y)) + | .int n, .int m => .bool (decide (n = m)) + | _, _ => .null + +/-- Strict less-than. Booleans compare with `false < true`; +integers compare with `Int`'s `<`. Mixed types yield `.null`. -/ +def evalLt : Datum → Datum → Datum + | .err e, _ => .err e + | _, .err e => .err e + | .null, _ => .null + | _, .null => .null + | .bool x, .bool y => .bool (decide (x < y)) + | .int n, .int m => .bool (decide (n < m)) + | _, _ => .null + /-! ## Environment -/ /-- Environment: a positional list of bindings for `Expr.col`. -/ diff --git a/doc/developer/semantics/Mz/Pushdown.lean b/doc/developer/semantics/Mz/Pushdown.lean index 95cb15e29165d..1e22a5d395c49 100644 --- a/doc/developer/semantics/Mz/Pushdown.lean +++ b/doc/developer/semantics/Mz/Pushdown.lean @@ -49,6 +49,8 @@ def Expr.subst (es : List Expr) : Expr → Expr | .minus a b => .minus (a.subst es) (b.subst es) | .times a b => .times (a.subst es) (b.subst es) | .divide a b => .divide (a.subst es) (b.subst es) + | .eq a b => .eq (a.subst es) (b.subst es) + | .lt a b => .lt (a.subst es) (b.subst es) /-- Pointwise application of `subst` to a list of operands. -/ def Expr.substArgs (es : List Expr) : List Expr → List Expr @@ -155,6 +157,12 @@ theorem eval_subst : | env, es, .divide a b => by simp only [Expr.subst, eval] rw [eval_subst env es a, eval_subst env es b] + | env, es, .eq a b => by + simp only [Expr.subst, eval] + rw [eval_subst env es a, eval_subst env es b] + | env, es, .lt a b => by + simp only [Expr.subst, eval] + rw [eval_subst env es a, eval_subst env es b] /-! ## Predicate pushdown -/ diff --git a/doc/developer/semantics/Mz/Strict.lean b/doc/developer/semantics/Mz/Strict.lean index deeddc57dd527..1014b25f4752b 100644 --- a/doc/developer/semantics/Mz/Strict.lean +++ b/doc/developer/semantics/Mz/Strict.lean @@ -221,6 +221,70 @@ theorem evalDivide_nullPropagating : NullPropagatingBinary evalDivide where | null => rfl | err e => exact absurd (by simp [Datum.IsErr] : (Datum.err e).IsErr) h +/-! ## Comparison instances + +`evalEq` and `evalLt` mirror the arithmetic operators in their +propagation behavior: err-strict in both positions, null-strict in +both positions (when the other side is not err). The output is +always `.bool`, `.null`, or `.err`, so the operators chain cleanly +into the boolean-logic fragment. -/ + +theorem evalEq_errPropagating : ErrPropagatingBinary evalEq where + left := by + intro d₁ d₂ h + match d₁, h with + | .err e, _ => + show (evalEq (.err e) d₂).IsErr + simp [evalEq, Datum.IsErr] + right := by + intro d₁ d₂ h + match d₂, h with + | .err e, _ => cases d₁ <;> simp [evalEq, Datum.IsErr] + +theorem evalEq_nullPropagating : NullPropagatingBinary evalEq where + left := by + intro d h + cases d with + | bool b => rfl + | int n => rfl + | null => rfl + | err e => exact absurd (by simp [Datum.IsErr] : (Datum.err e).IsErr) h + right := by + intro d h + cases d with + | bool b => rfl + | int n => rfl + | null => rfl + | err e => exact absurd (by simp [Datum.IsErr] : (Datum.err e).IsErr) h + +theorem evalLt_errPropagating : ErrPropagatingBinary evalLt where + left := by + intro d₁ d₂ h + match d₁, h with + | .err e, _ => + show (evalLt (.err e) d₂).IsErr + simp [evalLt, Datum.IsErr] + right := by + intro d₁ d₂ h + match d₂, h with + | .err e, _ => cases d₁ <;> simp [evalLt, Datum.IsErr] + +theorem evalLt_nullPropagating : NullPropagatingBinary evalLt where + left := by + intro d h + cases d with + | bool b => rfl + | int n => rfl + | null => rfl + | err e => exact absurd (by simp [Datum.IsErr] : (Datum.err e).IsErr) h + right := by + intro d h + cases d with + | bool b => rfl + | int n => rfl + | null => rfl + | err e => exact absurd (by simp [Datum.IsErr] : (Datum.err e).IsErr) h + /-! ## Negative results `AND` and `OR` are not err-strict in either position. The short diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 6b79409bacf97..ecaa773c86801 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -9,13 +9,13 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four ## What is here * `Mz/Datum.lean`: `Datum` (`.bool`, `.int`, `.null`, `.err`), `EvalError` (`.placeholder`, `.divisionByZero`), and the `Datum.IsErr` predicate. -* `Mz/Expr.lean`: `Expr` inductive — literals, columns, binary `and`/`or`, `not`, `ifThen`, the list-carrying constructors `andN`, `orN`, `coalesce`, and the binary integer arithmetic constructors `plus`, `minus`, `times`, `divide`. -* `Mz/PrimEval.lean`: primitive evaluators on `Datum` and `List Datum` — `evalAnd`, `evalOr`, `evalNot`, `evalIfThen`, `Env`, `Env.get`, `evalAndN`, `evalOrN`, `evalCoalesce`, plus the integer arithmetic primitives `evalPlus`, `evalMinus`, `evalTimes`, `evalDivide`. Split out so the algebraic-law files and the expression-level evaluator can both import them without circular dependencies. Division strict on `.err` and `.null`; a `.int n / .int 0` divisor produces `.err .divisionByZero` — the canonical cell-scoped error. +* `Mz/Expr.lean`: `Expr` inductive — literals, columns, binary `and`/`or`, `not`, `ifThen`, the list-carrying constructors `andN`, `orN`, `coalesce`, the binary integer arithmetic constructors `plus`, `minus`, `times`, `divide`, and the binary comparison constructors `eq`, `lt`. +* `Mz/PrimEval.lean`: primitive evaluators on `Datum` and `List Datum` — `evalAnd`, `evalOr`, `evalNot`, `evalIfThen`, `Env`, `Env.get`, `evalAndN`, `evalOrN`, `evalCoalesce`, the integer arithmetic primitives `evalPlus`, `evalMinus`, `evalTimes`, `evalDivide`, and the comparison primitives `evalEq`, `evalLt`. Split out so the algebraic-law files and the expression-level evaluator can both import them without circular dependencies. Division strict on `.err` and `.null`; a `.int n / .int 0` divisor produces `.err .divisionByZero` — the canonical cell-scoped error. Comparison is err-strict and null-strict; mixed-type operands route to `.null` (the skeleton does not model SQL implicit casts). * `Mz/Eval.lean`: the big-step `eval : Env → Expr → Datum`. List-carrying constructors evaluate each operand and hand the result list to the matching primitive. * `Mz/Boolean.lean`: per-cell truth-table proofs for `AND`, `OR`, and `NOT`, plus involutivity of `NOT`. * `Mz/MightError.lean`: the `Expr.might_error` static analyzer, the `Env.ErrFree` predicate, and the `might_error_sound` theorem. Binary `AND` / `OR` short-circuit on literal-`.bool false` / literal-`.bool true` operands via `Expr.isLitBoolFalse` / `Expr.isLitBoolTrue`: either position being the absorbing literal makes the analyzer return `false` regardless of the other operand. The same short-circuit fires on variadic `andN` / `orN` when any operand is the absorbing literal. `IfThen` likewise short-circuits when the condition is a literal `.bool` — only the picked branch's analyzer result is consulted, so a known-erring branch on the discarded side cannot taint the result. `.divide a b` reduces to `a.might_error` when `b` is a literal nonzero int (`Expr.divisorIsSafe`) — the divisor cannot trigger divide-by-zero, so the operator inherits errors only from the dividend. Falls back to `true` when the divisor is unknown or a literal zero. `andN` and `orN` recurse via `Expr.argsMightError` ("any operand might error"); `coalesce` recurses via `Expr.argsAllMightError` ("every operand might error"), special-casing the empty list as safe. Soundness for `coalesce` extracts a statically-safe operand through `Expr.exists_safe_of_not_argsAllMightError` and applies `evalCoalesce_not_err_of_some_safe`, which in turn rests on `Coalesce.go_not_err` — the state-machine lemma that "once one safe operand is in the remaining list, the walk cannot return an error". Companion value-level helpers `evalAnd_{left,right}_false` / `evalOr_{left,right}_true` discharge the short-circuit branches of soundness. -* `Mz/Strict.lean`: strictness predicates — payload-preserving (`ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary`) and weaker propagation forms (`ErrPropagatingBinary`, `NullPropagatingBinary`) that match the four-valued lattice's `err > null > int` absorption order. Positive instances for `evalNot` and the condition slot of `evalIfThen`; closure under composition; arithmetic instances (`evalPlus`, `evalMinus`, `evalTimes`, `evalDivide` all err-propagating and null-propagating in both positions); negative results witnessing that `AND` and `OR` are *not* err-strict in either position. +* `Mz/Strict.lean`: strictness predicates — payload-preserving (`ErrStrictUnary`, `ErrStrictBinary`, `NullStrictUnary`) and weaker propagation forms (`ErrPropagatingBinary`, `NullPropagatingBinary`) that match the four-valued lattice's `err > null > int` absorption order. Positive instances for `evalNot` and the condition slot of `evalIfThen`; closure under composition; arithmetic instances (`evalPlus`, `evalMinus`, `evalTimes`, `evalDivide` all err-propagating and null-propagating in both positions); comparison instances (`evalEq`, `evalLt` same); negative results witnessing that `AND` and `OR` are *not* err-strict in either position. * `Mz/Coalesce.lean`: laws for `evalCoalesce` — error-rescue, null-beats-err tiebreak, first-error stickiness. * `Mz/Laws.lean`: algebraic laws — two-sided identity (`TRUE` for `AND`, `FALSE` for `OR`), idempotence (unconditional), commutativity (conditional on error-freedom of operands), and `Expr`-level reorder safety as a corollary of soundness. * `Mz/Variadic.lean`: laws for `evalAndN` and `evalOrN` over `List Datum` — cons recurrence, nil, singleton, binary equivalence with the binary evaluators, and `FALSE`/`TRUE` absorption. From 8a6aa086e564ef6e0caa5d8e8637fec169986d7d Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 15:08:30 +0200 Subject: [PATCH 053/127] doc/semantics: column-reference analyzer (join pushdown foundation) Adds `Mz/ColRefs.lean` with the `Expr.colReferencesBoundedBy n` static analyzer (returns `true` iff every `col i` in the expression has `i < n`) and the headline soundness theorem `eval_append_left_of_bounded` (when the bound holds for `l.length`, evaluating against `l ++ r` agrees with evaluating against `l`). This is the foundation for pushing a single-side filter through a join's `cross`: the joined env is `l ++ r`, and a predicate that only references the left side's columns evaluates the same way whether or not the right side is appended. Implementation notes: * Mutual recursion between `Expr.colReferencesBoundedBy` and `Expr.argsColRefBoundedBy` accommodates the nested-list constructors (`andN`, `orN`, `coalesce`). * `eval_append_left_of_bounded` and the operand-list companion `eval_append_left_of_bounded_argsMap` share a `mutual` block so structural recursion sees both. * Supporting `Env.get_append_left` proves the per-column reduction. Relational join-pushdown theorems are additive follow-ups that cite this agreement. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/ColRefs.lean | 166 ++++++++++++++++++++++++ doc/developer/semantics/README.md | 1 + 3 files changed, 168 insertions(+) create mode 100644 doc/developer/semantics/Mz/ColRefs.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index d752b8b154bc1..34881f81f647d 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -11,6 +11,7 @@ import Mz.ExprVariadic import Mz.Bag import Mz.ErrStream import Mz.Pushdown +import Mz.ColRefs import Mz.DiffSemiring import Mz.UnifiedStream import Mz.UnifiedConsolidate diff --git a/doc/developer/semantics/Mz/ColRefs.lean b/doc/developer/semantics/Mz/ColRefs.lean new file mode 100644 index 0000000000000..a5bb0ffcc8b3c --- /dev/null +++ b/doc/developer/semantics/Mz/ColRefs.lean @@ -0,0 +1,166 @@ +import Mz.Eval + +/-! +# Column-reference analysis + +The static analyzer `Expr.colReferencesBoundedBy n` returns `true` +when every `col i` reference in the expression has `i < n`. Used by +the optimizer to decide whether a predicate over a wide schema +mentions only the prefix (or only the suffix) of the row — the +precondition for pushing the predicate to one side of a join. + +The headline theorem `eval_append_left_of_bounded` proves the +soundness side: when a predicate's column references are bounded by +`l.length`, evaluating it against the concatenated row `l ++ r` +agrees with evaluating against `l` alone. The right-side analogue +shifts column indices by `l.length`. + +Both directions feed the join-pushdown theorems: a filter over a +cross product whose predicate touches only one side commutes with +the cross. The skeleton states the agreement here; the +relational-pushdown variant follows in future iterations of +`Mz/Join.lean`. +-/ + +namespace Mz + +/-! ## Column-bound analyzer -/ + +mutual +/-- `Expr.colReferencesBoundedBy n e` returns `true` iff every +`col i` in `e` has `i < n`. -/ +def Expr.colReferencesBoundedBy (n : Nat) : Expr → Bool + | .lit _ => true + | .col i => decide (i < n) + | .and a b => a.colReferencesBoundedBy n && b.colReferencesBoundedBy n + | .or a b => a.colReferencesBoundedBy n && b.colReferencesBoundedBy n + | .not a => a.colReferencesBoundedBy n + | .ifThen c t e => + c.colReferencesBoundedBy n && + t.colReferencesBoundedBy n && + e.colReferencesBoundedBy n + | .andN args => Expr.argsColRefBoundedBy n args + | .orN args => Expr.argsColRefBoundedBy n args + | .coalesce args => Expr.argsColRefBoundedBy n args + | .plus a b => a.colReferencesBoundedBy n && b.colReferencesBoundedBy n + | .minus a b => a.colReferencesBoundedBy n && b.colReferencesBoundedBy n + | .times a b => a.colReferencesBoundedBy n && b.colReferencesBoundedBy n + | .divide a b => a.colReferencesBoundedBy n && b.colReferencesBoundedBy n + | .eq a b => a.colReferencesBoundedBy n && b.colReferencesBoundedBy n + | .lt a b => a.colReferencesBoundedBy n && b.colReferencesBoundedBy n + +/-- Companion fold over operand lists. `argsColRefBoundedBy n args` +returns `true` iff every operand passes the bound. -/ +def Expr.argsColRefBoundedBy (n : Nat) : List Expr → Bool + | [] => true + | e :: rest => e.colReferencesBoundedBy n && Expr.argsColRefBoundedBy n rest +end + +/-! ## Environment-append lemmas -/ + +/-- Reading a column index below `l.length` from `l ++ r` yields +the same value as reading from `l`. Trivial induction. -/ +theorem Env.get_append_left : + ∀ (l r : Env) (i : Nat), i < l.length → + Env.get (l ++ r) i = Env.get l i + | [], _, _, h => absurd h (Nat.not_lt_zero _) + | hd :: _, _, 0, _ => rfl + | _ :: tl, r, n + 1, h => by + show Env.get (tl ++ r) n = Env.get tl n + exact Env.get_append_left tl r n (Nat.lt_of_succ_lt_succ h) + +/-! ## Eval agreement under bound + +If a predicate's column references are all bounded by `l.length`, +evaluating against `l ++ r` agrees with evaluating against `l`. +Joint structural recursion on `Expr` and the operand list. -/ + +mutual +theorem eval_append_left_of_bounded : + ∀ (l r : Env) (e : Expr), + e.colReferencesBoundedBy l.length = true → + eval (l ++ r) e = eval l e + | _, _, .lit _, _ => by simp [eval] + | l, r, .col i, h => by + have h_lt : i < l.length := of_decide_eq_true h + simp only [eval] + exact Env.get_append_left l r i h_lt + | l, r, .and a b, h => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h + simp only [eval] + rw [eval_append_left_of_bounded l r a h.1, + eval_append_left_of_bounded l r b h.2] + | l, r, .or a b, h => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h + simp only [eval] + rw [eval_append_left_of_bounded l r a h.1, + eval_append_left_of_bounded l r b h.2] + | l, r, .not a, h => by + simp only [Expr.colReferencesBoundedBy] at h + simp only [eval] + rw [eval_append_left_of_bounded l r a h] + | l, r, .ifThen c t e, h => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h + simp only [eval] + rw [eval_append_left_of_bounded l r c h.1.1, + eval_append_left_of_bounded l r t h.1.2, + eval_append_left_of_bounded l r e h.2] + | l, r, .andN args, h => by + simp only [Expr.colReferencesBoundedBy] at h + simp only [eval] + rw [eval_append_left_of_bounded_argsMap l r args h] + | l, r, .orN args, h => by + simp only [Expr.colReferencesBoundedBy] at h + simp only [eval] + rw [eval_append_left_of_bounded_argsMap l r args h] + | l, r, .coalesce args, h => by + simp only [Expr.colReferencesBoundedBy] at h + simp only [eval] + rw [eval_append_left_of_bounded_argsMap l r args h] + | l, r, .plus a b, h => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h + simp only [eval] + rw [eval_append_left_of_bounded l r a h.1, + eval_append_left_of_bounded l r b h.2] + | l, r, .minus a b, h => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h + simp only [eval] + rw [eval_append_left_of_bounded l r a h.1, + eval_append_left_of_bounded l r b h.2] + | l, r, .times a b, h => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h + simp only [eval] + rw [eval_append_left_of_bounded l r a h.1, + eval_append_left_of_bounded l r b h.2] + | l, r, .divide a b, h => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h + simp only [eval] + rw [eval_append_left_of_bounded l r a h.1, + eval_append_left_of_bounded l r b h.2] + | l, r, .eq a b, h => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h + simp only [eval] + rw [eval_append_left_of_bounded l r a h.1, + eval_append_left_of_bounded l r b h.2] + | l, r, .lt a b, h => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h + simp only [eval] + rw [eval_append_left_of_bounded l r a h.1, + eval_append_left_of_bounded l r b h.2] + +/-- Operand-list agreement under bound. Mutually defined with the +`Expr` form so structural recursion accepts both. -/ +theorem eval_append_left_of_bounded_argsMap : + ∀ (l r : Env) (args : List Expr), + Expr.argsColRefBoundedBy l.length args = true → + args.map (eval (l ++ r)) = args.map (eval l) + | _, _, [], _ => rfl + | l, r, e :: rest, h => by + simp only [Expr.argsColRefBoundedBy, Bool.and_eq_true] at h + show eval (l ++ r) e :: rest.map (eval (l ++ r)) + = eval l e :: rest.map (eval l) + rw [eval_append_left_of_bounded l r e h.1, + eval_append_left_of_bounded_argsMap l r rest h.2] +end + +end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index ecaa773c86801..fedbd724eff08 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -28,6 +28,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `BagStream.project` projects each row through a list of scalars; a row stays in the data collection only when every scalar succeeds, otherwise its err payloads (one per erroring scalar) are appended to the error collection. `rowErrs_nil_of_all_safe` and `projectErrs_eq_nil_of_all_safe` show that when no projection errs, `BagStream.project` does not extend the error collection. * `Mz/Pushdown.lean`: substitution (`Expr.subst`) plus the headline `eval_subst` theorem (substituting then evaluating against the original row equals evaluating against the projected row), and the relational predicate-pushdown rewrite `filterRel p (project es rel) = project es (filterRel (p.subst es) rel)`. +* `Mz/ColRefs.lean`: column-reference analyzer. `Expr.colReferencesBoundedBy n e` returns `true` when every `col i` in `e` has `i < n`. Mutually defined with `Expr.argsColRefBoundedBy` so structural recursion handles the nested-list constructors. Headline theorem `eval_append_left_of_bounded`: when a predicate's column references are bounded by `l.length`, evaluating against `l ++ r` agrees with evaluating against `l` alone — the foundation for pushing a single-side filter through a join's `cross` (where the joined env is `l ++ r`). Supporting `Env.get_append_left` proves the row-level reduction. * `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `-`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity / negation laws that downstream operators must respect. Negation laws (`neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`) carry the principle that `.error` is unrecoverable — a collection-scoped error cannot be subtracted away. The `_int` specializations (`add_comm_int`, `add_assoc_int`, `mul_assoc_int`, `mul_comm_int`, `mul_add_int`, `neg_neg_int`, `val_add_neg_val_int`) discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean`, `Mz/UnifiedConsolidate.lean`, and `Mz/SetOps.lean` can cite the named laws directly. * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). `UnifiedStream.project` lifts `BagStream.project` to the diff-aware carrier. Records with `.error` diff or `.err` carrier pass through unchanged; a `.row r` record with `.val n` diff is evaluated against `es` — if every scalar succeeds, the row is emitted with diff `.val n`; if any scalar errs, one `(.err e, .val n)` is emitted per erroring scalar (each preserving the original multiplicity). Theorems: `project_preserves_error_diff` (an `.error` diff in the input always reaches the output), `project_no_error` (all-`.val` inputs yield all-`.val` outputs), `project_nil_es` (empty projection list collapses every row to width-zero), `project_nil_stream` (empty stream is empty). From 5dc4982172f9bae99d09250a04228eb5d33cc099 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 15:14:12 +0200 Subject: [PATCH 054/127] doc/semantics: right-side column shift for join pushdown Adds `Expr.colShift k e` (adds `k` to every `col i`, leaving other constructors intact) and the headline soundness theorem `eval_append_right_shift`: `eval (l ++ r) (e.colShift l.length) = eval r e`. Right-side analogue of `eval_append_left_of_bounded`. Together with the existing left-side bound, this gives the optimizer the two primitives needed to push either side of a single-side filter through a join's `cross`: * Left predicate (no shift): `Expr.colReferencesBoundedBy l.length` ensures it sees only the left half. * Right predicate (shifted): `e.colShift l.length` realigns the predicate written against the right schema with the combined env. Supporting `Env.get_append_right` lemma proves the column-read reduction `Env.get (l ++ r) (l.length + i) = Env.get r i`. `Expr.colShift` and its `argsColShift` companion share a `mutual` block so structural recursion accepts the nested-list constructors (`andN`, `orN`, `coalesce`). The proof of `eval_append_right_shift` mirrors `eval_append_left_of_bounded`'s mutual structure. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/ColRefs.lean | 122 ++++++++++++++++++++++++ doc/developer/semantics/README.md | 5 +- 2 files changed, 126 insertions(+), 1 deletion(-) diff --git a/doc/developer/semantics/Mz/ColRefs.lean b/doc/developer/semantics/Mz/ColRefs.lean index a5bb0ffcc8b3c..a36970d75f86f 100644 --- a/doc/developer/semantics/Mz/ColRefs.lean +++ b/doc/developer/semantics/Mz/ColRefs.lean @@ -69,6 +69,23 @@ theorem Env.get_append_left : show Env.get (tl ++ r) n = Env.get tl n exact Env.get_append_left tl r n (Nat.lt_of_succ_lt_succ h) +/-- Reading a column index `l.length + i` from `l ++ r` yields +the i-th value of `r`. The right-side analogue of +`Env.get_append_left`. -/ +theorem Env.get_append_right : + ∀ (l r : Env) (i : Nat), + Env.get (l ++ r) (l.length + i) = Env.get r i + | [], _, _ => by + show Env.get (([] : Env) ++ _) (0 + _) = _ + rw [List.nil_append, Nat.zero_add] + | hd :: tl, r, i => by + show Env.get ((hd :: tl) ++ r) (tl.length + 1 + i) = Env.get r i + show Env.get (hd :: (tl ++ r)) (tl.length + 1 + i) = Env.get r i + have h_rewrite : tl.length + 1 + i = (tl.length + i) + 1 := by omega + rw [h_rewrite] + show Env.get (tl ++ r) (tl.length + i) = Env.get r i + exact Env.get_append_right tl r i + /-! ## Eval agreement under bound If a predicate's column references are all bounded by `l.length`, @@ -163,4 +180,109 @@ theorem eval_append_left_of_bounded_argsMap : eval_append_left_of_bounded_argsMap l r rest h.2] end +/-! ## Column shifting + +`Expr.colShift k e` adds `k` to every `col i` reference in `e`, +leaving other constructors structurally intact. Used to align a +predicate originally written against a right-side schema with the +joined env `l ++ r`: in the combined env, the right side starts at +index `l.length`, so shifting the predicate by `l.length` makes +its references land in the right half. + +The headline `eval_append_right_shift` states agreement: evaluating +the shifted expression against `l ++ r` equals evaluating the +original against `r`. -/ + +mutual +def Expr.colShift (k : Nat) : Expr → Expr + | .lit d => .lit d + | .col i => .col (k + i) + | .and a b => .and (a.colShift k) (b.colShift k) + | .or a b => .or (a.colShift k) (b.colShift k) + | .not a => .not (a.colShift k) + | .ifThen c t e => .ifThen (c.colShift k) (t.colShift k) (e.colShift k) + | .andN args => .andN (Expr.argsColShift k args) + | .orN args => .orN (Expr.argsColShift k args) + | .coalesce args => .coalesce (Expr.argsColShift k args) + | .plus a b => .plus (a.colShift k) (b.colShift k) + | .minus a b => .minus (a.colShift k) (b.colShift k) + | .times a b => .times (a.colShift k) (b.colShift k) + | .divide a b => .divide (a.colShift k) (b.colShift k) + | .eq a b => .eq (a.colShift k) (b.colShift k) + | .lt a b => .lt (a.colShift k) (b.colShift k) + +def Expr.argsColShift (k : Nat) : List Expr → List Expr + | [] => [] + | e :: rest => e.colShift k :: Expr.argsColShift k rest +end + +/-! ## Eval agreement under right-side shift + +Evaluating the shifted expression against `l ++ r` agrees with +evaluating the original against `r`. The shift compensates for +the `l.length` offset in the combined env. -/ + +mutual +theorem eval_append_right_shift : + ∀ (l r : Env) (e : Expr), + eval (l ++ r) (e.colShift l.length) = eval r e + | _, _, .lit _ => by simp [eval, Expr.colShift] + | l, r, .col i => by + show eval (l ++ r) (.col (l.length + i)) = eval r (.col i) + simp only [eval] + exact Env.get_append_right l r i + | l, r, .and a b => by + simp only [Expr.colShift, eval] + rw [eval_append_right_shift l r a, eval_append_right_shift l r b] + | l, r, .or a b => by + simp only [Expr.colShift, eval] + rw [eval_append_right_shift l r a, eval_append_right_shift l r b] + | l, r, .not a => by + simp only [Expr.colShift, eval] + rw [eval_append_right_shift l r a] + | l, r, .ifThen c t e => by + simp only [Expr.colShift, eval] + rw [eval_append_right_shift l r c, eval_append_right_shift l r t, + eval_append_right_shift l r e] + | l, r, .andN args => by + simp only [Expr.colShift, eval] + rw [eval_append_right_shift_argsMap l r args] + | l, r, .orN args => by + simp only [Expr.colShift, eval] + rw [eval_append_right_shift_argsMap l r args] + | l, r, .coalesce args => by + simp only [Expr.colShift, eval] + rw [eval_append_right_shift_argsMap l r args] + | l, r, .plus a b => by + simp only [Expr.colShift, eval] + rw [eval_append_right_shift l r a, eval_append_right_shift l r b] + | l, r, .minus a b => by + simp only [Expr.colShift, eval] + rw [eval_append_right_shift l r a, eval_append_right_shift l r b] + | l, r, .times a b => by + simp only [Expr.colShift, eval] + rw [eval_append_right_shift l r a, eval_append_right_shift l r b] + | l, r, .divide a b => by + simp only [Expr.colShift, eval] + rw [eval_append_right_shift l r a, eval_append_right_shift l r b] + | l, r, .eq a b => by + simp only [Expr.colShift, eval] + rw [eval_append_right_shift l r a, eval_append_right_shift l r b] + | l, r, .lt a b => by + simp only [Expr.colShift, eval] + rw [eval_append_right_shift l r a, eval_append_right_shift l r b] + +theorem eval_append_right_shift_argsMap : + ∀ (l r : Env) (args : List Expr), + (Expr.argsColShift l.length args).map (eval (l ++ r)) + = args.map (eval r) + | _, _, [] => rfl + | l, r, e :: rest => by + show eval (l ++ r) (e.colShift l.length) + :: (Expr.argsColShift l.length rest).map (eval (l ++ r)) + = eval r e :: rest.map (eval r) + rw [eval_append_right_shift l r e, + eval_append_right_shift_argsMap l r rest] +end + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index fedbd724eff08..44131b8d0e36c 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -28,7 +28,10 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `BagStream.project` projects each row through a list of scalars; a row stays in the data collection only when every scalar succeeds, otherwise its err payloads (one per erroring scalar) are appended to the error collection. `rowErrs_nil_of_all_safe` and `projectErrs_eq_nil_of_all_safe` show that when no projection errs, `BagStream.project` does not extend the error collection. * `Mz/Pushdown.lean`: substitution (`Expr.subst`) plus the headline `eval_subst` theorem (substituting then evaluating against the original row equals evaluating against the projected row), and the relational predicate-pushdown rewrite `filterRel p (project es rel) = project es (filterRel (p.subst es) rel)`. -* `Mz/ColRefs.lean`: column-reference analyzer. `Expr.colReferencesBoundedBy n e` returns `true` when every `col i` in `e` has `i < n`. Mutually defined with `Expr.argsColRefBoundedBy` so structural recursion handles the nested-list constructors. Headline theorem `eval_append_left_of_bounded`: when a predicate's column references are bounded by `l.length`, evaluating against `l ++ r` agrees with evaluating against `l` alone — the foundation for pushing a single-side filter through a join's `cross` (where the joined env is `l ++ r`). Supporting `Env.get_append_left` proves the row-level reduction. +* `Mz/ColRefs.lean`: column-reference analyzer and rewriter. + `Expr.colReferencesBoundedBy n e` returns `true` when every `col i` in `e` has `i < n`. Mutually defined with `Expr.argsColRefBoundedBy` so structural recursion handles the nested-list constructors. Headline theorem `eval_append_left_of_bounded`: when a predicate's column references are bounded by `l.length`, evaluating against `l ++ r` agrees with evaluating against `l` alone — the foundation for pushing a single-side filter through a join's `cross` (where the joined env is `l ++ r`). + `Expr.colShift k e` adds `k` to every column reference, leaving other constructors structurally intact. Right-side analogue of the bounded analyzer: it realigns a predicate written against the right schema with the joined env `l ++ r` (where the right side starts at index `l.length`). Headline `eval_append_right_shift`: `eval (l ++ r) (e.colShift l.length) = eval r e`. + Supporting environment lemmas: `Env.get_append_left` (read from prefix) and `Env.get_append_right` (read from suffix with index shift). * `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `-`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity / negation laws that downstream operators must respect. Negation laws (`neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`) carry the principle that `.error` is unrecoverable — a collection-scoped error cannot be subtracted away. The `_int` specializations (`add_comm_int`, `add_assoc_int`, `mul_assoc_int`, `mul_comm_int`, `mul_add_int`, `neg_neg_int`, `val_add_neg_val_int`) discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean`, `Mz/UnifiedConsolidate.lean`, and `Mz/SetOps.lean` can cite the named laws directly. * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). `UnifiedStream.project` lifts `BagStream.project` to the diff-aware carrier. Records with `.error` diff or `.err` carrier pass through unchanged; a `.row r` record with `.val n` diff is evaluated against `es` — if every scalar succeeds, the row is emitted with diff `.val n`; if any scalar errs, one `(.err e, .val n)` is emitted per erroring scalar (each preserving the original multiplicity). Theorems: `project_preserves_error_diff` (an `.error` diff in the input always reaches the output), `project_no_error` (all-`.val` inputs yield all-`.val` outputs), `project_nil_es` (empty projection list collapses every row to width-zero), `project_nil_stream` (empty stream is empty). From b64c451f920201c4fbe923843a2f36b1434a0bdb Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 15:19:46 +0200 Subject: [PATCH 055/127] doc/semantics: bound monotonicity for column references MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `Expr.colReferencesBoundedBy_mono` (and operand-list mutual companion `Expr.argsColRefBoundedBy_mono`): a predicate bounded by `n` is also bounded by any `m ≥ n`. Useful when a predicate's natural bound is a single relation's width but the proof site needs the joined-env width. `eval_append_left_of_bounded_at` packages the common shape: when the predicate's bound `n` is at most `l.length`, the agreement `eval (l ++ r) e = eval l e` holds. Removes the requirement that the bound match `l.length` exactly. Both lemmas are pure additions; downstream join-pushdown theorems will cite them to bridge tight per-relation bounds and the wider combined-env bound. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/ColRefs.lean | 86 +++++++++++++++++++++++++ doc/developer/semantics/README.md | 1 + 2 files changed, 87 insertions(+) diff --git a/doc/developer/semantics/Mz/ColRefs.lean b/doc/developer/semantics/Mz/ColRefs.lean index a36970d75f86f..7040c4b7b86dd 100644 --- a/doc/developer/semantics/Mz/ColRefs.lean +++ b/doc/developer/semantics/Mz/ColRefs.lean @@ -180,6 +180,92 @@ theorem eval_append_left_of_bounded_argsMap : eval_append_left_of_bounded_argsMap l r rest h.2] end +/-! ## Bound monotonicity + +A predicate whose column references are bounded by `n` is also +bounded by any `m ≥ n`. Used to lift a tight per-relation bound +(e.g. `pred is bounded by table-A-width`) to a coarser join-env +bound (`bounded by combined-env-width`). -/ + +mutual +theorem Expr.colReferencesBoundedBy_mono : + ∀ {n m : Nat} (e : Expr), + e.colReferencesBoundedBy n = true → n ≤ m → + e.colReferencesBoundedBy m = true + | _, _, .lit _, _, _ => rfl + | n, m, .col i, h, hLe => by + have h_lt : i < n := of_decide_eq_true h + show decide (i < m) = true + exact decide_eq_true (Nat.lt_of_lt_of_le h_lt hLe) + | _, _, .and a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesBoundedBy_mono a h.1 hLe, + Expr.colReferencesBoundedBy_mono b h.2 hLe⟩ + | _, _, .or a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesBoundedBy_mono a h.1 hLe, + Expr.colReferencesBoundedBy_mono b h.2 hLe⟩ + | _, _, .not a, h, hLe => Expr.colReferencesBoundedBy_mono a h hLe + | _, _, .ifThen c t e, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h ⊢ + exact ⟨⟨Expr.colReferencesBoundedBy_mono c h.1.1 hLe, + Expr.colReferencesBoundedBy_mono t h.1.2 hLe⟩, + Expr.colReferencesBoundedBy_mono e h.2 hLe⟩ + | _, _, .andN args, h, hLe => by + simp only [Expr.colReferencesBoundedBy] at h ⊢ + exact Expr.argsColRefBoundedBy_mono args h hLe + | _, _, .orN args, h, hLe => by + simp only [Expr.colReferencesBoundedBy] at h ⊢ + exact Expr.argsColRefBoundedBy_mono args h hLe + | _, _, .coalesce args, h, hLe => by + simp only [Expr.colReferencesBoundedBy] at h ⊢ + exact Expr.argsColRefBoundedBy_mono args h hLe + | _, _, .plus a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesBoundedBy_mono a h.1 hLe, + Expr.colReferencesBoundedBy_mono b h.2 hLe⟩ + | _, _, .minus a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesBoundedBy_mono a h.1 hLe, + Expr.colReferencesBoundedBy_mono b h.2 hLe⟩ + | _, _, .times a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesBoundedBy_mono a h.1 hLe, + Expr.colReferencesBoundedBy_mono b h.2 hLe⟩ + | _, _, .divide a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesBoundedBy_mono a h.1 hLe, + Expr.colReferencesBoundedBy_mono b h.2 hLe⟩ + | _, _, .eq a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesBoundedBy_mono a h.1 hLe, + Expr.colReferencesBoundedBy_mono b h.2 hLe⟩ + | _, _, .lt a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesBoundedBy_mono a h.1 hLe, + Expr.colReferencesBoundedBy_mono b h.2 hLe⟩ + +theorem Expr.argsColRefBoundedBy_mono : + ∀ {n m : Nat} (args : List Expr), + Expr.argsColRefBoundedBy n args = true → n ≤ m → + Expr.argsColRefBoundedBy m args = true + | _, _, [], _, _ => rfl + | _, _, e :: rest, h, hLe => by + simp only [Expr.argsColRefBoundedBy, Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesBoundedBy_mono e h.1 hLe, + Expr.argsColRefBoundedBy_mono rest h.2 hLe⟩ +end + +/-- Convenience: when the predicate's bound `n` is at most the +prefix length, `eval (l ++ r) e = eval l e`. Removes the need for +the predicate to know `l.length` exactly. -/ +theorem eval_append_left_of_bounded_at + (l r : Env) (n : Nat) (e : Expr) + (hP : e.colReferencesBoundedBy n = true) (hLe : n ≤ l.length) : + eval (l ++ r) e = eval l e := + eval_append_left_of_bounded l r e + (Expr.colReferencesBoundedBy_mono e hP hLe) + /-! ## Column shifting `Expr.colShift k e` adds `k` to every `col i` reference in `e`, diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 44131b8d0e36c..0146a4dbe4e31 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -31,6 +31,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/ColRefs.lean`: column-reference analyzer and rewriter. `Expr.colReferencesBoundedBy n e` returns `true` when every `col i` in `e` has `i < n`. Mutually defined with `Expr.argsColRefBoundedBy` so structural recursion handles the nested-list constructors. Headline theorem `eval_append_left_of_bounded`: when a predicate's column references are bounded by `l.length`, evaluating against `l ++ r` agrees with evaluating against `l` alone — the foundation for pushing a single-side filter through a join's `cross` (where the joined env is `l ++ r`). `Expr.colShift k e` adds `k` to every column reference, leaving other constructors structurally intact. Right-side analogue of the bounded analyzer: it realigns a predicate written against the right schema with the joined env `l ++ r` (where the right side starts at index `l.length`). Headline `eval_append_right_shift`: `eval (l ++ r) (e.colShift l.length) = eval r e`. + `Expr.colReferencesBoundedBy_mono` (mutual with `Expr.argsColRefBoundedBy_mono`) lifts a tight bound to a coarser one — useful when a predicate's natural bound is a single relation's width but the proof site needs the joined-env width. Convenience `eval_append_left_of_bounded_at` removes the requirement that the predicate's bound match `l.length` exactly: any `n ≤ l.length` suffices. Supporting environment lemmas: `Env.get_append_left` (read from prefix) and `Env.get_append_right` (read from suffix with index shift). * `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `-`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity / negation laws that downstream operators must respect. Negation laws (`neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`) carry the principle that `.error` is unrecoverable — a collection-scoped error cannot be subtracted away. The `_int` specializations (`add_comm_int`, `add_assoc_int`, `mul_assoc_int`, `mul_comm_int`, `mul_add_int`, `neg_neg_int`, `val_add_neg_val_int`) discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean`, `Mz/UnifiedConsolidate.lean`, and `Mz/SetOps.lean` can cite the named laws directly. * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). From 920df20a1a8ae5259a8493184752216e85286c11 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 15:22:50 +0200 Subject: [PATCH 056/127] doc/semantics: colShift monoid laws Adds `Expr.colShift_zero` (identity at `k = 0`) and `Expr.colShift_add` (`(e.colShift k).colShift m = e.colShift (k + m)`), the monoid laws on the column-shift operator. Mutual companions `Expr.argsColShift_zero` / `argsColShift_add` discharge the nested-list constructors. Useful for nested joins: each layer can add its own offset to a predicate's column references and combine them by addition. Pure additions; downstream join-pushdown theorems will cite them to flatten multi-layer shifts. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/ColRefs.lean | 132 ++++++++++++++++++++++++ doc/developer/semantics/README.md | 1 + 2 files changed, 133 insertions(+) diff --git a/doc/developer/semantics/Mz/ColRefs.lean b/doc/developer/semantics/Mz/ColRefs.lean index 7040c4b7b86dd..0bc5f7cd0ed16 100644 --- a/doc/developer/semantics/Mz/ColRefs.lean +++ b/doc/developer/semantics/Mz/ColRefs.lean @@ -371,4 +371,136 @@ theorem eval_append_right_shift_argsMap : eval_append_right_shift_argsMap l r rest] end +/-! ## Shift composition laws + +`colShift` is the identity at `k = 0` and composes additively: +shifting by `k` then by `m` equals shifting by `k + m`. Useful for +nested joins where each join adds its own offset to the predicate's +column references. -/ + +mutual +theorem Expr.colShift_zero : ∀ (e : Expr), e.colShift 0 = e + | .lit _ => rfl + | .col i => by show Expr.col (0 + i) = .col i; rw [Nat.zero_add] + | .and a b => by + show Expr.and (a.colShift 0) (b.colShift 0) = .and a b + rw [Expr.colShift_zero a, Expr.colShift_zero b] + | .or a b => by + show Expr.or (a.colShift 0) (b.colShift 0) = .or a b + rw [Expr.colShift_zero a, Expr.colShift_zero b] + | .not a => by + show Expr.not (a.colShift 0) = .not a + rw [Expr.colShift_zero a] + | .ifThen c t e => by + show Expr.ifThen (c.colShift 0) (t.colShift 0) (e.colShift 0) + = .ifThen c t e + rw [Expr.colShift_zero c, Expr.colShift_zero t, Expr.colShift_zero e] + | .andN args => by + show Expr.andN (Expr.argsColShift 0 args) = .andN args + rw [Expr.argsColShift_zero args] + | .orN args => by + show Expr.orN (Expr.argsColShift 0 args) = .orN args + rw [Expr.argsColShift_zero args] + | .coalesce args => by + show Expr.coalesce (Expr.argsColShift 0 args) = .coalesce args + rw [Expr.argsColShift_zero args] + | .plus a b => by + show Expr.plus (a.colShift 0) (b.colShift 0) = .plus a b + rw [Expr.colShift_zero a, Expr.colShift_zero b] + | .minus a b => by + show Expr.minus (a.colShift 0) (b.colShift 0) = .minus a b + rw [Expr.colShift_zero a, Expr.colShift_zero b] + | .times a b => by + show Expr.times (a.colShift 0) (b.colShift 0) = .times a b + rw [Expr.colShift_zero a, Expr.colShift_zero b] + | .divide a b => by + show Expr.divide (a.colShift 0) (b.colShift 0) = .divide a b + rw [Expr.colShift_zero a, Expr.colShift_zero b] + | .eq a b => by + show Expr.eq (a.colShift 0) (b.colShift 0) = .eq a b + rw [Expr.colShift_zero a, Expr.colShift_zero b] + | .lt a b => by + show Expr.lt (a.colShift 0) (b.colShift 0) = .lt a b + rw [Expr.colShift_zero a, Expr.colShift_zero b] + +theorem Expr.argsColShift_zero : ∀ (args : List Expr), + Expr.argsColShift 0 args = args + | [] => rfl + | e :: rest => by + show e.colShift 0 :: Expr.argsColShift 0 rest = e :: rest + rw [Expr.colShift_zero e, Expr.argsColShift_zero rest] +end + +mutual +theorem Expr.colShift_add : + ∀ (k m : Nat) (e : Expr), (e.colShift k).colShift m = e.colShift (k + m) + | _, _, .lit _ => rfl + | k, m, .col i => by + show Expr.col (m + (k + i)) = Expr.col (k + m + i) + congr 1; omega + | k, m, .and a b => by + show Expr.and ((a.colShift k).colShift m) ((b.colShift k).colShift m) + = .and (a.colShift (k + m)) (b.colShift (k + m)) + rw [Expr.colShift_add k m a, Expr.colShift_add k m b] + | k, m, .or a b => by + show Expr.or ((a.colShift k).colShift m) ((b.colShift k).colShift m) + = .or (a.colShift (k + m)) (b.colShift (k + m)) + rw [Expr.colShift_add k m a, Expr.colShift_add k m b] + | k, m, .not a => by + show Expr.not ((a.colShift k).colShift m) = .not (a.colShift (k + m)) + rw [Expr.colShift_add k m a] + | k, m, .ifThen c t e => by + show Expr.ifThen ((c.colShift k).colShift m) ((t.colShift k).colShift m) + ((e.colShift k).colShift m) + = .ifThen (c.colShift (k + m)) (t.colShift (k + m)) (e.colShift (k + m)) + rw [Expr.colShift_add k m c, Expr.colShift_add k m t, + Expr.colShift_add k m e] + | k, m, .andN args => by + show Expr.andN (Expr.argsColShift m (Expr.argsColShift k args)) + = .andN (Expr.argsColShift (k + m) args) + rw [Expr.argsColShift_add k m args] + | k, m, .orN args => by + show Expr.orN (Expr.argsColShift m (Expr.argsColShift k args)) + = .orN (Expr.argsColShift (k + m) args) + rw [Expr.argsColShift_add k m args] + | k, m, .coalesce args => by + show Expr.coalesce (Expr.argsColShift m (Expr.argsColShift k args)) + = .coalesce (Expr.argsColShift (k + m) args) + rw [Expr.argsColShift_add k m args] + | k, m, .plus a b => by + show Expr.plus ((a.colShift k).colShift m) ((b.colShift k).colShift m) + = .plus (a.colShift (k + m)) (b.colShift (k + m)) + rw [Expr.colShift_add k m a, Expr.colShift_add k m b] + | k, m, .minus a b => by + show Expr.minus ((a.colShift k).colShift m) ((b.colShift k).colShift m) + = .minus (a.colShift (k + m)) (b.colShift (k + m)) + rw [Expr.colShift_add k m a, Expr.colShift_add k m b] + | k, m, .times a b => by + show Expr.times ((a.colShift k).colShift m) ((b.colShift k).colShift m) + = .times (a.colShift (k + m)) (b.colShift (k + m)) + rw [Expr.colShift_add k m a, Expr.colShift_add k m b] + | k, m, .divide a b => by + show Expr.divide ((a.colShift k).colShift m) ((b.colShift k).colShift m) + = .divide (a.colShift (k + m)) (b.colShift (k + m)) + rw [Expr.colShift_add k m a, Expr.colShift_add k m b] + | k, m, .eq a b => by + show Expr.eq ((a.colShift k).colShift m) ((b.colShift k).colShift m) + = .eq (a.colShift (k + m)) (b.colShift (k + m)) + rw [Expr.colShift_add k m a, Expr.colShift_add k m b] + | k, m, .lt a b => by + show Expr.lt ((a.colShift k).colShift m) ((b.colShift k).colShift m) + = .lt (a.colShift (k + m)) (b.colShift (k + m)) + rw [Expr.colShift_add k m a, Expr.colShift_add k m b] + +theorem Expr.argsColShift_add : + ∀ (k m : Nat) (args : List Expr), + Expr.argsColShift m (Expr.argsColShift k args) + = Expr.argsColShift (k + m) args + | _, _, [] => rfl + | k, m, e :: rest => by + show (e.colShift k).colShift m :: Expr.argsColShift m (Expr.argsColShift k rest) + = e.colShift (k + m) :: Expr.argsColShift (k + m) rest + rw [Expr.colShift_add k m e, Expr.argsColShift_add k m rest] +end + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 0146a4dbe4e31..589afa7e8e22b 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -32,6 +32,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `Expr.colReferencesBoundedBy n e` returns `true` when every `col i` in `e` has `i < n`. Mutually defined with `Expr.argsColRefBoundedBy` so structural recursion handles the nested-list constructors. Headline theorem `eval_append_left_of_bounded`: when a predicate's column references are bounded by `l.length`, evaluating against `l ++ r` agrees with evaluating against `l` alone — the foundation for pushing a single-side filter through a join's `cross` (where the joined env is `l ++ r`). `Expr.colShift k e` adds `k` to every column reference, leaving other constructors structurally intact. Right-side analogue of the bounded analyzer: it realigns a predicate written against the right schema with the joined env `l ++ r` (where the right side starts at index `l.length`). Headline `eval_append_right_shift`: `eval (l ++ r) (e.colShift l.length) = eval r e`. `Expr.colReferencesBoundedBy_mono` (mutual with `Expr.argsColRefBoundedBy_mono`) lifts a tight bound to a coarser one — useful when a predicate's natural bound is a single relation's width but the proof site needs the joined-env width. Convenience `eval_append_left_of_bounded_at` removes the requirement that the predicate's bound match `l.length` exactly: any `n ≤ l.length` suffices. + `Expr.colShift_zero` (identity at `k = 0`) and `Expr.colShift_add` (`(e.colShift k).colShift m = e.colShift (k + m)`) give the shift its monoid laws. Useful for nested joins where each layer adds its own offset to the predicate's column references. Supporting environment lemmas: `Env.get_append_left` (read from prefix) and `Env.get_append_right` (read from suffix with index shift). * `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `-`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity / negation laws that downstream operators must respect. Negation laws (`neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`) carry the principle that `.error` is unrecoverable — a collection-scoped error cannot be subtracted away. The `_int` specializations (`add_comm_int`, `add_assoc_int`, `mul_assoc_int`, `mul_comm_int`, `mul_add_int`, `neg_neg_int`, `val_add_neg_val_int`) discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean`, `Mz/UnifiedConsolidate.lean`, and `Mz/SetOps.lean` can cite the named laws directly. * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). From 775194b02661af009e36ff89157ff35ff4afd843 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 15:30:09 +0200 Subject: [PATCH 057/127] doc/semantics: operator distributivity over unionAll MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds four distributivity theorems showing that `flatMap`- and `map`-based operators distribute over `unionAll` (which is `++`): * `filter_unionAll` — predicate filter. * `cross_unionAll_left` — cross-product with a fixed right input. * `project_unionAll` — projection. * `negate_unionAll` — diff negation. Each follows from `List.flatMap_append` / `List.map_append`. They let the optimizer pull a `UNION ALL` tail out of a pipeline and plan per-branch independently. The note about `cross_unionAll_right` (cross with `unionAll` on the right) is intentionally absent: that direction is *not* a list equality — flatMap over `l` interleaves outputs from `r1` and `r2` per-l-record, while concatenating crosses produces them in a different order. It holds only up to multiset equality. Side change: `combineCarrier` in `Mz/Join.lean` is no longer `private` so future join-pushdown work can refer to it directly. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Join.lean | 6 ++-- doc/developer/semantics/Mz/SetOps.lean | 42 ++++++++++++++++++++++++++ doc/developer/semantics/README.md | 1 + 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/doc/developer/semantics/Mz/Join.lean b/doc/developer/semantics/Mz/Join.lean index 846ceedebbfe7..b4b2d882f240d 100644 --- a/doc/developer/semantics/Mz/Join.lean +++ b/doc/developer/semantics/Mz/Join.lean @@ -27,8 +27,10 @@ would lift to this with a column-substitution layer. namespace Mz -/-- Combine two unified carriers, with left winning on err conflict. -/ -@[inline] private def combineCarrier : UnifiedRow → UnifiedRow → UnifiedRow +/-- Combine two unified carriers, with left winning on err conflict. +Exposed (not `private`) so the join-pushdown theorems in +`Mz/JoinPushdown.lean` can refer to it directly. -/ +@[inline] def combineCarrier : UnifiedRow → UnifiedRow → UnifiedRow | .row la, .row rb => .row (la ++ rb) | .err e, _ => .err e | _, .err e => .err e diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 5faed5756a55a..5ac48d3e56006 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -1,6 +1,7 @@ import Mz.UnifiedStream import Mz.UnifiedConsolidate import Mz.DiffSemiring +import Mz.Join /-! # Set operations on `UnifiedStream` @@ -499,4 +500,45 @@ theorem UnifiedStream.distinct_only_one_or_error (us : UnifiedStream) : x.2 = DiffWithError.val 1 ∨ x.2 = DiffWithError.error := UnifiedStream.clampToOne_only_one_or_error _ +/-! ## Distributivity over `unionAll` + +`unionAll` is concatenation, so any operator built as a `flatMap` +(`filter`, `cross`-on-left, `project`) distributes over it via the +generic `List.flatMap_append` law. The distributivity laws below +let the optimizer rearrange a pipeline whose tail is a `UNION ALL` +into per-branch pipelines that can be planned independently. -/ + +theorem UnifiedStream.filter_unionAll (p : Expr) (a b : UnifiedStream) : + UnifiedStream.filter p (UnifiedStream.unionAll a b) + = UnifiedStream.unionAll + (UnifiedStream.filter p a) + (UnifiedStream.filter p b) := by + show (a ++ b).flatMap _ = a.flatMap _ ++ b.flatMap _ + exact List.flatMap_append + +theorem UnifiedStream.cross_unionAll_left (a b r : UnifiedStream) : + UnifiedStream.cross (UnifiedStream.unionAll a b) r + = UnifiedStream.unionAll + (UnifiedStream.cross a r) + (UnifiedStream.cross b r) := by + show (a ++ b).flatMap _ = a.flatMap _ ++ b.flatMap _ + exact List.flatMap_append + +theorem UnifiedStream.project_unionAll (es : List Expr) (a b : UnifiedStream) : + UnifiedStream.project es (UnifiedStream.unionAll a b) + = UnifiedStream.unionAll + (UnifiedStream.project es a) + (UnifiedStream.project es b) := by + show (a ++ b).flatMap _ = a.flatMap _ ++ b.flatMap _ + exact List.flatMap_append + +/-- `negate` distributes over `unionAll` via `List.map_append`. -/ +theorem UnifiedStream.negate_unionAll (a b : UnifiedStream) : + UnifiedStream.negate (UnifiedStream.unionAll a b) + = UnifiedStream.unionAll + (UnifiedStream.negate a) + (UnifiedStream.negate b) := by + show (a ++ b).map _ = a.map _ ++ b.map _ + exact List.map_append + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 589afa7e8e22b..83c8d353b009d 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -53,6 +53,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `bagExceptAll = clampPositive ∘ exceptAll` realizes the bag-semantics `EXCEPT ALL` — the signed-diff result is post-processed to drop non-positive multiplicities, producing `max(L - R, 0)` per carrier. Theorems lift the signed flavor: `bagExceptAll_length_le`, `bagExceptAll_preserves_error_diff_left`/`_right`, `bagExceptAll_only_positive`. `clampToOne` collapses surviving multiplicities to one: `.val n > 0` becomes `.val 1`, non-positive `.val` is dropped, `.error` survives. Defined by structural recursion on the list. Theorems: `clampToOne_length_le`, `clampToOne_preserves_error_diff`, `clampToOne_only_one_or_error` (every output diff is `.val 1` or `.error`). `distinct = clampToOne ∘ consolidate` realizes SQL `DISTINCT`: each distinct carrier appears at most once with multiplicity one (or `.error` if a collection-scoped error existed). Theorems: `distinct_length_le`, `distinct_preserves_error_diff`, `distinct_only_one_or_error`. + Distributivity over `unionAll`: `filter_unionAll`, `cross_unionAll_left`, `project_unionAll`, `negate_unionAll`. Each follows from `List.flatMap_append` / `List.map_append` — `flatMap`- and `map`-based operators distribute over concatenation. Lets the optimizer pull a `UNION ALL` tail out of a pipeline and plan per-branch. `INTERSECT ALL` requires a per-carrier `min` combinator not yet exposed by `DiffWithError`. Deferred. * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. From 6986044c1194fc3eda534167287c38d34a1dbf63 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 15:33:06 +0200 Subject: [PATCH 058/127] doc/semantics: negation distributes through multiplication MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `neg_mul` and `mul_neg` on `DiffWithError`: negation distributes through multiplication on both sides. Mirrors the ring law `(-a) * b = -(a * b) = a * (-b)`, lifted through the `.error` absorber. `_int` specializations discharge the base hypothesis via `Int.neg_mul` / `Int.mul_neg`. Application: `UnifiedStream.cross_negate_left` — negating the left input of a cross product equals negating the cross output. `combineCarrier` is unchanged by negation (only diffs flip), so the proof reduces to the per-r-record diff arithmetic `(-d) * rd.2 = -(d * rd.2)`, discharged by `neg_mul_int`. Useful for reasoning about the signed-diff `exceptAll`: the operator's `negate r` argument can be moved through subsequent cross products without changing semantics. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/DiffSemiring.lean | 40 ++++++++++++++++ doc/developer/semantics/Mz/SetOps.lean | 48 ++++++++++++++++++++ doc/developer/semantics/README.md | 3 +- 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/doc/developer/semantics/Mz/DiffSemiring.lean b/doc/developer/semantics/Mz/DiffSemiring.lean index 773021f6ade8c..79c29b244c148 100644 --- a/doc/developer/semantics/Mz/DiffSemiring.lean +++ b/doc/developer/semantics/Mz/DiffSemiring.lean @@ -251,6 +251,40 @@ theorem val_add_neg_val [Add α] [Neg α] [Zero α] show (val (x + -x) : DiffWithError α) = val 0 rw [h] +/-- Negation distributes over multiplication on the left, lifted +from the base. Used by `Mz/SetOps.lean` to reason about negating +one side of a cross product. -/ +theorem neg_mul [Mul α] [Neg α] + (h : ∀ x y : α, (-x) * y = -(x * y)) + (a b : DiffWithError α) : (-a) * b = -(a * b) := by + cases a with + | val x => + cases b with + | val y => + show (val ((-x) * y) : DiffWithError α) = val (-(x * y)) + rw [h] + | error => rfl + | error => + cases b with + | val _ => rfl + | error => rfl + +/-- Negation distributes over multiplication on the right. -/ +theorem mul_neg [Mul α] [Neg α] + (h : ∀ x y : α, x * (-y) = -(x * y)) + (a b : DiffWithError α) : a * (-b) = -(a * b) := by + cases a with + | val x => + cases b with + | val y => + show (val (x * (-y)) : DiffWithError α) = val (-(x * y)) + rw [h] + | error => rfl + | error => + cases b with + | val _ => rfl + | error => rfl + /-! ## Int specializations The diff-aware operators in `Mz/UnifiedStream.lean` and @@ -284,6 +318,12 @@ theorem val_add_neg_val_int (x : Int) : (val x : DiffWithError Int) + -val x = 0 := val_add_neg_val (fun x => by omega) x +theorem neg_mul_int (a b : DiffWithError Int) : (-a) * b = -(a * b) := + neg_mul (fun x y => Int.neg_mul x y) a b + +theorem mul_neg_int (a b : DiffWithError Int) : a * (-b) = -(a * b) := + mul_neg (fun x y => Int.mul_neg x y) a b + end DiffWithError end Mz diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 5ac48d3e56006..8a630d4bd6482 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -541,4 +541,52 @@ theorem UnifiedStream.negate_unionAll (a b : UnifiedStream) : show (a ++ b).map _ = a.map _ ++ b.map _ exact List.map_append +/-- Negating one side of a cross product is the same as negating +the cross product. The diff-semiring law `(-a) * b = -(a * b)` +carries the proof through `combineCarrier` (carrier is unchanged +by negation) and the diff arithmetic. -/ +theorem UnifiedStream.cross_negate_left (l r : UnifiedStream) : + UnifiedStream.cross (UnifiedStream.negate l) r + = UnifiedStream.negate (UnifiedStream.cross l r) := by + induction l with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + -- LHS: cross ((uc, -d) :: negate tl) r + -- RHS: negate (r.map (fun rd => (combineCarrier uc rd.1, d * rd.2)) ++ cross tl r) + show UnifiedStream.cross ((uc, -d) :: UnifiedStream.negate tl) r + = UnifiedStream.negate + ((r.map (fun rd => (combineCarrier uc rd.1, d * rd.2))) + ++ UnifiedStream.cross tl r) + -- LHS reduces to (r.map (fun rd => (combineCarrier uc rd.1, (-d) * rd.2))) + -- ++ cross (negate tl) r + show (r.map (fun rd => (combineCarrier uc rd.1, (-d) * rd.2))) + ++ UnifiedStream.cross (UnifiedStream.negate tl) r + = UnifiedStream.negate + ((r.map (fun rd => (combineCarrier uc rd.1, d * rd.2))) + ++ UnifiedStream.cross tl r) + -- RHS: negate distributes over ++. + rw [show UnifiedStream.negate + ((r.map (fun rd => (combineCarrier uc rd.1, d * rd.2))) + ++ UnifiedStream.cross tl r) + = UnifiedStream.negate + (r.map (fun rd => (combineCarrier uc rd.1, d * rd.2))) + ++ UnifiedStream.negate (UnifiedStream.cross tl r) + from UnifiedStream.negate_unionAll _ _] + rw [← ih] + -- Reduce to per-r-record equality. The two r.map terms must agree. + congr 1 + -- negate (r.map (fun rd => (combineCarrier uc rd.1, d * rd.2))) + -- = r.map (fun rd => (combineCarrier uc rd.1, -(d * rd.2))) + -- = r.map (fun rd => (combineCarrier uc rd.1, (-d) * rd.2)) + show r.map (fun rd => (combineCarrier uc rd.1, (-d) * rd.2)) + = (r.map (fun rd => (combineCarrier uc rd.1, d * rd.2))).map + (fun ud => (ud.1, -ud.2)) + rw [List.map_map] + apply List.map_congr_left + intro rd _ + show (combineCarrier uc rd.1, (-d) * rd.2) + = (combineCarrier uc rd.1, -(d * rd.2)) + rw [DiffWithError.neg_mul_int] + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 83c8d353b009d..91f2b35d0ee81 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -34,7 +34,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `Expr.colReferencesBoundedBy_mono` (mutual with `Expr.argsColRefBoundedBy_mono`) lifts a tight bound to a coarser one — useful when a predicate's natural bound is a single relation's width but the proof site needs the joined-env width. Convenience `eval_append_left_of_bounded_at` removes the requirement that the predicate's bound match `l.length` exactly: any `n ≤ l.length` suffices. `Expr.colShift_zero` (identity at `k = 0`) and `Expr.colShift_add` (`(e.colShift k).colShift m = e.colShift (k + m)`) give the shift its monoid laws. Useful for nested joins where each layer adds its own offset to the predicate's column references. Supporting environment lemmas: `Env.get_append_left` (read from prefix) and `Env.get_append_right` (read from suffix with index shift). -* `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `-`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity / negation laws that downstream operators must respect. Negation laws (`neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`) carry the principle that `.error` is unrecoverable — a collection-scoped error cannot be subtracted away. The `_int` specializations (`add_comm_int`, `add_assoc_int`, `mul_assoc_int`, `mul_comm_int`, `mul_add_int`, `neg_neg_int`, `val_add_neg_val_int`) discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean`, `Mz/UnifiedConsolidate.lean`, and `Mz/SetOps.lean` can cite the named laws directly. +* `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `-`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity / negation laws that downstream operators must respect. Negation laws (`neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`, `neg_mul`, `mul_neg`) carry the principle that `.error` is unrecoverable — a collection-scoped error cannot be subtracted away, and that negation distributes through multiplication on both sides. The `_int` specializations (`add_comm_int`, `add_assoc_int`, `mul_assoc_int`, `mul_comm_int`, `mul_add_int`, `neg_neg_int`, `val_add_neg_val_int`, `neg_mul_int`, `mul_neg_int`) discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean`, `Mz/UnifiedConsolidate.lean`, and `Mz/SetOps.lean` can cite the named laws directly. * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). `UnifiedStream.project` lifts `BagStream.project` to the diff-aware carrier. Records with `.error` diff or `.err` carrier pass through unchanged; a `.row r` record with `.val n` diff is evaluated against `es` — if every scalar succeeds, the row is emitted with diff `.val n`; if any scalar errs, one `(.err e, .val n)` is emitted per erroring scalar (each preserving the original multiplicity). Theorems: `project_preserves_error_diff` (an `.error` diff in the input always reaches the output), `project_no_error` (all-`.val` inputs yield all-`.val` outputs), `project_nil_es` (empty projection list collapses every row to width-zero), `project_nil_stream` (empty stream is empty). * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). @@ -54,6 +54,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `clampToOne` collapses surviving multiplicities to one: `.val n > 0` becomes `.val 1`, non-positive `.val` is dropped, `.error` survives. Defined by structural recursion on the list. Theorems: `clampToOne_length_le`, `clampToOne_preserves_error_diff`, `clampToOne_only_one_or_error` (every output diff is `.val 1` or `.error`). `distinct = clampToOne ∘ consolidate` realizes SQL `DISTINCT`: each distinct carrier appears at most once with multiplicity one (or `.error` if a collection-scoped error existed). Theorems: `distinct_length_le`, `distinct_preserves_error_diff`, `distinct_only_one_or_error`. Distributivity over `unionAll`: `filter_unionAll`, `cross_unionAll_left`, `project_unionAll`, `negate_unionAll`. Each follows from `List.flatMap_append` / `List.map_append` — `flatMap`- and `map`-based operators distribute over concatenation. Lets the optimizer pull a `UNION ALL` tail out of a pipeline and plan per-branch. + `cross_negate_left` proves that negating the left input of a cross product equals negating the cross output: the diff-semiring law `(-a) * b = -(a * b)` (`DiffWithError.neg_mul_int`) carries the proof since `combineCarrier` is unchanged by negation and only the diff arithmetic flips. `INTERSECT ALL` requires a per-carrier `min` combinator not yet exposed by `DiffWithError`. Deferred. * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. From 212d76f23f0aa6437b3c266a856d18ccf0612708 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 15:38:16 +0200 Subject: [PATCH 059/127] doc/semantics: negate commutes with consolidate Adds the additive negation law `neg_add` on `DiffWithError` and its `Int` specialization. With it, negation slides through the per-bucket sums of `UnifiedStream.consolidate`: * `negate_consolidateInto` (private): negating after a single-key insert equals inserting the negated diff into the negated list. * `negate_consolidate`: `negate (consolidate us) = consolidate (negate us)`. Side change: `consolidateInto` in `Mz/UnifiedConsolidate.lean` is no longer `private` so `Mz/SetOps.lean` can state the per-step helper directly. Useful for the signed-diff `exceptAll`: negation can be moved past consolidation without changing semantics, simplifying algebraic reasoning about set operations. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/DiffSemiring.lean | 19 +++++ doc/developer/semantics/Mz/SetOps.lean | 70 +++++++++++++++++++ .../semantics/Mz/UnifiedConsolidate.lean | 5 +- doc/developer/semantics/README.md | 1 + 4 files changed, 93 insertions(+), 2 deletions(-) diff --git a/doc/developer/semantics/Mz/DiffSemiring.lean b/doc/developer/semantics/Mz/DiffSemiring.lean index 79c29b244c148..56ce847cbc3f1 100644 --- a/doc/developer/semantics/Mz/DiffSemiring.lean +++ b/doc/developer/semantics/Mz/DiffSemiring.lean @@ -251,6 +251,22 @@ theorem val_add_neg_val [Add α] [Neg α] [Zero α] show (val (x + -x) : DiffWithError α) = val 0 rw [h] +/-- Negation distributes over addition, lifted from the base. -/ +theorem neg_add [Add α] [Neg α] + (h : ∀ x y : α, -(x + y) = -x + -y) + (a b : DiffWithError α) : -(a + b) = -a + -b := by + cases a with + | val x => + cases b with + | val y => + show (val (-(x + y)) : DiffWithError α) = val (-x + -y) + rw [h] + | error => rfl + | error => + cases b with + | val _ => rfl + | error => rfl + /-- Negation distributes over multiplication on the left, lifted from the base. Used by `Mz/SetOps.lean` to reason about negating one side of a cross product. -/ @@ -318,6 +334,9 @@ theorem val_add_neg_val_int (x : Int) : (val x : DiffWithError Int) + -val x = 0 := val_add_neg_val (fun x => by omega) x +theorem neg_add_int (a b : DiffWithError Int) : -(a + b) = -a + -b := + neg_add (fun _ _ => by omega) a b + theorem neg_mul_int (a b : DiffWithError Int) : (-a) * b = -(a * b) := neg_mul (fun x y => Int.neg_mul x y) a b diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 8a630d4bd6482..ad870282de4fa 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -541,6 +541,76 @@ theorem UnifiedStream.negate_unionAll (a b : UnifiedStream) : show (a ++ b).map _ = a.map _ ++ b.map _ exact List.map_append +/-- `negate` commutes with `consolidateInto`: inserting a negated +diff into a negated bucket list gives the same result as inserting +the positive diff and negating the whole list. -/ +private theorem negate_consolidateInto + (uc : UnifiedRow) (d : DiffWithError Int) (xs : UnifiedStream) : + UnifiedStream.negate (consolidateInto uc d xs) + = consolidateInto uc (-d) (UnifiedStream.negate xs) := by + induction xs with + | nil => + show UnifiedStream.negate [(uc, d)] + = [(uc, -d)] + rfl + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + by_cases hEq : uc = uc' + · subst hEq + -- consolidateInto uc d ((uc, d') :: tl) = (uc, d + d') :: tl + have hLhs : consolidateInto uc d ((uc, d') :: tl) = (uc, d + d') :: tl := by + show (if uc = uc then (uc, d + d') :: tl + else (uc, d') :: consolidateInto uc d tl) + = (uc, d + d') :: tl + rw [if_pos rfl] + have hRhs : consolidateInto uc (-d) ((uc, -d') :: UnifiedStream.negate tl) + = (uc, -d + -d') :: UnifiedStream.negate tl := by + show (if uc = uc then (uc, -d + -d') :: UnifiedStream.negate tl + else (uc, -d') :: consolidateInto uc (-d) (UnifiedStream.negate tl)) + = (uc, -d + -d') :: UnifiedStream.negate tl + rw [if_pos rfl] + rw [hLhs] + show UnifiedStream.negate ((uc, d + d') :: tl) + = consolidateInto uc (-d) (UnifiedStream.negate ((uc, d') :: tl)) + show ((uc, -(d + d')) :: UnifiedStream.negate tl) + = consolidateInto uc (-d) ((uc, -d') :: UnifiedStream.negate tl) + rw [hRhs, DiffWithError.neg_add_int] + · have hLhs : consolidateInto uc d ((uc', d') :: tl) + = (uc', d') :: consolidateInto uc d tl := by + show (if uc = uc' then (uc', d + d') :: tl + else (uc', d') :: consolidateInto uc d tl) + = (uc', d') :: consolidateInto uc d tl + rw [if_neg hEq] + have hRhs : consolidateInto uc (-d) ((uc', -d') :: UnifiedStream.negate tl) + = (uc', -d') :: consolidateInto uc (-d) (UnifiedStream.negate tl) := by + show (if uc = uc' then (uc', -d + -d') :: UnifiedStream.negate tl + else (uc', -d') :: consolidateInto uc (-d) (UnifiedStream.negate tl)) + = (uc', -d') :: consolidateInto uc (-d) (UnifiedStream.negate tl) + rw [if_neg hEq] + rw [hLhs] + show UnifiedStream.negate ((uc', d') :: consolidateInto uc d tl) + = consolidateInto uc (-d) (UnifiedStream.negate ((uc', d') :: tl)) + show ((uc', -d') :: UnifiedStream.negate (consolidateInto uc d tl)) + = consolidateInto uc (-d) ((uc', -d') :: UnifiedStream.negate tl) + rw [hRhs, ih] + +/-- `negate` commutes with `consolidate`: consolidating then +negating equals negating then consolidating. Negation is additive, +so it slides through the per-bucket sums. -/ +theorem UnifiedStream.negate_consolidate (us : UnifiedStream) : + UnifiedStream.negate (UnifiedStream.consolidate us) + = UnifiedStream.consolidate (UnifiedStream.negate us) := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + show UnifiedStream.negate (consolidateInto uc d (UnifiedStream.consolidate tl)) + = UnifiedStream.consolidate ((uc, -d) :: UnifiedStream.negate tl) + rw [negate_consolidateInto] + show consolidateInto uc (-d) (UnifiedStream.negate (UnifiedStream.consolidate tl)) + = consolidateInto uc (-d) (UnifiedStream.consolidate (UnifiedStream.negate tl)) + rw [ih] + /-- Negating one side of a cross product is the same as negating the cross product. The diff-semiring law `(-a) * b = -(a * b)` carries the proof through `combineCarrier` (carrier is unchanged diff --git a/doc/developer/semantics/Mz/UnifiedConsolidate.lean b/doc/developer/semantics/Mz/UnifiedConsolidate.lean index 2f5d053f29890..ed84af9bc15aa 100644 --- a/doc/developer/semantics/Mz/UnifiedConsolidate.lean +++ b/doc/developer/semantics/Mz/UnifiedConsolidate.lean @@ -27,8 +27,9 @@ namespace Mz /-- Insert `(uc, d)` into a consolidated stream. If a record with the same carrier already exists, add `d` to its diff. Otherwise -append a new record at the end of the list. -/ -private def consolidateInto (uc : UnifiedRow) (d : DiffWithError Int) : +append a new record at the end of the list. Exposed (not +`private`) so downstream files can state laws about it. -/ +def consolidateInto (uc : UnifiedRow) (d : DiffWithError Int) : UnifiedStream → UnifiedStream | [] => [(uc, d)] | (uc', d') :: rest => diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 91f2b35d0ee81..3cf538b1b8299 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -55,6 +55,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `distinct = clampToOne ∘ consolidate` realizes SQL `DISTINCT`: each distinct carrier appears at most once with multiplicity one (or `.error` if a collection-scoped error existed). Theorems: `distinct_length_le`, `distinct_preserves_error_diff`, `distinct_only_one_or_error`. Distributivity over `unionAll`: `filter_unionAll`, `cross_unionAll_left`, `project_unionAll`, `negate_unionAll`. Each follows from `List.flatMap_append` / `List.map_append` — `flatMap`- and `map`-based operators distribute over concatenation. Lets the optimizer pull a `UNION ALL` tail out of a pipeline and plan per-branch. `cross_negate_left` proves that negating the left input of a cross product equals negating the cross output: the diff-semiring law `(-a) * b = -(a * b)` (`DiffWithError.neg_mul_int`) carries the proof since `combineCarrier` is unchanged by negation and only the diff arithmetic flips. + `negate_consolidate` proves that negation commutes with consolidation: `negate (consolidate us) = consolidate (negate us)`. The proof recurses via `negate_consolidateInto` (private), which lifts the same property to the single-step insertion. Negation is additive (`neg_add`), so it slides through per-bucket sums. `INTERSECT ALL` requires a per-carrier `min` combinator not yet exposed by `DiffWithError`. Deferred. * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. From 105626911dad0ce51125ec1b882e0bbf08db9174 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 15:40:08 +0200 Subject: [PATCH 060/127] doc/semantics: clampPositive and clampToOne idempotence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `clampPositive_idem` and `clampToOne_idem`. `clampPositive_idem` falls out of `List.filter_filter` plus `Bool.and_self`: the predicate `isPositiveDiff` short-circuits under self-conjunction. `clampToOne_idem` is structural: the first pass leaves every `.val` as `.val 1`, which satisfies `0 < 1`, so the second pass preserves it. `.error` records are unchanged across both passes. Inductive proof on the list, split on the head's diff shape. Useful for downstream pipelines: a normalize step can be applied defensively without changing output. Future `distinct` idempotence (`distinct ∘ distinct = distinct`) chains these with consolidation idempotence, which is non-trivial because consolidation may reorder distinct carriers. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 39 ++++++++++++++++++++++++++ doc/developer/semantics/README.md | 4 +-- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index ad870282de4fa..7d79e51632ba4 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -309,6 +309,16 @@ theorem UnifiedStream.clampPositive_preserves_error_diff show isPositiveDiff DiffWithError.error = true rfl +/-- `clampPositive` is idempotent. Filter twice = filter once. -/ +theorem UnifiedStream.clampPositive_idem (us : UnifiedStream) : + UnifiedStream.clampPositive (UnifiedStream.clampPositive us) + = UnifiedStream.clampPositive us := by + unfold UnifiedStream.clampPositive + rw [List.filter_filter] + congr 1 + funext ud + exact Bool.and_self _ + /-- The output of `clampPositive` never contains a `.val n` with `n ≤ 0`. Equivalently, every surviving `.val` diff is strictly positive. -/ @@ -442,6 +452,35 @@ theorem UnifiedStream.clampToOne_preserves_error_diff · exact List.mem_cons_of_mem _ (ih hTail) · exact ih hTail +/-- `clampToOne` is idempotent. After one pass every `.val` is +`.val 1` and every other diff is `.error`; the second pass +preserves both. -/ +theorem UnifiedStream.clampToOne_idem (us : UnifiedStream) : + UnifiedStream.clampToOne (UnifiedStream.clampToOne us) + = UnifiedStream.clampToOne us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + cases d with + | error => + show UnifiedStream.clampToOne + ((uc, DiffWithError.error) :: UnifiedStream.clampToOne tl) + = (uc, DiffWithError.error) :: UnifiedStream.clampToOne tl + simp only [UnifiedStream.clampToOne] + rw [ih] + | val n => + simp only [UnifiedStream.clampToOne] + split + · rename_i hPos + show UnifiedStream.clampToOne + ((uc, DiffWithError.val 1) :: UnifiedStream.clampToOne tl) + = (uc, DiffWithError.val 1) :: UnifiedStream.clampToOne tl + simp only [UnifiedStream.clampToOne] + rw [if_pos (by decide : (0 : Int) < 1)] + rw [ih] + · exact ih + /-- Every `.val` record in the output of `clampToOne` has multiplicity exactly one. `.error` records pass through unchanged. -/ theorem UnifiedStream.clampToOne_only_one_or_error diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 3cf538b1b8299..788a6d27acc83 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -49,9 +49,9 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/SetOps.lean`: set operations on `UnifiedStream`. `unionAll = (++)` concatenates two streams record-wise; theorems cover length (sum), associativity, nil identities, and error / no-error preservation from each input (`unionAll_preserves_error_diff_left`, `unionAll_preserves_error_diff_right`, `unionAll_no_error`). `union = consolidate ∘ unionAll` derives the set-semantics flavor; theorems lift the consolidation guarantees to `union` (`union_length_le`, `union_preserves_error_diff_left`, `union_preserves_error_diff_right`, `union_no_error`). `negate` negates every diff (`.error` absorbs negation, `.val n` becomes `.val (-n)`). Theorems: `negate_length` (length preserved), `negate_negate` (involution), `negate_preserves_error_diff`, `negate_no_error`. `exceptAll l r = consolidate (unionAll l (negate r))` realizes the signed-diff `EXCEPT ALL` (output diffs may be negative, encoding "this carrier has `n` fewer copies in the result than in the input"). Theorems: `exceptAll_length_le` (≤ sum of input lengths), `exceptAll_preserves_error_diff_left`/`exceptAll_preserves_error_diff_right` (errors from either side survive — negation absorbs at `.error`), `exceptAll_no_error`. - `clampPositive` drops records with `.val n` where `n ≤ 0`, keeping `.error` records and records with `.val n > 0`. Theorems: `clampPositive_length_le`, `clampPositive_preserves_error_diff`, `clampPositive_only_positive` (every output `.val` is strictly positive). + `clampPositive` drops records with `.val n` where `n ≤ 0`, keeping `.error` records and records with `.val n > 0`. Theorems: `clampPositive_length_le`, `clampPositive_preserves_error_diff`, `clampPositive_only_positive` (every output `.val` is strictly positive), `clampPositive_idem` (idempotent — filter twice = filter once). `bagExceptAll = clampPositive ∘ exceptAll` realizes the bag-semantics `EXCEPT ALL` — the signed-diff result is post-processed to drop non-positive multiplicities, producing `max(L - R, 0)` per carrier. Theorems lift the signed flavor: `bagExceptAll_length_le`, `bagExceptAll_preserves_error_diff_left`/`_right`, `bagExceptAll_only_positive`. - `clampToOne` collapses surviving multiplicities to one: `.val n > 0` becomes `.val 1`, non-positive `.val` is dropped, `.error` survives. Defined by structural recursion on the list. Theorems: `clampToOne_length_le`, `clampToOne_preserves_error_diff`, `clampToOne_only_one_or_error` (every output diff is `.val 1` or `.error`). + `clampToOne` collapses surviving multiplicities to one: `.val n > 0` becomes `.val 1`, non-positive `.val` is dropped, `.error` survives. Defined by structural recursion on the list. Theorems: `clampToOne_length_le`, `clampToOne_preserves_error_diff`, `clampToOne_only_one_or_error` (every output diff is `.val 1` or `.error`), `clampToOne_idem` (idempotent — after one pass every `.val` is `.val 1` so a second pass is a no-op). `distinct = clampToOne ∘ consolidate` realizes SQL `DISTINCT`: each distinct carrier appears at most once with multiplicity one (or `.error` if a collection-scoped error existed). Theorems: `distinct_length_le`, `distinct_preserves_error_diff`, `distinct_only_one_or_error`. Distributivity over `unionAll`: `filter_unionAll`, `cross_unionAll_left`, `project_unionAll`, `negate_unionAll`. Each follows from `List.flatMap_append` / `List.map_append` — `flatMap`- and `map`-based operators distribute over concatenation. Lets the optimizer pull a `UNION ALL` tail out of a pipeline and plan per-branch. `cross_negate_left` proves that negating the left input of a cross product equals negating the cross output: the diff-semiring law `(-a) * b = -(a * b)` (`DiffWithError.neg_mul_int`) carries the proof since `combineCarrier` is unchanged by negation and only the diff arithmetic flips. From 221a2798dca1ee38300e5eed2f6d3ecc7ab02727 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 15:41:58 +0200 Subject: [PATCH 061/127] doc/semantics: no_error preservation for set-op pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `no_error` preservation theorems closing the gap in the set-operation pipeline: * `clampPositive_no_error` — filter-based, surviving records keep their original `.val` diffs. * `clampToOne_no_error` — `.val n > 0` becomes `.val 1`, non-positive `.val` dropped, `.error` cannot be introduced. * `bagExceptAll_no_error` — chains `clampPositive_no_error` with `exceptAll_no_error`. * `distinct_no_error` — chains `clampToOne_no_error` with `consolidate_no_error`. Completes the symmetry with `_preserves_error_diff` / `_only_one_or_error` / `_only_positive` theorems already in place. Together these prove that `.error` diffs are the only source of absorbing diffs anywhere in the set-operation pipeline — a key spec invariant. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 63 ++++++++++++++++++++++++++ doc/developer/semantics/README.md | 8 ++-- 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 7d79e51632ba4..1d096d7c33555 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -319,6 +319,17 @@ theorem UnifiedStream.clampPositive_idem (us : UnifiedStream) : funext ud exact Bool.and_self _ +/-- `clampPositive` is filter-based, so no-error preservation is +trivial: surviving records keep their original diffs. -/ +theorem UnifiedStream.clampPositive_no_error + (us : UnifiedStream) + (h : ∀ x ∈ us, ∃ n : Int, x.2 = DiffWithError.val n) : + ∀ x ∈ UnifiedStream.clampPositive us, + ∃ n : Int, x.2 = DiffWithError.val n := by + intro x hMem + unfold UnifiedStream.clampPositive at hMem + exact h x (List.mem_filter.mp hMem).1 + /-- The output of `clampPositive` never contains a `.val n` with `n ≤ 0`. Equivalently, every surviving `.val` diff is strictly positive. -/ @@ -383,6 +394,17 @@ theorem UnifiedStream.bagExceptAll_only_positive ∨ x.2 = DiffWithError.error := UnifiedStream.clampPositive_only_positive _ +/-- All-`.val` inputs yield all-`.val` outputs through +`bagExceptAll`. -/ +theorem UnifiedStream.bagExceptAll_no_error + (l r : UnifiedStream) + (hL : ∀ x ∈ l, ∃ n : Int, x.2 = DiffWithError.val n) + (hR : ∀ x ∈ r, ∃ n : Int, x.2 = DiffWithError.val n) : + ∀ x ∈ UnifiedStream.bagExceptAll l r, + ∃ n : Int, x.2 = DiffWithError.val n := + UnifiedStream.clampPositive_no_error _ + (UnifiedStream.exceptAll_no_error l r hL hR) + /-! ## `DISTINCT` Set semantics: collapse multiplicities so each carrier appears at @@ -481,6 +503,36 @@ theorem UnifiedStream.clampToOne_idem (us : UnifiedStream) : rw [ih] · exact ih +/-- All-`.val` inputs yield all-`.val` outputs through `clampToOne`: +no `.error` is introduced (records with non-positive `.val` are +dropped, positive `.val` become `.val 1`). -/ +theorem UnifiedStream.clampToOne_no_error + (us : UnifiedStream) + (h : ∀ x ∈ us, ∃ n : Int, x.2 = DiffWithError.val n) : + ∀ x ∈ UnifiedStream.clampToOne us, + ∃ n : Int, x.2 = DiffWithError.val n := by + induction us with + | nil => intro x hMem; exact absurd hMem List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hHd : ∃ n : Int, d = DiffWithError.val n := + h (uc, d) List.mem_cons_self + have hTl : ∀ x ∈ tl, ∃ n : Int, x.2 = DiffWithError.val n := + fun x hMem => h x (List.mem_cons_of_mem _ hMem) + obtain ⟨n, hN⟩ := hHd + cases d with + | error => exact absurd hN (by intro hEq; cases hEq) + | val m => + intro x hMem + have hMem' : x ∈ (if 0 < m + then (uc, DiffWithError.val 1) :: UnifiedStream.clampToOne tl + else UnifiedStream.clampToOne tl) := hMem + split at hMem' + · rcases List.mem_cons.mp hMem' with hHead | hTail + · exact ⟨1, by rw [hHead]⟩ + · exact ih hTl x hTail + · exact ih hTl x hMem' + /-- Every `.val` record in the output of `clampToOne` has multiplicity exactly one. `.error` records pass through unchanged. -/ theorem UnifiedStream.clampToOne_only_one_or_error @@ -539,6 +591,17 @@ theorem UnifiedStream.distinct_only_one_or_error (us : UnifiedStream) : x.2 = DiffWithError.val 1 ∨ x.2 = DiffWithError.error := UnifiedStream.clampToOne_only_one_or_error _ +/-- All-`.val` inputs yield all-`.val` outputs through `distinct`. +Combined with `distinct_only_one_or_error`, the surviving diffs are +all `.val 1`. -/ +theorem UnifiedStream.distinct_no_error + (us : UnifiedStream) + (h : ∀ x ∈ us, ∃ n : Int, x.2 = DiffWithError.val n) : + ∀ x ∈ UnifiedStream.distinct us, + ∃ n : Int, x.2 = DiffWithError.val n := + UnifiedStream.clampToOne_no_error _ + (UnifiedStream.consolidate_no_error _ h) + /-! ## Distributivity over `unionAll` `unionAll` is concatenation, so any operator built as a `flatMap` diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 788a6d27acc83..bc1bcc032627a 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -49,10 +49,10 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/SetOps.lean`: set operations on `UnifiedStream`. `unionAll = (++)` concatenates two streams record-wise; theorems cover length (sum), associativity, nil identities, and error / no-error preservation from each input (`unionAll_preserves_error_diff_left`, `unionAll_preserves_error_diff_right`, `unionAll_no_error`). `union = consolidate ∘ unionAll` derives the set-semantics flavor; theorems lift the consolidation guarantees to `union` (`union_length_le`, `union_preserves_error_diff_left`, `union_preserves_error_diff_right`, `union_no_error`). `negate` negates every diff (`.error` absorbs negation, `.val n` becomes `.val (-n)`). Theorems: `negate_length` (length preserved), `negate_negate` (involution), `negate_preserves_error_diff`, `negate_no_error`. `exceptAll l r = consolidate (unionAll l (negate r))` realizes the signed-diff `EXCEPT ALL` (output diffs may be negative, encoding "this carrier has `n` fewer copies in the result than in the input"). Theorems: `exceptAll_length_le` (≤ sum of input lengths), `exceptAll_preserves_error_diff_left`/`exceptAll_preserves_error_diff_right` (errors from either side survive — negation absorbs at `.error`), `exceptAll_no_error`. - `clampPositive` drops records with `.val n` where `n ≤ 0`, keeping `.error` records and records with `.val n > 0`. Theorems: `clampPositive_length_le`, `clampPositive_preserves_error_diff`, `clampPositive_only_positive` (every output `.val` is strictly positive), `clampPositive_idem` (idempotent — filter twice = filter once). - `bagExceptAll = clampPositive ∘ exceptAll` realizes the bag-semantics `EXCEPT ALL` — the signed-diff result is post-processed to drop non-positive multiplicities, producing `max(L - R, 0)` per carrier. Theorems lift the signed flavor: `bagExceptAll_length_le`, `bagExceptAll_preserves_error_diff_left`/`_right`, `bagExceptAll_only_positive`. - `clampToOne` collapses surviving multiplicities to one: `.val n > 0` becomes `.val 1`, non-positive `.val` is dropped, `.error` survives. Defined by structural recursion on the list. Theorems: `clampToOne_length_le`, `clampToOne_preserves_error_diff`, `clampToOne_only_one_or_error` (every output diff is `.val 1` or `.error`), `clampToOne_idem` (idempotent — after one pass every `.val` is `.val 1` so a second pass is a no-op). - `distinct = clampToOne ∘ consolidate` realizes SQL `DISTINCT`: each distinct carrier appears at most once with multiplicity one (or `.error` if a collection-scoped error existed). Theorems: `distinct_length_le`, `distinct_preserves_error_diff`, `distinct_only_one_or_error`. + `clampPositive` drops records with `.val n` where `n ≤ 0`, keeping `.error` records and records with `.val n > 0`. Theorems: `clampPositive_length_le`, `clampPositive_preserves_error_diff`, `clampPositive_only_positive` (every output `.val` is strictly positive), `clampPositive_no_error`, `clampPositive_idem` (idempotent — filter twice = filter once). + `bagExceptAll = clampPositive ∘ exceptAll` realizes the bag-semantics `EXCEPT ALL` — the signed-diff result is post-processed to drop non-positive multiplicities, producing `max(L - R, 0)` per carrier. Theorems lift the signed flavor: `bagExceptAll_length_le`, `bagExceptAll_preserves_error_diff_left`/`_right`, `bagExceptAll_only_positive`, `bagExceptAll_no_error`. + `clampToOne` collapses surviving multiplicities to one: `.val n > 0` becomes `.val 1`, non-positive `.val` is dropped, `.error` survives. Defined by structural recursion on the list. Theorems: `clampToOne_length_le`, `clampToOne_preserves_error_diff`, `clampToOne_no_error`, `clampToOne_only_one_or_error` (every output diff is `.val 1` or `.error`), `clampToOne_idem` (idempotent — after one pass every `.val` is `.val 1` so a second pass is a no-op). + `distinct = clampToOne ∘ consolidate` realizes SQL `DISTINCT`: each distinct carrier appears at most once with multiplicity one (or `.error` if a collection-scoped error existed). Theorems: `distinct_length_le`, `distinct_preserves_error_diff`, `distinct_only_one_or_error`, `distinct_no_error`. Distributivity over `unionAll`: `filter_unionAll`, `cross_unionAll_left`, `project_unionAll`, `negate_unionAll`. Each follows from `List.flatMap_append` / `List.map_append` — `flatMap`- and `map`-based operators distribute over concatenation. Lets the optimizer pull a `UNION ALL` tail out of a pipeline and plan per-branch. `cross_negate_left` proves that negating the left input of a cross product equals negating the cross output: the diff-semiring law `(-a) * b = -(a * b)` (`DiffWithError.neg_mul_int`) carries the proof since `combineCarrier` is unchanged by negation and only the diff arithmetic flips. `negate_consolidate` proves that negation commutes with consolidation: `negate (consolidate us) = consolidate (negate us)`. The proof recurses via `negate_consolidateInto` (private), which lifts the same property to the single-step insertion. Negation is additive (`neg_add`), so it slides through per-bucket sums. From 7385b02f6741b52fa5ee82d7568485d2f7d7c347 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 15:44:57 +0200 Subject: [PATCH 062/127] doc/semantics: identity-element theorems for set operations Adds five `nil_left` / `nil_right` reductions for the set-op pipeline: * `union_nil_left`/`union_nil_right`: `union [] r = consolidate r`, `union l [] = consolidate l`. * `exceptAll_nil_left`: `exceptAll [] r = negate (consolidate r)`. Bridges to `negate_consolidate`. * `exceptAll_nil_right`: `exceptAll l [] = consolidate l`. * `bagExceptAll_nil_right`: `bagExceptAll l [] = clampPositive (consolidate l)`. Each follows from the underlying `unionAll_nil_left` / `unionAll_nil_right` plus the relevant semiring law. Useful for optimizer rewrites: a set operation with one empty input collapses to a single-input pipeline. `exceptAll_nil_left` is stated after the negation-commutes lemma (`negate_consolidate`), since its proof depends on it. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 46 ++++++++++++++++++++++++++ doc/developer/semantics/README.md | 4 +-- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 1d096d7c33555..1e08129158171 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -121,6 +121,18 @@ the output. -/ def UnifiedStream.union (l r : UnifiedStream) : UnifiedStream := UnifiedStream.consolidate (UnifiedStream.unionAll l r) +theorem UnifiedStream.union_nil_left (r : UnifiedStream) : + UnifiedStream.union [] r = UnifiedStream.consolidate r := by + show UnifiedStream.consolidate (UnifiedStream.unionAll [] r) + = UnifiedStream.consolidate r + rw [UnifiedStream.unionAll_nil_left] + +theorem UnifiedStream.union_nil_right (l : UnifiedStream) : + UnifiedStream.union l [] = UnifiedStream.consolidate l := by + show UnifiedStream.consolidate (UnifiedStream.unionAll l []) + = UnifiedStream.consolidate l + rw [UnifiedStream.unionAll_nil_right] + theorem UnifiedStream.union_length_le (l r : UnifiedStream) : (UnifiedStream.union l r).length ≤ l.length + r.length := by show (UnifiedStream.consolidate (UnifiedStream.unionAll l r)).length @@ -225,6 +237,19 @@ def UnifiedStream.exceptAll (l r : UnifiedStream) : UnifiedStream := UnifiedStream.consolidate (UnifiedStream.unionAll l (UnifiedStream.negate r)) +-- `exceptAll_nil_left` is proven later, after `negate_consolidate`. + +/-- Empty right input: `exceptAll l [] = consolidate l`. The +negation of the empty stream is empty, and `unionAll l [] = l`. -/ +theorem UnifiedStream.exceptAll_nil_right (l : UnifiedStream) : + UnifiedStream.exceptAll l [] = UnifiedStream.consolidate l := by + show UnifiedStream.consolidate + (UnifiedStream.unionAll l (UnifiedStream.negate [])) + = UnifiedStream.consolidate l + show UnifiedStream.consolidate (UnifiedStream.unionAll l []) + = UnifiedStream.consolidate l + rw [UnifiedStream.unionAll_nil_right] + theorem UnifiedStream.exceptAll_length_le (l r : UnifiedStream) : (UnifiedStream.exceptAll l r).length ≤ l.length + r.length := by show (UnifiedStream.consolidate @@ -394,6 +419,16 @@ theorem UnifiedStream.bagExceptAll_only_positive ∨ x.2 = DiffWithError.error := UnifiedStream.clampPositive_only_positive _ +/-- `bagExceptAll l [] = clampPositive (consolidate l)`. Trivial +composition of `exceptAll_nil_right` and `bagExceptAll`'s +definition. -/ +theorem UnifiedStream.bagExceptAll_nil_right (l : UnifiedStream) : + UnifiedStream.bagExceptAll l [] + = UnifiedStream.clampPositive (UnifiedStream.consolidate l) := by + show UnifiedStream.clampPositive (UnifiedStream.exceptAll l []) + = UnifiedStream.clampPositive (UnifiedStream.consolidate l) + rw [UnifiedStream.exceptAll_nil_right] + /-- All-`.val` inputs yield all-`.val` outputs through `bagExceptAll`. -/ theorem UnifiedStream.bagExceptAll_no_error @@ -761,4 +796,15 @@ theorem UnifiedStream.cross_negate_left (l r : UnifiedStream) : = (combineCarrier uc rd.1, -(d * rd.2)) rw [DiffWithError.neg_mul_int] +/-- Empty left input: `exceptAll [] r = negate (consolidate r)`. +Reduces via `unionAll_nil_left` and `negate_consolidate`. -/ +theorem UnifiedStream.exceptAll_nil_left (r : UnifiedStream) : + UnifiedStream.exceptAll [] r + = UnifiedStream.negate (UnifiedStream.consolidate r) := by + show UnifiedStream.consolidate + (UnifiedStream.unionAll [] (UnifiedStream.negate r)) + = UnifiedStream.negate (UnifiedStream.consolidate r) + rw [UnifiedStream.unionAll_nil_left] + exact (UnifiedStream.negate_consolidate r).symm + end Mz diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index bc1bcc032627a..3c2d91a6635c9 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -46,9 +46,9 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four *no-error preservation* — `consolidate_no_error` proves that if every input diff is a `.val`, every output diff is a `.val`, so `.error` is the only source of absorption. * `Mz/Triple.lean`: collection-wide and per-time *flat* consolidation views on `TimedUnifiedStream`. `consolidateAll` sums every diff in the stream; `consolidateAtTimeFlat t` sums every diff at time `t`. Both ignore the carrier — they collapse a time slice (or the whole stream) to one `DiffWithError Int`. Absorption: `consolidateAll_eq_error_of_mem` and `consolidateAtTimeFlat_eq_error_of_mem`. Complementary to `Mz/TimedConsolidate.lean`'s `consolidateAtTime t`, which buckets per `(row, time)` and returns a `UnifiedStream`. * `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). No-error preservation: `cross_no_error` and `filter_no_error` prove that all-`.val` input diffs yield all-`.val` output diffs, so `.error` is the only source of absorbing diffs in the joint output. Algebraic laws: `combineCarrier_assoc` (carrier combine is associative modulo `List.append_assoc`) and the headline `UnifiedStream.cross_assoc` (`(a × b) × c = a × (b × c)`). The proof rearranges nested `flatMap` / `map` via local list-monad lemmas and closes via `DiffWithError.mul_assoc` plus `combineCarrier_assoc`. -* `Mz/SetOps.lean`: set operations on `UnifiedStream`. `unionAll = (++)` concatenates two streams record-wise; theorems cover length (sum), associativity, nil identities, and error / no-error preservation from each input (`unionAll_preserves_error_diff_left`, `unionAll_preserves_error_diff_right`, `unionAll_no_error`). `union = consolidate ∘ unionAll` derives the set-semantics flavor; theorems lift the consolidation guarantees to `union` (`union_length_le`, `union_preserves_error_diff_left`, `union_preserves_error_diff_right`, `union_no_error`). +* `Mz/SetOps.lean`: set operations on `UnifiedStream`. `unionAll = (++)` concatenates two streams record-wise; theorems cover length (sum), associativity, nil identities, and error / no-error preservation from each input (`unionAll_preserves_error_diff_left`, `unionAll_preserves_error_diff_right`, `unionAll_no_error`). `union = consolidate ∘ unionAll` derives the set-semantics flavor; theorems lift the consolidation guarantees to `union` (`union_length_le`, `union_preserves_error_diff_left`, `union_preserves_error_diff_right`, `union_no_error`, `union_nil_left`, `union_nil_right`). `negate` negates every diff (`.error` absorbs negation, `.val n` becomes `.val (-n)`). Theorems: `negate_length` (length preserved), `negate_negate` (involution), `negate_preserves_error_diff`, `negate_no_error`. - `exceptAll l r = consolidate (unionAll l (negate r))` realizes the signed-diff `EXCEPT ALL` (output diffs may be negative, encoding "this carrier has `n` fewer copies in the result than in the input"). Theorems: `exceptAll_length_le` (≤ sum of input lengths), `exceptAll_preserves_error_diff_left`/`exceptAll_preserves_error_diff_right` (errors from either side survive — negation absorbs at `.error`), `exceptAll_no_error`. + `exceptAll l r = consolidate (unionAll l (negate r))` realizes the signed-diff `EXCEPT ALL` (output diffs may be negative, encoding "this carrier has `n` fewer copies in the result than in the input"). Theorems: `exceptAll_length_le` (≤ sum of input lengths), `exceptAll_preserves_error_diff_left`/`exceptAll_preserves_error_diff_right` (errors from either side survive — negation absorbs at `.error`), `exceptAll_no_error`, `exceptAll_nil_left` (`exceptAll [] r = negate (consolidate r)` — bridges to `negate_consolidate`), `exceptAll_nil_right` (`exceptAll l [] = consolidate l`). `clampPositive` drops records with `.val n` where `n ≤ 0`, keeping `.error` records and records with `.val n > 0`. Theorems: `clampPositive_length_le`, `clampPositive_preserves_error_diff`, `clampPositive_only_positive` (every output `.val` is strictly positive), `clampPositive_no_error`, `clampPositive_idem` (idempotent — filter twice = filter once). `bagExceptAll = clampPositive ∘ exceptAll` realizes the bag-semantics `EXCEPT ALL` — the signed-diff result is post-processed to drop non-positive multiplicities, producing `max(L - R, 0)` per carrier. Theorems lift the signed flavor: `bagExceptAll_length_le`, `bagExceptAll_preserves_error_diff_left`/`_right`, `bagExceptAll_only_positive`, `bagExceptAll_no_error`. `clampToOne` collapses surviving multiplicities to one: `.val n > 0` becomes `.val 1`, non-positive `.val` is dropped, `.error` survives. Defined by structural recursion on the list. Theorems: `clampToOne_length_le`, `clampToOne_preserves_error_diff`, `clampToOne_no_error`, `clampToOne_only_one_or_error` (every output diff is `.val 1` or `.error`), `clampToOne_idem` (idempotent — after one pass every `.val` is `.val 1` so a second pass is a no-op). From 551782597440f1fb6a22dca669e2ecbc88abac7e Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 15:47:02 +0200 Subject: [PATCH 063/127] doc/semantics: strict cardinality bound for consolidate duplicates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `consolidate_strict_length_dup`: two adjacent records sharing a carrier compress to one bucket in the consolidated output: (consolidate ((uc, d) :: (uc, d') :: rest)).length ≤ rest.length + 1 Strictly less than the input's `rest.length + 2` — concrete witness that consolidation can shrink the carrier list. Supporting private lemmas: * `mem_after_consolidateInto`: after a single-key insert, the carrier is in the result. * `consolidateInto_length_eq_of_mem`: inserting a key already present does not change the length (the bucket update is in-place). Generalization to arbitrary `k`-duplicate compression needs a counting-multiplicity argument; deferred. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../semantics/Mz/UnifiedConsolidate.lean | 87 +++++++++++++++++++ doc/developer/semantics/README.md | 3 +- 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/doc/developer/semantics/Mz/UnifiedConsolidate.lean b/doc/developer/semantics/Mz/UnifiedConsolidate.lean index ed84af9bc15aa..eb8ac045abc61 100644 --- a/doc/developer/semantics/Mz/UnifiedConsolidate.lean +++ b/doc/developer/semantics/Mz/UnifiedConsolidate.lean @@ -199,6 +199,93 @@ theorem UnifiedStream.consolidate_length_le (us : UnifiedStream) : show (consolidateInto uc d (UnifiedStream.consolidate tl)).length ≤ tl.length + 1 exact Nat.le_trans hStep hIh +/-! ## Strict cardinality + +When a carrier already appears in the consolidated list, +inserting it again does not grow the list — the existing bucket +absorbs the new diff. The headline `consolidate_strict_length_dup` +shows two adjacent records sharing a carrier compress to one in +the output. -/ + +/-- After `consolidateInto`, the carrier `uc` is in the result. +Either the input already had it (the bucket update preserves +membership) or the input did not (a fresh bucket is appended). -/ +private theorem mem_after_consolidateInto + (uc : UnifiedRow) (d : DiffWithError Int) (us : UnifiedStream) : + ∃ d', (uc, d') ∈ consolidateInto uc d us := by + induction us with + | nil => exact ⟨d, List.mem_singleton.mpr rfl⟩ + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + by_cases hEq : uc = uc' + · subst hEq + refine ⟨d + d', ?_⟩ + show (uc, d + d') ∈ + (if uc = uc then (uc, d + d') :: tl + else (uc, d') :: consolidateInto uc d tl) + rw [if_pos rfl] + exact List.mem_cons_self + · obtain ⟨d'', hMem⟩ := ih + refine ⟨d'', ?_⟩ + show (uc, d'') ∈ + (if uc = uc' then (uc', d + d') :: tl + else (uc', d') :: consolidateInto uc d tl) + rw [if_neg hEq] + exact List.mem_cons_of_mem _ hMem + +/-- When `uc` already appears in `us`, `consolidateInto uc d us` +does not change the length — the bucket update is in place. -/ +private theorem consolidateInto_length_eq_of_mem + (uc : UnifiedRow) (d : DiffWithError Int) (us : UnifiedStream) + (h : ∃ d', (uc, d') ∈ us) : + (consolidateInto uc d us).length = us.length := by + induction us with + | nil => obtain ⟨_, hMem⟩ := h; exact absurd hMem List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + by_cases hEq : uc = uc' + · subst hEq + show (if uc = uc then (uc, d + d') :: tl + else (uc, d') :: consolidateInto uc d tl).length + = ((uc, d') :: tl).length + rw [if_pos rfl] + rfl + · have hMemTl : ∃ d', (uc, d') ∈ tl := by + obtain ⟨d'', hMem⟩ := h + rcases List.mem_cons.mp hMem with hHead | hTail + · exact absurd ((Prod.mk.injEq _ _ _ _).mp hHead).1 hEq + · exact ⟨d'', hTail⟩ + show (if uc = uc' then (uc', d + d') :: tl + else (uc', d') :: consolidateInto uc d tl).length + = ((uc', d') :: tl).length + rw [if_neg hEq] + show (consolidateInto uc d tl).length + 1 = tl.length + 1 + rw [ih hMemTl] + +/-- Two adjacent records sharing a carrier collapse to one in the +consolidated output. The output length is at most `rest.length + 1`, +strictly less than the input's `rest.length + 2`. -/ +theorem UnifiedStream.consolidate_strict_length_dup + (uc : UnifiedRow) (d d' : DiffWithError Int) (rest : UnifiedStream) : + (UnifiedStream.consolidate ((uc, d) :: (uc, d') :: rest)).length + ≤ rest.length + 1 := by + -- consolidate ((uc, d) :: (uc, d') :: rest) + -- = consolidateInto uc d (consolidate ((uc, d') :: rest)) + -- = consolidateInto uc d (consolidateInto uc d' (consolidate rest)) + -- The inner consolidateInto produces a list containing uc; + -- the outer therefore preserves length. + show (consolidateInto uc d (UnifiedStream.consolidate ((uc, d') :: rest))).length + ≤ rest.length + 1 + have hMem : ∃ d'', (uc, d'') ∈ UnifiedStream.consolidate ((uc, d') :: rest) := by + show ∃ d'', (uc, d'') ∈ consolidateInto uc d' (UnifiedStream.consolidate rest) + exact mem_after_consolidateInto uc d' _ + rw [consolidateInto_length_eq_of_mem uc d _ hMem] + -- Now bound (consolidate ((uc, d') :: rest)).length ≤ ((uc, d') :: rest).length + -- = rest.length + 1. + have := UnifiedStream.consolidate_length_le ((uc, d') :: rest) + show (UnifiedStream.consolidate ((uc, d') :: rest)).length ≤ rest.length + 1 + exact this + /-! ## No-error preservation If every input diff is a `.val`, every output diff is a `.val`. diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 3c2d91a6635c9..6f28b6951dc78 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -44,6 +44,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four *absorption* — `consolidate_preserves_error` proves an `.error` diff anywhere in the input gives an `.error` diff in the consolidated output for that carrier; *cardinality* — `consolidate_length_le` bounds the output by the input length (consolidation only merges, never expands); *no-error preservation* — `consolidate_no_error` proves that if every input diff is a `.val`, every output diff is a `.val`, so `.error` is the only source of absorption. + *strict shrinkage* — `consolidate_strict_length_dup` proves that two adjacent records sharing a carrier compress to one in the output: `(consolidate ((uc, d) :: (uc, d') :: rest)).length ≤ rest.length + 1`, strictly less than the input's `rest.length + 2`. * `Mz/Triple.lean`: collection-wide and per-time *flat* consolidation views on `TimedUnifiedStream`. `consolidateAll` sums every diff in the stream; `consolidateAtTimeFlat t` sums every diff at time `t`. Both ignore the carrier — they collapse a time slice (or the whole stream) to one `DiffWithError Int`. Absorption: `consolidateAll_eq_error_of_mem` and `consolidateAtTimeFlat_eq_error_of_mem`. Complementary to `Mz/TimedConsolidate.lean`'s `consolidateAtTime t`, which buckets per `(row, time)` and returns a `UnifiedStream`. * `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). No-error preservation: `cross_no_error` and `filter_no_error` prove that all-`.val` input diffs yield all-`.val` output diffs, so `.error` is the only source of absorbing diffs in the joint output. Algebraic laws: `combineCarrier_assoc` (carrier combine is associative modulo `List.append_assoc`) and the headline `UnifiedStream.cross_assoc` (`(a × b) × c = a × (b × c)`). The proof rearranges nested `flatMap` / `map` via local list-monad lemmas and closes via `DiffWithError.mul_assoc` plus `combineCarrier_assoc`. * `Mz/SetOps.lean`: set operations on `UnifiedStream`. `unionAll = (++)` concatenates two streams record-wise; theorems cover length (sum), associativity, nil identities, and error / no-error preservation from each input (`unionAll_preserves_error_diff_left`, `unionAll_preserves_error_diff_right`, `unionAll_no_error`). `union = consolidate ∘ unionAll` derives the set-semantics flavor; theorems lift the consolidation guarantees to `union` (`union_length_le`, `union_preserves_error_diff_left`, `union_preserves_error_diff_right`, `union_no_error`, `union_nil_left`, `union_nil_right`). @@ -116,7 +117,7 @@ The diff-semiring extension is in scope: `UnifiedStream` records carry `(Unified ### Additive refinements * Tightening `Expr.might_error` further. Short-circuit detection covers binary / variadic `AND` / `OR` and `IfThen` against literal absorbers. Remaining: ground-truth lookups (literal arithmetic, known-null operands, type-driven), all additive against the current soundness proof. -* Strict cardinality bound for `UnifiedConsolidate`: when a carrier appears `k > 1` times in the input, the output is `k - 1` shorter than the input. +* General `k > 1` strict cardinality bound for `UnifiedConsolidate`. Today's `consolidate_strict_length_dup` covers the case of two adjacent duplicates compressing to one bucket; lifting to "any `k` occurrences of the same carrier shrink the output by `k - 1`" needs a counting-multiplicity argument. * `UnifiedStream.project` pushdown analogue of `BagStream.project_filter_pushdown_data`. Predicate pushdown across the diff-aware projection is straightforward on the carrier side but the err-split adds asymmetries: filter-after-project sees projected rows; filter-before-project (with substitution) sees originals. * Lift `ErrPropagatingBinary` / `NullPropagatingBinary` to `Expr` form. Today's predicates work at the `Datum` level; an `Expr.err_propagating` analogue would let the optimizer reason about whole sub-expressions, not just primitives. From 317e8aa6115207186be57ed6222f71085dd37c4e Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 15:49:58 +0200 Subject: [PATCH 064/127] doc/semantics: column-unused analyzer (pruning foundation) Adds `Expr.colReferencesUnused n e` static analyzer (returns `true` iff `e` never reads column `n`), mutual companion `Expr.argsColRefUnused`, environment-replacement primitive `Env.replaceAt`, and the headline soundness theorem `eval_replaceAt_of_unused`. `eval_replaceAt_of_unused`: when column `n` is unused in `e`, replacing its value in the environment does not change eval. Foundation for column-pruning rewrites: if every downstream `Expr` is `colReferencesUnused n`, the column can be dropped from the row schema without affecting results. Supporting `Env.get_replaceAt_eq` (read-after-write at the replaced position) and `Env.get_replaceAt_ne` (read-after-write at any other position) discharge the per-column reductions. `eval_replaceAt_of_unused` and `eval_replaceAt_of_unused_argsMap` share a `mutual` block so structural recursion accepts both. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/ColRefs.lean | 160 ++++++++++++++++++++++++ doc/developer/semantics/README.md | 1 + 2 files changed, 161 insertions(+) diff --git a/doc/developer/semantics/Mz/ColRefs.lean b/doc/developer/semantics/Mz/ColRefs.lean index 0bc5f7cd0ed16..907393471446f 100644 --- a/doc/developer/semantics/Mz/ColRefs.lean +++ b/doc/developer/semantics/Mz/ColRefs.lean @@ -371,6 +371,166 @@ theorem eval_append_right_shift_argsMap : eval_append_right_shift_argsMap l r rest] end +/-! ## Column-unused analyzer + +`Expr.colReferencesUnused n e` returns `true` when `e` never reads +column `n`. The eval-invariance theorem uses this to justify +column-pruning rewrites: a projection that drops an unused column +does not change downstream eval results. -/ + +mutual +def Expr.colReferencesUnused (n : Nat) : Expr → Bool + | .lit _ => true + | .col i => decide (i ≠ n) + | .and a b => a.colReferencesUnused n && b.colReferencesUnused n + | .or a b => a.colReferencesUnused n && b.colReferencesUnused n + | .not a => a.colReferencesUnused n + | .ifThen c t e => + c.colReferencesUnused n && + t.colReferencesUnused n && + e.colReferencesUnused n + | .andN args => Expr.argsColRefUnused n args + | .orN args => Expr.argsColRefUnused n args + | .coalesce args => Expr.argsColRefUnused n args + | .plus a b => a.colReferencesUnused n && b.colReferencesUnused n + | .minus a b => a.colReferencesUnused n && b.colReferencesUnused n + | .times a b => a.colReferencesUnused n && b.colReferencesUnused n + | .divide a b => a.colReferencesUnused n && b.colReferencesUnused n + | .eq a b => a.colReferencesUnused n && b.colReferencesUnused n + | .lt a b => a.colReferencesUnused n && b.colReferencesUnused n + +def Expr.argsColRefUnused (n : Nat) : List Expr → Bool + | [] => true + | e :: rest => e.colReferencesUnused n && Expr.argsColRefUnused n rest +end + +/-! ## Env replacement at position + +`Env.replaceAt env n v` swaps the value at index `n` for `v`, +leaving other positions intact. Out-of-bounds indices leave the +environment unchanged (consistent with `Env.get`'s null fallback). -/ + +def Env.replaceAt : Env → Nat → Datum → Env + | [], _, _ => [] + | _ :: tl, 0, v => v :: tl + | hd :: tl, n + 1, v => hd :: Env.replaceAt tl n v + +/-- Reading position `n` from `replaceAt env n v` yields `v` when +in bounds. -/ +theorem Env.get_replaceAt_eq : + ∀ (env : Env) (n : Nat) (v : Datum), n < env.length → + Env.get (Env.replaceAt env n v) n = v + | [], _, _, h => absurd h (Nat.not_lt_zero _) + | _ :: _, 0, _, _ => rfl + | _ :: tl, n + 1, v, h => by + show Env.get (Env.replaceAt tl n v) n = v + exact Env.get_replaceAt_eq tl n v (Nat.lt_of_succ_lt_succ h) + +/-- Reading any other position is unchanged. -/ +theorem Env.get_replaceAt_ne : + ∀ (env : Env) (n i : Nat) (v : Datum), i ≠ n → + Env.get (Env.replaceAt env n v) i = Env.get env i + | [], _, _, _, _ => rfl + | _ :: _, 0, 0, _, h => absurd rfl h + | hd :: _, 0, i + 1, _, _ => by + show Env.get (hd :: _) (i + 1) = Env.get (hd :: _) (i + 1) + rfl + | _ :: _, n + 1, 0, _, _ => rfl + | _ :: tl, n + 1, i + 1, v, h => by + show Env.get (Env.replaceAt tl n v) i = Env.get tl i + exact Env.get_replaceAt_ne tl n i v (fun hEq => h (by rw [hEq])) + +/-! ## Eval invariance under replacement + +If column `n` is unused in `e`, replacing the value at position `n` +does not change `eval`. Mutual structural recursion on `Expr`. -/ + +mutual +theorem eval_replaceAt_of_unused : + ∀ (env : Env) (n : Nat) (v : Datum) (e : Expr), + e.colReferencesUnused n = true → + eval (Env.replaceAt env n v) e = eval env e + | _, _, _, .lit _, _ => by simp [eval] + | env, n, v, .col i, h => by + have h_ne : i ≠ n := of_decide_eq_true h + simp only [eval] + exact Env.get_replaceAt_ne env n i v h_ne + | env, n, v, .and a b, h => by + simp only [Expr.colReferencesUnused, Bool.and_eq_true] at h + simp only [eval] + rw [eval_replaceAt_of_unused env n v a h.1, + eval_replaceAt_of_unused env n v b h.2] + | env, n, v, .or a b, h => by + simp only [Expr.colReferencesUnused, Bool.and_eq_true] at h + simp only [eval] + rw [eval_replaceAt_of_unused env n v a h.1, + eval_replaceAt_of_unused env n v b h.2] + | env, n, v, .not a, h => by + simp only [Expr.colReferencesUnused] at h + simp only [eval] + rw [eval_replaceAt_of_unused env n v a h] + | env, n, v, .ifThen c t e, h => by + simp only [Expr.colReferencesUnused, Bool.and_eq_true] at h + simp only [eval] + rw [eval_replaceAt_of_unused env n v c h.1.1, + eval_replaceAt_of_unused env n v t h.1.2, + eval_replaceAt_of_unused env n v e h.2] + | env, n, v, .andN args, h => by + simp only [Expr.colReferencesUnused] at h + simp only [eval] + rw [eval_replaceAt_of_unused_argsMap env n v args h] + | env, n, v, .orN args, h => by + simp only [Expr.colReferencesUnused] at h + simp only [eval] + rw [eval_replaceAt_of_unused_argsMap env n v args h] + | env, n, v, .coalesce args, h => by + simp only [Expr.colReferencesUnused] at h + simp only [eval] + rw [eval_replaceAt_of_unused_argsMap env n v args h] + | env, n, v, .plus a b, h => by + simp only [Expr.colReferencesUnused, Bool.and_eq_true] at h + simp only [eval] + rw [eval_replaceAt_of_unused env n v a h.1, + eval_replaceAt_of_unused env n v b h.2] + | env, n, v, .minus a b, h => by + simp only [Expr.colReferencesUnused, Bool.and_eq_true] at h + simp only [eval] + rw [eval_replaceAt_of_unused env n v a h.1, + eval_replaceAt_of_unused env n v b h.2] + | env, n, v, .times a b, h => by + simp only [Expr.colReferencesUnused, Bool.and_eq_true] at h + simp only [eval] + rw [eval_replaceAt_of_unused env n v a h.1, + eval_replaceAt_of_unused env n v b h.2] + | env, n, v, .divide a b, h => by + simp only [Expr.colReferencesUnused, Bool.and_eq_true] at h + simp only [eval] + rw [eval_replaceAt_of_unused env n v a h.1, + eval_replaceAt_of_unused env n v b h.2] + | env, n, v, .eq a b, h => by + simp only [Expr.colReferencesUnused, Bool.and_eq_true] at h + simp only [eval] + rw [eval_replaceAt_of_unused env n v a h.1, + eval_replaceAt_of_unused env n v b h.2] + | env, n, v, .lt a b, h => by + simp only [Expr.colReferencesUnused, Bool.and_eq_true] at h + simp only [eval] + rw [eval_replaceAt_of_unused env n v a h.1, + eval_replaceAt_of_unused env n v b h.2] + +theorem eval_replaceAt_of_unused_argsMap : + ∀ (env : Env) (n : Nat) (v : Datum) (args : List Expr), + Expr.argsColRefUnused n args = true → + args.map (eval (Env.replaceAt env n v)) = args.map (eval env) + | _, _, _, [], _ => rfl + | env, n, v, e :: rest, h => by + simp only [Expr.argsColRefUnused, Bool.and_eq_true] at h + show eval (Env.replaceAt env n v) e :: rest.map (eval (Env.replaceAt env n v)) + = eval env e :: rest.map (eval env) + rw [eval_replaceAt_of_unused env n v e h.1, + eval_replaceAt_of_unused_argsMap env n v rest h.2] +end + /-! ## Shift composition laws `colShift` is the identity at `k = 0` and composes additively: diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 6f28b6951dc78..490d48d7e89da 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -33,6 +33,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `Expr.colShift k e` adds `k` to every column reference, leaving other constructors structurally intact. Right-side analogue of the bounded analyzer: it realigns a predicate written against the right schema with the joined env `l ++ r` (where the right side starts at index `l.length`). Headline `eval_append_right_shift`: `eval (l ++ r) (e.colShift l.length) = eval r e`. `Expr.colReferencesBoundedBy_mono` (mutual with `Expr.argsColRefBoundedBy_mono`) lifts a tight bound to a coarser one — useful when a predicate's natural bound is a single relation's width but the proof site needs the joined-env width. Convenience `eval_append_left_of_bounded_at` removes the requirement that the predicate's bound match `l.length` exactly: any `n ≤ l.length` suffices. `Expr.colShift_zero` (identity at `k = 0`) and `Expr.colShift_add` (`(e.colShift k).colShift m = e.colShift (k + m)`) give the shift its monoid laws. Useful for nested joins where each layer adds its own offset to the predicate's column references. + `Expr.colReferencesUnused n e` (mutual with `Expr.argsColRefUnused`) returns `true` when `e` never reads column `n`. Companion `Env.replaceAt env n v` swaps a single position. Headline `eval_replaceAt_of_unused`: when column `n` is unused, replacing its value does not change eval. Supports column-pruning rewrites: a projection that drops unused columns is sound. Supporting `Env.get_replaceAt_eq` and `Env.get_replaceAt_ne` discharge the per-column reductions. Supporting environment lemmas: `Env.get_append_left` (read from prefix) and `Env.get_append_right` (read from suffix with index shift). * `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `-`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity / negation laws that downstream operators must respect. Negation laws (`neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`, `neg_mul`, `mul_neg`) carry the principle that `.error` is unrecoverable — a collection-scoped error cannot be subtracted away, and that negation distributes through multiplication on both sides. The `_int` specializations (`add_comm_int`, `add_assoc_int`, `mul_assoc_int`, `mul_comm_int`, `mul_add_int`, `neg_neg_int`, `val_add_neg_val_int`, `neg_mul_int`, `mul_neg_int`) discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean`, `Mz/UnifiedConsolidate.lean`, and `Mz/SetOps.lean` can cite the named laws directly. * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). From dfe032b5c9c6e9730ab735f2a89b410f885dd2f0 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 15:51:15 +0200 Subject: [PATCH 065/127] doc/semantics: bridge bounded and unused analyzers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `Expr.colReferencesUnused_of_bounded` (mutual with `Expr.argsColRefUnused_of_bounded`): a predicate whose column references are bounded by `n` has every column index `i ≥ n` unused. Lets the optimizer derive column-pruning consequences from a tight bound. Composed with `eval_replaceAt_of_unused`, this proves that columns at index `≥ n` can be replaced (or dropped) without changing eval. Mutual structure mirrors the existing `_mono` analyzers. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/ColRefs.lean | 88 +++++++++++++++++++++++++ doc/developer/semantics/README.md | 1 + 2 files changed, 89 insertions(+) diff --git a/doc/developer/semantics/Mz/ColRefs.lean b/doc/developer/semantics/Mz/ColRefs.lean index 907393471446f..ab87912670e68 100644 --- a/doc/developer/semantics/Mz/ColRefs.lean +++ b/doc/developer/semantics/Mz/ColRefs.lean @@ -531,6 +531,94 @@ theorem eval_replaceAt_of_unused_argsMap : eval_replaceAt_of_unused_argsMap env n v rest h.2] end +/-! ## Bridge between bounded and unused analyzers + +If a predicate's column references are bounded by `n`, then any +column index `i ≥ n` is unused. Used by the optimizer to derive +column-pruning consequences from a tight bound: a predicate that +only reads the first `n` columns leaves the rest unused. -/ + +mutual +theorem Expr.colReferencesUnused_of_bounded : + ∀ {n i : Nat} (e : Expr), + e.colReferencesBoundedBy n = true → n ≤ i → + e.colReferencesUnused i = true + | _, _, .lit _, _, _ => rfl + | n, i, .col j, h, hLe => by + have h_lt : j < n := of_decide_eq_true h + show decide (j ≠ i) = true + have : j ≠ i := fun hEq => Nat.not_lt_of_le hLe (hEq ▸ h_lt) + exact decide_eq_true this + | _, _, .and a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Expr.colReferencesUnused, + Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesUnused_of_bounded a h.1 hLe, + Expr.colReferencesUnused_of_bounded b h.2 hLe⟩ + | _, _, .or a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Expr.colReferencesUnused, + Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesUnused_of_bounded a h.1 hLe, + Expr.colReferencesUnused_of_bounded b h.2 hLe⟩ + | _, _, .not a, h, hLe => + Expr.colReferencesUnused_of_bounded a h hLe + | _, _, .ifThen c t e, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Expr.colReferencesUnused, + Bool.and_eq_true] at h ⊢ + exact ⟨⟨Expr.colReferencesUnused_of_bounded c h.1.1 hLe, + Expr.colReferencesUnused_of_bounded t h.1.2 hLe⟩, + Expr.colReferencesUnused_of_bounded e h.2 hLe⟩ + | _, _, .andN args, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Expr.colReferencesUnused] at h ⊢ + exact Expr.argsColRefUnused_of_bounded args h hLe + | _, _, .orN args, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Expr.colReferencesUnused] at h ⊢ + exact Expr.argsColRefUnused_of_bounded args h hLe + | _, _, .coalesce args, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Expr.colReferencesUnused] at h ⊢ + exact Expr.argsColRefUnused_of_bounded args h hLe + | _, _, .plus a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Expr.colReferencesUnused, + Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesUnused_of_bounded a h.1 hLe, + Expr.colReferencesUnused_of_bounded b h.2 hLe⟩ + | _, _, .minus a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Expr.colReferencesUnused, + Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesUnused_of_bounded a h.1 hLe, + Expr.colReferencesUnused_of_bounded b h.2 hLe⟩ + | _, _, .times a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Expr.colReferencesUnused, + Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesUnused_of_bounded a h.1 hLe, + Expr.colReferencesUnused_of_bounded b h.2 hLe⟩ + | _, _, .divide a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Expr.colReferencesUnused, + Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesUnused_of_bounded a h.1 hLe, + Expr.colReferencesUnused_of_bounded b h.2 hLe⟩ + | _, _, .eq a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Expr.colReferencesUnused, + Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesUnused_of_bounded a h.1 hLe, + Expr.colReferencesUnused_of_bounded b h.2 hLe⟩ + | _, _, .lt a b, h, hLe => by + simp only [Expr.colReferencesBoundedBy, Expr.colReferencesUnused, + Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesUnused_of_bounded a h.1 hLe, + Expr.colReferencesUnused_of_bounded b h.2 hLe⟩ + +theorem Expr.argsColRefUnused_of_bounded : + ∀ {n i : Nat} (args : List Expr), + Expr.argsColRefBoundedBy n args = true → n ≤ i → + Expr.argsColRefUnused i args = true + | _, _, [], _, _ => rfl + | _, _, e :: rest, h, hLe => by + simp only [Expr.argsColRefBoundedBy, Expr.argsColRefUnused, + Bool.and_eq_true] at h ⊢ + exact ⟨Expr.colReferencesUnused_of_bounded e h.1 hLe, + Expr.argsColRefUnused_of_bounded rest h.2 hLe⟩ +end + /-! ## Shift composition laws `colShift` is the identity at `k = 0` and composes additively: diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 490d48d7e89da..f995e496ab510 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -34,6 +34,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `Expr.colReferencesBoundedBy_mono` (mutual with `Expr.argsColRefBoundedBy_mono`) lifts a tight bound to a coarser one — useful when a predicate's natural bound is a single relation's width but the proof site needs the joined-env width. Convenience `eval_append_left_of_bounded_at` removes the requirement that the predicate's bound match `l.length` exactly: any `n ≤ l.length` suffices. `Expr.colShift_zero` (identity at `k = 0`) and `Expr.colShift_add` (`(e.colShift k).colShift m = e.colShift (k + m)`) give the shift its monoid laws. Useful for nested joins where each layer adds its own offset to the predicate's column references. `Expr.colReferencesUnused n e` (mutual with `Expr.argsColRefUnused`) returns `true` when `e` never reads column `n`. Companion `Env.replaceAt env n v` swaps a single position. Headline `eval_replaceAt_of_unused`: when column `n` is unused, replacing its value does not change eval. Supports column-pruning rewrites: a projection that drops unused columns is sound. Supporting `Env.get_replaceAt_eq` and `Env.get_replaceAt_ne` discharge the per-column reductions. + `Expr.colReferencesUnused_of_bounded` (mutual with the operand-list version) bridges the two analyzers: a predicate bounded by `n` has every column `i ≥ n` unused. Lets the optimizer derive column-pruning consequences from a tight bound. Supporting environment lemmas: `Env.get_append_left` (read from prefix) and `Env.get_append_right` (read from suffix with index shift). * `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `-`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity / negation laws that downstream operators must respect. Negation laws (`neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`, `neg_mul`, `mul_neg`) carry the principle that `.error` is unrecoverable — a collection-scoped error cannot be subtracted away, and that negation distributes through multiplication on both sides. The `_int` specializations (`add_comm_int`, `add_assoc_int`, `mul_assoc_int`, `mul_comm_int`, `mul_add_int`, `neg_neg_int`, `val_add_neg_val_int`, `neg_mul_int`, `mul_neg_int`) discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean`, `Mz/UnifiedConsolidate.lean`, and `Mz/SetOps.lean` can cite the named laws directly. * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). From 0f92ec2d1b5f743eae8b12e1c309fb21e045b778 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 15:57:25 +0200 Subject: [PATCH 066/127] doc/semantics: bagExceptAll nil-left reduction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `bagExceptAll_nil_left`: `bagExceptAll [] r = clampPositive (negate (consolidate r))`. Direct composition of `exceptAll_nil_left` with the `clampPositive` outer wrapping. Spec consequence: when subtracting r from an empty bag, every `.val n > 0` diff in `consolidate r` becomes `.val (-n) < 0` after `negate`, which `clampPositive` then drops — yielding the correct empty bag result for the `.val` slice. `.error` diffs survive both stages, as required by the absorbing-marker semantics. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 1e08129158171..0b00dfb081ef9 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -807,4 +807,17 @@ theorem UnifiedStream.exceptAll_nil_left (r : UnifiedStream) : rw [UnifiedStream.unionAll_nil_left] exact (UnifiedStream.negate_consolidate r).symm +/-- `bagExceptAll [] r = clampPositive (negate (consolidate r))`. +With an all-`.val` `r`, the negation makes every diff non-positive, +which `clampPositive` then drops, yielding the spec-correct empty +result. `.error` records survive both stages. -/ +theorem UnifiedStream.bagExceptAll_nil_left (r : UnifiedStream) : + UnifiedStream.bagExceptAll [] r + = UnifiedStream.clampPositive + (UnifiedStream.negate (UnifiedStream.consolidate r)) := by + show UnifiedStream.clampPositive (UnifiedStream.exceptAll [] r) + = UnifiedStream.clampPositive + (UnifiedStream.negate (UnifiedStream.consolidate r)) + rw [UnifiedStream.exceptAll_nil_left] + end Mz From ca02c34d740bade2d5ad1cf2165296c63ccbd13c Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 16:05:13 +0200 Subject: [PATCH 067/127] doc/semantics: named reduction lemmas for filter and cross MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lifts the inline `simp [List.flatMap_cons]` / `exact List.flatMap_append` patterns scattered across the skeleton into named theorems that downstream proofs cite directly: * `UnifiedStream.filter_nil` — empty input. * `UnifiedStream.filter_append` — distributes over `++`. * `UnifiedStream.cross_append_left` — left-input append. * `UnifiedStream.cross_cons_left` — exposes the head-record contribution explicitly. Applies the new lemmas to `filter_unionAll` and `cross_unionAll_left`, which now reduce to direct citations. Motivation: two recent proof attempts (`JoinPushdown.lean`, `negate_filter`) got tangled in inline `flatMap`-lambda reduction and nested `match` in `show` statements. Named lemmas sidestep that — proofs become "induct on list, cite the reduction, then per-record analysis". Future complex proofs in this area should follow that template. The deeper refactor (named `filterStep` / `crossStep` per-record helpers) is deferred — touching `filter`'s definition broke a dozen downstream proofs in `Mz/Join.lean`. Keep the inline match, add the reductions on top. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Join.lean | 19 +++++++++++++++++++ doc/developer/semantics/Mz/SetOps.lean | 10 ++++------ doc/developer/semantics/Mz/UnifiedStream.lean | 14 ++++++++++++++ 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/doc/developer/semantics/Mz/Join.lean b/doc/developer/semantics/Mz/Join.lean index b4b2d882f240d..8fc3c856af6a0 100644 --- a/doc/developer/semantics/Mz/Join.lean +++ b/doc/developer/semantics/Mz/Join.lean @@ -62,6 +62,25 @@ theorem UnifiedStream.cross_nil_right (l : UnifiedStream) : | nil => rfl | cons _ tl _ih => simp [UnifiedStream.cross, List.map_nil, List.flatMap_cons] +/-! ### Reduction lemmas on the left input + +Named per-list-shape reductions so downstream proofs cite these +instead of unfolding `flatMap` inline. -/ + +theorem UnifiedStream.cross_append_left (a b r : UnifiedStream) : + UnifiedStream.cross (a ++ b) r + = UnifiedStream.cross a r ++ UnifiedStream.cross b r := by + show (a ++ b).flatMap _ = a.flatMap _ ++ b.flatMap _ + exact List.flatMap_append + +theorem UnifiedStream.cross_cons_left + (hd : UnifiedRow × DiffWithError Int) (tl r : UnifiedStream) : + UnifiedStream.cross (hd :: tl) r + = (r.map (fun rd => (combineCarrier hd.1 rd.1, hd.2 * rd.2))) + ++ UnifiedStream.cross tl r := by + show (hd :: tl).flatMap _ = _ ++ tl.flatMap _ + simp [List.flatMap_cons] + /-! ## Cardinality -/ /-- Cross product cardinality. `cross l r` produces exactly one diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 0b00dfb081ef9..ef86cf1fd9bb8 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -649,17 +649,15 @@ theorem UnifiedStream.filter_unionAll (p : Expr) (a b : UnifiedStream) : UnifiedStream.filter p (UnifiedStream.unionAll a b) = UnifiedStream.unionAll (UnifiedStream.filter p a) - (UnifiedStream.filter p b) := by - show (a ++ b).flatMap _ = a.flatMap _ ++ b.flatMap _ - exact List.flatMap_append + (UnifiedStream.filter p b) := + UnifiedStream.filter_append p a b theorem UnifiedStream.cross_unionAll_left (a b r : UnifiedStream) : UnifiedStream.cross (UnifiedStream.unionAll a b) r = UnifiedStream.unionAll (UnifiedStream.cross a r) - (UnifiedStream.cross b r) := by - show (a ++ b).flatMap _ = a.flatMap _ ++ b.flatMap _ - exact List.flatMap_append + (UnifiedStream.cross b r) := + UnifiedStream.cross_append_left a b r theorem UnifiedStream.project_unionAll (es : List Expr) (a b : UnifiedStream) : UnifiedStream.project es (UnifiedStream.unionAll a b) diff --git a/doc/developer/semantics/Mz/UnifiedStream.lean b/doc/developer/semantics/Mz/UnifiedStream.lean index 18ff46569a322..81b524788204d 100644 --- a/doc/developer/semantics/Mz/UnifiedStream.lean +++ b/doc/developer/semantics/Mz/UnifiedStream.lean @@ -107,6 +107,20 @@ def UnifiedStream.filter (pred : Expr) (us : UnifiedStream) : UnifiedStream := | .err e => [(.err e, d)] | _ => [] +/-! ### Reduction lemmas + +Named per-list-shape reductions for `filter`. Downstream proofs +cite these instead of unfolding `flatMap` inline. -/ + +theorem UnifiedStream.filter_nil (pred : Expr) : + UnifiedStream.filter pred [] = [] := rfl + +theorem UnifiedStream.filter_append (pred : Expr) (a b : UnifiedStream) : + UnifiedStream.filter pred (a ++ b) + = UnifiedStream.filter pred a ++ UnifiedStream.filter pred b := by + show (a ++ b).flatMap _ = a.flatMap _ ++ b.flatMap _ + exact List.flatMap_append + /-! ## Project Diff-aware projection. Each non-error record splits on its carrier: From df6bd790307b5e68404ae8e48f69d503f555bf38 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 16:21:49 +0200 Subject: [PATCH 068/127] doc/semantics: more reduction lemmas for project and negate Adds `UnifiedStream.project_append` and `UnifiedStream.negate_append` (the `flatMap_append` / `map_append` reductions in named-lemma form). Refactors `project_unionAll` and `negate_unionAll` to cite them directly. Companion to the earlier `filter_append` / `cross_append_left` cleanup: every operator on `UnifiedStream` defined as `flatMap` or `map` over a list now has a named append-distributivity lemma. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 21 ++++++++++++------- doc/developer/semantics/Mz/UnifiedStream.lean | 8 +++++++ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index ef86cf1fd9bb8..57a945370d088 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -186,6 +186,14 @@ theorem UnifiedStream.negate_length (us : UnifiedStream) : (UnifiedStream.negate us).length = us.length := List.length_map _ +/-- `negate` distributes over `++`. The dedicated map-append +reduction so downstream proofs cite a single lemma. -/ +theorem UnifiedStream.negate_append (a b : UnifiedStream) : + UnifiedStream.negate (a ++ b) + = UnifiedStream.negate a ++ UnifiedStream.negate b := by + show (a ++ b).map _ = a.map _ ++ b.map _ + exact List.map_append + /-- Double negation is the identity (lifted from `Int.neg_neg` through `DiffWithError.neg_neg_int`). -/ theorem UnifiedStream.negate_negate (us : UnifiedStream) : @@ -663,18 +671,17 @@ theorem UnifiedStream.project_unionAll (es : List Expr) (a b : UnifiedStream) : UnifiedStream.project es (UnifiedStream.unionAll a b) = UnifiedStream.unionAll (UnifiedStream.project es a) - (UnifiedStream.project es b) := by - show (a ++ b).flatMap _ = a.flatMap _ ++ b.flatMap _ - exact List.flatMap_append + (UnifiedStream.project es b) := + UnifiedStream.project_append es a b -/-- `negate` distributes over `unionAll` via `List.map_append`. -/ +/-- `negate` distributes over `unionAll`. Direct citation of +`negate_append`. -/ theorem UnifiedStream.negate_unionAll (a b : UnifiedStream) : UnifiedStream.negate (UnifiedStream.unionAll a b) = UnifiedStream.unionAll (UnifiedStream.negate a) - (UnifiedStream.negate b) := by - show (a ++ b).map _ = a.map _ ++ b.map _ - exact List.map_append + (UnifiedStream.negate b) := + UnifiedStream.negate_append a b /-- `negate` commutes with `consolidateInto`: inserting a negated diff into a negated bucket list gives the same result as inserting diff --git a/doc/developer/semantics/Mz/UnifiedStream.lean b/doc/developer/semantics/Mz/UnifiedStream.lean index 81b524788204d..784b5767b56b7 100644 --- a/doc/developer/semantics/Mz/UnifiedStream.lean +++ b/doc/developer/semantics/Mz/UnifiedStream.lean @@ -121,6 +121,8 @@ theorem UnifiedStream.filter_append (pred : Expr) (a b : UnifiedStream) : show (a ++ b).flatMap _ = a.flatMap _ ++ b.flatMap _ exact List.flatMap_append +-- `project_append` is stated after `project`'s definition below. + /-! ## Project Diff-aware projection. Each non-error record splits on its carrier: @@ -162,6 +164,12 @@ def UnifiedStream.project (es : List Expr) (us : UnifiedStream) : UnifiedStream theorem UnifiedStream.project_nil_stream (es : List Expr) : UnifiedStream.project es [] = [] := rfl +theorem UnifiedStream.project_append (es : List Expr) (a b : UnifiedStream) : + UnifiedStream.project es (a ++ b) + = UnifiedStream.project es a ++ UnifiedStream.project es b := by + show (a ++ b).flatMap _ = a.flatMap _ ++ b.flatMap _ + exact List.flatMap_append + /-- The empty projection list cannot error on any row, so every record passes through with the row collapsed to width zero. -/ theorem UnifiedStream.project_nil_es (us : UnifiedStream) : From 25703a49272046d2fbe0a4994468eedc30ca80cd Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 16:30:59 +0200 Subject: [PATCH 069/127] doc/semantics: named reduction lemmas for consolidateInto MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lifts the inline `rw [if_pos rfl]` / `rw [if_neg hEq]` patterns across the consolidation proofs into three named lemmas: * `consolidateInto_nil`: `consolidateInto uc d [] = [(uc, d)]`. * `consolidateInto_match`: head key matches — bucket update. * `consolidateInto_skip`: head key differs — recurse on tail. Refactors `consolidateInto_error_diff`, `consolidateInto_preserves_error_mem`, `consolidateInto_length_le_succ`, `mem_after_consolidateInto`, `consolidateInto_length_eq_of_mem`, `consolidateInto_no_error` (all in `Mz/UnifiedConsolidate.lean`) plus `negate_consolidateInto` (in `Mz/SetOps.lean`) to cite the named lemmas instead of opening the if-then-else by hand. Reduces ~40 lines of inline `show (if uc = uc' then ...) = ...; rw [if_pos/neg]` boilerplate. Future consolidation proofs follow the same template. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 32 +---- .../semantics/Mz/UnifiedConsolidate.lean | 110 ++++++++---------- 2 files changed, 52 insertions(+), 90 deletions(-) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 57a945370d088..5061ce091116c 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -699,42 +699,18 @@ private theorem negate_consolidateInto obtain ⟨uc', d'⟩ := hd by_cases hEq : uc = uc' · subst hEq - -- consolidateInto uc d ((uc, d') :: tl) = (uc, d + d') :: tl - have hLhs : consolidateInto uc d ((uc, d') :: tl) = (uc, d + d') :: tl := by - show (if uc = uc then (uc, d + d') :: tl - else (uc, d') :: consolidateInto uc d tl) - = (uc, d + d') :: tl - rw [if_pos rfl] - have hRhs : consolidateInto uc (-d) ((uc, -d') :: UnifiedStream.negate tl) - = (uc, -d + -d') :: UnifiedStream.negate tl := by - show (if uc = uc then (uc, -d + -d') :: UnifiedStream.negate tl - else (uc, -d') :: consolidateInto uc (-d) (UnifiedStream.negate tl)) - = (uc, -d + -d') :: UnifiedStream.negate tl - rw [if_pos rfl] - rw [hLhs] + rw [consolidateInto_match] show UnifiedStream.negate ((uc, d + d') :: tl) = consolidateInto uc (-d) (UnifiedStream.negate ((uc, d') :: tl)) show ((uc, -(d + d')) :: UnifiedStream.negate tl) = consolidateInto uc (-d) ((uc, -d') :: UnifiedStream.negate tl) - rw [hRhs, DiffWithError.neg_add_int] - · have hLhs : consolidateInto uc d ((uc', d') :: tl) - = (uc', d') :: consolidateInto uc d tl := by - show (if uc = uc' then (uc', d + d') :: tl - else (uc', d') :: consolidateInto uc d tl) - = (uc', d') :: consolidateInto uc d tl - rw [if_neg hEq] - have hRhs : consolidateInto uc (-d) ((uc', -d') :: UnifiedStream.negate tl) - = (uc', -d') :: consolidateInto uc (-d) (UnifiedStream.negate tl) := by - show (if uc = uc' then (uc', -d + -d') :: UnifiedStream.negate tl - else (uc', -d') :: consolidateInto uc (-d) (UnifiedStream.negate tl)) - = (uc', -d') :: consolidateInto uc (-d) (UnifiedStream.negate tl) - rw [if_neg hEq] - rw [hLhs] + rw [consolidateInto_match, DiffWithError.neg_add_int] + · rw [consolidateInto_skip _ _ _ _ _ hEq] show UnifiedStream.negate ((uc', d') :: consolidateInto uc d tl) = consolidateInto uc (-d) (UnifiedStream.negate ((uc', d') :: tl)) show ((uc', -d') :: UnifiedStream.negate (consolidateInto uc d tl)) = consolidateInto uc (-d) ((uc', -d') :: UnifiedStream.negate tl) - rw [hRhs, ih] + rw [consolidateInto_skip _ _ _ _ _ hEq, ih] /-- `negate` commutes with `consolidate`: consolidating then negating equals negating then consolidating. Negation is additive, diff --git a/doc/developer/semantics/Mz/UnifiedConsolidate.lean b/doc/developer/semantics/Mz/UnifiedConsolidate.lean index eb8ac045abc61..212fa0ccbab6d 100644 --- a/doc/developer/semantics/Mz/UnifiedConsolidate.lean +++ b/doc/developer/semantics/Mz/UnifiedConsolidate.lean @@ -46,6 +46,36 @@ def UnifiedStream.consolidate : UnifiedStream → UnifiedStream | (uc, d) :: rest => consolidateInto uc d (UnifiedStream.consolidate rest) +/-! ### `consolidateInto` reduction lemmas + +Named per-shape reductions so proofs cite a single lemma instead +of unfolding the `if`-then-else by hand. -/ + +theorem consolidateInto_nil (uc : UnifiedRow) (d : DiffWithError Int) : + consolidateInto uc d [] = [(uc, d)] := rfl + +/-- Inserting `(uc, d)` at the head of a list whose head matches +`uc` folds into the head bucket. -/ +theorem consolidateInto_match + (uc : UnifiedRow) (d d' : DiffWithError Int) (tl : UnifiedStream) : + consolidateInto uc d ((uc, d') :: tl) = (uc, d + d') :: tl := by + show (if uc = uc then (uc, d + d') :: tl + else (uc, d') :: consolidateInto uc d tl) + = (uc, d + d') :: tl + rw [if_pos rfl] + +/-- Inserting `(uc, d)` at the head of a list whose head does not +match `uc` skips the head and recurses on the tail. -/ +theorem consolidateInto_skip + (uc uc' : UnifiedRow) (d d' : DiffWithError Int) (tl : UnifiedStream) + (h : uc ≠ uc') : + consolidateInto uc d ((uc', d') :: tl) + = (uc', d') :: consolidateInto uc d tl := by + show (if uc = uc' then (uc', d + d') :: tl + else (uc', d') :: consolidateInto uc d tl) + = (uc', d') :: consolidateInto uc d tl + rw [if_neg h] + /-! ## Trivial cases -/ theorem UnifiedStream.consolidate_nil : @@ -70,16 +100,10 @@ private theorem consolidateInto_error_diff obtain ⟨uc', d'⟩ := hd by_cases hEq : uc = uc' · subst hEq - show (uc, DiffWithError.error) - ∈ (if uc = uc then (uc, DiffWithError.error + d') :: tl - else (uc, d') :: consolidateInto uc DiffWithError.error tl) - rw [if_pos rfl] + rw [consolidateInto_match] rw [DiffWithError.error_add_left] exact List.mem_cons_self - · show (uc, DiffWithError.error) - ∈ (if uc = uc' then (uc', DiffWithError.error + d') :: tl - else (uc', d') :: consolidateInto uc DiffWithError.error tl) - rw [if_neg hEq] + · rw [consolidateInto_skip _ _ _ _ _ hEq] exact List.mem_cons_of_mem _ ih /-- Inserting any record into a consolidated stream that already @@ -105,27 +129,17 @@ private theorem consolidateInto_preserves_error_mem subst hUc subst hD by_cases hEq' : uc' = uc - · show (uc, DiffWithError.error) - ∈ (if uc' = uc then (uc, d' + DiffWithError.error) :: tl - else (uc, DiffWithError.error) :: consolidateInto uc' d' tl) - rw [if_pos hEq'] + · subst hEq' + rw [consolidateInto_match] rw [DiffWithError.error_add_right] exact List.mem_cons_self - · show (uc, DiffWithError.error) - ∈ (if uc' = uc then (uc, d' + DiffWithError.error) :: tl - else (uc, DiffWithError.error) :: consolidateInto uc' d' tl) - rw [if_neg hEq'] + · rw [consolidateInto_skip _ _ _ _ _ hEq'] exact List.mem_cons_self · by_cases hEq' : uc' = uc₀ - · show (uc, DiffWithError.error) - ∈ (if uc' = uc₀ then (uc₀, d' + d₀) :: tl - else (uc₀, d₀) :: consolidateInto uc' d' tl) - rw [if_pos hEq'] + · subst hEq' + rw [consolidateInto_match] exact List.mem_cons_of_mem _ hTail - · show (uc, DiffWithError.error) - ∈ (if uc' = uc₀ then (uc₀, d' + d₀) :: tl - else (uc₀, d₀) :: consolidateInto uc' d' tl) - rw [if_neg hEq'] + · rw [consolidateInto_skip _ _ _ _ _ hEq'] exact List.mem_cons_of_mem _ (ih hTail) /-- Headline absorption: an `.error` diff anywhere in the input @@ -172,15 +186,10 @@ private theorem consolidateInto_length_le_succ | cons hd tl ih => obtain ⟨uc', d'⟩ := hd by_cases hEq : uc = uc' - · show (if uc = uc' then (uc', d + d') :: tl - else (uc', d') :: consolidateInto uc d tl).length - ≤ (((uc', d') :: tl).length) + 1 - rw [if_pos hEq] + · subst hEq + rw [consolidateInto_match] simp [List.length_cons] - · show (if uc = uc' then (uc', d + d') :: tl - else (uc', d') :: consolidateInto uc d tl).length - ≤ (((uc', d') :: tl).length) + 1 - rw [if_neg hEq] + · rw [consolidateInto_skip _ _ _ _ _ hEq] show (consolidateInto uc d tl).length + 1 ≤ tl.length + 1 + 1 omega @@ -220,17 +229,11 @@ private theorem mem_after_consolidateInto by_cases hEq : uc = uc' · subst hEq refine ⟨d + d', ?_⟩ - show (uc, d + d') ∈ - (if uc = uc then (uc, d + d') :: tl - else (uc, d') :: consolidateInto uc d tl) - rw [if_pos rfl] + rw [consolidateInto_match] exact List.mem_cons_self · obtain ⟨d'', hMem⟩ := ih refine ⟨d'', ?_⟩ - show (uc, d'') ∈ - (if uc = uc' then (uc', d + d') :: tl - else (uc', d') :: consolidateInto uc d tl) - rw [if_neg hEq] + rw [consolidateInto_skip _ _ _ _ _ hEq] exact List.mem_cons_of_mem _ hMem /-- When `uc` already appears in `us`, `consolidateInto uc d us` @@ -245,20 +248,14 @@ private theorem consolidateInto_length_eq_of_mem obtain ⟨uc', d'⟩ := hd by_cases hEq : uc = uc' · subst hEq - show (if uc = uc then (uc, d + d') :: tl - else (uc, d') :: consolidateInto uc d tl).length - = ((uc, d') :: tl).length - rw [if_pos rfl] + rw [consolidateInto_match] rfl · have hMemTl : ∃ d', (uc, d') ∈ tl := by obtain ⟨d'', hMem⟩ := h rcases List.mem_cons.mp hMem with hHead | hTail · exact absurd ((Prod.mk.injEq _ _ _ _).mp hHead).1 hEq · exact ⟨d'', hTail⟩ - show (if uc = uc' then (uc', d + d') :: tl - else (uc', d') :: consolidateInto uc d tl).length - = ((uc', d') :: tl).length - rw [if_neg hEq] + rw [consolidateInto_skip _ _ _ _ _ hEq] show (consolidateInto uc d tl).length + 1 = tl.length + 1 rw [ih hMemTl] @@ -314,13 +311,8 @@ private theorem consolidateInto_no_error obtain ⟨m, hM⟩ := hHd intro r hMem by_cases hEq : uc = uc' - · have hOut : consolidateInto uc (DiffWithError.val n) ((uc', d') :: tl) - = (uc', DiffWithError.val n + d') :: tl := by - show (if uc = uc' then (uc', DiffWithError.val n + d') :: tl - else (uc', d') :: consolidateInto uc (DiffWithError.val n) tl) - = (uc', DiffWithError.val n + d') :: tl - rw [if_pos hEq] - rw [hOut] at hMem + · subst hEq + rw [consolidateInto_match] at hMem rcases List.mem_cons.mp hMem with hHead | hTail' · subst hHead rw [hM] @@ -328,13 +320,7 @@ private theorem consolidateInto_no_error = DiffWithError.val m' exact ⟨n + m, rfl⟩ · exact hTl r hTail' - · have hOut : consolidateInto uc (DiffWithError.val n) ((uc', d') :: tl) - = (uc', d') :: consolidateInto uc (DiffWithError.val n) tl := by - show (if uc = uc' then (uc', DiffWithError.val n + d') :: tl - else (uc', d') :: consolidateInto uc (DiffWithError.val n) tl) - = (uc', d') :: consolidateInto uc (DiffWithError.val n) tl - rw [if_neg hEq] - rw [hOut] at hMem + · rw [consolidateInto_skip _ _ _ _ _ hEq] at hMem rcases List.mem_cons.mp hMem with hHead | hTail' · subst hHead exact ⟨m, hM⟩ From 7c45ebd1ef2872ff3a45e1b38434f8159ad2b628 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 16:36:06 +0200 Subject: [PATCH 070/127] doc/semantics: named reduction lemmas for insertInto / insertIntoDistinct Adds four named lemmas for the groupBy insert helpers: * `insertInto_match` / `insertInto_skip` for the merge-on-equal variant. * `insertIntoDistinct_match` / `insertIntoDistinct_skip` for the err-distinct variant (matches on `Datum.groupKeyEq` instead of `=`). Refactors `insertIntoDistinct_eq_insertInto`, `insertInto_preserves_non_err_keys`, `totalRows_insertInto`, and `totalRows_insertIntoDistinct` to cite the named lemmas instead of opening the if-then-else by hand. Reduces another ~30 lines of inline `show (if k = k' then ... else ...) = ...; rw [if_pos/neg]` boilerplate. Follows the template established by the `consolidateInto` cleanup. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/GroupBy.lean | 107 ++++++++++++++---------- 1 file changed, 62 insertions(+), 45 deletions(-) diff --git a/doc/developer/semantics/Mz/GroupBy.lean b/doc/developer/semantics/Mz/GroupBy.lean index b8243df04e947..14b8d91e65d18 100644 --- a/doc/developer/semantics/Mz/GroupBy.lean +++ b/doc/developer/semantics/Mz/GroupBy.lean @@ -51,6 +51,50 @@ private def insertIntoDistinct (k : Datum) (row : Row) : if Datum.groupKeyEq k k' then (k', row :: rows) :: rest else (k', rows) :: insertIntoDistinct k row rest +/-! ### `insertInto` reduction lemmas + +Named per-shape reductions so proofs cite a single lemma instead of +opening the `if`-then-else by hand. -/ + +private theorem insertInto_match + (k : Datum) (row : Row) (rows : Relation) + (tl : List (Datum × Relation)) : + insertInto k row ((k, rows) :: tl) = (k, row :: rows) :: tl := by + show (if k = k then (k, row :: rows) :: tl + else (k, rows) :: insertInto k row tl) + = (k, row :: rows) :: tl + rw [if_pos rfl] + +private theorem insertInto_skip + (k k' : Datum) (row : Row) (rows : Relation) + (tl : List (Datum × Relation)) (h : k ≠ k') : + insertInto k row ((k', rows) :: tl) + = (k', rows) :: insertInto k row tl := by + show (if k = k' then (k', row :: rows) :: tl + else (k', rows) :: insertInto k row tl) + = (k', rows) :: insertInto k row tl + rw [if_neg h] + +private theorem insertIntoDistinct_match + (k k' : Datum) (row : Row) (rows : Relation) + (tl : List (Datum × Relation)) (h : Datum.groupKeyEq k k' = true) : + insertIntoDistinct k row ((k', rows) :: tl) + = (k', row :: rows) :: tl := by + show (if Datum.groupKeyEq k k' then (k', row :: rows) :: tl + else (k', rows) :: insertIntoDistinct k row tl) + = (k', row :: rows) :: tl + rw [if_pos h] + +private theorem insertIntoDistinct_skip + (k k' : Datum) (row : Row) (rows : Relation) + (tl : List (Datum × Relation)) (h : Datum.groupKeyEq k k' = false) : + insertIntoDistinct k row ((k', rows) :: tl) + = (k', rows) :: insertIntoDistinct k row tl := by + show (if Datum.groupKeyEq k k' then (k', row :: rows) :: tl + else (k', rows) :: insertIntoDistinct k row tl) + = (k', rows) :: insertIntoDistinct k row tl + rw [if_neg (by simp [h])] + /-- `GROUP BY keyExpr`: partition `rel` by the value of `keyExpr` on each row. Output is a list of `(key, rows)` pairs, one per distinct key, in encounter order. -/ @@ -197,20 +241,14 @@ private theorem insertIntoDistinct_eq_insertInto fun g hMem => hGroups g (List.mem_cons_of_mem _ hMem) have hKey := Datum.groupKeyEq_eq_decide_of_no_err hK hK' by_cases hEq : k = k' - · show (if Datum.groupKeyEq k k' then (k', row :: rows) :: tl - else (k', rows) :: insertIntoDistinct k row tl) - = (if k = k' then (k', row :: rows) :: tl - else (k', rows) :: insertInto k row tl) - have hKeyTrue : Datum.groupKeyEq k k' = true := by - rw [hKey]; exact decide_eq_true hEq - rw [if_pos hKeyTrue, if_pos hEq] - · show (if Datum.groupKeyEq k k' then (k', row :: rows) :: tl - else (k', rows) :: insertIntoDistinct k row tl) - = (if k = k' then (k', row :: rows) :: tl - else (k', rows) :: insertInto k row tl) - have hKeyFalse : Datum.groupKeyEq k k' = false := by + · subst hEq + have hKeyTrue : Datum.groupKeyEq k k = true := by + rw [hKey]; exact decide_eq_true rfl + rw [insertIntoDistinct_match _ _ _ _ _ hKeyTrue, insertInto_match] + · have hKeyFalse : Datum.groupKeyEq k k' = false := by rw [hKey]; exact decide_eq_false hEq - rw [if_neg (by simp [hKeyFalse]), if_neg hEq, ih hTl] + rw [insertIntoDistinct_skip _ _ _ _ _ hKeyFalse, + insertInto_skip _ _ _ _ _ hEq, ih hTl] /-- `insertInto` propagates the "no err keys" invariant from its input bucket list to its output: if the inserted key is non-err @@ -235,23 +273,12 @@ private theorem insertInto_preserves_non_err_keys intro g hMem show ¬ g.1.IsErr by_cases hEq : k = k' - · have hOut : insertInto k row ((k', rows) :: tl) - = (k', row :: rows) :: tl := by - show (if k = k' then (k', row :: rows) :: tl - else (k', rows) :: insertInto k row tl) - = (k', row :: rows) :: tl - rw [if_pos hEq] - rw [hOut] at hMem + · subst hEq + rw [insertInto_match] at hMem rcases List.mem_cons.mp hMem with hHead | hTail · subst hHead; exact hK' · exact hTl g hTail - · have hOut : insertInto k row ((k', rows) :: tl) - = (k', rows) :: insertInto k row tl := by - show (if k = k' then (k', row :: rows) :: tl - else (k', rows) :: insertInto k row tl) - = (k', rows) :: insertInto k row tl - rw [if_neg hEq] - rw [hOut] at hMem + · rw [insertInto_skip _ _ _ _ _ hEq] at hMem rcases List.mem_cons.mp hMem with hHead | hTail · subst hHead; exact hK' · exact ih hTl g hTail @@ -323,18 +350,12 @@ private theorem totalRows_insertInto obtain ⟨k', rows⟩ := head by_cases hEq : k = k' · subst hEq - show totalRows (if k = k then (k, row :: rows) :: tl - else (k, rows) :: insertInto k row tl) - = totalRows ((k, rows) :: tl) + 1 - rw [if_pos rfl] + rw [insertInto_match] show (row :: rows).length + totalRows tl = rows.length + totalRows tl + 1 simp [List.length_cons] omega - · show totalRows (if k = k' then (k', row :: rows) :: tl - else (k', rows) :: insertInto k row tl) - = totalRows ((k', rows) :: tl) + 1 - rw [if_neg hEq] + · rw [insertInto_skip _ _ _ _ _ hEq] show rows.length + totalRows (insertInto k row tl) = rows.length + totalRows tl + 1 rw [ih] @@ -350,20 +371,16 @@ private theorem totalRows_insertIntoDistinct | cons head tl ih => obtain ⟨k', rows⟩ := head by_cases hEq : Datum.groupKeyEq k k' = true - · show totalRows (if Datum.groupKeyEq k k' - then (k', row :: rows) :: tl - else (k', rows) :: insertIntoDistinct k row tl) - = totalRows ((k', rows) :: tl) + 1 - rw [if_pos hEq] + · rw [insertIntoDistinct_match _ _ _ _ _ hEq] show (row :: rows).length + totalRows tl = rows.length + totalRows tl + 1 simp [List.length_cons] omega - · show totalRows (if Datum.groupKeyEq k k' - then (k', row :: rows) :: tl - else (k', rows) :: insertIntoDistinct k row tl) - = totalRows ((k', rows) :: tl) + 1 - rw [if_neg hEq] + · have hFalse : Datum.groupKeyEq k k' = false := by + cases h : Datum.groupKeyEq k k' with + | true => exact absurd h hEq + | false => rfl + rw [insertIntoDistinct_skip _ _ _ _ _ hFalse] show rows.length + totalRows (insertIntoDistinct k row tl) = rows.length + totalRows tl + 1 rw [ih] From 3c29a5045ef1f9785235bcf11472480a89a01ed5 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 16:39:12 +0200 Subject: [PATCH 071/127] doc/semantics: errorRows reduction lemmas MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds three named reductions for the foldr-match pattern in `errorRows`: * `errorRows_nil`. * `errorRows_cons_err`: head row errs with payload `e` → prepend `e`. * `errorRows_cons_non_err`: head row is non-err → recurse on tail. Refactors `errorRows_eq_nil_of_all_true` and `errorRows_eq_nil_of_no_err` to cite the non-err lemma instead of opening the foldr-match by hand. The remaining inline `show (match eval hd pred with ...) = ...` patterns now live only in the reduction-lemma definitions themselves. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/ErrStream.lean | 44 +++++++++++++++-------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/doc/developer/semantics/Mz/ErrStream.lean b/doc/developer/semantics/Mz/ErrStream.lean index 846f0e5d57b6b..3e4dae99fc9b1 100644 --- a/doc/developer/semantics/Mz/ErrStream.lean +++ b/doc/developer/semantics/Mz/ErrStream.lean @@ -47,6 +47,33 @@ def errorRows (pred : Expr) (rel : Relation) : List EvalError := | .err e => e :: acc | _ => acc) [] +/-! ### `errorRows` reduction lemmas + +Named per-shape reductions so downstream proofs cite a single +lemma instead of unfolding the `foldr`-match inline. -/ + +theorem errorRows_nil (pred : Expr) : errorRows pred [] = [] := rfl + +theorem errorRows_cons_err (pred : Expr) (row : Row) (e : EvalError) + (tl : Relation) (h : eval row pred = .err e) : + errorRows pred (row :: tl) = e :: errorRows pred tl := by + show (match eval row pred with + | .err e => e :: errorRows pred tl + | _ => errorRows pred tl) = e :: errorRows pred tl + rw [h] + +theorem errorRows_cons_non_err (pred : Expr) (row : Row) (tl : Relation) + (h : ¬(eval row pred).IsErr) : + errorRows pred (row :: tl) = errorRows pred tl := by + show (match eval row pred with + | .err e => e :: errorRows pred tl + | _ => errorRows pred tl) = errorRows pred tl + cases hEval : eval row pred with + | bool _ => rfl + | int _ => rfl + | null => rfl + | err _ => rw [hEval] at h; exact absurd trivial h + /-- Error-aware filter. Rows whose predicate evaluates to `.bool true` stay in the data collection; rows whose predicate evaluates to `.err` contribute their payload to the error collection; @@ -93,10 +120,7 @@ theorem errorRows_eq_nil_of_all_true (pred : Expr) (rel : Relation) have hd_eval : eval hd pred = .bool true := h hd (List.Mem.head tl) have htl : ∀ row ∈ tl, eval row pred = .bool true := fun row h_mem => h row (List.Mem.tail hd h_mem) - show (match eval hd pred with - | .err e => e :: errorRows pred tl - | _ => errorRows pred tl) = [] - rw [hd_eval] + rw [errorRows_cons_non_err _ _ _ (by rw [hd_eval]; intro hRes; cases hRes)] exact ih htl /-- `errorRows` of a filtered relation is empty: the survivors all @@ -148,16 +172,8 @@ theorem errorRows_eq_nil_of_no_err have hHd : ¬(eval hd pred).IsErr := h hd List.mem_cons_self have hTl : ∀ row ∈ tl, ¬(eval row pred).IsErr := fun row hMem => h row (List.mem_cons_of_mem _ hMem) - show (match eval hd pred with - | .err e => e :: errorRows pred tl - | _ => errorRows pred tl) = [] - cases h_eval : eval hd pred with - | bool _ => exact ih hTl - | int _ => exact ih hTl - | null => exact ih hTl - | err _ => - rw [h_eval] at hHd - exact absurd (show True by trivial) hHd + rw [errorRows_cons_non_err _ _ _ hHd] + exact ih hTl /-- Full commutativity of `BagStream.filter` under a no-error precondition: when neither predicate errs on any row of the input From 7d50459effb804a7cf96765eb8d31c5e006dc42d Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 16:43:31 +0200 Subject: [PATCH 072/127] doc/semantics: INTERSECT ALL via min combinator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `DiffWithError.min` combinator (the bag-min on diffs with `.error` absorbing), supporting laws (`error_min_left`, `error_min_right`, `min_val_val`), `UnifiedStream.lookup` (per-carrier diff lookup in a consolidated stream), and the headline `UnifiedStream.intersectAll`. `intersectAll l r`: * Consolidates both inputs. * Per left-carrier: emit `(uc, min(L_diff, R_diff))` if the carrier exists in the right's consolidate; otherwise drop. Output: signed-diff bag intersection. Theorem `intersectAll_length_le` (≤ left.length) — at most one output record per consolidated left carrier. Closes the deferred roadmap item that previously cited "needs a per-carrier min combinator not derivable from +, *, -". The combinator now exists; the operator builds on it. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/DiffSemiring.lean | 21 ++++++++++ doc/developer/semantics/Mz/SetOps.lean | 43 ++++++++++++++++++++ doc/developer/semantics/README.md | 6 +-- 3 files changed, 67 insertions(+), 3 deletions(-) diff --git a/doc/developer/semantics/Mz/DiffSemiring.lean b/doc/developer/semantics/Mz/DiffSemiring.lean index 56ce847cbc3f1..aa96e2e227713 100644 --- a/doc/developer/semantics/Mz/DiffSemiring.lean +++ b/doc/developer/semantics/Mz/DiffSemiring.lean @@ -59,6 +59,15 @@ def neg [Neg α] : DiffWithError α → DiffWithError α | .error => .error | .val x => .val (-x) +/-- Per-pair minimum. `error` absorbs (a collection-scoped error +in either input forces an error output); on `val`s, the underlying +type's `min` is taken. Required for bag-intersection: the bag-min +combinator at the diff level. -/ +def min [Min α] : DiffWithError α → DiffWithError α → DiffWithError α + | .error, _ => .error + | _, .error => .error + | .val x, .val y => .val (Min.min x y) + instance [Add α] : Add (DiffWithError α) := ⟨add⟩ instance [Mul α] : Mul (DiffWithError α) := ⟨mul⟩ instance [Neg α] : Neg (DiffWithError α) := ⟨neg⟩ @@ -92,6 +101,18 @@ theorem error_mul_right [Mul α] (x : DiffWithError α) : | val _ => rfl | error => rfl +theorem error_min_left [Min α] (y : DiffWithError α) : + DiffWithError.min (error : DiffWithError α) y = error := rfl + +theorem error_min_right [Min α] (x : DiffWithError α) : + DiffWithError.min x (error : DiffWithError α) = error := by + cases x with + | val _ => rfl + | error => rfl + +theorem min_val_val [Min α] (x y : α) : + DiffWithError.min (val x : DiffWithError α) (val y) = val (Min.min x y) := rfl + /-! ## Commutativity / associativity of `+` (when the base has them) -/ theorem add_comm [Add α] (h_comm : ∀ x y : α, x + y = y + x) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 5061ce091116c..3b7c419035f6e 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -788,6 +788,49 @@ theorem UnifiedStream.exceptAll_nil_left (r : UnifiedStream) : rw [UnifiedStream.unionAll_nil_left] exact (UnifiedStream.negate_consolidate r).symm +/-! ## `INTERSECT ALL` + +Bag intersection on the diff-aware stream. Implementation: +consolidate both inputs, then for each carrier in the left, +look it up in the right. Output: `(carrier, min(L, R))` when +the carrier is present in both, otherwise omit. The min +combinator `DiffWithError.min` carries `.error` absorption: a +collection-scoped error in either input gives an `.error` output +for that carrier (provided the carrier exists in both). -/ + +/-- Lookup the diff for `uc` in a consolidated stream. Returns +`none` when the carrier is absent. The skeleton's consolidate +keeps each carrier at most once, so the first match is the only +match. -/ +def UnifiedStream.lookup (uc : UnifiedRow) : + UnifiedStream → Option (DiffWithError Int) + | [] => none + | (uc', d) :: rest => if uc = uc' then some d + else UnifiedStream.lookup uc rest + +theorem UnifiedStream.lookup_nil (uc : UnifiedRow) : + UnifiedStream.lookup uc [] = none := rfl + +/-- `INTERSECT ALL` on `UnifiedStream`. Consolidates both sides, +then per left-carrier emits `(uc, min(L_diff, R_diff))` if the +carrier exists in the right's consolidate; otherwise drops it. -/ +def UnifiedStream.intersectAll (l r : UnifiedStream) : UnifiedStream := + let rCons := UnifiedStream.consolidate r + (UnifiedStream.consolidate l).filterMap fun ud => + match UnifiedStream.lookup ud.1 rCons with + | none => none + | some d' => some (ud.1, DiffWithError.min ud.2 d') + +/-- Length bound: at most the consolidated length of `l`. -/ +theorem UnifiedStream.intersectAll_length_le (l r : UnifiedStream) : + (UnifiedStream.intersectAll l r).length ≤ l.length := by + have h1 : (UnifiedStream.intersectAll l r).length + ≤ (UnifiedStream.consolidate l).length := by + show ((UnifiedStream.consolidate l).filterMap _).length + ≤ (UnifiedStream.consolidate l).length + exact List.length_filterMap_le _ _ + exact Nat.le_trans h1 (UnifiedStream.consolidate_length_le l) + /-- `bagExceptAll [] r = clampPositive (negate (consolidate r))`. With an all-`.val` `r`, the negation makes every diff non-positive, which `clampPositive` then drops, yielding the spec-correct empty diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index f995e496ab510..3ee0408d78c64 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -36,7 +36,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `Expr.colReferencesUnused n e` (mutual with `Expr.argsColRefUnused`) returns `true` when `e` never reads column `n`. Companion `Env.replaceAt env n v` swaps a single position. Headline `eval_replaceAt_of_unused`: when column `n` is unused, replacing its value does not change eval. Supports column-pruning rewrites: a projection that drops unused columns is sound. Supporting `Env.get_replaceAt_eq` and `Env.get_replaceAt_ne` discharge the per-column reductions. `Expr.colReferencesUnused_of_bounded` (mutual with the operand-list version) bridges the two analyzers: a predicate bounded by `n` has every column `i ≥ n` unused. Lets the optimizer derive column-pruning consequences from a tight bound. Supporting environment lemmas: `Env.get_append_left` (read from prefix) and `Env.get_append_right` (read from suffix with index shift). -* `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `-`, `0`, `1` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity / negation laws that downstream operators must respect. Negation laws (`neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`, `neg_mul`, `mul_neg`) carry the principle that `.error` is unrecoverable — a collection-scoped error cannot be subtracted away, and that negation distributes through multiplication on both sides. The `_int` specializations (`add_comm_int`, `add_assoc_int`, `mul_assoc_int`, `mul_comm_int`, `mul_add_int`, `neg_neg_int`, `val_add_neg_val_int`, `neg_mul_int`, `mul_neg_int`) discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean`, `Mz/UnifiedConsolidate.lean`, and `Mz/SetOps.lean` can cite the named laws directly. +* `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `-`, `0`, `1`, `min` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity / negation / min laws that downstream operators must respect. Negation laws (`neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`, `neg_mul`, `mul_neg`, `neg_add`) carry the principle that `.error` is unrecoverable — a collection-scoped error cannot be subtracted away, and that negation distributes through addition and multiplication. Min laws (`error_min_left`, `error_min_right`, `min_val_val`) define the bag-min combinator with `.error` absorbing. The `_int` specializations discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean`, `Mz/UnifiedConsolidate.lean`, and `Mz/SetOps.lean` can cite the named laws directly. * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). `UnifiedStream.project` lifts `BagStream.project` to the diff-aware carrier. Records with `.error` diff or `.err` carrier pass through unchanged; a `.row r` record with `.val n` diff is evaluated against `es` — if every scalar succeeds, the row is emitted with diff `.val n`; if any scalar errs, one `(.err e, .val n)` is emitted per erroring scalar (each preserving the original multiplicity). Theorems: `project_preserves_error_diff` (an `.error` diff in the input always reaches the output), `project_no_error` (all-`.val` inputs yield all-`.val` outputs), `project_nil_es` (empty projection list collapses every row to width-zero), `project_nil_stream` (empty stream is empty). * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). @@ -59,7 +59,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four Distributivity over `unionAll`: `filter_unionAll`, `cross_unionAll_left`, `project_unionAll`, `negate_unionAll`. Each follows from `List.flatMap_append` / `List.map_append` — `flatMap`- and `map`-based operators distribute over concatenation. Lets the optimizer pull a `UNION ALL` tail out of a pipeline and plan per-branch. `cross_negate_left` proves that negating the left input of a cross product equals negating the cross output: the diff-semiring law `(-a) * b = -(a * b)` (`DiffWithError.neg_mul_int`) carries the proof since `combineCarrier` is unchanged by negation and only the diff arithmetic flips. `negate_consolidate` proves that negation commutes with consolidation: `negate (consolidate us) = consolidate (negate us)`. The proof recurses via `negate_consolidateInto` (private), which lifts the same property to the single-step insertion. Negation is additive (`neg_add`), so it slides through per-bucket sums. - `INTERSECT ALL` requires a per-carrier `min` combinator not yet exposed by `DiffWithError`. Deferred. + `intersectAll l r` realizes bag-intersection via lookup: consolidate both inputs, then per left-carrier emit `(uc, min(L_diff, R_diff))` if the carrier exists in the right's consolidate, else drop. Supported by `UnifiedStream.lookup` (return the diff for a carrier in a consolidated stream, or `none`) and `DiffWithError.min` (the bag-min combinator with `.error` absorbing). Theorem: `intersectAll_length_le` (≤ left.length). * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. @@ -125,6 +125,6 @@ The diff-semiring extension is in scope: `UnifiedStream` records carry `(Unified ### Material expansions -* `INTERSECT ALL` on `UnifiedStream`: requires a per-carrier `min` combinator over `DiffWithError Int`. The combinator is not derivable from `+`, `*`, `-` alone — landing it requires either a new diff primitive or a bucketing operator that materializes per-carrier multiplicities from both inputs. +* Strengthening `intersectAll` correctness theorems. Length bound shipped; remaining: error preservation per side (`.error` diff for a carrier present in both inputs survives), no-error preservation, bag-semantics positive-clamp (parallel to `bagExceptAll`). * `distinct` is in scope; remaining: stronger correctness theorems (idempotence `distinct ∘ distinct = distinct`, agreement with the carrier-set view, no-error preservation on `.val` inputs). * Cross-link the spec doc (`../design/20260517_error_handling_semantics.md`) to specific theorem names via `[Mz/...:thm]` cross-references. From 10bef2aac8475485b239867f6491fd83ca7d94e7 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 16:45:53 +0200 Subject: [PATCH 073/127] doc/semantics: intersectAll left-side error preservation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two theorems strengthening `intersectAll`: * `lookup_isSome_of_mem`: if `(uc, d) ∈ us` then `lookup uc us = some d'` for some `d'` (the returned diff need not be the input one — `lookup` does not depend on consolidate's uniqueness invariant). * `intersectAll_preserves_error_diff_left`: `.error` diff in the left input survives, provided the carrier also appears (with any diff) in the right input. The min combinator's left-absorbing property `error_min_left` carries the proof. A right-side counterpart is deferred: it requires `lookup` to return the `.error` *specifically* when `(uc, .error) ∈ consolidate r`, which needs a consolidate-key-uniqueness lemma not yet exposed. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 55 ++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 3b7c419035f6e..f0fec932d4b0e 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -811,6 +811,32 @@ def UnifiedStream.lookup (uc : UnifiedRow) : theorem UnifiedStream.lookup_nil (uc : UnifiedRow) : UnifiedStream.lookup uc [] = none := rfl +/-- If a carrier is in the stream (with some diff), the lookup +returns `some` value. The returned diff need not be the input one +(consolidated streams keep each carrier once, but `lookup` does +not depend on that invariant). -/ +theorem UnifiedStream.lookup_isSome_of_mem + (uc : UnifiedRow) (us : UnifiedStream) + (h : ∃ d, (uc, d) ∈ us) : + ∃ d, UnifiedStream.lookup uc us = some d := by + induction us with + | nil => obtain ⟨_, hMem⟩ := h; exact absurd hMem List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + by_cases hEq : uc = uc' + · refine ⟨d', ?_⟩ + show (if uc = uc' then some d' else UnifiedStream.lookup uc tl) = some d' + rw [if_pos hEq] + · have hTl : ∃ d, (uc, d) ∈ tl := by + obtain ⟨d, hMem⟩ := h + rcases List.mem_cons.mp hMem with hHead | hTail + · exact absurd ((Prod.mk.injEq _ _ _ _).mp hHead).1 hEq + · exact ⟨d, hTail⟩ + obtain ⟨d, hLookup⟩ := ih hTl + refine ⟨d, ?_⟩ + show (if uc = uc' then some d' else UnifiedStream.lookup uc tl) = some d + rw [if_neg hEq]; exact hLookup + /-- `INTERSECT ALL` on `UnifiedStream`. Consolidates both sides, then per left-carrier emits `(uc, min(L_diff, R_diff))` if the carrier exists in the right's consolidate; otherwise drops it. -/ @@ -831,6 +857,35 @@ theorem UnifiedStream.intersectAll_length_le (l r : UnifiedStream) : exact List.length_filterMap_le _ _ exact Nat.le_trans h1 (UnifiedStream.consolidate_length_le l) +/-- `.error` diff in the left input survives `intersectAll`, +provided the same carrier also appears (with any diff) in the +right input. The min combinator's left-absorbing property +(`error_min_left`) ensures the output diff is `.error`. -/ +theorem UnifiedStream.intersectAll_preserves_error_diff_left + (l r : UnifiedStream) (uc : UnifiedRow) + (hL : (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.consolidate l) + (hR : ∃ d, (uc, d) ∈ UnifiedStream.consolidate r) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.intersectAll l r := by + obtain ⟨d', hLookup⟩ := + UnifiedStream.lookup_isSome_of_mem uc _ hR + show (uc, DiffWithError.error) + ∈ (UnifiedStream.consolidate l).filterMap _ + rw [List.mem_filterMap] + refine ⟨(uc, DiffWithError.error), hL, ?_⟩ + show (match UnifiedStream.lookup uc (UnifiedStream.consolidate r) with + | none => none + | some d' => some (uc, DiffWithError.min DiffWithError.error d')) + = some (uc, DiffWithError.error) + rw [hLookup] + rfl + +-- A right-side counterpart `intersectAll_preserves_error_diff_right` +-- needs `lookup` to find the `.error` *specifically* when the +-- carrier has `.error` in `consolidate r`. That requires +-- consolidate-key-uniqueness as a lemma. Deferred. + /-- `bagExceptAll [] r = clampPositive (negate (consolidate r))`. With an all-`.val` `r`, the negation makes every diff non-positive, which `clampPositive` then drops, yielding the spec-correct empty From fe355528d9d847aed70dfbd2a2463be6435976ee Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 16:51:43 +0200 Subject: [PATCH 074/127] doc/semantics: consolidate carrier uniqueness + right-side intersect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the foundational invariant `consolidate_noDup`: every output of `UnifiedStream.consolidate` is a `NoDupCarriers` list (`Pairwise` on first-component inequality — each carrier appears at most once). Supporting structure: * `NoDupCarriers us` defined as `us.Pairwise (·.1 ≠ ·.1)`. * `mem_consolidateInto_carrier`: characterizes carriers in the output of a single insert (either the new `uc` or an old tail-carrier). * `consolidateInto_preserves_noDup` (with fresh-key precondition) and `_general` (without): the bucket update preserves the uniqueness invariant. Downstream consequences: * `lookup_eq_of_mem_noDup`: when carriers are unique, `lookup` returns the exact diff (not some other entry's diff). * `intersectAll_preserves_error_diff_right`: `.error` diff in the right input survives, because `lookup` finds the *exact* `.error` diff via `consolidate_noDup`, and right-min absorption (`error_min_right`) yields the `.error` output. Closes the deferred `_right` counterpart to `intersectAll_preserves_error_diff_left`. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 55 ++++++- .../semantics/Mz/UnifiedConsolidate.lean | 141 ++++++++++++++++++ doc/developer/semantics/README.md | 3 +- 3 files changed, 194 insertions(+), 5 deletions(-) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index f0fec932d4b0e..ea7fb56959ae0 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -881,10 +881,57 @@ theorem UnifiedStream.intersectAll_preserves_error_diff_left rw [hLookup] rfl --- A right-side counterpart `intersectAll_preserves_error_diff_right` --- needs `lookup` to find the `.error` *specifically* when the --- carrier has `.error` in `consolidate r`. That requires --- consolidate-key-uniqueness as a lemma. Deferred. +/-- When the list has no duplicate carriers, `lookup` returns the +exact diff associated with the membership witness. -/ +theorem UnifiedStream.lookup_eq_of_mem_noDup + {uc : UnifiedRow} {d : DiffWithError Int} {us : UnifiedStream} + (h_mem : (uc, d) ∈ us) (h_noDup : UnifiedStream.NoDupCarriers us) : + UnifiedStream.lookup uc us = some d := by + induction us with + | nil => exact absurd h_mem List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + obtain ⟨hHead, hTl⟩ := List.pairwise_cons.mp h_noDup + rcases List.mem_cons.mp h_mem with hEq | hTail + · have hUc : uc = uc' := (Prod.mk.injEq _ _ _ _).mp hEq |>.1 + have hD : d = d' := (Prod.mk.injEq _ _ _ _).mp hEq |>.2 + subst hUc; subst hD + show (if uc = uc then some d else UnifiedStream.lookup uc tl) = some d + rw [if_pos rfl] + · have hNe : uc ≠ uc' := (hHead (uc, d) hTail).symm + show (if uc = uc' then some d' else UnifiedStream.lookup uc tl) = some d + rw [if_neg hNe] + exact ih hTail hTl + +/-- Symmetric: `.error` diff in the right input survives, provided +the same carrier also appears (with any diff) in the left. The +right-min-absorbing property carries the proof, using the no-dup +property of `consolidate r` to extract the exact `.error` diff. -/ +theorem UnifiedStream.intersectAll_preserves_error_diff_right + (l r : UnifiedStream) (uc : UnifiedRow) + (hL : ∃ d, (uc, d) ∈ UnifiedStream.consolidate l) + (hR : (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.consolidate r) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.intersectAll l r := by + obtain ⟨dL, hLMem⟩ := hL + have h_lookup_err : + UnifiedStream.lookup uc (UnifiedStream.consolidate r) + = some DiffWithError.error := + UnifiedStream.lookup_eq_of_mem_noDup hR + (UnifiedStream.consolidate_noDup r) + show (uc, DiffWithError.error) + ∈ (UnifiedStream.consolidate l).filterMap _ + rw [List.mem_filterMap] + refine ⟨(uc, dL), hLMem, ?_⟩ + show (match UnifiedStream.lookup uc (UnifiedStream.consolidate r) with + | none => none + | some d' => some (uc, DiffWithError.min dL d')) + = some (uc, DiffWithError.error) + rw [h_lookup_err] + show some (uc, DiffWithError.min dL DiffWithError.error) + = some (uc, DiffWithError.error) + rw [DiffWithError.error_min_right] /-- `bagExceptAll [] r = clampPositive (negate (consolidate r))`. With an all-`.val` `r`, the negation makes every diff non-positive, diff --git a/doc/developer/semantics/Mz/UnifiedConsolidate.lean b/doc/developer/semantics/Mz/UnifiedConsolidate.lean index 212fa0ccbab6d..a3f2b3e2734d7 100644 --- a/doc/developer/semantics/Mz/UnifiedConsolidate.lean +++ b/doc/developer/semantics/Mz/UnifiedConsolidate.lean @@ -283,6 +283,147 @@ theorem UnifiedStream.consolidate_strict_length_dup show (UnifiedStream.consolidate ((uc, d') :: rest)).length ≤ rest.length + 1 exact this +/-! ## Carrier uniqueness + +`consolidate` collapses duplicate carriers into single buckets, so +its output has each carrier at most once. Formalized as +`NoDupCarriers`: a `Pairwise` predicate stating that any two +records have distinct first components. + +`lookup_eq_of_mem_noDup` then connects uniqueness to `lookup`: +when carriers are unique, `lookup uc us` returns the exact diff +of `(uc, d) ∈ us` (not some other entry's diff). -/ + +/-- Each carrier appears at most once in the list. -/ +def UnifiedStream.NoDupCarriers (us : UnifiedStream) : Prop := + us.Pairwise (fun a b => a.1 ≠ b.1) + +theorem UnifiedStream.NoDupCarriers.nil : + UnifiedStream.NoDupCarriers [] := List.Pairwise.nil + +/-- The carriers appearing in `consolidateInto uc d us` are +contained in `{uc} ∪ carriers(us)`. Used by +`consolidateInto_preserves_noDup` to argue the new head's +distinctness from the recursive tail. -/ +private theorem mem_consolidateInto_carrier + (uc : UnifiedRow) (d : DiffWithError Int) (us : UnifiedStream) + (x : UnifiedRow × DiffWithError Int) + (h : x ∈ consolidateInto uc d us) : + x.1 = uc ∨ ∃ d'', (x.1, d'') ∈ us := by + obtain ⟨xu, xd⟩ := x + induction us with + | nil => + have hEq : (xu, xd) = (uc, d) := List.mem_singleton.mp h + have hUc : xu = uc := (Prod.mk.injEq _ _ _ _).mp hEq |>.1 + exact Or.inl hUc + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + by_cases hEq : uc = uc' + · subst hEq + rw [consolidateInto_match] at h + rcases List.mem_cons.mp h with hHead | hTail + · refine Or.inr ⟨d', ?_⟩ + have : xu = uc := (Prod.mk.injEq _ _ _ _).mp hHead |>.1 + rw [this] + exact List.mem_cons_self + · exact Or.inr ⟨xd, List.mem_cons_of_mem _ hTail⟩ + · rw [consolidateInto_skip _ _ _ _ _ hEq] at h + rcases List.mem_cons.mp h with hHead | hTail + · refine Or.inr ⟨d', ?_⟩ + have : xu = uc' := (Prod.mk.injEq _ _ _ _).mp hHead |>.1 + rw [this] + exact List.mem_cons_self + · rcases ih hTail with hIs_uc | ⟨d'', hMem⟩ + · exact Or.inl hIs_uc + · exact Or.inr ⟨d'', List.mem_cons_of_mem _ hMem⟩ + +/-- Inserting a record into a list whose carriers are all unique +preserves uniqueness. If the inserted carrier matches an existing +one, the bucket update keeps the list shape; otherwise the new +carrier joins fresh, distinct from every existing one. -/ +private theorem consolidateInto_preserves_noDup + (uc : UnifiedRow) (d : DiffWithError Int) (us : UnifiedStream) + (h_noDup : UnifiedStream.NoDupCarriers us) + (h_fresh : ∀ d'', (uc, d'') ∉ us) : + UnifiedStream.NoDupCarriers (consolidateInto uc d us) := by + -- The fresh-key precondition simplifies the proof: every + -- insertion either matches (preserving the list) or extends. + -- We do not need the fresh precondition for the match case, + -- but stating it together keeps the inductive shape simple. + induction us with + | nil => + show List.Pairwise _ [(uc, d)] + exact List.Pairwise.cons (fun _ hMem => absurd hMem List.not_mem_nil) List.Pairwise.nil + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + obtain ⟨hHead, hTl⟩ := List.pairwise_cons.mp h_noDup + -- The fresh precondition `∀ d'', (uc, d'') ∉ (uc', d') :: tl` + -- gives `uc ≠ uc'`. + have hUcNe : uc ≠ uc' := by + intro hEq + exact h_fresh d' (by rw [hEq]; exact List.mem_cons_self) + have hFreshTl : ∀ d'', (uc, d'') ∉ tl := + fun d'' hMem => h_fresh d'' (List.mem_cons_of_mem _ hMem) + rw [consolidateInto_skip _ _ _ _ _ hUcNe] + -- New head's distinctness: uc' ≠ x.1 for all x in + -- consolidateInto uc d tl. + have hHead' : ∀ x ∈ consolidateInto uc d tl, uc' ≠ x.1 := by + intro x hMem + rcases mem_consolidateInto_carrier uc d tl x hMem with hUc | ⟨d'', hMem'⟩ + · exact hUc ▸ fun h => hUcNe h.symm + · exact hHead (x.1, d'') hMem' + exact List.Pairwise.cons hHead' (ih hTl hFreshTl) + +/-- Same as above but without the fresh-key precondition: the +match branch handles the not-fresh case (the existing bucket +gets updated, list shape preserved). -/ +private theorem consolidateInto_preserves_noDup_general + (uc : UnifiedRow) (d : DiffWithError Int) (us : UnifiedStream) + (h_noDup : UnifiedStream.NoDupCarriers us) : + UnifiedStream.NoDupCarriers (consolidateInto uc d us) := by + by_cases h_exists : ∃ d'', (uc, d'') ∈ us + · -- uc already in us; existential ⊆ matching bucket on some level. + -- Induction on us to find it. + induction us with + | nil => obtain ⟨_, hMem⟩ := h_exists; exact absurd hMem List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + obtain ⟨hHead, hTl⟩ := List.pairwise_cons.mp h_noDup + by_cases hEq : uc = uc' + · subst hEq + rw [consolidateInto_match] + exact List.Pairwise.cons hHead hTl + · rw [consolidateInto_skip _ _ _ _ _ hEq] + have hExistsTl : ∃ d'', (uc, d'') ∈ tl := by + obtain ⟨d'', hMem⟩ := h_exists + refine ⟨d'', ?_⟩ + rcases List.mem_cons.mp hMem with hH | hT + · exact absurd ((Prod.mk.injEq _ _ _ _).mp hH).1 hEq + · exact hT + have hHead' : ∀ x ∈ consolidateInto uc d tl, uc' ≠ x.1 := by + intro x hMem_x + rcases mem_consolidateInto_carrier uc d tl x hMem_x with hUc | ⟨d''', hMem_x'⟩ + · exact hUc ▸ fun h => hEq h.symm + · exact hHead (x.1, d''') hMem_x' + exact List.Pairwise.cons hHead' (ih hTl hExistsTl) + · -- uc is fresh; use the fresh-precondition variant. + have h_fresh : ∀ d'', (uc, d'') ∉ us := by + intro d'' hMem + exact h_exists ⟨d'', hMem⟩ + exact consolidateInto_preserves_noDup uc d us h_noDup h_fresh + +/-- The headline: `consolidate` always produces a list with unique +carriers. -/ +theorem UnifiedStream.consolidate_noDup (us : UnifiedStream) : + UnifiedStream.NoDupCarriers (UnifiedStream.consolidate us) := by + induction us with + | nil => exact UnifiedStream.NoDupCarriers.nil + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + show UnifiedStream.NoDupCarriers + (consolidateInto uc d (UnifiedStream.consolidate tl)) + exact consolidateInto_preserves_noDup_general uc d _ ih + /-! ## No-error preservation If every input diff is a `.val`, every output diff is a `.val`. diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 3ee0408d78c64..6cece6b05e2b4 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -47,6 +47,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four *cardinality* — `consolidate_length_le` bounds the output by the input length (consolidation only merges, never expands); *no-error preservation* — `consolidate_no_error` proves that if every input diff is a `.val`, every output diff is a `.val`, so `.error` is the only source of absorption. *strict shrinkage* — `consolidate_strict_length_dup` proves that two adjacent records sharing a carrier compress to one in the output: `(consolidate ((uc, d) :: (uc, d') :: rest)).length ≤ rest.length + 1`, strictly less than the input's `rest.length + 2`. + *carrier uniqueness* — `consolidate_noDup` proves that `consolidate` always produces a `NoDupCarriers` list (`Pairwise` on first-component inequality). Each carrier appears at most once. Supporting `consolidateInto_preserves_noDup` (and a `_general` variant that handles both fresh-key and matching-key cases) carries the invariant through single-step inserts. * `Mz/Triple.lean`: collection-wide and per-time *flat* consolidation views on `TimedUnifiedStream`. `consolidateAll` sums every diff in the stream; `consolidateAtTimeFlat t` sums every diff at time `t`. Both ignore the carrier — they collapse a time slice (or the whole stream) to one `DiffWithError Int`. Absorption: `consolidateAll_eq_error_of_mem` and `consolidateAtTimeFlat_eq_error_of_mem`. Complementary to `Mz/TimedConsolidate.lean`'s `consolidateAtTime t`, which buckets per `(row, time)` and returns a `UnifiedStream`. * `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). No-error preservation: `cross_no_error` and `filter_no_error` prove that all-`.val` input diffs yield all-`.val` output diffs, so `.error` is the only source of absorbing diffs in the joint output. Algebraic laws: `combineCarrier_assoc` (carrier combine is associative modulo `List.append_assoc`) and the headline `UnifiedStream.cross_assoc` (`(a × b) × c = a × (b × c)`). The proof rearranges nested `flatMap` / `map` via local list-monad lemmas and closes via `DiffWithError.mul_assoc` plus `combineCarrier_assoc`. * `Mz/SetOps.lean`: set operations on `UnifiedStream`. `unionAll = (++)` concatenates two streams record-wise; theorems cover length (sum), associativity, nil identities, and error / no-error preservation from each input (`unionAll_preserves_error_diff_left`, `unionAll_preserves_error_diff_right`, `unionAll_no_error`). `union = consolidate ∘ unionAll` derives the set-semantics flavor; theorems lift the consolidation guarantees to `union` (`union_length_le`, `union_preserves_error_diff_left`, `union_preserves_error_diff_right`, `union_no_error`, `union_nil_left`, `union_nil_right`). @@ -59,7 +60,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four Distributivity over `unionAll`: `filter_unionAll`, `cross_unionAll_left`, `project_unionAll`, `negate_unionAll`. Each follows from `List.flatMap_append` / `List.map_append` — `flatMap`- and `map`-based operators distribute over concatenation. Lets the optimizer pull a `UNION ALL` tail out of a pipeline and plan per-branch. `cross_negate_left` proves that negating the left input of a cross product equals negating the cross output: the diff-semiring law `(-a) * b = -(a * b)` (`DiffWithError.neg_mul_int`) carries the proof since `combineCarrier` is unchanged by negation and only the diff arithmetic flips. `negate_consolidate` proves that negation commutes with consolidation: `negate (consolidate us) = consolidate (negate us)`. The proof recurses via `negate_consolidateInto` (private), which lifts the same property to the single-step insertion. Negation is additive (`neg_add`), so it slides through per-bucket sums. - `intersectAll l r` realizes bag-intersection via lookup: consolidate both inputs, then per left-carrier emit `(uc, min(L_diff, R_diff))` if the carrier exists in the right's consolidate, else drop. Supported by `UnifiedStream.lookup` (return the diff for a carrier in a consolidated stream, or `none`) and `DiffWithError.min` (the bag-min combinator with `.error` absorbing). Theorem: `intersectAll_length_le` (≤ left.length). + `intersectAll l r` realizes bag-intersection via lookup: consolidate both inputs, then per left-carrier emit `(uc, min(L_diff, R_diff))` if the carrier exists in the right's consolidate, else drop. Supported by `UnifiedStream.lookup` (return the diff for a carrier in a consolidated stream, or `none`), `UnifiedStream.lookup_isSome_of_mem` (lookup returns `some` when carrier present), `UnifiedStream.lookup_eq_of_mem_noDup` (returns the exact diff when the list has unique carriers), and `DiffWithError.min` (the bag-min combinator with `.error` absorbing). Theorems: `intersectAll_length_le` (≤ left.length), `intersectAll_preserves_error_diff_left`/`_right` (`.error` diff for a carrier present in both inputs survives — left case uses left-min absorption, right case uses the no-dup property of `consolidate r`). * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. From 4d89ec8860680b48adea96f4b406cb94732557e4 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 16:53:23 +0200 Subject: [PATCH 075/127] doc/semantics: clampPositive is no-op on clampToOne output Adds `clampPositive_clampToOne`: applying `clampPositive` to a stream that already passed through `clampToOne` is the identity. Every output of `clampToOne` has diff `.val 1` (which satisfies `0 < 1`) or `.error` (which `clampPositive` always keeps). Proof uses `List.filter_eq_self`: a filter is the identity iff the predicate is `true` on every element. Both cases discharge by `rfl` on `isPositiveDiff`. Useful for composing pipelines: a `distinct` followed by a bag clamp (e.g., `bagExceptAll`-style) does not reduce further. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index ea7fb56959ae0..5469ed307b63d 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -634,6 +634,21 @@ theorem UnifiedStream.distinct_only_one_or_error (us : UnifiedStream) : x.2 = DiffWithError.val 1 ∨ x.2 = DiffWithError.error := UnifiedStream.clampToOne_only_one_or_error _ +/-- `clampPositive` is a no-op on the output of `clampToOne`: +every surviving record has `.val 1` (which is positive) or +`.error`, both of which `clampPositive` keeps. -/ +theorem UnifiedStream.clampPositive_clampToOne (us : UnifiedStream) : + UnifiedStream.clampPositive (UnifiedStream.clampToOne us) + = UnifiedStream.clampToOne us := by + unfold UnifiedStream.clampPositive + apply List.filter_eq_self.mpr + intro x hMem + rcases UnifiedStream.clampToOne_only_one_or_error us x hMem with h | h + · show isPositiveDiff x.2 = true + rw [h]; rfl + · show isPositiveDiff x.2 = true + rw [h]; rfl + /-- All-`.val` inputs yield all-`.val` outputs through `distinct`. Combined with `distinct_only_one_or_error`, the surviving diffs are all `.val 1`. -/ From 4b75da83ab330744a8153cb7a9a5e3ade813472b Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 17:01:05 +0200 Subject: [PATCH 076/127] doc/semantics: intersectAll no-error preservation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `mem_of_lookup_eq_some`: the converse of `lookup_isSome_of_mem` — a successful lookup witnesses membership in the searched list. Used to pull a no-error witness from the right input into an `intersectAll` proof. Adds `intersectAll_no_error`: all-`.val` inputs yield all-`.val` outputs through `intersectAll`. Each output diff is `.val (Min.min nL nR)` for diffs `nL` (left) and `nR` (right) of matching carriers; `.error` cannot be introduced when neither input carries it. Proof composes `consolidate_no_error` (both sides), `mem_of_lookup_eq_some` (to obtain `d' ∈ consolidate r`), `DiffWithError.min_val_val` (val-min computes `.val (Min.min ...)`). Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 68 ++++++++++++++++++++++++++ doc/developer/semantics/README.md | 2 +- 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 5469ed307b63d..d290dd1801c12 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -896,6 +896,35 @@ theorem UnifiedStream.intersectAll_preserves_error_diff_left rw [hLookup] rfl +/-- Converse direction: a successful `lookup` witnesses +membership. The returned diff is paired with the queried carrier +in the input list. -/ +theorem UnifiedStream.mem_of_lookup_eq_some + {uc : UnifiedRow} {d : DiffWithError Int} {us : UnifiedStream} + (h : UnifiedStream.lookup uc us = some d) : + (uc, d) ∈ us := by + induction us with + | nil => exact absurd h (by simp [UnifiedStream.lookup]) + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + show (uc, d) ∈ (uc', d') :: tl + by_cases hEq : uc = uc' + · subst hEq + have hRed : UnifiedStream.lookup uc ((uc, d') :: tl) = some d' := by + show (if uc = uc then some d' else UnifiedStream.lookup uc tl) = some d' + rw [if_pos rfl] + rw [hRed] at h + have hEq' : d' = d := by injection h + rw [← hEq'] + exact List.mem_cons_self + · have hRed : UnifiedStream.lookup uc ((uc', d') :: tl) + = UnifiedStream.lookup uc tl := by + show (if uc = uc' then some d' else UnifiedStream.lookup uc tl) + = UnifiedStream.lookup uc tl + rw [if_neg hEq] + rw [hRed] at h + exact List.mem_cons_of_mem _ (ih h) + /-- When the list has no duplicate carriers, `lookup` returns the exact diff associated with the membership witness. -/ theorem UnifiedStream.lookup_eq_of_mem_noDup @@ -948,6 +977,45 @@ theorem UnifiedStream.intersectAll_preserves_error_diff_right = some (uc, DiffWithError.error) rw [DiffWithError.error_min_right] +/-- All-`.val` inputs yield all-`.val` outputs through +`intersectAll`. Each output diff is `.val (Min.min n m)` for some +`n` from the left and `m` from the right. -/ +theorem UnifiedStream.intersectAll_no_error + (l r : UnifiedStream) + (hL : ∀ x ∈ l, ∃ n : Int, x.2 = DiffWithError.val n) + (hR : ∀ x ∈ r, ∃ n : Int, x.2 = DiffWithError.val n) : + ∀ x ∈ UnifiedStream.intersectAll l r, + ∃ n : Int, x.2 = DiffWithError.val n := by + intro x hMem + obtain ⟨ud, hUdMem, hMatch⟩ := + List.mem_filterMap.mp (show x ∈ (UnifiedStream.consolidate l).filterMap _ from hMem) + -- ud ∈ consolidate l → ud.2 = .val nL. + obtain ⟨nL, hNL⟩ := + UnifiedStream.consolidate_no_error l hL ud hUdMem + -- The match in hMatch must resolve to `some` for hMatch to be `some x`. + cases h_lookup : UnifiedStream.lookup ud.1 (UnifiedStream.consolidate r) with + | none => + rw [h_lookup] at hMatch + cases hMatch + | some d' => + rw [h_lookup] at hMatch + -- hMatch : some (ud.1, min ud.2 d') = some x + have hxEq : x = (ud.1, DiffWithError.min ud.2 d') := by + injection hMatch with hPair + exact hPair.symm + -- d' ∈ consolidate r → d' = .val nR. + have hMemR : (ud.1, d') ∈ UnifiedStream.consolidate r := + UnifiedStream.mem_of_lookup_eq_some h_lookup + obtain ⟨nR, hNR⟩ := + UnifiedStream.consolidate_no_error r hR (ud.1, d') hMemR + have hNR' : d' = DiffWithError.val nR := hNR + -- min (.val nL) (.val nR) = .val (Min.min nL nR). + refine ⟨Min.min nL nR, ?_⟩ + rw [hxEq] + show DiffWithError.min ud.2 d' = DiffWithError.val (Min.min nL nR) + rw [hNL, hNR'] + rfl + /-- `bagExceptAll [] r = clampPositive (negate (consolidate r))`. With an all-`.val` `r`, the negation makes every diff non-positive, which `clampPositive` then drops, yielding the spec-correct empty diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 6cece6b05e2b4..a6538ded32519 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -60,7 +60,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four Distributivity over `unionAll`: `filter_unionAll`, `cross_unionAll_left`, `project_unionAll`, `negate_unionAll`. Each follows from `List.flatMap_append` / `List.map_append` — `flatMap`- and `map`-based operators distribute over concatenation. Lets the optimizer pull a `UNION ALL` tail out of a pipeline and plan per-branch. `cross_negate_left` proves that negating the left input of a cross product equals negating the cross output: the diff-semiring law `(-a) * b = -(a * b)` (`DiffWithError.neg_mul_int`) carries the proof since `combineCarrier` is unchanged by negation and only the diff arithmetic flips. `negate_consolidate` proves that negation commutes with consolidation: `negate (consolidate us) = consolidate (negate us)`. The proof recurses via `negate_consolidateInto` (private), which lifts the same property to the single-step insertion. Negation is additive (`neg_add`), so it slides through per-bucket sums. - `intersectAll l r` realizes bag-intersection via lookup: consolidate both inputs, then per left-carrier emit `(uc, min(L_diff, R_diff))` if the carrier exists in the right's consolidate, else drop. Supported by `UnifiedStream.lookup` (return the diff for a carrier in a consolidated stream, or `none`), `UnifiedStream.lookup_isSome_of_mem` (lookup returns `some` when carrier present), `UnifiedStream.lookup_eq_of_mem_noDup` (returns the exact diff when the list has unique carriers), and `DiffWithError.min` (the bag-min combinator with `.error` absorbing). Theorems: `intersectAll_length_le` (≤ left.length), `intersectAll_preserves_error_diff_left`/`_right` (`.error` diff for a carrier present in both inputs survives — left case uses left-min absorption, right case uses the no-dup property of `consolidate r`). + `intersectAll l r` realizes bag-intersection via lookup: consolidate both inputs, then per left-carrier emit `(uc, min(L_diff, R_diff))` if the carrier exists in the right's consolidate, else drop. Supported by `UnifiedStream.lookup` (return the diff for a carrier in a consolidated stream, or `none`), `UnifiedStream.lookup_isSome_of_mem` (lookup returns `some` when carrier present), `UnifiedStream.mem_of_lookup_eq_some` (converse: lookup success witnesses membership), `UnifiedStream.lookup_eq_of_mem_noDup` (returns the exact diff when the list has unique carriers), and `DiffWithError.min` (the bag-min combinator with `.error` absorbing). Theorems: `intersectAll_length_le` (≤ left.length), `intersectAll_preserves_error_diff_left`/`_right` (`.error` diff for a carrier present in both inputs survives — left case uses left-min absorption, right case uses the no-dup property of `consolidate r`), `intersectAll_no_error` (all-`.val` inputs yield all-`.val` outputs). * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. From 187e341797416b5981136c0b98e620ec3037192e Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 17:03:26 +0200 Subject: [PATCH 077/127] doc/semantics: bag-semantics INTERSECT ALL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `bagIntersectAll = clampPositive ∘ intersectAll`. The signed flavor's output may have non-positive `.val` diffs when either side's consolidated count is non-positive; the clamp drops those, producing the bag-semantics result. Theorems lift the signed flavor through `clampPositive`: * `bagIntersectAll_length_le` (≤ left.length). * `bagIntersectAll_preserves_error_diff_left` / `_right` (chains intersect's error preservation with `clampPositive`'s). * `bagIntersectAll_no_error` (chained no-error). * `bagIntersectAll_only_positive` (every output `.val` is strictly positive; `.error` may also appear). Closes the bag-semantics SQL set-op trio: `union`, `bagExceptAll`, `bagIntersectAll`. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 52 ++++++++++++++++++++++++++ doc/developer/semantics/README.md | 4 +- 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index d290dd1801c12..a72dd540d5562 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -1016,6 +1016,58 @@ theorem UnifiedStream.intersectAll_no_error rw [hNL, hNR'] rfl +/-! ## Bag-semantics `INTERSECT ALL` + +`bagIntersectAll = clampPositive ∘ intersectAll`. The signed-diff +output of `intersectAll` may have non-positive `.val` diffs when +either side's consolidated count is non-positive; the clamp drops +those, producing the bag-semantics result. -/ + +def UnifiedStream.bagIntersectAll (l r : UnifiedStream) : UnifiedStream := + UnifiedStream.clampPositive (UnifiedStream.intersectAll l r) + +theorem UnifiedStream.bagIntersectAll_length_le (l r : UnifiedStream) : + (UnifiedStream.bagIntersectAll l r).length ≤ l.length := + Nat.le_trans + (UnifiedStream.clampPositive_length_le _) + (UnifiedStream.intersectAll_length_le l r) + +theorem UnifiedStream.bagIntersectAll_preserves_error_diff_left + (l r : UnifiedStream) (uc : UnifiedRow) + (hL : (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.consolidate l) + (hR : ∃ d, (uc, d) ∈ UnifiedStream.consolidate r) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.bagIntersectAll l r := + UnifiedStream.clampPositive_preserves_error_diff _ uc + (UnifiedStream.intersectAll_preserves_error_diff_left l r uc hL hR) + +theorem UnifiedStream.bagIntersectAll_preserves_error_diff_right + (l r : UnifiedStream) (uc : UnifiedRow) + (hL : ∃ d, (uc, d) ∈ UnifiedStream.consolidate l) + (hR : (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.consolidate r) : + (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.bagIntersectAll l r := + UnifiedStream.clampPositive_preserves_error_diff _ uc + (UnifiedStream.intersectAll_preserves_error_diff_right l r uc hL hR) + +theorem UnifiedStream.bagIntersectAll_no_error + (l r : UnifiedStream) + (hL : ∀ x ∈ l, ∃ n : Int, x.2 = DiffWithError.val n) + (hR : ∀ x ∈ r, ∃ n : Int, x.2 = DiffWithError.val n) : + ∀ x ∈ UnifiedStream.bagIntersectAll l r, + ∃ n : Int, x.2 = DiffWithError.val n := + UnifiedStream.clampPositive_no_error _ + (UnifiedStream.intersectAll_no_error l r hL hR) + +theorem UnifiedStream.bagIntersectAll_only_positive + (l r : UnifiedStream) : + ∀ x ∈ UnifiedStream.bagIntersectAll l r, + (∃ n : Int, x.2 = DiffWithError.val n ∧ 0 < n) + ∨ x.2 = DiffWithError.error := + UnifiedStream.clampPositive_only_positive _ + /-- `bagExceptAll [] r = clampPositive (negate (consolidate r))`. With an all-`.val` `r`, the negation makes every diff non-positive, which `clampPositive` then drops, yielding the spec-correct empty diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index a6538ded32519..215e2b9546b30 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -61,6 +61,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `cross_negate_left` proves that negating the left input of a cross product equals negating the cross output: the diff-semiring law `(-a) * b = -(a * b)` (`DiffWithError.neg_mul_int`) carries the proof since `combineCarrier` is unchanged by negation and only the diff arithmetic flips. `negate_consolidate` proves that negation commutes with consolidation: `negate (consolidate us) = consolidate (negate us)`. The proof recurses via `negate_consolidateInto` (private), which lifts the same property to the single-step insertion. Negation is additive (`neg_add`), so it slides through per-bucket sums. `intersectAll l r` realizes bag-intersection via lookup: consolidate both inputs, then per left-carrier emit `(uc, min(L_diff, R_diff))` if the carrier exists in the right's consolidate, else drop. Supported by `UnifiedStream.lookup` (return the diff for a carrier in a consolidated stream, or `none`), `UnifiedStream.lookup_isSome_of_mem` (lookup returns `some` when carrier present), `UnifiedStream.mem_of_lookup_eq_some` (converse: lookup success witnesses membership), `UnifiedStream.lookup_eq_of_mem_noDup` (returns the exact diff when the list has unique carriers), and `DiffWithError.min` (the bag-min combinator with `.error` absorbing). Theorems: `intersectAll_length_le` (≤ left.length), `intersectAll_preserves_error_diff_left`/`_right` (`.error` diff for a carrier present in both inputs survives — left case uses left-min absorption, right case uses the no-dup property of `consolidate r`), `intersectAll_no_error` (all-`.val` inputs yield all-`.val` outputs). + `bagIntersectAll = clampPositive ∘ intersectAll` realizes the bag-semantics `INTERSECT ALL` — drops records with non-positive multiplicity, leaving `.error` records untouched. Theorems lift the signed flavor: `bagIntersectAll_length_le`, `bagIntersectAll_preserves_error_diff_left`/`_right`, `bagIntersectAll_no_error`, `bagIntersectAll_only_positive`. * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. @@ -126,6 +127,5 @@ The diff-semiring extension is in scope: `UnifiedStream` records carry `(Unified ### Material expansions -* Strengthening `intersectAll` correctness theorems. Length bound shipped; remaining: error preservation per side (`.error` diff for a carrier present in both inputs survives), no-error preservation, bag-semantics positive-clamp (parallel to `bagExceptAll`). -* `distinct` is in scope; remaining: stronger correctness theorems (idempotence `distinct ∘ distinct = distinct`, agreement with the carrier-set view, no-error preservation on `.val` inputs). +* `distinct` idempotence (`distinct ∘ distinct = distinct`) as multiset equality. Consolidate may reorder distinct carriers, so list-equality idempotence is false; needs multiset machinery on `List (UnifiedRow × DiffWithError Int)`. * Cross-link the spec doc (`../design/20260517_error_handling_semantics.md`) to specific theorem names via `[Mz/...:thm]` cross-references. From 25952da15facf15715dbc685e4c72716f3454f03 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 17:05:18 +0200 Subject: [PATCH 078/127] doc/semantics: NoDupCarriers closure for negate and clampPositive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two NoDup-preservation theorems: * `negate_noDup`: negation preserves uniqueness — carriers are untouched, only diffs flip. Direct induction on the list. * `clampPositive_noDup`: filter preserves uniqueness — derived via `List.Pairwise.sublist` and `List.filter_sublist`. `filter_noDup` (the predicate-driven `UnifiedStream.filter`) and `clampToOne_noDup` need per-shape membership characterizations for the head step (filter may convert `.row` to `.err`, clampToOne may drop or transform). Deferred. These closure properties let the optimizer compose set-op pipelines without re-deriving uniqueness at each step. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 33 ++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index a72dd540d5562..73e4bc1406e0a 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -977,6 +977,39 @@ theorem UnifiedStream.intersectAll_preserves_error_diff_right = some (uc, DiffWithError.error) rw [DiffWithError.error_min_right] +/-! ## NoDupCarriers closure laws + +Operators that preserve carriers (or filter them) preserve +`NoDupCarriers`. The closure lets the optimizer compose set-op +pipelines without re-deriving uniqueness at each step. -/ + +theorem UnifiedStream.negate_noDup (us : UnifiedStream) + (h : UnifiedStream.NoDupCarriers us) : + UnifiedStream.NoDupCarriers (UnifiedStream.negate us) := by + induction us with + | nil => exact UnifiedStream.NoDupCarriers.nil + | cons hd tl ih => + obtain ⟨hHead, hTl⟩ := List.pairwise_cons.mp h + show List.Pairwise _ ((hd.1, -hd.2) :: UnifiedStream.negate tl) + apply List.Pairwise.cons + · intro y hY + obtain ⟨orig, hOrigMem, hY_eq⟩ := List.mem_map.mp hY + rw [← hY_eq] + exact hHead orig hOrigMem + · exact ih hTl + +theorem UnifiedStream.clampPositive_noDup (us : UnifiedStream) + (h : UnifiedStream.NoDupCarriers us) : + UnifiedStream.NoDupCarriers (UnifiedStream.clampPositive us) := by + unfold UnifiedStream.clampPositive + exact List.Pairwise.sublist List.filter_sublist h + +-- `filter_noDup` and `clampToOne_noDup` need a membership +-- characterization for the head step (filter may convert `.row` to +-- `.err` for the same record, and clampToOne may drop or transform). +-- The carrier of the converted record is bounded, so NoDup holds, but +-- the rigorous proof requires per-shape analysis. Deferred. + /-- All-`.val` inputs yield all-`.val` outputs through `intersectAll`. Each output diff is `.val (Min.min n m)` for some `n` from the left and `m` from the right. -/ From 1657cc215bddd12c7cd89ee19bb681a855d524fc Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 17:06:30 +0200 Subject: [PATCH 079/127] doc/semantics: clampToOne NoDup preservation + filter clarification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `clampToOne_noDup`: `clampToOne` preserves NoDupCarriers since each output record carries the same first-component as its corresponding input record (or none, when the row is dropped). Supporting private `mem_clampToOne_carrier` characterizes the output carrier set as a subset of input carriers. Clarifies the earlier deferred `filter_noDup`: it is *false*, not just unproven. Two distinct `.row r1` / `.row r2` records can both pred-err with the same payload `e`, producing two output records carrying the same `.err e` — collapsing distinct carriers into one, violating NoDup. The skeleton's filter is therefore not a NoDup-preserving operator. This corrects an earlier incorrect comment that framed `filter_noDup` as a deferred proof obligation. The actual fact is that `filter` does not preserve NoDup; downstream code needing NoDup after filter must apply `consolidate` (which always produces NoDup) or rely on additional preconditions. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 77 ++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 73e4bc1406e0a..2a3bb5a989e00 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -1004,11 +1004,78 @@ theorem UnifiedStream.clampPositive_noDup (us : UnifiedStream) unfold UnifiedStream.clampPositive exact List.Pairwise.sublist List.filter_sublist h --- `filter_noDup` and `clampToOne_noDup` need a membership --- characterization for the head step (filter may convert `.row` to --- `.err` for the same record, and clampToOne may drop or transform). --- The carrier of the converted record is bounded, so NoDup holds, but --- the rigorous proof requires per-shape analysis. Deferred. +-- `filter_noDup` is *false* in general: two distinct `.row r1` / +-- `.row r2` records with `r1 ≠ r2` may both pred-err with the same +-- payload `e`, producing two output records carrying the same +-- `.err e`. The collapse violates NoDup. Filter is therefore not a +-- NoDup-preserving operator; downstream proofs that need NoDup on +-- filter output must rely on additional structure (e.g., that the +-- predicate never errs, or that consolidate is applied after). + +/-- Carriers of `clampToOne` output are a subset of input carriers +(via the first component). Used by `clampToOne_noDup`. -/ +private theorem mem_clampToOne_carrier (us : UnifiedStream) + (x : UnifiedRow × DiffWithError Int) (h : x ∈ UnifiedStream.clampToOne us) : + ∃ orig ∈ us, x.1 = orig.1 := by + induction us with + | nil => exact absurd h List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + cases d with + | error => + -- clampToOne ((uc, .error) :: tl) = (uc, .error) :: clampToOne tl + have h' : x ∈ (uc, (DiffWithError.error : DiffWithError Int)) + :: UnifiedStream.clampToOne tl := h + rcases List.mem_cons.mp h' with hHead | hTail + · refine ⟨(uc, DiffWithError.error), List.mem_cons_self, ?_⟩ + rw [hHead] + · obtain ⟨orig, hOrigMem, hCarr⟩ := ih hTail + exact ⟨orig, List.mem_cons_of_mem _ hOrigMem, hCarr⟩ + | val n => + have h' : x ∈ (if 0 < n + then (uc, DiffWithError.val 1) :: UnifiedStream.clampToOne tl + else UnifiedStream.clampToOne tl) := h + split at h' + · rcases List.mem_cons.mp h' with hHead | hTail + · refine ⟨(uc, DiffWithError.val n), List.mem_cons_self, ?_⟩ + rw [hHead] + · obtain ⟨orig, hOrigMem, hCarr⟩ := ih hTail + exact ⟨orig, List.mem_cons_of_mem _ hOrigMem, hCarr⟩ + · obtain ⟨orig, hOrigMem, hCarr⟩ := ih h' + exact ⟨orig, List.mem_cons_of_mem _ hOrigMem, hCarr⟩ + +/-- `clampToOne` preserves NoDup: each output record corresponds to +exactly one input record (or none), with carrier unchanged. -/ +theorem UnifiedStream.clampToOne_noDup (us : UnifiedStream) + (h : UnifiedStream.NoDupCarriers us) : + UnifiedStream.NoDupCarriers (UnifiedStream.clampToOne us) := by + induction us with + | nil => exact UnifiedStream.NoDupCarriers.nil + | cons hd tl ih => + obtain ⟨hHead, hTl⟩ := List.pairwise_cons.mp h + obtain ⟨uc, d⟩ := hd + cases d with + | error => + show List.Pairwise _ ((uc, DiffWithError.error) + :: UnifiedStream.clampToOne tl) + apply List.Pairwise.cons + · intro y hY + obtain ⟨orig, hOrigMem, hCarr⟩ := mem_clampToOne_carrier tl y hY + rw [hCarr] at * + exact hHead orig hOrigMem + · exact ih hTl + | val n => + show List.Pairwise _ (if 0 < n + then (uc, DiffWithError.val 1) :: UnifiedStream.clampToOne tl + else UnifiedStream.clampToOne tl) + split + · apply List.Pairwise.cons + · intro y hY + obtain ⟨orig, hOrigMem, hCarr⟩ := mem_clampToOne_carrier tl y hY + rw [hCarr] at * + exact hHead orig hOrigMem + · exact ih hTl + · exact ih hTl /-- All-`.val` inputs yield all-`.val` outputs through `intersectAll`. Each output diff is `.val (Min.min n m)` for some From a04507315b380222dfd05eeb84d00c6dfc7a3deb Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 17:09:23 +0200 Subject: [PATCH 080/127] doc/semantics: NoDup closure for all derived set operators Adds NoDup-preservation theorems for every derived set operator: * `union_noDup`, `exceptAll_noDup`: direct from `consolidate_noDup`. * `bagExceptAll_noDup`: chains `clampPositive_noDup`. * `distinct_noDup`: chains `clampToOne_noDup`. * `intersectAll_noDup`: via the new helper `filterMap_carrier_noDup` (filterMap on a NoDup list with a carrier-preserving mapping is itself NoDup). * `bagIntersectAll_noDup`: chains `clampPositive_noDup`. The closure means every set-op output can be fed back into another set op (or a lookup) without re-deriving uniqueness invariants. `filterMap_carrier_noDup` is a private helper that abstracts the "filterMap preserves NoDup when the mapping preserves first components" pattern, useful for any future filterMap-based operator on `UnifiedStream`. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 65 ++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 2a3bb5a989e00..9aa268fc1504e 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -1181,4 +1181,69 @@ theorem UnifiedStream.bagExceptAll_nil_left (r : UnifiedStream) : (UnifiedStream.negate (UnifiedStream.consolidate r)) rw [UnifiedStream.exceptAll_nil_left] +/-! ## NoDup closure on derived set operators + +`union`, `exceptAll`, `intersectAll`, `distinct`, and their +bag-clamped variants all produce NoDupCarriers output. Each +factors through `consolidate` (or `clampToOne` on a consolidated +list), which gives the closure for free. -/ + +theorem UnifiedStream.union_noDup (l r : UnifiedStream) : + UnifiedStream.NoDupCarriers (UnifiedStream.union l r) := + UnifiedStream.consolidate_noDup _ + +theorem UnifiedStream.exceptAll_noDup (l r : UnifiedStream) : + UnifiedStream.NoDupCarriers (UnifiedStream.exceptAll l r) := + UnifiedStream.consolidate_noDup _ + +theorem UnifiedStream.bagExceptAll_noDup (l r : UnifiedStream) : + UnifiedStream.NoDupCarriers (UnifiedStream.bagExceptAll l r) := + UnifiedStream.clampPositive_noDup _ (UnifiedStream.exceptAll_noDup l r) + +theorem UnifiedStream.distinct_noDup (us : UnifiedStream) : + UnifiedStream.NoDupCarriers (UnifiedStream.distinct us) := + UnifiedStream.clampToOne_noDup _ (UnifiedStream.consolidate_noDup us) + +/-- `intersectAll`'s output is a filterMap of a NoDupCarriers list +whose mapping preserves the first component. The filterMap +therefore also has NoDupCarriers. -/ +private theorem filterMap_carrier_noDup + {f : UnifiedRow × DiffWithError Int → Option (UnifiedRow × DiffWithError Int)} + (h_f : ∀ ud out, f ud = some out → out.1 = ud.1) + {us : UnifiedStream} (h : UnifiedStream.NoDupCarriers us) : + UnifiedStream.NoDupCarriers (us.filterMap f) := by + induction us with + | nil => exact UnifiedStream.NoDupCarriers.nil + | cons hd tl ih => + obtain ⟨hHead, hTl⟩ := List.pairwise_cons.mp h + rw [List.filterMap_cons] + cases h_eq : f hd with + | none => exact ih hTl + | some out => + have hCarr : out.1 = hd.1 := h_f hd out h_eq + apply List.Pairwise.cons + · intro y hY + obtain ⟨origTl, hOrigMem, hY_eq⟩ := List.mem_filterMap.mp hY + have hYCarr : y.1 = origTl.1 := h_f origTl y hY_eq + rw [hYCarr, hCarr] + exact hHead origTl hOrigMem + · exact ih hTl + +theorem UnifiedStream.intersectAll_noDup (l r : UnifiedStream) : + UnifiedStream.NoDupCarriers (UnifiedStream.intersectAll l r) := by + apply filterMap_carrier_noDup ?_ (UnifiedStream.consolidate_noDup l) + intro ud out hSome + cases h_lookup : UnifiedStream.lookup ud.1 (UnifiedStream.consolidate r) with + | none => + rw [h_lookup] at hSome + cases hSome + | some d' => + rw [h_lookup] at hSome + injection hSome with hPair + rw [← hPair] + +theorem UnifiedStream.bagIntersectAll_noDup (l r : UnifiedStream) : + UnifiedStream.NoDupCarriers (UnifiedStream.bagIntersectAll l r) := + UnifiedStream.clampPositive_noDup _ (UnifiedStream.intersectAll_noDup l r) + end Mz From 56e50f9b493c70b564ccae7cf1b332f7fe49b751 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 17:15:59 +0200 Subject: [PATCH 081/127] doc/semantics: TimedUnifiedStream.advanceFrontier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the differential-dataflow `advance` operator on a scalar frontier. Records originally at time `< f` are advanced to `f`; records at `≥ f` are left untouched. `TimedUnifiedStream.advanceFrontier f s = s.map (fun r => (r.1, Nat.max r.2.1 f, r.2.2))`. Theorems: * `advanceFrontier_nil`, `advanceFrontier_length`: trivial / length preserved. * `advanceFrontier_zero`: advancing by `0` is the identity. * `advanceFrontier_idem`: advancing twice by the same frontier equals advancing once (the first pass leaves every record at `≥ f`, so the second `Nat.max _ f` is a no-op). * `advanceFrontier_advanceFrontier`: composing by `f` then `g` equals advancing by `Nat.max f g` — frontier compositions reduce to the max. Real differential dataflow uses antichains of times for partial- order timestamps; the scalar form is sufficient to state the algebraic laws and capture the essence of "advance". Co-Authored-By: Claude Opus 4.7 (1M context) --- .../semantics/Mz/TimedConsolidate.lean | 85 +++++++++++++++++++ doc/developer/semantics/README.md | 1 + 2 files changed, 86 insertions(+) diff --git a/doc/developer/semantics/Mz/TimedConsolidate.lean b/doc/developer/semantics/Mz/TimedConsolidate.lean index 13225e8f81fd3..1e71255db1f00 100644 --- a/doc/developer/semantics/Mz/TimedConsolidate.lean +++ b/doc/developer/semantics/Mz/TimedConsolidate.lean @@ -43,6 +43,91 @@ def TimedUnifiedStream.consolidateAtTime (t : Nat) (s : TimedUnifiedStream) : UnifiedStream := UnifiedStream.consolidate (TimedUnifiedStream.atTime t s) +/-! ## Frontier advance + +Differential dataflow's `advance` operator: records with time +strictly before frontier `f` are "advanced" to `f` (their time +is updated to `f`), making the past immutable. Records at or past +`f` are left untouched. + +The skeleton models frontiers as a single `Nat`. The real +framework uses antichains of times for partial-order timestamps; +the scalar form is sufficient to state the algebraic laws. -/ + +/-- Advance every record's time to at least `f`. Records originally +at time `< f` move to `f`; records already at `≥ f` stay. -/ +def TimedUnifiedStream.advanceFrontier (f : Nat) (s : TimedUnifiedStream) : + TimedUnifiedStream := + s.map fun r => (r.1, Nat.max r.2.1 f, r.2.2) + +theorem TimedUnifiedStream.advanceFrontier_nil (f : Nat) : + TimedUnifiedStream.advanceFrontier f [] = [] := rfl + +theorem TimedUnifiedStream.advanceFrontier_length + (f : Nat) (s : TimedUnifiedStream) : + (TimedUnifiedStream.advanceFrontier f s).length = s.length := + List.length_map _ + +/-- Advancing by `0` is the identity (no times below the frontier). -/ +theorem TimedUnifiedStream.advanceFrontier_zero (s : TimedUnifiedStream) : + TimedUnifiedStream.advanceFrontier 0 s = s := by + induction s with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, t, d⟩ := hd + show (uc, Nat.max t 0, d) :: TimedUnifiedStream.advanceFrontier 0 tl + = (uc, t, d) :: tl + have hMax : Nat.max t 0 = t := Nat.max_eq_left (Nat.zero_le t) + rw [hMax, ih] + +/-- Idempotence: advancing twice by the same frontier equals +advancing once. After the first pass, every record has time `≥ f`, +so the second `Nat.max _ f` is a no-op. -/ +theorem TimedUnifiedStream.advanceFrontier_idem + (f : Nat) (s : TimedUnifiedStream) : + TimedUnifiedStream.advanceFrontier f + (TimedUnifiedStream.advanceFrontier f s) + = TimedUnifiedStream.advanceFrontier f s := by + induction s with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, t, d⟩ := hd + show (uc, Nat.max (Nat.max t f) f, d) + :: TimedUnifiedStream.advanceFrontier f + (TimedUnifiedStream.advanceFrontier f tl) + = (uc, Nat.max t f, d) :: TimedUnifiedStream.advanceFrontier f tl + have h_max : Nat.max (Nat.max t f) f = Nat.max t f := by + cases Nat.le_total t f with + | inl h_le => + have h1 : Nat.max t f = f := Nat.max_eq_right h_le + have h2 : Nat.max f f = f := Nat.max_eq_left (Nat.le_refl _) + rw [h1, h2] + | inr h_ge => + have h1 : Nat.max t f = t := Nat.max_eq_left h_ge + rw [h1]; exact h1 + rw [h_max, ih] + +/-- Monotone composition: advancing by `f` then `g` equals +advancing by `Nat.max f g`. The max is associative and +commutative on `Nat`, so the final frontier dominates. -/ +theorem TimedUnifiedStream.advanceFrontier_advanceFrontier + (f g : Nat) (s : TimedUnifiedStream) : + TimedUnifiedStream.advanceFrontier g + (TimedUnifiedStream.advanceFrontier f s) + = TimedUnifiedStream.advanceFrontier (Nat.max f g) s := by + induction s with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, t, d⟩ := hd + show (uc, Nat.max (Nat.max t f) g, d) + :: TimedUnifiedStream.advanceFrontier g + (TimedUnifiedStream.advanceFrontier f tl) + = (uc, Nat.max t (Nat.max f g), d) + :: TimedUnifiedStream.advanceFrontier (Nat.max f g) tl + have hAssoc : Nat.max (Nat.max t f) g = Nat.max t (Nat.max f g) := + Nat.max_assoc t f g + rw [hAssoc, ih] + /-! ## Trivial cases -/ theorem TimedUnifiedStream.atTime_nil (t : Nat) : diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 215e2b9546b30..986b151a50953 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -42,6 +42,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. * `Mz/TimedConsolidate.lean`: per-`(row, time)` consolidation. `TimedUnifiedStream := List (UnifiedRow × Nat × DiffWithError Int)` carries records with time. `atTime t` projects to one time slice (dropping the time component); `consolidateAtTime t` chains it with `UnifiedStream.consolidate`. Theorems: `consolidateAtTime_preserves_error` (an `.error` diff at time `t` survives both filter and consolidation), `atTime_length_le` and `consolidateAtTime_length_le` (both non-expanding). Decomposes the joint key into "filter by time, then consolidate by row". + `advanceFrontier f s` advances every record's time to `Nat.max time f` — the differential-dataflow `advance` operator on a scalar frontier. Records originally at time `< f` move to `f`; records at `≥ f` stay. Theorems: `advanceFrontier_nil`, `advanceFrontier_length` (length preserved), `advanceFrontier_zero` (zero frontier is identity), `advanceFrontier_idem` (idempotent on equal frontier), `advanceFrontier_advanceFrontier` (composing by `f` then `g` equals advancing by `Nat.max f g`). Real differential dataflow uses antichains of times; the scalar form is sufficient to state the algebraic laws. * `Mz/UnifiedConsolidate.lean`: row-keyed diff summation on `UnifiedStream`. `UnifiedStream.consolidate` buckets records by carrier (via `DecidableEq UnifiedRow`) and sums per-bucket diffs. Theorems cover three properties: *absorption* — `consolidate_preserves_error` proves an `.error` diff anywhere in the input gives an `.error` diff in the consolidated output for that carrier; *cardinality* — `consolidate_length_le` bounds the output by the input length (consolidation only merges, never expands); From 408e67f6e2cfc1b7510bcf5d9c94003ee2d0fdb6 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 17:29:31 +0200 Subject: [PATCH 082/127] doc/semantics: error-scope extractors and invariance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Formalizes the three error scopes — cell-scoped (Datum.err), row-scoped (UnifiedRow.err), collection-scoped (.error diff) — as observable record-level sets on `UnifiedStream`: * `errCarriers us`: list of row-scoped err payloads (records whose carrier is `.err e`). * `errorDiffCarriers us`: list of carriers whose diff is `.error` (records carrying a collection-scoped error marker). Together these expose how many of each scope live in the stream — a vocabulary the spec needs to argue about scope promotion / demotion. Reduction lemmas: * `errCarriers_nil` / `errorDiffCarriers_nil`. * `errCarriers_append` / `errorDiffCarriers_append`: each extractor distributes over `++`. * `unionAll_errCarriers` / `unionAll_errorDiffCarriers`: the named-operator form (`unionAll` is `++`). Invariance: * `negate_errCarriers`: negation does not change the row-err set (carriers untouched by negation). * `negate_errorDiffCarriers`: negation does not change the collection-err set (`.error` absorbs negation per `neg_error`). These confirm that the two scopes are independent observable properties — `negate` flips diffs but preserves both error sets. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 66 +++++++++++++++++++ doc/developer/semantics/Mz/UnifiedStream.lean | 48 ++++++++++++++ doc/developer/semantics/README.md | 1 + 3 files changed, 115 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 9aa268fc1504e..7f48248c15d49 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -1246,4 +1246,70 @@ theorem UnifiedStream.bagIntersectAll_noDup (l r : UnifiedStream) : UnifiedStream.NoDupCarriers (UnifiedStream.bagIntersectAll l r) := UnifiedStream.clampPositive_noDup _ (UnifiedStream.intersectAll_noDup l r) +/-! ## Error-scope invariance + +Negation preserves both the row-scoped err set (`errCarriers`) +and the collection-scoped err set (`errorDiffCarriers`): +negation flips diffs but `.error` absorbs, and carriers are +unchanged by definition. Negation cannot escalate or rescue +either error scope. -/ + +theorem UnifiedStream.negate_errCarriers (us : UnifiedStream) : + UnifiedStream.errCarriers (UnifiedStream.negate us) + = UnifiedStream.errCarriers us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + cases uc with + | row r => + show List.filterMap _ ((UnifiedRow.row r, -d) :: UnifiedStream.negate tl) + = List.filterMap _ ((UnifiedRow.row r, d) :: tl) + simp only [List.filterMap_cons] + exact ih + | err e => + show List.filterMap _ ((UnifiedRow.err e, -d) :: UnifiedStream.negate tl) + = List.filterMap _ ((UnifiedRow.err e, d) :: tl) + simp only [List.filterMap_cons] + show e :: UnifiedStream.errCarriers (UnifiedStream.negate tl) + = e :: UnifiedStream.errCarriers tl + rw [ih] + +theorem UnifiedStream.negate_errorDiffCarriers (us : UnifiedStream) : + UnifiedStream.errorDiffCarriers (UnifiedStream.negate us) + = UnifiedStream.errorDiffCarriers us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + cases d with + | val n => + show List.filterMap _ ((uc, -DiffWithError.val n) + :: UnifiedStream.negate tl) + = List.filterMap _ ((uc, DiffWithError.val n) :: tl) + simp only [List.filterMap_cons] + exact ih + | error => + show List.filterMap _ ((uc, -(DiffWithError.error : DiffWithError Int)) + :: UnifiedStream.negate tl) + = List.filterMap _ ((uc, (DiffWithError.error : DiffWithError Int)) + :: tl) + simp only [List.filterMap_cons] + show uc :: UnifiedStream.errorDiffCarriers (UnifiedStream.negate tl) + = uc :: UnifiedStream.errorDiffCarriers tl + rw [ih] + +/-- `unionAll` concatenates both error-scope sets. The row-err set +of `unionAll a b` is `errCarriers a ++ errCarriers b`; same for +the collection-err set. Each scope is tracked independently. -/ +theorem UnifiedStream.unionAll_errCarriers (a b : UnifiedStream) : + UnifiedStream.errCarriers (UnifiedStream.unionAll a b) + = UnifiedStream.errCarriers a ++ UnifiedStream.errCarriers b := + UnifiedStream.errCarriers_append a b + +theorem UnifiedStream.unionAll_errorDiffCarriers (a b : UnifiedStream) : + UnifiedStream.errorDiffCarriers (UnifiedStream.unionAll a b) + = UnifiedStream.errorDiffCarriers a ++ UnifiedStream.errorDiffCarriers b := + UnifiedStream.errorDiffCarriers_append a b + end Mz diff --git a/doc/developer/semantics/Mz/UnifiedStream.lean b/doc/developer/semantics/Mz/UnifiedStream.lean index 784b5767b56b7..3865143ec48d7 100644 --- a/doc/developer/semantics/Mz/UnifiedStream.lean +++ b/doc/developer/semantics/Mz/UnifiedStream.lean @@ -314,6 +314,54 @@ theorem UnifiedStream.project_no_error exact ⟨n, by rw [this]⟩ · exact ih hTl rec hTail +/-! ## Error-scope extractors + +The skeleton distinguishes three error scopes: + +* **Cell-scoped** (`Datum.err e`): a single cell-value invalid. + Propagates through scalar evaluators. +* **Row-scoped** (`UnifiedRow.err e`): a whole record carries an + error instead of a row. Captured in the carrier. +* **Collection-scoped** (`.error` diff in `DiffWithError`): the + collection itself is invalid at this time/state. Captured in + the diff. + +`errCarriers` and `errorDiffCarriers` project a `UnifiedStream` +to the two record-level error sets, giving the spec a vocabulary +for "how many row-errs and collection-errs in this stream". -/ + +/-- The list of row-scoped error payloads carried by this stream. +Order matches the input. -/ +def UnifiedStream.errCarriers (us : UnifiedStream) : List EvalError := + us.filterMap fun ud => match ud.1 with + | .err e => some e + | _ => none + +/-- The list of carriers whose diff is collection-scoped `.error`. +Order matches the input. -/ +def UnifiedStream.errorDiffCarriers (us : UnifiedStream) : List UnifiedRow := + us.filterMap fun ud => match ud.2 with + | .error => some ud.1 + | _ => none + +theorem UnifiedStream.errCarriers_nil : + UnifiedStream.errCarriers [] = [] := rfl + +theorem UnifiedStream.errorDiffCarriers_nil : + UnifiedStream.errorDiffCarriers [] = [] := rfl + +theorem UnifiedStream.errCarriers_append (a b : UnifiedStream) : + UnifiedStream.errCarriers (a ++ b) + = UnifiedStream.errCarriers a ++ UnifiedStream.errCarriers b := by + show (a ++ b).filterMap _ = a.filterMap _ ++ b.filterMap _ + exact List.filterMap_append + +theorem UnifiedStream.errorDiffCarriers_append (a b : UnifiedStream) : + UnifiedStream.errorDiffCarriers (a ++ b) + = UnifiedStream.errorDiffCarriers a ++ UnifiedStream.errorDiffCarriers b := by + show (a ++ b).filterMap _ = a.filterMap _ ++ b.filterMap _ + exact List.filterMap_append + /-! ## Helper lemmas for filterMap over the packed concatenation -/ private theorem filterMap_pickRow_rowMap (rs : List Row) : diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 986b151a50953..cac8fcb7c60c9 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -38,6 +38,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four Supporting environment lemmas: `Env.get_append_left` (read from prefix) and `Env.get_append_right` (read from suffix with index shift). * `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `-`, `0`, `1`, `min` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity / negation / min laws that downstream operators must respect. Negation laws (`neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`, `neg_mul`, `mul_neg`, `neg_add`) carry the principle that `.error` is unrecoverable — a collection-scoped error cannot be subtracted away, and that negation distributes through addition and multiplication. Min laws (`error_min_left`, `error_min_right`, `min_val_val`) define the bag-min combinator with `.error` absorbing. The `_int` specializations discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean`, `Mz/UnifiedConsolidate.lean`, and `Mz/SetOps.lean` can cite the named laws directly. * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). + Error-scope extractors: `errCarriers us` returns the list of row-scoped err payloads (carrier = `.err e`); `errorDiffCarriers us` returns the list of carriers whose diff is collection-scoped `.error`. Theorems: `errCarriers_nil`/`_append`, `errorDiffCarriers_nil`/`_append`. The two sets are independent observable properties — operators that touch one need not touch the other. `UnifiedStream.project` lifts `BagStream.project` to the diff-aware carrier. Records with `.error` diff or `.err` carrier pass through unchanged; a `.row r` record with `.val n` diff is evaluated against `es` — if every scalar succeeds, the row is emitted with diff `.val n`; if any scalar errs, one `(.err e, .val n)` is emitted per erroring scalar (each preserving the original multiplicity). Theorems: `project_preserves_error_diff` (an `.error` diff in the input always reaches the output), `project_no_error` (all-`.val` inputs yield all-`.val` outputs), `project_nil_es` (empty projection list collapses every row to width-zero), `project_nil_stream` (empty stream is empty). * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. From 2744f10b0f5254b39dffa8c84d2246f8c24c17ca Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 17:45:52 +0200 Subject: [PATCH 083/127] doc/semantics: filter error-scope theorems Two new invariance theorems for `UnifiedStream.filter`: * `filter_errorDiffCarriers`: equality. Filter passes `.error`-diff records through unchanged; every other branch produces `.val`-diff output. The collection-scoped error set is preserved exactly. * `filter_errCarriers_mono`: monotone. Every row-scoped error in the input is in the output. The reverse fails because cell-to-row promotion (`eval r pred = .err e`) introduces fresh row-errs. Together these formalize what filter does to the two record- level error scopes: it is exact on collection-scope and strictly grows row-scope. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 145 +++++++++++++++++++++++++ 1 file changed, 145 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 7f48248c15d49..bcc88405226a1 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -1312,4 +1312,149 @@ theorem UnifiedStream.unionAll_errorDiffCarriers (a b : UnifiedStream) : = UnifiedStream.errorDiffCarriers a ++ UnifiedStream.errorDiffCarriers b := UnifiedStream.errorDiffCarriers_append a b +/-! ## `filter` and error scopes + +`filter` is the canonical site of *cell-to-row promotion*: a +`.row r` record whose predicate evaluates to `.err e` becomes a +`.err e` carrier in the output. Filter therefore *grows* the +row-scoped error set in general; it does not shrink it +(row-scoped errors pass through, and collection-scoped errors +pass through unconditionally). + +`filter_errorDiffCarriers` is an equality: filter preserves the +collection-scoped error set exactly. The first arm matches +`.error` diffs and outputs them unchanged; every other arm +produces `.val`-diff outputs only. So the `.error` carriers are +neither added nor removed. + +`filter_errCarriers_mono` is the monotone direction for the +row-scoped error set: every row-err present in input is present +in output. The reverse inclusion fails because cell-to-row +promotion can add fresh errors. -/ + +private theorem filter_singleton_errorDiffCarriers + (pred : Expr) (uc : UnifiedRow) (d : DiffWithError Int) : + UnifiedStream.errorDiffCarriers (UnifiedStream.filter pred [(uc, d)]) + = UnifiedStream.errorDiffCarriers [(uc, d)] := by + cases d with + | error => + -- Filter passes `.error`-diff records unchanged. + cases uc <;> rfl + | val n => + cases uc with + | err _ => rfl + | row r => + -- Filter's `.row r, .val n` arm produces zero or one `.val`-diff records, + -- depending on `eval r pred`. Case-split on the eval, then reduce filter + -- with the resulting equation to compute the concrete singleton output. + cases hEval : eval r pred with + | bool b => + cases b with + | true => + have hF : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val n)] + = [(UnifiedRow.row r, DiffWithError.val n)] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [hEval]; rfl + rw [hF] + | false => + have hF : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val n)] = [] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [hEval]; rfl + rw [hF]; rfl + | int _ => + have hF : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val n)] = [] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [hEval]; rfl + rw [hF]; rfl + | null => + have hF : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val n)] = [] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [hEval]; rfl + rw [hF]; rfl + | err e_pred => + have hF : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val n)] + = [(UnifiedRow.err e_pred, DiffWithError.val n)] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [hEval]; rfl + rw [hF]; rfl + +theorem UnifiedStream.filter_errorDiffCarriers + (pred : Expr) (us : UnifiedStream) : + UnifiedStream.errorDiffCarriers (UnifiedStream.filter pred us) + = UnifiedStream.errorDiffCarriers us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hCons : ((uc, d) :: tl : UnifiedStream) = [(uc, d)] ++ tl := rfl + rw [hCons, UnifiedStream.filter_append, + UnifiedStream.errorDiffCarriers_append, + UnifiedStream.errorDiffCarriers_append, ih, + filter_singleton_errorDiffCarriers pred uc d] + +private theorem filter_singleton_errCarriers_mono + (pred : Expr) (uc : UnifiedRow) (d : DiffWithError Int) (e : EvalError) + (h : e ∈ UnifiedStream.errCarriers [(uc, d)]) : + e ∈ UnifiedStream.errCarriers (UnifiedStream.filter pred [(uc, d)]) := by + cases uc with + | row r => + -- `errCarriers [(.row r, d)] = []`, so `h` is vacuous. + have hEmpty : UnifiedStream.errCarriers [(UnifiedRow.row r, d)] = [] := rfl + rw [hEmpty] at h + exact absurd h List.not_mem_nil + | err e0 => + -- `errCarriers [(.err e0, d)] = [e0]`, so `e = e0`. + have hSingle : UnifiedStream.errCarriers [(UnifiedRow.err e0, d)] = [e0] := rfl + rw [hSingle] at h + have hEq : e = e0 := List.mem_singleton.mp h + subst hEq + cases d with + | error => + -- filter passes `(.err e, .error)` through. + have : UnifiedStream.filter pred [(UnifiedRow.err e, DiffWithError.error)] + = [(UnifiedRow.err e, DiffWithError.error)] := rfl + rw [this] + exact List.mem_singleton.mpr rfl + | val n => + have : UnifiedStream.filter pred [(UnifiedRow.err e, DiffWithError.val n)] + = [(UnifiedRow.err e, DiffWithError.val n)] := rfl + rw [this] + exact List.mem_singleton.mpr rfl + +theorem UnifiedStream.filter_errCarriers_mono + (pred : Expr) (us : UnifiedStream) (e : EvalError) + (h : e ∈ UnifiedStream.errCarriers us) : + e ∈ UnifiedStream.errCarriers (UnifiedStream.filter pred us) := by + induction us with + | nil => exact absurd h List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hCons : ((uc, d) :: tl : UnifiedStream) = [(uc, d)] ++ tl := rfl + rw [hCons, UnifiedStream.errCarriers_append] at h + rw [hCons, UnifiedStream.filter_append, UnifiedStream.errCarriers_append] + rcases List.mem_append.mp h with hHead | hTail + · exact List.mem_append.mpr + (Or.inl (filter_singleton_errCarriers_mono pred uc d e hHead)) + · exact List.mem_append.mpr (Or.inr (ih hTail)) + end Mz From 1774fbe50c1a273304839bb5e18b9677f8b56d30 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 17:52:15 +0200 Subject: [PATCH 084/127] doc/semantics: consolidate error-scope theorems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add bridge lemmas converting extractor membership to carrier membership: * `mem_errCarriers : e ∈ errCarriers us ↔ ∃ d, (.err e, d) ∈ us` * `mem_errorDiffCarriers : uc ∈ errorDiffCarriers us ↔ (uc, .error) ∈ us` Add carrier-preservation lemmas for `consolidate`: * `mem_consolidate_of_mem` (forward): every input carrier survives consolidation. * `mem_of_mem_consolidate` (reverse): every output carrier came from an input. Combined yield the consolidate error-scope theorems: * `consolidate_errCarriers_iff`: row-err set equal as set. * `consolidate_errorDiffCarriers_mono`: collection-err set monotone (via `consolidate_preserves_error`). Multiplicity may shrink — consolidate folds duplicates — but the *set* of row-errs is preserved exactly. Collection-err set preserved on the forward direction; reverse direction (no spurious .error appears) requires diff-semiring per-carrier reasoning, deferred. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 35 +++++++ .../semantics/Mz/UnifiedConsolidate.lean | 77 +++++++++++++++ doc/developer/semantics/Mz/UnifiedStream.lean | 94 +++++++++++++++++++ 3 files changed, 206 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index bcc88405226a1..4ec54e5db8709 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -1457,4 +1457,39 @@ theorem UnifiedStream.filter_errCarriers_mono (Or.inl (filter_singleton_errCarriers_mono pred uc d e hHead)) · exact List.mem_append.mpr (Or.inr (ih hTail)) +/-! ## `consolidate` and error scopes + +`consolidate` buckets records by carrier and sums diffs. As a +list, this can shrink (multiple `.err e` carriers fold to one) +and the row-err set may have multiplicity collapsed. But the +*set* of row-err payloads and the *set* of `.error`-diff carriers +is preserved. + +Forward direction for row-err is from `mem_consolidate_of_mem`: +every input carrier survives. Backward direction is from +`mem_of_mem_consolidate`. For `.error`-diff carriers, the forward +direction is `consolidate_preserves_error`. -/ + +theorem UnifiedStream.consolidate_errCarriers_iff + (us : UnifiedStream) (e : EvalError) : + e ∈ UnifiedStream.errCarriers (UnifiedStream.consolidate us) + ↔ e ∈ UnifiedStream.errCarriers us := by + rw [UnifiedStream.mem_errCarriers, UnifiedStream.mem_errCarriers] + constructor + · intro h + exact UnifiedStream.mem_of_mem_consolidate us (UnifiedRow.err e) h + · intro ⟨d, hMem⟩ + exact UnifiedStream.mem_consolidate_of_mem us (UnifiedRow.err e) d hMem + +/-- Forward direction for collection-scoped errors: every input +`.error`-diff carrier shows up in the consolidated output. Direct +consequence of `consolidate_preserves_error`. -/ +theorem UnifiedStream.consolidate_errorDiffCarriers_mono + (us : UnifiedStream) (uc : UnifiedRow) + (h : uc ∈ UnifiedStream.errorDiffCarriers us) : + uc ∈ UnifiedStream.errorDiffCarriers (UnifiedStream.consolidate us) := by + rw [UnifiedStream.mem_errorDiffCarriers] at h + rw [UnifiedStream.mem_errorDiffCarriers] + exact UnifiedStream.consolidate_preserves_error us uc h + end Mz diff --git a/doc/developer/semantics/Mz/UnifiedConsolidate.lean b/doc/developer/semantics/Mz/UnifiedConsolidate.lean index a3f2b3e2734d7..14039a535d720 100644 --- a/doc/developer/semantics/Mz/UnifiedConsolidate.lean +++ b/doc/developer/semantics/Mz/UnifiedConsolidate.lean @@ -492,4 +492,81 @@ theorem UnifiedStream.consolidate_no_error rw [hN] at this exact consolidateInto_no_error uc n _ hConsTl r this +/-! ## Carrier preservation + +`consolidate` does not lose carriers: every record that appears +in the input has a counterpart in the output with the same +carrier (the diff may have been folded into a bucket with other +records sharing the carrier). -/ + +private theorem consolidateInto_preserves_mem + (uc uc' : UnifiedRow) (d d' : DiffWithError Int) (us : UnifiedStream) + (h : (uc, d) ∈ us) : + ∃ d'', (uc, d'') ∈ consolidateInto uc' d' us := by + induction us with + | nil => exact absurd h List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc'', d''⟩ := hd + by_cases hEq : uc' = uc'' + · subst hEq + rw [consolidateInto_match] + rcases List.mem_cons.mp h with hHead | hTail + · -- (uc, d) = (uc', d''); head of output is (uc', d' + d''). + have hCarrier : uc = uc' := (Prod.mk.injEq _ _ _ _).mp hHead |>.1 + subst hCarrier + exact ⟨d' + d'', List.mem_cons_self⟩ + · -- (uc, d) in tl; tail of output is tl. + exact ⟨d, List.mem_cons_of_mem _ hTail⟩ + · rw [consolidateInto_skip _ _ _ _ _ hEq] + rcases List.mem_cons.mp h with hHead | hTail + · -- (uc, d) is the head record (uc'', d''); preserved verbatim. + have hCarrier : uc = uc'' := (Prod.mk.injEq _ _ _ _).mp hHead |>.1 + have hD : d = d'' := (Prod.mk.injEq _ _ _ _).mp hHead |>.2 + subst hCarrier; subst hD + exact ⟨d, List.mem_cons_self⟩ + · obtain ⟨d''', hMem⟩ := ih hTail + exact ⟨d''', List.mem_cons_of_mem _ hMem⟩ + +/-- Every input carrier survives consolidation: an input record +with carrier `uc` has some output record with the same carrier +(diff possibly folded with siblings sharing the carrier). -/ +theorem UnifiedStream.mem_consolidate_of_mem + (us : UnifiedStream) (uc : UnifiedRow) (d : DiffWithError Int) + (h : (uc, d) ∈ us) : + ∃ d', (uc, d') ∈ UnifiedStream.consolidate us := by + induction us with + | nil => exact absurd h List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + rcases List.mem_cons.mp h with hHead | hTail + · -- Head record's carrier matches; insert finds or appends bucket. + have hCarrier : uc = uc' := (Prod.mk.injEq _ _ _ _).mp hHead |>.1 + subst hCarrier + show ∃ d'', (uc, d'') ∈ consolidateInto uc d' (UnifiedStream.consolidate tl) + exact mem_after_consolidateInto uc d' _ + · -- Tail contains the record; ih + carrier preservation through insert. + obtain ⟨d'', hMem⟩ := ih hTail + exact consolidateInto_preserves_mem uc uc' d'' d' _ hMem + +/-- Reverse direction: every output carrier came from an input +carrier. The output is "no bigger" than the input on carriers. -/ +theorem UnifiedStream.mem_of_mem_consolidate + (us : UnifiedStream) (uc : UnifiedRow) : + (∃ d, (uc, d) ∈ UnifiedStream.consolidate us) + → ∃ d', (uc, d') ∈ us := by + induction us with + | nil => intro ⟨_, h⟩; exact absurd h List.not_mem_nil + | cons hd tl ih => + intro ⟨d, h⟩ + obtain ⟨uc', d'⟩ := hd + have hInto : (uc, d) ∈ consolidateInto uc' d' (UnifiedStream.consolidate tl) := h + rcases mem_consolidateInto_carrier uc' d' _ (uc, d) hInto with hUc | ⟨d'', hMem⟩ + · -- carrier = uc'; head of input. + subst hUc + exact ⟨d', List.mem_cons_self⟩ + · -- carrier in consolidate tl; apply ih. + have hMem' : (uc, d'') ∈ UnifiedStream.consolidate tl := hMem + obtain ⟨d''', hMemTl⟩ := ih ⟨d'', hMem'⟩ + exact ⟨d''', List.mem_cons_of_mem _ hMemTl⟩ + end Mz diff --git a/doc/developer/semantics/Mz/UnifiedStream.lean b/doc/developer/semantics/Mz/UnifiedStream.lean index 3865143ec48d7..3416b027be375 100644 --- a/doc/developer/semantics/Mz/UnifiedStream.lean +++ b/doc/developer/semantics/Mz/UnifiedStream.lean @@ -362,6 +362,100 @@ theorem UnifiedStream.errorDiffCarriers_append (a b : UnifiedStream) : show (a ++ b).filterMap _ = a.filterMap _ ++ b.filterMap _ exact List.filterMap_append +/-! ### Membership characterizations + +Bridge the extractor membership predicates to the underlying +record-membership predicates. These let downstream proofs reason +about error scopes via the carrier and diff projections of +individual records without unfolding the `filterMap`. -/ + +theorem UnifiedStream.mem_errCarriers (us : UnifiedStream) (e : EvalError) : + e ∈ UnifiedStream.errCarriers us + ↔ ∃ d, (UnifiedRow.err e, d) ∈ us := by + induction us with + | nil => + constructor + · intro h; exact absurd h List.not_mem_nil + · intro ⟨_, h⟩; exact absurd h List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + cases uc with + | row r => + -- Head contributes nothing to errCarriers; reduce to tail. + have hRed : UnifiedStream.errCarriers ((UnifiedRow.row r, d) :: tl) + = UnifiedStream.errCarriers tl := rfl + rw [hRed] + constructor + · intro h; obtain ⟨d', hMem⟩ := ih.mp h + exact ⟨d', List.mem_cons_of_mem _ hMem⟩ + · intro ⟨d', hMem⟩ + rcases List.mem_cons.mp hMem with hHead | hTail + · -- hHead : (UnifiedRow.err e, d') = (UnifiedRow.row r, d). Impossible. + have hCarrier : UnifiedRow.err e = UnifiedRow.row r := + (Prod.mk.injEq _ _ _ _).mp hHead |>.1 + exact UnifiedRow.noConfusion hCarrier + · exact ih.mpr ⟨d', hTail⟩ + | err e0 => + -- Head contributes `e0` to errCarriers. + have hRed : UnifiedStream.errCarriers ((UnifiedRow.err e0, d) :: tl) + = e0 :: UnifiedStream.errCarriers tl := rfl + rw [hRed] + constructor + · intro h + rcases List.mem_cons.mp h with hHead | hTail + · subst hHead; exact ⟨d, List.mem_cons_self⟩ + · obtain ⟨d', hMem⟩ := ih.mp hTail + exact ⟨d', List.mem_cons_of_mem _ hMem⟩ + · intro ⟨d', hMem⟩ + rcases List.mem_cons.mp hMem with hHead | hTail + · have hErr : UnifiedRow.err e = UnifiedRow.err e0 := + (Prod.mk.injEq _ _ _ _).mp hHead |>.1 + have : e = e0 := UnifiedRow.err.inj hErr + rw [this]; exact List.mem_cons_self + · exact List.mem_cons_of_mem _ (ih.mpr ⟨d', hTail⟩) + +theorem UnifiedStream.mem_errorDiffCarriers (us : UnifiedStream) (uc : UnifiedRow) : + uc ∈ UnifiedStream.errorDiffCarriers us + ↔ (uc, (DiffWithError.error : DiffWithError Int)) ∈ us := by + induction us with + | nil => + constructor + · intro h; exact absurd h List.not_mem_nil + · intro h; exact absurd h List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc0, d⟩ := hd + cases d with + | val n => + -- `.val n` head contributes nothing. + have hRed : UnifiedStream.errorDiffCarriers ((uc0, DiffWithError.val n) :: tl) + = UnifiedStream.errorDiffCarriers tl := rfl + rw [hRed] + constructor + · intro h; exact List.mem_cons_of_mem _ (ih.mp h) + · intro h + rcases List.mem_cons.mp h with hHead | hTail + · -- (uc, .error) = (uc0, .val n) impossible (diff mismatch). + have hDiff : (DiffWithError.error : DiffWithError Int) + = DiffWithError.val n := + (Prod.mk.injEq _ _ _ _).mp hHead |>.2 + cases hDiff + · exact ih.mpr hTail + | error => + -- `.error` head contributes `uc0`. + have hRed : UnifiedStream.errorDiffCarriers ((uc0, DiffWithError.error) :: tl) + = uc0 :: UnifiedStream.errorDiffCarriers tl := rfl + rw [hRed] + constructor + · intro h + rcases List.mem_cons.mp h with hHead | hTail + · subst hHead; exact List.mem_cons_self + · exact List.mem_cons_of_mem _ (ih.mp hTail) + · intro h + rcases List.mem_cons.mp h with hHead | hTail + · have : uc = uc0 := (Prod.mk.injEq _ _ _ _).mp hHead |>.1 + rw [this]; exact List.mem_cons_self + · exact List.mem_cons_of_mem _ (ih.mpr hTail) + /-! ## Helper lemmas for filterMap over the packed concatenation -/ private theorem filterMap_pickRow_rowMap (rs : List Row) : From bce62ea015fb1f30581584ccd8f4bbb2c7234af0 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 18:09:30 +0200 Subject: [PATCH 085/127] doc/semantics: cross error-scope propagation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four propagation theorems for `UnifiedStream.cross`: * `mem_cross_of_mems`: every input pair contributes one output. * `cross_errCarriers_from_left`: left `.err e` paired with any right record yields `.err e` in output (left wins via combineCarrier). * `cross_errCarriers_from_right`: right `.err e` paired with a left `.row la` yields `.err e` in output. (Right does *not* propagate when left is also `.err` — combineCarrier's left-wins rule shadows it.) * `cross_errorDiffCarriers_from_left` / `..._from_right`: `.error` diff on either side absorbs through the diff product to give `.error` output diff. Together these formalize cross's multiplicative error growth: both row-err and collection-err sets grow with the cardinality of the opposite input. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Join.lean | 102 +++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/doc/developer/semantics/Mz/Join.lean b/doc/developer/semantics/Mz/Join.lean index 8fc3c856af6a0..410ce1f21da2f 100644 --- a/doc/developer/semantics/Mz/Join.lean +++ b/doc/developer/semantics/Mz/Join.lean @@ -433,4 +433,106 @@ theorem UnifiedStream.cross_assoc (a b c : UnifiedStream) : intro cd _ exact cross_step_assoc ad bd cd +/-! ## Cross and error scopes + +A pair `(ld, rd)` of input records produces one output record +`(combineCarrier ld.1 rd.1, ld.2 * rd.2)` in `cross l r`. The +error-scope behavior of cross is then: + +* Row-err on left propagates to row-err on output (left wins on + carrier-combine conflict). +* Row-err on right propagates to row-err on output *when the + left record's carrier is `.row`* (otherwise the left's err wins). +* Collection-err on either side propagates to collection-err on + output, because `.error` absorbs in `DiffWithError` multiplication. + +These rules collectively show cross *grows* both error scopes +multiplicatively in the size of the opposite input. -/ + +/-- Every input pair `(ld, rd)` contributes one output record to +`cross l r`. -/ +theorem UnifiedStream.mem_cross_of_mems + (l r : UnifiedStream) + (ld : UnifiedRow × DiffWithError Int) (hL : ld ∈ l) + (rd : UnifiedRow × DiffWithError Int) (hR : rd ∈ r) : + (combineCarrier ld.1 rd.1, ld.2 * rd.2) ∈ UnifiedStream.cross l r := by + induction l with + | nil => exact absurd hL List.not_mem_nil + | cons hdL tlL ih => + rw [UnifiedStream.cross_cons_left] + rcases List.mem_cons.mp hL with hHd | hTl + · subst hHd + exact List.mem_append.mpr (Or.inl (List.mem_map.mpr ⟨rd, hR, rfl⟩)) + · exact List.mem_append.mpr (Or.inr (ih hTl)) + +/-- Left-side row-err propagates: a `.err e` carrier on the left +combined with any right record produces an `.err e` carrier in +the output (left wins on combineCarrier). -/ +theorem UnifiedStream.cross_errCarriers_from_left + (l r : UnifiedStream) (e : EvalError) (d : DiffWithError Int) + (hL : (UnifiedRow.err e, d) ∈ l) + (rd : UnifiedRow × DiffWithError Int) (hR : rd ∈ r) : + e ∈ UnifiedStream.errCarriers (UnifiedStream.cross l r) := by + have hMem : (combineCarrier (UnifiedRow.err e) rd.1, d * rd.2) + ∈ UnifiedStream.cross l r := + UnifiedStream.mem_cross_of_mems l r (UnifiedRow.err e, d) hL rd hR + have hCombine : combineCarrier (UnifiedRow.err e) rd.1 = UnifiedRow.err e := by + cases rd.1 <;> rfl + rw [hCombine] at hMem + exact (UnifiedStream.mem_errCarriers _ _).mpr ⟨d * rd.2, hMem⟩ + +/-- Right-side row-err propagates when paired with a left-row +record. A right `.err e` combined with a left `.row la` gives +output `.err e` (combineCarrier's third arm). When paired with a +left `.err e'`, the left wins; the right `.err e` does not appear +under that pairing. -/ +theorem UnifiedStream.cross_errCarriers_from_right + (l r : UnifiedStream) (e : EvalError) (d : DiffWithError Int) + (hR : (UnifiedRow.err e, d) ∈ r) + (la : Row) (dL : DiffWithError Int) + (hL : (UnifiedRow.row la, dL) ∈ l) : + e ∈ UnifiedStream.errCarriers (UnifiedStream.cross l r) := by + have hMem : (combineCarrier (UnifiedRow.row la) (UnifiedRow.err e), dL * d) + ∈ UnifiedStream.cross l r := + UnifiedStream.mem_cross_of_mems l r + (UnifiedRow.row la, dL) hL (UnifiedRow.err e, d) hR + have hCombine : combineCarrier (UnifiedRow.row la) (UnifiedRow.err e) + = UnifiedRow.err e := rfl + rw [hCombine] at hMem + exact (UnifiedStream.mem_errCarriers _ _).mpr ⟨dL * d, hMem⟩ + +/-- Left-side collection-err propagates: a `.error`-diff record on +the left combined with any right record produces an `.error`-diff +output (`.error * d = .error`). -/ +theorem UnifiedStream.cross_errorDiffCarriers_from_left + (l r : UnifiedStream) (uc : UnifiedRow) + (hL : (uc, (DiffWithError.error : DiffWithError Int)) ∈ l) + (rd : UnifiedRow × DiffWithError Int) (hR : rd ∈ r) : + combineCarrier uc rd.1 + ∈ UnifiedStream.errorDiffCarriers (UnifiedStream.cross l r) := by + have hMem : (combineCarrier uc rd.1, + (DiffWithError.error : DiffWithError Int) * rd.2) + ∈ UnifiedStream.cross l r := + UnifiedStream.mem_cross_of_mems l r + (uc, (DiffWithError.error : DiffWithError Int)) hL rd hR + rw [DiffWithError.error_mul_left] at hMem + exact (UnifiedStream.mem_errorDiffCarriers _ _).mpr hMem + +/-- Right-side collection-err propagates: a `.error`-diff record +on the right combined with any left record produces an +`.error`-diff output (`d * .error = .error`). -/ +theorem UnifiedStream.cross_errorDiffCarriers_from_right + (l r : UnifiedStream) (uc : UnifiedRow) + (hR : (uc, (DiffWithError.error : DiffWithError Int)) ∈ r) + (ld : UnifiedRow × DiffWithError Int) (hL : ld ∈ l) : + combineCarrier ld.1 uc + ∈ UnifiedStream.errorDiffCarriers (UnifiedStream.cross l r) := by + have hMem : (combineCarrier ld.1 uc, + ld.2 * (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.cross l r := + UnifiedStream.mem_cross_of_mems l r ld hL + (uc, (DiffWithError.error : DiffWithError Int)) hR + rw [DiffWithError.error_mul_right] at hMem + exact (UnifiedStream.mem_errorDiffCarriers _ _).mpr hMem + end Mz From 12a057a1b0fa060b124ffd373e0d76a55c7dd78e Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 19:03:57 +0200 Subject: [PATCH 086/127] doc/semantics: escalateRowErrs operator Add UnifiedStream.escalateRowErrs: promotes row-scoped errors to collection-scoped. Each (.err e, _) record has its diff overwritten to .error; .row r records untouched. Theorems: length-preserving, idempotent, preserves errCarriers, and escalateRowErrs_errCarriers_in_errorDiff makes the promotion observable on errorDiffCarriers. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/UnifiedStream.lean | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/doc/developer/semantics/Mz/UnifiedStream.lean b/doc/developer/semantics/Mz/UnifiedStream.lean index 3416b027be375..ecd567ce2d74d 100644 --- a/doc/developer/semantics/Mz/UnifiedStream.lean +++ b/doc/developer/semantics/Mz/UnifiedStream.lean @@ -456,6 +456,105 @@ theorem UnifiedStream.mem_errorDiffCarriers (us : UnifiedStream) (uc : UnifiedRo rw [this]; exact List.mem_cons_self · exact List.mem_cons_of_mem _ (ih.mpr hTail) +/-! ## Error-scope escalation + +`escalateRowErrs` promotes every row-scoped error to a +collection-scoped error: each `(.err e, _)` record has its diff +overwritten to `.error`. The `.row r` records are untouched. + +This is the canonical operator for the "row err means the whole +collection is broken at this point" semantics. The companion +`escalateRowErrs_idem` says re-escalating is a no-op. -/ + +def UnifiedStream.escalateRowErrs (us : UnifiedStream) : UnifiedStream := + us.map fun ud => match ud.1 with + | .err e => (UnifiedRow.err e, DiffWithError.error) + | .row _ => ud + +theorem UnifiedStream.escalateRowErrs_nil : + UnifiedStream.escalateRowErrs [] = [] := rfl + +theorem UnifiedStream.escalateRowErrs_length (us : UnifiedStream) : + (UnifiedStream.escalateRowErrs us).length = us.length := + List.length_map _ + +theorem UnifiedStream.escalateRowErrs_idem (us : UnifiedStream) : + UnifiedStream.escalateRowErrs (UnifiedStream.escalateRowErrs us) + = UnifiedStream.escalateRowErrs us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + cases uc with + | row r => + show ((UnifiedRow.row r, d) :: + (tl.map (fun ud => match ud.1 with + | .err e => (UnifiedRow.err e, DiffWithError.error) + | .row _ => ud))).map _ + = (UnifiedRow.row r, d) :: _ + simp only [List.map_cons] + show (UnifiedRow.row r, d) :: + (tl.map (fun ud => match ud.1 with + | .err e => (UnifiedRow.err e, DiffWithError.error) + | .row _ => ud)).map _ + = (UnifiedRow.row r, d) :: _ + exact congrArg (fun t => (UnifiedRow.row r, d) :: t) ih + | err e => + show ((UnifiedRow.err e, DiffWithError.error) :: + (tl.map (fun ud => match ud.1 with + | .err e => (UnifiedRow.err e, DiffWithError.error) + | .row _ => ud))).map _ + = (UnifiedRow.err e, DiffWithError.error) :: _ + simp only [List.map_cons] + exact congrArg (fun t => (UnifiedRow.err e, DiffWithError.error) :: t) ih + +/-- After escalation, every row-err in the input is also a +collection-err carrier in the output. The row-err set is +preserved (escalation does not delete carriers, only overwrites +their diff). -/ +theorem UnifiedStream.escalateRowErrs_errCarriers (us : UnifiedStream) : + UnifiedStream.errCarriers (UnifiedStream.escalateRowErrs us) + = UnifiedStream.errCarriers us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + cases uc with + | row r => + show UnifiedStream.errCarriers ((UnifiedRow.row r, d) + :: UnifiedStream.escalateRowErrs tl) + = UnifiedStream.errCarriers ((UnifiedRow.row r, d) :: tl) + show (UnifiedStream.escalateRowErrs tl).filterMap _ = tl.filterMap _ + exact ih + | err e => + show UnifiedStream.errCarriers + ((UnifiedRow.err e, DiffWithError.error) + :: UnifiedStream.escalateRowErrs tl) + = UnifiedStream.errCarriers ((UnifiedRow.err e, d) :: tl) + show e :: (UnifiedStream.escalateRowErrs tl).filterMap _ + = e :: tl.filterMap _ + exact congrArg _ ih + +/-- After escalation, every row-err carrier from input appears in +the output's collection-err set: the escalation is observable on +`errorDiffCarriers`. -/ +theorem UnifiedStream.escalateRowErrs_errCarriers_in_errorDiff + (us : UnifiedStream) (e : EvalError) + (h : e ∈ UnifiedStream.errCarriers us) : + UnifiedRow.err e + ∈ UnifiedStream.errorDiffCarriers (UnifiedStream.escalateRowErrs us) := by + obtain ⟨d, hMem⟩ := (UnifiedStream.mem_errCarriers us e).mp h + -- Build the membership witness in escalateRowErrs us. + have hMemMap : (UnifiedRow.err e, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.escalateRowErrs us := by + show (UnifiedRow.err e, DiffWithError.error) + ∈ us.map (fun ud => match ud.1 with + | .err e => (UnifiedRow.err e, DiffWithError.error) + | .row _ => ud) + refine List.mem_map.mpr ⟨(UnifiedRow.err e, d), hMem, ?_⟩ + rfl + exact (UnifiedStream.mem_errorDiffCarriers _ _).mpr hMemMap + /-! ## Helper lemmas for filterMap over the packed concatenation -/ private theorem filterMap_pickRow_rowMap (rs : List Row) : From 2f8377a079ab109bd61429cddccf16230690d31b Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 19:07:03 +0200 Subject: [PATCH 087/127] doc/semantics: project error-scope theorems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror filter error-scope formalization for project: * project_errorDiffCarriers: equality. project preserves the collection-err set. rowProjectRecords never produces .error diffs (it inherits the input diff verbatim). * project_errCarriers_mono: monotone. row-err set may grow when rowProjectRecords emits new .err carriers from scalar errors. filter and project are the two cell-to-row promotion sites. Both preserve collection-scope and grow row-scope. The shared shape suggests a generic theorem about flatMap-based diff-aware ops covering both, but that abstraction is deferred until a third example shows the pattern is reusable. Helper: errorDiffCarriers_eq_nil_of_all_val — a list with all .val diffs has empty errorDiffCarriers. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/UnifiedStream.lean | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/doc/developer/semantics/Mz/UnifiedStream.lean b/doc/developer/semantics/Mz/UnifiedStream.lean index ecd567ce2d74d..189ee7fe013fb 100644 --- a/doc/developer/semantics/Mz/UnifiedStream.lean +++ b/doc/developer/semantics/Mz/UnifiedStream.lean @@ -456,6 +456,117 @@ theorem UnifiedStream.mem_errorDiffCarriers (us : UnifiedStream) (uc : UnifiedRo rw [this]; exact List.mem_cons_self · exact List.mem_cons_of_mem _ (ih.mpr hTail) +/-! ## `project` and error scopes + +`project` mirrors `filter` on error-scope behavior: + +* Collection-err set is preserved exactly. The `(_, .error)` arm + passes the absorbing diff through; every other arm produces + `.val`-diff output only (`rowProjectRecords` preserves the + input diff verbatim onto each emitted record). +* Row-err set is monotone (grows). The `(.row r, .val n)` arm + may emit zero or more `.err e` records when scalar evaluation + fails — the cell-to-row promotion site. -/ + +private theorem errorDiffCarriers_eq_nil_of_all_val + (us : UnifiedStream) + (h : ∀ rec ∈ us, ∃ n : Int, rec.2 = DiffWithError.val n) : + UnifiedStream.errorDiffCarriers us = [] := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨n, hN⟩ := h hd List.mem_cons_self + obtain ⟨uc, d⟩ := hd + have hD : d = DiffWithError.val n := hN + subst hD + have hRed : UnifiedStream.errorDiffCarriers + ((uc, DiffWithError.val n) :: tl) + = UnifiedStream.errorDiffCarriers tl := rfl + rw [hRed] + exact ih (fun rec hMem => h rec (List.mem_cons_of_mem _ hMem)) + +private theorem project_singleton_errorDiffCarriers + (es : List Expr) (uc : UnifiedRow) (d : DiffWithError Int) : + UnifiedStream.errorDiffCarriers (UnifiedStream.project es [(uc, d)]) + = UnifiedStream.errorDiffCarriers [(uc, d)] := by + cases d with + | error => + -- First arm: project passes (uc, .error) through. + cases uc <;> rfl + | val n => + cases uc with + | err _ => rfl + | row r => + have hP : UnifiedStream.project es + [(UnifiedRow.row r, DiffWithError.val n)] + = rowProjectRecords es (DiffWithError.val n) r := by + show List.flatMap _ _ = _ + simp only [List.flatMap_cons, List.flatMap_nil, List.append_nil] + rw [hP] + have hRhs : UnifiedStream.errorDiffCarriers + [(UnifiedRow.row r, DiffWithError.val n)] = [] := rfl + rw [hRhs] + exact errorDiffCarriers_eq_nil_of_all_val _ + (rowProjectRecords_no_error es n r) + +theorem UnifiedStream.project_errorDiffCarriers + (es : List Expr) (us : UnifiedStream) : + UnifiedStream.errorDiffCarriers (UnifiedStream.project es us) + = UnifiedStream.errorDiffCarriers us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hCons : ((uc, d) :: tl : UnifiedStream) = [(uc, d)] ++ tl := rfl + rw [hCons, UnifiedStream.project_append, + UnifiedStream.errorDiffCarriers_append, + UnifiedStream.errorDiffCarriers_append, ih, + project_singleton_errorDiffCarriers es uc d] + +private theorem project_singleton_errCarriers_mono + (es : List Expr) (uc : UnifiedRow) (d : DiffWithError Int) (e : EvalError) + (h : e ∈ UnifiedStream.errCarriers [(uc, d)]) : + e ∈ UnifiedStream.errCarriers (UnifiedStream.project es [(uc, d)]) := by + cases uc with + | row r => + have hEmpty : UnifiedStream.errCarriers [(UnifiedRow.row r, d)] = [] := rfl + rw [hEmpty] at h + exact absurd h List.not_mem_nil + | err e0 => + have hSingle : UnifiedStream.errCarriers [(UnifiedRow.err e0, d)] = [e0] := rfl + rw [hSingle] at h + have hEq : e = e0 := List.mem_singleton.mp h + subst hEq + cases d with + | error => + have hP : UnifiedStream.project es + [(UnifiedRow.err e, DiffWithError.error)] + = [(UnifiedRow.err e, DiffWithError.error)] := rfl + rw [hP] + exact List.mem_singleton.mpr rfl + | val n => + have hP : UnifiedStream.project es + [(UnifiedRow.err e, DiffWithError.val n)] + = [(UnifiedRow.err e, DiffWithError.val n)] := rfl + rw [hP] + exact List.mem_singleton.mpr rfl + +theorem UnifiedStream.project_errCarriers_mono + (es : List Expr) (us : UnifiedStream) (e : EvalError) + (h : e ∈ UnifiedStream.errCarriers us) : + e ∈ UnifiedStream.errCarriers (UnifiedStream.project es us) := by + induction us with + | nil => exact absurd h List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hCons : ((uc, d) :: tl : UnifiedStream) = [(uc, d)] ++ tl := rfl + rw [hCons, UnifiedStream.errCarriers_append] at h + rw [hCons, UnifiedStream.project_append, UnifiedStream.errCarriers_append] + rcases List.mem_append.mp h with hHead | hTail + · exact List.mem_append.mpr + (Or.inl (project_singleton_errCarriers_mono es uc d e hHead)) + · exact List.mem_append.mpr (Or.inr (ih hTail)) + /-! ## Error-scope escalation `escalateRowErrs` promotes every row-scoped error to a From d93635ef0736587e7e71995b6c2a7432c2e057f6 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 19:11:32 +0200 Subject: [PATCH 088/127] doc/semantics: consolidate errorDiff iff (reverse direction) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Close the loop on consolidate vs collection-err set: previously forward-only (consolidate_errorDiffCarriers_mono via consolidate_preserves_error). Now full equivalence. DiffSemiring: add_eq_error_left_or_right — converse of absorption. The .val + .val arm never returns .error, so a sum equal to .error rules in at least one .error summand. UnifiedConsolidate: * consolidateInto_error_inv: a .error diff at carrier uc in the output of consolidateInto came from either the inserted (uc', d') being .error or the existing bucket carrying .error. * consolidate_error_inv: (uc, .error) in consolidate us implies (uc, .error) in us. Per-carrier inversion via the DiffSemiring helper. SetOps: * consolidate_errorDiffCarriers_iff: combines preserves_error (forward) with error_inv (reverse). The collection-err set is preserved exactly under consolidation. Together with consolidate_errCarriers_iff, both error scopes are now characterized as set-level invariants of consolidation. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/DiffSemiring.lean | 17 +++++ doc/developer/semantics/Mz/SetOps.lean | 14 ++++ .../semantics/Mz/UnifiedConsolidate.lean | 70 +++++++++++++++++++ 3 files changed, 101 insertions(+) diff --git a/doc/developer/semantics/Mz/DiffSemiring.lean b/doc/developer/semantics/Mz/DiffSemiring.lean index aa96e2e227713..e7d18dd9882f9 100644 --- a/doc/developer/semantics/Mz/DiffSemiring.lean +++ b/doc/developer/semantics/Mz/DiffSemiring.lean @@ -92,6 +92,23 @@ theorem error_add_right [Add α] (x : DiffWithError α) : | val _ => rfl | error => rfl +/-- Converse of absorption: `.error` can only emerge from `+` if at +least one summand was `.error`. The `.val + .val` branch always +returns `.val`, so the result `.error` rules it out. -/ +theorem add_eq_error_left_or_right [Add α] (a b : DiffWithError α) + (h : a + b = error) : + a = error ∨ b = error := by + cases a with + | error => exact Or.inl rfl + | val x => + cases b with + | error => exact Or.inr rfl + | val y => + have hEq : (DiffWithError.val x : DiffWithError α) + DiffWithError.val y + = DiffWithError.val (x + y) := rfl + rw [hEq] at h + cases h + theorem error_mul_left [Mul α] (y : DiffWithError α) : (error : DiffWithError α) * y = error := rfl diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 4ec54e5db8709..ff477f9693cfd 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -1492,4 +1492,18 @@ theorem UnifiedStream.consolidate_errorDiffCarriers_mono rw [UnifiedStream.mem_errorDiffCarriers] exact UnifiedStream.consolidate_preserves_error us uc h +/-- Full equivalence: the collection-err set of the consolidated +stream equals the input's. Combines `consolidate_preserves_error` +(forward) with `consolidate_error_inv` (reverse — no spurious +`.error` emerges from `.val + .val`). -/ +theorem UnifiedStream.consolidate_errorDiffCarriers_iff + (us : UnifiedStream) (uc : UnifiedRow) : + uc ∈ UnifiedStream.errorDiffCarriers (UnifiedStream.consolidate us) + ↔ uc ∈ UnifiedStream.errorDiffCarriers us := by + rw [UnifiedStream.mem_errorDiffCarriers, + UnifiedStream.mem_errorDiffCarriers] + constructor + · intro h; exact UnifiedStream.consolidate_error_inv us uc h + · intro h; exact UnifiedStream.consolidate_preserves_error us uc h + end Mz diff --git a/doc/developer/semantics/Mz/UnifiedConsolidate.lean b/doc/developer/semantics/Mz/UnifiedConsolidate.lean index 14039a535d720..8ba5980d6e95a 100644 --- a/doc/developer/semantics/Mz/UnifiedConsolidate.lean +++ b/doc/developer/semantics/Mz/UnifiedConsolidate.lean @@ -569,4 +569,74 @@ theorem UnifiedStream.mem_of_mem_consolidate obtain ⟨d''', hMemTl⟩ := ih ⟨d'', hMem'⟩ exact ⟨d''', List.mem_cons_of_mem _ hMemTl⟩ +/-! ## Error-diff inversion + +Reverse direction of `consolidate_preserves_error`: a `.error` +diff in the consolidated output must have come from a `.error` +diff in the input at the same carrier. Builds on the DiffSemiring +inversion `add_eq_error_left_or_right`. -/ + +private theorem consolidateInto_error_inv + (uc uc' : UnifiedRow) (d' : DiffWithError Int) (us : UnifiedStream) + (h : (uc, (DiffWithError.error : DiffWithError Int)) + ∈ consolidateInto uc' d' us) : + (uc = uc' ∧ d' = DiffWithError.error) + ∨ (uc, (DiffWithError.error : DiffWithError Int)) ∈ us := by + induction us with + | nil => + -- consolidateInto uc' d' [] = [(uc', d')]; so (uc, .error) = (uc', d'). + have hEq : (uc, (DiffWithError.error : DiffWithError Int)) = (uc', d') := + List.mem_singleton.mp h + have hUc : uc = uc' := (Prod.mk.injEq _ _ _ _).mp hEq |>.1 + have hD : (DiffWithError.error : DiffWithError Int) = d' := + (Prod.mk.injEq _ _ _ _).mp hEq |>.2 + exact Or.inl ⟨hUc, hD.symm⟩ + | cons hd tl ih => + obtain ⟨uc'', d''⟩ := hd + by_cases hEq : uc' = uc'' + · subst hEq + rw [consolidateInto_match] at h + rcases List.mem_cons.mp h with hHead | hTail + · -- (uc, .error) = (uc', d' + d''); two summand sources. + have hUc : uc = uc' := (Prod.mk.injEq _ _ _ _).mp hHead |>.1 + have hD : (DiffWithError.error : DiffWithError Int) = d' + d'' := + (Prod.mk.injEq _ _ _ _).mp hHead |>.2 + subst hUc + rcases DiffWithError.add_eq_error_left_or_right d' d'' hD.symm with hD' | hD'' + · exact Or.inl ⟨rfl, hD'⟩ + · subst hD'' + exact Or.inr List.mem_cons_self + · exact Or.inr (List.mem_cons_of_mem _ hTail) + · rw [consolidateInto_skip _ _ _ _ _ hEq] at h + rcases List.mem_cons.mp h with hHead | hTail + · -- (uc, .error) = (uc'', d''); preserved from input head. + have hUc : uc = uc'' := (Prod.mk.injEq _ _ _ _).mp hHead |>.1 + have hD : (DiffWithError.error : DiffWithError Int) = d'' := + (Prod.mk.injEq _ _ _ _).mp hHead |>.2 + subst hUc; subst hD + exact Or.inr List.mem_cons_self + · rcases ih hTail with ⟨hUc, hD⟩ | hMem + · exact Or.inl ⟨hUc, hD⟩ + · exact Or.inr (List.mem_cons_of_mem _ hMem) + +/-- Every collection-err carrier in the consolidated output +corresponds to a collection-err carrier in the input at the +same carrier. The strict converse of +`consolidate_preserves_error`. -/ +theorem UnifiedStream.consolidate_error_inv + (us : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.consolidate us) : + (uc, (DiffWithError.error : DiffWithError Int)) ∈ us := by + induction us with + | nil => exact absurd h List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + have hInto : (uc, (DiffWithError.error : DiffWithError Int)) + ∈ consolidateInto uc' d' (UnifiedStream.consolidate tl) := h + rcases consolidateInto_error_inv uc uc' d' _ hInto with ⟨hUc, hD⟩ | hMem + · subst hUc; subst hD + exact List.mem_cons_self + · exact List.mem_cons_of_mem _ (ih hMem) + end Mz From 028114ce0f09b8f83f6a54155cc0cf53621f4d81 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 19:16:26 +0200 Subject: [PATCH 089/127] doc/semantics: join error-scope theorems join pred l r := filter pred (cross l r). Error-scope behavior follows by composition: * join_errorDiffCarriers: equals errorDiffCarriers of cross. Filter preserves the collection-err set exactly. * join_errCarriers_mono: every row-err in cross is in join. Filter is monotone on row-errs; cross propagation determines the cross row-err set. Direct corollaries of filter_errorDiffCarriers / filter_errCarriers_mono applied to UnifiedStream.cross l r. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index ff477f9693cfd..1888b755e5973 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -1506,4 +1506,26 @@ theorem UnifiedStream.consolidate_errorDiffCarriers_iff · intro h; exact UnifiedStream.consolidate_error_inv us uc h · intro h; exact UnifiedStream.consolidate_preserves_error us uc h +/-! ## Join and error scopes + +`join pred l r = filter pred (cross l r)`. Error-scope behavior +composes from `filter` and `cross` theorems above: + +* Collection-err set equals that of `cross l r` (filter preserves + the `.error` diff set exactly). +* Row-err set grows monotonically over `cross l r` via `filter`'s + cell-to-row promotion. -/ + +theorem UnifiedStream.join_errorDiffCarriers + (pred : Expr) (l r : UnifiedStream) : + UnifiedStream.errorDiffCarriers (UnifiedStream.join pred l r) + = UnifiedStream.errorDiffCarriers (UnifiedStream.cross l r) := + UnifiedStream.filter_errorDiffCarriers pred (UnifiedStream.cross l r) + +theorem UnifiedStream.join_errCarriers_mono + (pred : Expr) (l r : UnifiedStream) (e : EvalError) + (h : e ∈ UnifiedStream.errCarriers (UnifiedStream.cross l r)) : + e ∈ UnifiedStream.errCarriers (UnifiedStream.join pred l r) := + UnifiedStream.filter_errCarriers_mono pred (UnifiedStream.cross l r) e h + end Mz From 93b4e01ecfb778eba7b1522769c7ead206944f2c Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 19:17:26 +0200 Subject: [PATCH 090/127] doc/semantics: union and exceptAll error-scope theorems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compose existing pieces: * union := consolidate ∘ unionAll. Errors flow through unionAll (concat) and consolidate (set-preserving on both scopes). Both error sets become disjoint union of inputs. * exceptAll := consolidate (unionAll L (negate R)). negate preserves both error scopes (proven earlier); the rest matches union. Both error sets are disjoint union — exceptAll does not remove errors from R despite the bag-subtraction semantics on carriers (errors flow through unchanged). Theorems: * union_errCarriers_iff / union_errorDiffCarriers_iff * exceptAll_errCarriers_iff / exceptAll_errorDiffCarriers_iff The composition pattern (consolidate ∘ unionAll ∘ negate) covers both bag SQL set ops via the underlying signed-diff arithmetic without changing the error-scope sets. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 61 ++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 1888b755e5973..99d49e9df12bb 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -1528,4 +1528,65 @@ theorem UnifiedStream.join_errCarriers_mono e ∈ UnifiedStream.errCarriers (UnifiedStream.join pred l r) := UnifiedStream.filter_errCarriers_mono pred (UnifiedStream.cross l r) e h +/-! ## `union` and error scopes + +`union := consolidate ∘ unionAll`. Compose +`unionAll_errCarriers` (concat) with `consolidate_errCarriers_iff` +(set-preserving). The result: the row-err set of `union L R` is +the disjoint union of `L`'s and `R`'s row-err sets (as a set; +multiplicity collapses via consolidate). + +Same shape for collection-err. -/ + +theorem UnifiedStream.union_errCarriers_iff + (l r : UnifiedStream) (e : EvalError) : + e ∈ UnifiedStream.errCarriers (UnifiedStream.union l r) + ↔ e ∈ UnifiedStream.errCarriers l ∨ e ∈ UnifiedStream.errCarriers r := by + show e ∈ UnifiedStream.errCarriers + (UnifiedStream.consolidate (UnifiedStream.unionAll l r)) ↔ _ + rw [UnifiedStream.consolidate_errCarriers_iff, + UnifiedStream.unionAll_errCarriers, List.mem_append] + +theorem UnifiedStream.union_errorDiffCarriers_iff + (l r : UnifiedStream) (uc : UnifiedRow) : + uc ∈ UnifiedStream.errorDiffCarriers (UnifiedStream.union l r) + ↔ uc ∈ UnifiedStream.errorDiffCarriers l + ∨ uc ∈ UnifiedStream.errorDiffCarriers r := by + show uc ∈ UnifiedStream.errorDiffCarriers + (UnifiedStream.consolidate (UnifiedStream.unionAll l r)) ↔ _ + rw [UnifiedStream.consolidate_errorDiffCarriers_iff, + UnifiedStream.unionAll_errorDiffCarriers, List.mem_append] + +/-! ## `exceptAll` and error scopes + +`exceptAll L R := consolidate (unionAll L (negate R))`. The +right side flows through `negate`, which preserves both error +scopes (proven above). The composition with `consolidate` +preserves both scopes as sets. So `exceptAll`'s row-err set is +the disjoint union of `L`'s and `R`'s, and same for +collection-err. -/ + +theorem UnifiedStream.exceptAll_errCarriers_iff + (l r : UnifiedStream) (e : EvalError) : + e ∈ UnifiedStream.errCarriers (UnifiedStream.exceptAll l r) + ↔ e ∈ UnifiedStream.errCarriers l ∨ e ∈ UnifiedStream.errCarriers r := by + show e ∈ UnifiedStream.errCarriers + (UnifiedStream.consolidate + (UnifiedStream.unionAll l (UnifiedStream.negate r))) ↔ _ + rw [UnifiedStream.consolidate_errCarriers_iff, + UnifiedStream.unionAll_errCarriers, + UnifiedStream.negate_errCarriers, List.mem_append] + +theorem UnifiedStream.exceptAll_errorDiffCarriers_iff + (l r : UnifiedStream) (uc : UnifiedRow) : + uc ∈ UnifiedStream.errorDiffCarriers (UnifiedStream.exceptAll l r) + ↔ uc ∈ UnifiedStream.errorDiffCarriers l + ∨ uc ∈ UnifiedStream.errorDiffCarriers r := by + show uc ∈ UnifiedStream.errorDiffCarriers + (UnifiedStream.consolidate + (UnifiedStream.unionAll l (UnifiedStream.negate r))) ↔ _ + rw [UnifiedStream.consolidate_errorDiffCarriers_iff, + UnifiedStream.unionAll_errorDiffCarriers, + UnifiedStream.negate_errorDiffCarriers, List.mem_append] + end Mz From 7d850e21616d4e3d583d0af67605365e7846c9af Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 19:49:44 +0200 Subject: [PATCH 091/127] doc/semantics: README updates for error-scope theorems Add per-file documentation for the new error-scope work. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index cac8fcb7c60c9..10191d637d5a5 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -36,9 +36,12 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `Expr.colReferencesUnused n e` (mutual with `Expr.argsColRefUnused`) returns `true` when `e` never reads column `n`. Companion `Env.replaceAt env n v` swaps a single position. Headline `eval_replaceAt_of_unused`: when column `n` is unused, replacing its value does not change eval. Supports column-pruning rewrites: a projection that drops unused columns is sound. Supporting `Env.get_replaceAt_eq` and `Env.get_replaceAt_ne` discharge the per-column reductions. `Expr.colReferencesUnused_of_bounded` (mutual with the operand-list version) bridges the two analyzers: a predicate bounded by `n` has every column `i ≥ n` unused. Lets the optimizer derive column-pruning consequences from a tight bound. Supporting environment lemmas: `Env.get_append_left` (read from prefix) and `Env.get_append_right` (read from suffix with index shift). -* `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `-`, `0`, `1`, `min` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity / negation / min laws that downstream operators must respect. Negation laws (`neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`, `neg_mul`, `mul_neg`, `neg_add`) carry the principle that `.error` is unrecoverable — a collection-scoped error cannot be subtracted away, and that negation distributes through addition and multiplication. Min laws (`error_min_left`, `error_min_right`, `min_val_val`) define the bag-min combinator with `.error` absorbing. The `_int` specializations discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean`, `Mz/UnifiedConsolidate.lean`, and `Mz/SetOps.lean` can cite the named laws directly. +* `Mz/DiffSemiring.lean`: `DiffWithError α` — the diff-field type extension that encodes global (collection-scoped) errors as an absorbing element. Provides `+`, `*`, `-`, `0`, `1`, `min` instances over an arbitrary base diff and proves the absorption / commutativity / associativity / distributivity / negation / min laws that downstream operators must respect. Negation laws (`neg_error`, `neg_val`, `neg_neg_val`, `val_add_neg_val`, `neg_mul`, `mul_neg`, `neg_add`) carry the principle that `.error` is unrecoverable — a collection-scoped error cannot be subtracted away, and that negation distributes through addition and multiplication. Min laws (`error_min_left`, `error_min_right`, `min_val_val`) define the bag-min combinator with `.error` absorbing. The `_int` specializations discharge the base hypotheses at `Int` so downstream code in `Mz/Join.lean`, `Mz/UnifiedConsolidate.lean`, and `Mz/SetOps.lean` can cite the named laws directly. The inversion law `add_eq_error_left_or_right` is the converse of absorption — a sum equal to `.error` forces at least one summand to be `.error` — required for the reverse direction of `consolidate_error_inv`. * `Mz/UnifiedStream.lean`: unified diff-aware alternative to `BagStream`. `UnifiedStream := List (UnifiedRow × DiffWithError Int)` pairs a carrier (data row or row-scoped err) with a differential-dataflow diff augmented by the absorbing `error` element. Row-scoped errors flow through the carrier; collection-scoped errors flow through diff multiplication / addition. `ofBag` / `split` conversions assign every bag record a diff of `.val 1`; the round-trip theorem `split (ofBag s) = s` holds. The cross-direction is exact only up to multiset equality on `List EvalError` and is lossy for diffs ≠ `.val 1` (split drops diff information). Error-scope extractors: `errCarriers us` returns the list of row-scoped err payloads (carrier = `.err e`); `errorDiffCarriers us` returns the list of carriers whose diff is collection-scoped `.error`. Theorems: `errCarriers_nil`/`_append`, `errorDiffCarriers_nil`/`_append`. The two sets are independent observable properties — operators that touch one need not touch the other. + Bridge lemmas `mem_errCarriers` and `mem_errorDiffCarriers` convert membership in the extractor lists to record-level carrier / diff membership in the stream, letting downstream proofs reason without unfolding the underlying `filterMap`. + Per-operator error-scope behavior: `filter_errorDiffCarriers` (filter preserves the collection-err set exactly) and `filter_errCarriers_mono` (filter is monotone on row-errs — cell-to-row promotion only adds carriers). Same pair for `project`: `project_errorDiffCarriers` (equality) and `project_errCarriers_mono` (monotone). filter and project are the two cell-to-row promotion sites. + `escalateRowErrs us` is the explicit row-to-collection promotion operator: every `(.err e, _)` record's diff is overwritten to `.error`; `.row r` records untouched. Theorems: `escalateRowErrs_length` (length-preserving), `escalateRowErrs_idem` (idempotent), `escalateRowErrs_errCarriers` (row-err set preserved), `escalateRowErrs_errCarriers_in_errorDiff` (every input row-err appears in the output collection-err set — the observable promotion property). `UnifiedStream.project` lifts `BagStream.project` to the diff-aware carrier. Records with `.error` diff or `.err` carrier pass through unchanged; a `.row r` record with `.val n` diff is evaluated against `es` — if every scalar succeeds, the row is emitted with diff `.val n`; if any scalar errs, one `(.err e, .val n)` is emitted per erroring scalar (each preserving the original multiplicity). Theorems: `project_preserves_error_diff` (an `.error` diff in the input always reaches the output), `project_no_error` (all-`.val` inputs yield all-`.val` outputs), `project_nil_es` (empty projection list collapses every row to width-zero), `project_nil_stream` (empty stream is empty). * `Mz/Aggregate.lean`: aggregate reductions over `List Datum`. `aggCountNonNull` for `COUNT(expr)`. `aggStrict` for `SUM`/`MIN`/`MAX`-style aggregates that propagate `err` (first one in scan order wins) and skip `NULL`s. `aggTry` for the proposed `try_sum`/`try_min`/`try_max` variants that swallow `err` into `NULL` instead of propagating, defined as a post-pass on `aggStrict`. Theorems: `aggStrict_err` (any `err` input → `err` output), `aggStrict_no_err` (no-err inputs + no-err reducer → no-err output), `aggTry_no_err` (the non-strict variant never errors), and `aggTry_eq_aggStrict_of_no_err` (strict and non-strict agree on error-free inputs). * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. @@ -50,8 +53,11 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four *no-error preservation* — `consolidate_no_error` proves that if every input diff is a `.val`, every output diff is a `.val`, so `.error` is the only source of absorption. *strict shrinkage* — `consolidate_strict_length_dup` proves that two adjacent records sharing a carrier compress to one in the output: `(consolidate ((uc, d) :: (uc, d') :: rest)).length ≤ rest.length + 1`, strictly less than the input's `rest.length + 2`. *carrier uniqueness* — `consolidate_noDup` proves that `consolidate` always produces a `NoDupCarriers` list (`Pairwise` on first-component inequality). Each carrier appears at most once. Supporting `consolidateInto_preserves_noDup` (and a `_general` variant that handles both fresh-key and matching-key cases) carries the invariant through single-step inserts. + *carrier preservation* — `mem_consolidate_of_mem` (forward) and `mem_of_mem_consolidate` (reverse): every input carrier appears in the output, and every output carrier came from an input. The set of carriers is preserved exactly under consolidation (multiplicity may collapse). + *error inversion* — `consolidate_error_inv` (with helper `consolidateInto_error_inv` and DiffSemiring's `add_eq_error_left_or_right`): a `.error` diff in the output must come from a `.error` diff in the input at the same carrier. The reverse of `consolidate_preserves_error`; closes the loop for `consolidate_errorDiffCarriers_iff` in `Mz/SetOps.lean`. * `Mz/Triple.lean`: collection-wide and per-time *flat* consolidation views on `TimedUnifiedStream`. `consolidateAll` sums every diff in the stream; `consolidateAtTimeFlat t` sums every diff at time `t`. Both ignore the carrier — they collapse a time slice (or the whole stream) to one `DiffWithError Int`. Absorption: `consolidateAll_eq_error_of_mem` and `consolidateAtTimeFlat_eq_error_of_mem`. Complementary to `Mz/TimedConsolidate.lean`'s `consolidateAtTime t`, which buckets per `(row, time)` and returns a `UnifiedStream`. * `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). No-error preservation: `cross_no_error` and `filter_no_error` prove that all-`.val` input diffs yield all-`.val` output diffs, so `.error` is the only source of absorbing diffs in the joint output. Algebraic laws: `combineCarrier_assoc` (carrier combine is associative modulo `List.append_assoc`) and the headline `UnifiedStream.cross_assoc` (`(a × b) × c = a × (b × c)`). The proof rearranges nested `flatMap` / `map` via local list-monad lemmas and closes via `DiffWithError.mul_assoc` plus `combineCarrier_assoc`. + Error-scope propagation through `cross`: `mem_cross_of_mems` is the carrier-level witness — every input pair contributes one output record. `cross_errCarriers_from_left` (left `.err e` paired with any right record yields `.err e` in output via combineCarrier's left-wins rule); `cross_errCarriers_from_right` (right `.err e` paired with a left `.row la` yields `.err e` — right propagates only when left is `.row`); `cross_errorDiffCarriers_from_{left,right}` (`.error` diff on either side absorbs through the product). cross grows both error scopes multiplicatively in the cardinality of the opposite input. * `Mz/SetOps.lean`: set operations on `UnifiedStream`. `unionAll = (++)` concatenates two streams record-wise; theorems cover length (sum), associativity, nil identities, and error / no-error preservation from each input (`unionAll_preserves_error_diff_left`, `unionAll_preserves_error_diff_right`, `unionAll_no_error`). `union = consolidate ∘ unionAll` derives the set-semantics flavor; theorems lift the consolidation guarantees to `union` (`union_length_le`, `union_preserves_error_diff_left`, `union_preserves_error_diff_right`, `union_no_error`, `union_nil_left`, `union_nil_right`). `negate` negates every diff (`.error` absorbs negation, `.val n` becomes `.val (-n)`). Theorems: `negate_length` (length preserved), `negate_negate` (involution), `negate_preserves_error_diff`, `negate_no_error`. `exceptAll l r = consolidate (unionAll l (negate r))` realizes the signed-diff `EXCEPT ALL` (output diffs may be negative, encoding "this carrier has `n` fewer copies in the result than in the input"). Theorems: `exceptAll_length_le` (≤ sum of input lengths), `exceptAll_preserves_error_diff_left`/`exceptAll_preserves_error_diff_right` (errors from either side survive — negation absorbs at `.error`), `exceptAll_no_error`, `exceptAll_nil_left` (`exceptAll [] r = negate (consolidate r)` — bridges to `negate_consolidate`), `exceptAll_nil_right` (`exceptAll l [] = consolidate l`). @@ -64,6 +70,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `negate_consolidate` proves that negation commutes with consolidation: `negate (consolidate us) = consolidate (negate us)`. The proof recurses via `negate_consolidateInto` (private), which lifts the same property to the single-step insertion. Negation is additive (`neg_add`), so it slides through per-bucket sums. `intersectAll l r` realizes bag-intersection via lookup: consolidate both inputs, then per left-carrier emit `(uc, min(L_diff, R_diff))` if the carrier exists in the right's consolidate, else drop. Supported by `UnifiedStream.lookup` (return the diff for a carrier in a consolidated stream, or `none`), `UnifiedStream.lookup_isSome_of_mem` (lookup returns `some` when carrier present), `UnifiedStream.mem_of_lookup_eq_some` (converse: lookup success witnesses membership), `UnifiedStream.lookup_eq_of_mem_noDup` (returns the exact diff when the list has unique carriers), and `DiffWithError.min` (the bag-min combinator with `.error` absorbing). Theorems: `intersectAll_length_le` (≤ left.length), `intersectAll_preserves_error_diff_left`/`_right` (`.error` diff for a carrier present in both inputs survives — left case uses left-min absorption, right case uses the no-dup property of `consolidate r`), `intersectAll_no_error` (all-`.val` inputs yield all-`.val` outputs). `bagIntersectAll = clampPositive ∘ intersectAll` realizes the bag-semantics `INTERSECT ALL` — drops records with non-positive multiplicity, leaving `.error` records untouched. Theorems lift the signed flavor: `bagIntersectAll_length_le`, `bagIntersectAll_preserves_error_diff_left`/`_right`, `bagIntersectAll_no_error`, `bagIntersectAll_only_positive`. + Error-scope characterization of set ops. `unionAll_errCarriers` and `unionAll_errorDiffCarriers` (concat — direct from `errCarriers_append` / `errorDiffCarriers_append`). `negate_errCarriers` and `negate_errorDiffCarriers` (both scopes are invariants of negation since carriers are unchanged and `.error` absorbs negation). `consolidate_errCarriers_iff` (row-err set equal as set; multiplicity may shrink) and `consolidate_errorDiffCarriers_iff` (collection-err set equal exactly — forward from `consolidate_preserves_error`, reverse from `consolidate_error_inv`). `union_errCarriers_iff` / `union_errorDiffCarriers_iff` and `exceptAll_errCarriers_iff` / `exceptAll_errorDiffCarriers_iff` follow as compositions of `unionAll`, `negate`, and `consolidate` theorems — the error-scope sets of both ops are the disjoint union of the inputs'. `join_errorDiffCarriers` (equals that of `cross`) and `join_errCarriers_mono` (filter is monotone on row-errs over `cross`) compose `cross` and `filter` results. * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. From d891187a481407fe2ceafb98592c576020bb094a Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 20:25:17 +0200 Subject: [PATCH 092/127] doc/semantics: filter pushdown for cross products MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Mz/JoinPushdown.lean with the optimizer's canonical join-pushdown rewrite: filter pred (cross l r) = cross (filter pred l) r when pred references only left-input columns. Earlier abandoned attempt at this proof failed on the show-tactic pattern matching; this retry succeeds via: * IsPureData predicate on r (no .err carriers, no .error diffs) — required because combineCarrier's left-wins rule and the absorbing .error * d = .error rule disagree across the two sides without it. * cross_singleton reduction lemma with carrier/diff split out (avoids .fst/.snd projections that block later rfls under induction). * Per-record helpers for each left-record shape: - filter_map_error_diff: .error diff passes through. - filter_map_err_carrier: .err e left, .val diff. - filter_map_row_val_keep: .row la, eval = .bool true. - filter_map_row_val_err: .row la, eval = .err — cell-to-row promotion site. - filter_map_row_val_drop: .row la, eval rejects. Main theorem filter_cross_pushdown_left assembles these via structural induction on l plus eval_append_left_of_bounded. Caveats encoded in the hypotheses: * hLWidth (∀ row la in l, la.length ≥ N) — needed because eval_append_left_of_bounded requires bounded by la.length. * hRPure (IsPureData r) — without it, attribution conflicts arise (left .row pred-errs vs right .err records; .error diff in r interacts with filter's first arm). Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/JoinPushdown.lean | 426 +++++++++++++++++++ 2 files changed, 427 insertions(+) create mode 100644 doc/developer/semantics/Mz/JoinPushdown.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 34881f81f647d..03a7c3d5cdfa9 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -20,5 +20,6 @@ import Mz.Aggregate import Mz.Consolidate import Mz.Triple import Mz.Join +import Mz.JoinPushdown import Mz.GroupBy import Mz.SetOps diff --git a/doc/developer/semantics/Mz/JoinPushdown.lean b/doc/developer/semantics/Mz/JoinPushdown.lean new file mode 100644 index 0000000000000..8c73a76170a4e --- /dev/null +++ b/doc/developer/semantics/Mz/JoinPushdown.lean @@ -0,0 +1,426 @@ +import Mz.Join +import Mz.ColRefs + +/-! +# Filter pushdown for cross products + +When a join predicate `pred` only references columns of the left +input (`pred.colReferencesBoundedBy N = true` where `N` bounds +every left row's width), `filter pred ∘ cross` may be rewritten +as `cross ∘ (filter pred)` on the left input. This is the +optimizer's canonical join-pushdown rule: shrink the cross +product before the join filter runs. + +The full rewrite requires the right input to be free of +row-scoped errors (`.err` carriers) and collection-scoped errors +(`.error` diffs). Without this restriction, `combineCarrier`'s +left-wins rule and the absorbing `.error * d = .error` rule cause +the two sides to disagree on which err payload ends up on the +output (and on whether `.error`-diff rows survive past the +filter). With a clean right input, the pushdown is exact. + +The proof is by structural induction on the left input, with a +per-record helper that handles the three left-record shapes +(`.error` diff, `.err` carrier, `.row` with `.val` diff). +-/ + +namespace Mz + +/-- Predicate: every record in `r` has a `.row` carrier and a +`.val` diff. Captures "no row-scoped error, no collection-scoped +error" — the precondition for filter pushdown to commute with +cross. -/ +def UnifiedStream.IsPureData (us : UnifiedStream) : Prop := + ∀ ud ∈ us, (∃ rb, ud.1 = UnifiedRow.row rb) ∧ (∃ m : Int, ud.2 = DiffWithError.val m) + +theorem UnifiedStream.IsPureData.tail {hd : UnifiedRow × DiffWithError Int} + {tl : UnifiedStream} (h : UnifiedStream.IsPureData (hd :: tl)) : + UnifiedStream.IsPureData tl := + fun ud hMem => h ud (List.mem_cons_of_mem _ hMem) + +theorem UnifiedStream.IsPureData.head {hd : UnifiedRow × DiffWithError Int} + {tl : UnifiedStream} (h : UnifiedStream.IsPureData (hd :: tl)) : + (∃ rb, hd.1 = UnifiedRow.row rb) ∧ (∃ m : Int, hd.2 = DiffWithError.val m) := + h hd List.mem_cons_self + +/-- Reduction: `cross [(uc, d)] r` flattens to a per-record map +over `r`. Stated with carrier / diff split out separately so the +expanded body uses concrete projections, not `.fst`/`.snd` on a +tuple — keeps downstream `rfl`s working under induction. -/ +theorem UnifiedStream.cross_singleton + (uc : UnifiedRow) (d : DiffWithError Int) (r : UnifiedStream) : + UnifiedStream.cross [(uc, d)] r + = r.map fun rd => (combineCarrier uc rd.1, d * rd.2) := by + show ([(uc, d)] : UnifiedStream).flatMap _ = _ + simp only [List.flatMap_cons, List.flatMap_nil, List.append_nil] + +/-! ## Per-record helpers + +Each helper handles one shape of left record `(uc, d)` against a +pure-data right input. Combined in the main theorem via case +analysis. The helpers state the equation in the +`filter pred (r.map …) = cross … r` form directly, then use +`cross_singleton` to expand the cross side on demand. -/ + +/-- For `(uc, .error)` left record: filter passes every cross +output through (`.error` diff absorbs and filter's first arm +matches). Equals the cross of the filtered singleton. -/ +private theorem filter_map_error_diff + (pred : Expr) (uc : UnifiedRow) (r : UnifiedStream) : + UnifiedStream.filter pred + (r.map fun rd => (combineCarrier uc rd.1, + (DiffWithError.error : DiffWithError Int) * rd.2)) + = UnifiedStream.cross + (UnifiedStream.filter pred + [(uc, (DiffWithError.error : DiffWithError Int))]) r := by + have hFilterL : UnifiedStream.filter pred + [(uc, (DiffWithError.error : DiffWithError Int))] + = [(uc, (DiffWithError.error : DiffWithError Int))] := by + cases uc <;> rfl + rw [hFilterL, UnifiedStream.cross_singleton] + induction r with + | nil => rfl + | cons hd tl ih => + obtain ⟨ru, rd⟩ := hd + show UnifiedStream.filter pred + ((combineCarrier uc ru, + (DiffWithError.error : DiffWithError Int) * rd) :: tl.map _) + = (combineCarrier uc ru, + (DiffWithError.error : DiffWithError Int) * rd) :: tl.map _ + have hCons : ((combineCarrier uc ru, + (DiffWithError.error : DiffWithError Int) * rd) + :: tl.map (fun rd' => (combineCarrier uc rd'.1, + (DiffWithError.error : DiffWithError Int) * rd'.2)) + : UnifiedStream) + = [(combineCarrier uc ru, + (DiffWithError.error : DiffWithError Int) * rd)] + ++ tl.map (fun rd' => (combineCarrier uc rd'.1, + (DiffWithError.error : DiffWithError Int) * rd'.2)) := rfl + rw [hCons, UnifiedStream.filter_append, ih] + rw [DiffWithError.error_mul_left] + rfl + +/-- For `(.err e, .val n)` left record on a pure-data right +input: cross output carriers are `.err e` (left wins), diffs are +`.val (n*m)`. Filter passes `.err`-carrier `.val`-diff records +through via second arm. Equals cross of the filtered singleton. -/ +private theorem filter_map_err_carrier + (pred : Expr) (e : EvalError) (n : Int) (r : UnifiedStream) + (hR : UnifiedStream.IsPureData r) : + UnifiedStream.filter pred + (r.map fun rd => (combineCarrier (UnifiedRow.err e) rd.1, + DiffWithError.val n * rd.2)) + = UnifiedStream.cross + (UnifiedStream.filter pred [(UnifiedRow.err e, DiffWithError.val n)]) r := by + have hFilterL : UnifiedStream.filter pred + [(UnifiedRow.err e, DiffWithError.val n)] + = [(UnifiedRow.err e, DiffWithError.val n)] := rfl + rw [hFilterL, UnifiedStream.cross_singleton] + induction r with + | nil => rfl + | cons hd tl ih => + obtain ⟨ru, rd⟩ := hd + obtain ⟨_, ⟨m, hM⟩⟩ := hR (ru, rd) List.mem_cons_self + have hTl : UnifiedStream.IsPureData tl := hR.tail + have hM' : rd = DiffWithError.val m := hM + subst hM' + have hComb : combineCarrier (UnifiedRow.err e) ru = UnifiedRow.err e := by + cases ru <;> rfl + show UnifiedStream.filter pred + ((combineCarrier (UnifiedRow.err e) ru, + DiffWithError.val n * DiffWithError.val m) :: tl.map _) + = (combineCarrier (UnifiedRow.err e) ru, + DiffWithError.val n * DiffWithError.val m) :: tl.map _ + rw [hComb] + have hCons : ((UnifiedRow.err e, + DiffWithError.val n * DiffWithError.val m) + :: tl.map (fun rd' => (combineCarrier (UnifiedRow.err e) rd'.1, + DiffWithError.val n * rd'.2)) + : UnifiedStream) + = [(UnifiedRow.err e, + DiffWithError.val n * DiffWithError.val m)] + ++ tl.map (fun rd' => (combineCarrier (UnifiedRow.err e) rd'.1, + DiffWithError.val n * rd'.2)) := rfl + rw [hCons, UnifiedStream.filter_append, ih hTl] + rfl + +/-- `(.row la, .val n)` left with `eval la pred = .bool true` on +a pure-data right input. Filter keeps every cross output: +`eval (la ++ rb) pred = eval la pred = .bool true` by +`eval_append_left_of_bounded`. -/ +private theorem filter_map_row_val_keep + (pred : Expr) (la : Row) (n : Int) (r : UnifiedStream) + (hBound' : pred.colReferencesBoundedBy la.length = true) + (hEval : eval la pred = Datum.bool true) + (hR : UnifiedStream.IsPureData r) : + UnifiedStream.filter pred + (r.map fun rd => (combineCarrier (UnifiedRow.row la) rd.1, + DiffWithError.val n * rd.2)) + = UnifiedStream.cross + (UnifiedStream.filter pred [(UnifiedRow.row la, DiffWithError.val n)]) r := by + have hFilterL : UnifiedStream.filter pred + [(UnifiedRow.row la, DiffWithError.val n)] + = [(UnifiedRow.row la, DiffWithError.val n)] := by + show (match eval la pred with + | .bool true => [(UnifiedRow.row la, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [hEval]; rfl + rw [hFilterL, UnifiedStream.cross_singleton] + induction r with + | nil => rfl + | cons hd tl ih => + obtain ⟨ru, rd⟩ := hd + obtain ⟨⟨rb, hRb⟩, ⟨m, hM⟩⟩ := hR (ru, rd) List.mem_cons_self + have hTl : UnifiedStream.IsPureData tl := hR.tail + subst hRb; subst hM + have hComb : combineCarrier (UnifiedRow.row la) (UnifiedRow.row rb) + = UnifiedRow.row (la ++ rb) := rfl + have hMul : (DiffWithError.val n : DiffWithError Int) * DiffWithError.val m + = DiffWithError.val (n * m) := rfl + show UnifiedStream.filter pred + (((UnifiedRow.row rb, DiffWithError.val m) :: tl).map _) + = ((UnifiedRow.row rb, DiffWithError.val m) :: tl).map _ + simp only [List.map_cons] + rw [hComb, hMul] + have hConsLhs : ((UnifiedRow.row (la ++ rb), DiffWithError.val (n * m)) + :: tl.map (fun rd' => + (combineCarrier (UnifiedRow.row la) rd'.1, + DiffWithError.val n * rd'.2)) + : UnifiedStream) + = [(UnifiedRow.row (la ++ rb), DiffWithError.val (n * m))] + ++ tl.map (fun rd' => + (combineCarrier (UnifiedRow.row la) rd'.1, + DiffWithError.val n * rd'.2)) := rfl + rw [hConsLhs, UnifiedStream.filter_append, ih hTl] + have hEvalApp : eval (la ++ rb) pred = Datum.bool true := by + rw [eval_append_left_of_bounded la rb pred hBound', hEval] + have hFilterHd : UnifiedStream.filter pred + [(UnifiedRow.row (la ++ rb), DiffWithError.val (n * m))] + = [(UnifiedRow.row (la ++ rb), DiffWithError.val (n * m))] := by + show (match eval (la ++ rb) pred with + | .bool true => [(UnifiedRow.row (la ++ rb), DiffWithError.val (n * m))] + | .err e => [(UnifiedRow.err e, DiffWithError.val (n * m))] + | _ => []) ++ [] = _ + rw [hEvalApp]; rfl + rw [hFilterHd] + +/-- `(.row la, .val n)` left with `eval la pred = .err e_pred`: +filter promotes every cross output to `.err e_pred` carrier. -/ +private theorem filter_map_row_val_err + (pred : Expr) (la : Row) (n : Int) (r : UnifiedStream) (e_pred : EvalError) + (hBound' : pred.colReferencesBoundedBy la.length = true) + (hEval : eval la pred = Datum.err e_pred) + (hR : UnifiedStream.IsPureData r) : + UnifiedStream.filter pred + (r.map fun rd => (combineCarrier (UnifiedRow.row la) rd.1, + DiffWithError.val n * rd.2)) + = UnifiedStream.cross + (UnifiedStream.filter pred [(UnifiedRow.row la, DiffWithError.val n)]) r := by + have hFilterL : UnifiedStream.filter pred + [(UnifiedRow.row la, DiffWithError.val n)] + = [(UnifiedRow.err e_pred, DiffWithError.val n)] := by + show (match eval la pred with + | .bool true => [(UnifiedRow.row la, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [hEval]; rfl + rw [hFilterL, UnifiedStream.cross_singleton] + induction r with + | nil => rfl + | cons hd tl ih => + obtain ⟨ru, rd⟩ := hd + obtain ⟨⟨rb, hRb⟩, ⟨m, hM⟩⟩ := hR (ru, rd) List.mem_cons_self + have hTl : UnifiedStream.IsPureData tl := hR.tail + subst hRb; subst hM + have hCombRow : combineCarrier (UnifiedRow.row la) (UnifiedRow.row rb) + = UnifiedRow.row (la ++ rb) := rfl + have hCombErr : combineCarrier (UnifiedRow.err e_pred) (UnifiedRow.row rb) + = UnifiedRow.err e_pred := rfl + have hMul : (DiffWithError.val n : DiffWithError Int) * DiffWithError.val m + = DiffWithError.val (n * m) := rfl + show UnifiedStream.filter pred + (((UnifiedRow.row rb, DiffWithError.val m) :: tl).map _) + = ((UnifiedRow.row rb, DiffWithError.val m) :: tl).map _ + simp only [List.map_cons] + rw [hCombRow, hCombErr, hMul] + have hConsLhs : ((UnifiedRow.row (la ++ rb), DiffWithError.val (n * m)) + :: tl.map (fun rd' => + (combineCarrier (UnifiedRow.row la) rd'.1, + DiffWithError.val n * rd'.2)) + : UnifiedStream) + = [(UnifiedRow.row (la ++ rb), DiffWithError.val (n * m))] + ++ tl.map (fun rd' => + (combineCarrier (UnifiedRow.row la) rd'.1, + DiffWithError.val n * rd'.2)) := rfl + rw [hConsLhs, UnifiedStream.filter_append, ih hTl] + have hEvalApp : eval (la ++ rb) pred = Datum.err e_pred := by + rw [eval_append_left_of_bounded la rb pred hBound', hEval] + have hFilterHd : UnifiedStream.filter pred + [(UnifiedRow.row (la ++ rb), DiffWithError.val (n * m))] + = [(UnifiedRow.err e_pred, DiffWithError.val (n * m))] := by + show (match eval (la ++ rb) pred with + | .bool true => [(UnifiedRow.row (la ++ rb), DiffWithError.val (n * m))] + | .err e => [(UnifiedRow.err e, DiffWithError.val (n * m))] + | _ => []) ++ [] = _ + rw [hEvalApp]; rfl + rw [hFilterHd] + rfl + +/-- `(.row la, .val n)` left when `eval la pred` is neither +`.bool true` nor `.err _`: filter drops both at the singleton +level and at every cross-output level. -/ +private theorem filter_map_row_val_drop + (pred : Expr) (la : Row) (n : Int) (r : UnifiedStream) + (hBound' : pred.colReferencesBoundedBy la.length = true) + (hDrop : UnifiedStream.filter pred [(UnifiedRow.row la, DiffWithError.val n)] = []) + (hR : UnifiedStream.IsPureData r) : + UnifiedStream.filter pred + (r.map fun rd => (combineCarrier (UnifiedRow.row la) rd.1, + DiffWithError.val n * rd.2)) + = UnifiedStream.cross + (UnifiedStream.filter pred [(UnifiedRow.row la, DiffWithError.val n)]) r := by + rw [hDrop] + show _ = ([] : UnifiedStream).flatMap _ + rw [List.flatMap_nil] + have hDropExpand : (match eval la pred with + | .bool true => [(UnifiedRow.row la, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => ([] : UnifiedStream)) + = [] := by + have : UnifiedStream.filter pred [(UnifiedRow.row la, DiffWithError.val n)] + = (match eval la pred with + | .bool true => [(UnifiedRow.row la, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] := rfl + rw [this] at hDrop + rw [List.append_nil] at hDrop + exact hDrop + induction r with + | nil => rfl + | cons hd tl ih => + obtain ⟨ru, rd⟩ := hd + obtain ⟨⟨rb, hRb⟩, ⟨m, hM⟩⟩ := hR (ru, rd) List.mem_cons_self + have hTl : UnifiedStream.IsPureData tl := hR.tail + subst hRb; subst hM + have hCombRow : combineCarrier (UnifiedRow.row la) (UnifiedRow.row rb) + = UnifiedRow.row (la ++ rb) := rfl + have hMul : (DiffWithError.val n : DiffWithError Int) * DiffWithError.val m + = DiffWithError.val (n * m) := rfl + show UnifiedStream.filter pred + (((UnifiedRow.row rb, DiffWithError.val m) :: tl).map _) = [] + simp only [List.map_cons] + rw [hCombRow, hMul] + have hConsLhs : ((UnifiedRow.row (la ++ rb), DiffWithError.val (n * m)) + :: tl.map (fun rd' => + (combineCarrier (UnifiedRow.row la) rd'.1, + DiffWithError.val n * rd'.2)) + : UnifiedStream) + = [(UnifiedRow.row (la ++ rb), DiffWithError.val (n * m))] + ++ tl.map (fun rd' => + (combineCarrier (UnifiedRow.row la) rd'.1, + DiffWithError.val n * rd'.2)) := rfl + rw [hConsLhs, UnifiedStream.filter_append, ih hTl] + have hEvalApp : eval (la ++ rb) pred = eval la pred := + eval_append_left_of_bounded la rb pred hBound' + have hFilterHd : UnifiedStream.filter pred + [(UnifiedRow.row (la ++ rb), DiffWithError.val (n * m))] + = [] := by + show (match eval (la ++ rb) pred with + | .bool true => [(UnifiedRow.row (la ++ rb), DiffWithError.val (n * m))] + | .err e => [(UnifiedRow.err e, DiffWithError.val (n * m))] + | _ => []) ++ [] = _ + rw [hEvalApp, List.append_nil] + cases hC : eval la pred with + | bool b => + cases b with + | true => + exfalso + rw [hC] at hDropExpand + exact List.cons_ne_nil _ _ hDropExpand + | false => rfl + | err e => + exfalso + rw [hC] at hDropExpand + exact List.cons_ne_nil _ _ hDropExpand + | int _ => rfl + | null => rfl + rw [hFilterHd] + rfl + +/-! ## Main pushdown theorem -/ + +/-- Filter pushdown for cross products. When the join predicate +references only left-input columns (bounded by `N`, where every +left row's width is at least `N`), and the right input is pure +data (no row-errs, no collection-errs), filtering the cross +product equals crossing the filtered left input with the right. + +The right-pure hypothesis is essential: without it, +`combineCarrier`'s left-wins rule disagrees with filter's +cell-to-row promotion on which err payload to keep, and a +`.error` diff in `r` would interact with filter's first arm to +keep records that would be dropped post-pushdown. -/ +theorem UnifiedStream.filter_cross_pushdown_left + (pred : Expr) (N : Nat) (l r : UnifiedStream) + (hBound : pred.colReferencesBoundedBy N = true) + (hLWidth : ∀ ud ∈ l, ∀ la, ud.1 = UnifiedRow.row la → la.length ≥ N) + (hRPure : UnifiedStream.IsPureData r) : + UnifiedStream.filter pred (UnifiedStream.cross l r) + = UnifiedStream.cross (UnifiedStream.filter pred l) r := by + induction l with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hTlWidth : ∀ ud ∈ tl, ∀ la, ud.1 = UnifiedRow.row la → la.length ≥ N := + fun ud hMem la hUc => hLWidth ud (List.mem_cons_of_mem _ hMem) la hUc + have hHdWidth : ∀ la, uc = UnifiedRow.row la → la.length ≥ N := + fun la hUc => hLWidth (uc, d) List.mem_cons_self la hUc + rw [UnifiedStream.cross_cons_left, + UnifiedStream.filter_append, ih hTlWidth] + have hFilterCons : + UnifiedStream.filter pred ((uc, d) :: tl) + = UnifiedStream.filter pred [(uc, d)] + ++ UnifiedStream.filter pred tl := by + have : ((uc, d) :: tl : UnifiedStream) = [(uc, d)] ++ tl := rfl + rw [this, UnifiedStream.filter_append] + rw [hFilterCons, UnifiedStream.cross_append_left] + congr 1 + cases d with + | error => exact filter_map_error_diff pred uc r + | val n => + cases uc with + | err e => exact filter_map_err_carrier pred e n r hRPure + | row la => + have hBound' : pred.colReferencesBoundedBy la.length = true := + Expr.colReferencesBoundedBy_mono pred hBound (hHdWidth la rfl) + cases hEval : eval la pred with + | bool b => + cases b with + | true => + exact filter_map_row_val_keep pred la n r hBound' hEval hRPure + | false => + apply filter_map_row_val_drop pred la n r hBound' _ hRPure + show (match eval la pred with + | .bool true => [(UnifiedRow.row la, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = [] + rw [hEval]; rfl + | int _ => + apply filter_map_row_val_drop pred la n r hBound' _ hRPure + show (match eval la pred with + | .bool true => [(UnifiedRow.row la, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = [] + rw [hEval]; rfl + | null => + apply filter_map_row_val_drop pred la n r hBound' _ hRPure + show (match eval la pred with + | .bool true => [(UnifiedRow.row la, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = [] + rw [hEval]; rfl + | err e_pred => + exact filter_map_row_val_err pred la n r e_pred hBound' hEval hRPure + +end Mz From 52a233a2721a52a553acd44752b8dd090c6b6f18 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 20:39:41 +0200 Subject: [PATCH 093/127] doc/semantics: negate_filter commutes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add UnifiedStream.negate_filter: negate (filter pred us) = filter pred (negate us) Earlier abandoned attempt failed on pattern-match shows; this retry succeeds via per-record case analysis with computed reduction lemmas for each filter branch. Holds unconditionally — no hypothesis on pred or stream content. Filter's row arm depends on eval r pred which is independent of diff sign, so the negate slides through. Cases handled: * (uc, .error): filter passes through, .error negates to .error. * (.err e, .val n): filter passes through, both orderings give (.err e, .val -n). * (.row r, .val n) with eval = .bool true: keep, diff flips. * (.row r, .val n) with eval = .err e_pred: cell-to-row promotion preserved by negate. * (.row r, .val n) with eval drops: both orderings give []. Enables optimizer rewrites that commute negation with filter (EXCEPT ALL paths, retraction reordering). Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 170 +++++++++++++++++++++++++ 1 file changed, 170 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 99d49e9df12bb..49e444f286c8d 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -206,6 +206,176 @@ theorem UnifiedStream.negate_negate (us : UnifiedStream) : = (uc, d) :: tl rw [ih, DiffWithError.neg_neg_int] +/-- `negate` commutes with `filter`. Both orderings agree on +every record shape; filter's row arm depends on `eval r pred`, +which is independent of the diff sign, so the diff-flip slides +through. -/ +theorem UnifiedStream.negate_filter (pred : Expr) (us : UnifiedStream) : + UnifiedStream.negate (UnifiedStream.filter pred us) + = UnifiedStream.filter pred (UnifiedStream.negate us) := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hCons : ((uc, d) :: tl : UnifiedStream) = [(uc, d)] ++ tl := rfl + rw [hCons, UnifiedStream.filter_append, UnifiedStream.negate_append, + ih, UnifiedStream.negate_append, UnifiedStream.filter_append] + congr 1 + -- Per-record: negate (filter [(uc, d)]) = filter pred (negate [(uc, d)]). + -- Strategy: compute both filter results (original and post-negate), + -- compute negate of the original record, and equate via neg_{val,error}. + cases d with + | error => + have hF : UnifiedStream.filter pred + [(uc, (DiffWithError.error : DiffWithError Int))] + = [(uc, (DiffWithError.error : DiffWithError Int))] := by + cases uc <;> rfl + have hN : UnifiedStream.negate + [(uc, (DiffWithError.error : DiffWithError Int))] + = [(uc, (DiffWithError.error : DiffWithError Int))] := by + show [(uc, -(DiffWithError.error : DiffWithError Int))] = _ + rw [DiffWithError.neg_error] + rw [hF, hN, hF] + | val n => + cases uc with + | err e => + have hF : UnifiedStream.filter pred + [(UnifiedRow.err e, DiffWithError.val n)] + = [(UnifiedRow.err e, DiffWithError.val n)] := rfl + have hFn : UnifiedStream.filter pred + [(UnifiedRow.err e, DiffWithError.val (-n))] + = [(UnifiedRow.err e, DiffWithError.val (-n))] := rfl + have hN : UnifiedStream.negate [(UnifiedRow.err e, DiffWithError.val n)] + = [(UnifiedRow.err e, DiffWithError.val (-n))] := by + show [(UnifiedRow.err e, -(DiffWithError.val n : DiffWithError Int))] + = [(UnifiedRow.err e, DiffWithError.val (-n))] + rw [DiffWithError.neg_val] + rw [hF, hN, hFn] + | row r => + cases hEval : eval r pred with + | bool b => + cases b with + | true => + have hF : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val n)] + = [(UnifiedRow.row r, DiffWithError.val n)] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [hEval]; rfl + have hFn : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val (-n))] + = [(UnifiedRow.row r, DiffWithError.val (-n))] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val (-n))] + | .err e => [(UnifiedRow.err e, DiffWithError.val (-n))] + | _ => []) ++ [] = _ + rw [hEval]; rfl + have hN : UnifiedStream.negate + [(UnifiedRow.row r, DiffWithError.val n)] + = [(UnifiedRow.row r, DiffWithError.val (-n))] := by + show [(UnifiedRow.row r, -(DiffWithError.val n : DiffWithError Int))] + = [(UnifiedRow.row r, DiffWithError.val (-n))] + rw [DiffWithError.neg_val] + rw [hF, hN, hFn] + | false => + have hF : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val n)] = [] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [hEval]; rfl + have hFn : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val (-n))] = [] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val (-n))] + | .err e => [(UnifiedRow.err e, DiffWithError.val (-n))] + | _ => []) ++ [] = _ + rw [hEval]; rfl + have hN : UnifiedStream.negate + [(UnifiedRow.row r, DiffWithError.val n)] + = [(UnifiedRow.row r, DiffWithError.val (-n))] := by + show [(UnifiedRow.row r, -(DiffWithError.val n : DiffWithError Int))] + = [(UnifiedRow.row r, DiffWithError.val (-n))] + rw [DiffWithError.neg_val] + rw [hF, hN, hFn]; rfl + | int _ => + have hF : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val n)] = [] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [hEval]; rfl + have hFn : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val (-n))] = [] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val (-n))] + | .err e => [(UnifiedRow.err e, DiffWithError.val (-n))] + | _ => []) ++ [] = _ + rw [hEval]; rfl + have hN : UnifiedStream.negate + [(UnifiedRow.row r, DiffWithError.val n)] + = [(UnifiedRow.row r, DiffWithError.val (-n))] := by + show [(UnifiedRow.row r, -(DiffWithError.val n : DiffWithError Int))] + = [(UnifiedRow.row r, DiffWithError.val (-n))] + rw [DiffWithError.neg_val] + rw [hF, hN, hFn]; rfl + | null => + have hF : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val n)] = [] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [hEval]; rfl + have hFn : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val (-n))] = [] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val (-n))] + | .err e => [(UnifiedRow.err e, DiffWithError.val (-n))] + | _ => []) ++ [] = _ + rw [hEval]; rfl + have hN : UnifiedStream.negate + [(UnifiedRow.row r, DiffWithError.val n)] + = [(UnifiedRow.row r, DiffWithError.val (-n))] := by + show [(UnifiedRow.row r, -(DiffWithError.val n : DiffWithError Int))] + = [(UnifiedRow.row r, DiffWithError.val (-n))] + rw [DiffWithError.neg_val] + rw [hF, hN, hFn]; rfl + | err e_pred => + have hF : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val n)] + = [(UnifiedRow.err e_pred, DiffWithError.val n)] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err e => [(UnifiedRow.err e, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [hEval]; rfl + have hFn : UnifiedStream.filter pred + [(UnifiedRow.row r, DiffWithError.val (-n))] + = [(UnifiedRow.err e_pred, DiffWithError.val (-n))] := by + show (match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val (-n))] + | .err e => [(UnifiedRow.err e, DiffWithError.val (-n))] + | _ => []) ++ [] = _ + rw [hEval]; rfl + have hN : UnifiedStream.negate + [(UnifiedRow.row r, DiffWithError.val n)] + = [(UnifiedRow.row r, DiffWithError.val (-n))] := by + show [(UnifiedRow.row r, -(DiffWithError.val n : DiffWithError Int))] + = [(UnifiedRow.row r, DiffWithError.val (-n))] + rw [DiffWithError.neg_val] + have hNE : UnifiedStream.negate + [(UnifiedRow.err e_pred, DiffWithError.val n)] + = [(UnifiedRow.err e_pred, DiffWithError.val (-n))] := by + show [(UnifiedRow.err e_pred, -(DiffWithError.val n : DiffWithError Int))] + = [(UnifiedRow.err e_pred, DiffWithError.val (-n))] + rw [DiffWithError.neg_val] + rw [hF, hN, hFn, hNE] + /-- `.error` diffs survive negation; the carrier is preserved. -/ theorem UnifiedStream.negate_preserves_error_diff (us : UnifiedStream) (uc : UnifiedRow) From d544af5d7099d18317c1cc9f8573e719cb18cd20 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 20:46:40 +0200 Subject: [PATCH 094/127] doc/semantics: clampPositive/clampToOne error-scope; distinct/bagExceptAll corollaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Partial error-scope characterization for the clamp operators: * clampPositive_errorDiffCarriers_iff: equality. .error records always pass through (isPositiveDiff .error = true), so the collection-err set is preserved exactly. * clampPositive_errCarriers_of_mem: reverse direction only. clampPositive may drop (.err e, .val 0) records, so the row-err set may shrink. Forward direction fails. Same shape for clampToOne, with the case analysis tracking clampToOne's three branches (.error keep, .val n>0 collapse, .val n≤0 drop). clampToOne_error_inv: a (uc, .error) in clampToOne output came from a (uc, .error) input. The .val arms never emit .error. Composed corollaries: * distinct_errorDiffCarriers_iff / distinct_errCarriers_of_mem: distinct = clampToOne ∘ consolidate. * bagExceptAll_errorDiffCarriers_iff / bagExceptAll_errCarriers_of_mem: bagExceptAll = clampPositive ∘ exceptAll. bagIntersectAll deferred — intersectAll's lookup-based definition needs separate analysis before composition. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 207 +++++++++++++++++++++++++ 1 file changed, 207 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 49e444f286c8d..99f1554e20801 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -1759,4 +1759,211 @@ theorem UnifiedStream.exceptAll_errorDiffCarriers_iff UnifiedStream.unionAll_errorDiffCarriers, UnifiedStream.negate_errorDiffCarriers, List.mem_append] +/-! ## `clampPositive` and error scopes + +`clampPositive` keeps `.error` records and `.val n` records with +`0 < n`; drops everything else. + +* Collection-err set: preserved exactly. `.error`-diff records + always pass through (`isPositiveDiff .error = true`). +* Row-err set: not preserved — a `(.err e, .val 0)` or + `(.err e, .val (-n))` record is dropped, so the row-err set + may shrink. Only the reverse direction (output ⊆ input) holds. -/ + +theorem UnifiedStream.clampPositive_errorDiffCarriers_iff + (us : UnifiedStream) (uc : UnifiedRow) : + uc ∈ UnifiedStream.errorDiffCarriers (UnifiedStream.clampPositive us) + ↔ uc ∈ UnifiedStream.errorDiffCarriers us := by + rw [UnifiedStream.mem_errorDiffCarriers, + UnifiedStream.mem_errorDiffCarriers] + constructor + · intro h + -- (uc, .error) ∈ clampPositive us = List.filter _ us ⇒ (uc, .error) ∈ us. + exact (List.mem_filter.mp h).1 + · intro h + exact UnifiedStream.clampPositive_preserves_error_diff us uc h + +/-- Reverse direction for row-err: every err in the clamped +output was in the input. The forward direction fails because +clampPositive can drop `(.err e, .val 0)` records. -/ +theorem UnifiedStream.clampPositive_errCarriers_of_mem + (us : UnifiedStream) (e : EvalError) + (h : e ∈ UnifiedStream.errCarriers (UnifiedStream.clampPositive us)) : + e ∈ UnifiedStream.errCarriers us := by + rw [UnifiedStream.mem_errCarriers] at h + rw [UnifiedStream.mem_errCarriers] + obtain ⟨d, hMem⟩ := h + exact ⟨d, (List.mem_filter.mp hMem).1⟩ + +/-! ## `clampToOne` and error scopes + +`clampToOne` collapses positive `.val n > 0` to `.val 1`, drops +non-positive `.val`, and preserves `.error`. Same error-scope +behavior as `clampPositive`. -/ + +private theorem clampToOne_preserves_record_carrier + (us : UnifiedStream) (rec : UnifiedRow × DiffWithError Int) + (h : rec ∈ UnifiedStream.clampToOne us) : + ∃ d, (rec.1, d) ∈ us := by + induction us with + | nil => exact absurd h List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + cases d' with + | error => + -- clampToOne ((uc', .error) :: tl) = (uc', .error) :: clampToOne tl. + have hEq : UnifiedStream.clampToOne ((uc', DiffWithError.error) :: tl) + = (uc', DiffWithError.error) :: UnifiedStream.clampToOne tl := rfl + rw [hEq] at h + rcases List.mem_cons.mp h with hHead | hTail + · subst hHead + exact ⟨DiffWithError.error, List.mem_cons_self⟩ + · obtain ⟨d, hMem⟩ := ih hTail + exact ⟨d, List.mem_cons_of_mem _ hMem⟩ + | val n => + by_cases hPos : 0 < n + · have hEq : UnifiedStream.clampToOne ((uc', DiffWithError.val n) :: tl) + = (uc', DiffWithError.val 1) + :: UnifiedStream.clampToOne tl := by + show (if 0 < n then (uc', DiffWithError.val 1) + :: UnifiedStream.clampToOne tl + else UnifiedStream.clampToOne tl) + = _ + rw [if_pos hPos] + rw [hEq] at h + rcases List.mem_cons.mp h with hHead | hTail + · -- rec = (uc', .val 1); carrier uc' is in input head. + have hUc : rec.1 = uc' := by rw [hHead] + rw [hUc] + exact ⟨DiffWithError.val n, List.mem_cons_self⟩ + · obtain ⟨d, hMem⟩ := ih hTail + exact ⟨d, List.mem_cons_of_mem _ hMem⟩ + · have hEq : UnifiedStream.clampToOne ((uc', DiffWithError.val n) :: tl) + = UnifiedStream.clampToOne tl := by + show (if 0 < n then (uc', DiffWithError.val 1) + :: UnifiedStream.clampToOne tl + else UnifiedStream.clampToOne tl) + = _ + rw [if_neg hPos] + rw [hEq] at h + obtain ⟨d, hMem⟩ := ih h + exact ⟨d, List.mem_cons_of_mem _ hMem⟩ + +theorem UnifiedStream.clampToOne_errCarriers_of_mem + (us : UnifiedStream) (e : EvalError) + (h : e ∈ UnifiedStream.errCarriers (UnifiedStream.clampToOne us)) : + e ∈ UnifiedStream.errCarriers us := by + rw [UnifiedStream.mem_errCarriers] at h + rw [UnifiedStream.mem_errCarriers] + obtain ⟨d, hMem⟩ := h + exact clampToOne_preserves_record_carrier us (UnifiedRow.err e, d) hMem + +/-- Reverse direction of clampToOne / `.error`-diff carriers: +a `(uc, .error)` in the clamped output came from a `(uc, .error)` +in the input. The `.val n` arms of clampToOne never emit `.error`, +so the `.error` output had to come from an `.error` input. -/ +private theorem clampToOne_error_inv + (us : UnifiedStream) (uc : UnifiedRow) + (h : (uc, (DiffWithError.error : DiffWithError Int)) + ∈ UnifiedStream.clampToOne us) : + (uc, (DiffWithError.error : DiffWithError Int)) ∈ us := by + induction us with + | nil => exact absurd h List.not_mem_nil + | cons hd tl ih => + obtain ⟨uc', d'⟩ := hd + cases d' with + | error => + have hEq : UnifiedStream.clampToOne ((uc', DiffWithError.error) :: tl) + = (uc', DiffWithError.error) + :: UnifiedStream.clampToOne tl := rfl + rw [hEq] at h + rcases List.mem_cons.mp h with hHead | hTail + · have huc : uc = uc' := (Prod.mk.injEq _ _ _ _).mp hHead |>.1 + subst huc + exact List.mem_cons_self + · exact List.mem_cons_of_mem _ (ih hTail) + | val n => + by_cases hPos : 0 < n + · have hEq : UnifiedStream.clampToOne ((uc', DiffWithError.val n) :: tl) + = (uc', DiffWithError.val 1) + :: UnifiedStream.clampToOne tl := by + show (if 0 < n then (uc', DiffWithError.val 1) + :: UnifiedStream.clampToOne tl + else UnifiedStream.clampToOne tl) = _ + rw [if_pos hPos] + rw [hEq] at h + rcases List.mem_cons.mp h with hHead | hTail + · -- (uc, .error) = (uc', .val 1) impossible. + have hDiff : (DiffWithError.error : DiffWithError Int) + = DiffWithError.val 1 := + (Prod.mk.injEq _ _ _ _).mp hHead |>.2 + cases hDiff + · exact List.mem_cons_of_mem _ (ih hTail) + · have hEq : UnifiedStream.clampToOne ((uc', DiffWithError.val n) :: tl) + = UnifiedStream.clampToOne tl := by + show (if 0 < n then (uc', DiffWithError.val 1) + :: UnifiedStream.clampToOne tl + else UnifiedStream.clampToOne tl) = _ + rw [if_neg hPos] + rw [hEq] at h + exact List.mem_cons_of_mem _ (ih h) + +/-- `clampToOne` preserves collection-err carriers. The +`(uc, .error)` records always survive the clamp (the `.error` +branch of clampToOne's recursion). Forward via the existing +`clampToOne_preserves_error_diff`; reverse via `clampToOne_error_inv`. -/ +theorem UnifiedStream.clampToOne_errorDiffCarriers_iff + (us : UnifiedStream) (uc : UnifiedRow) : + uc ∈ UnifiedStream.errorDiffCarriers (UnifiedStream.clampToOne us) + ↔ uc ∈ UnifiedStream.errorDiffCarriers us := by + rw [UnifiedStream.mem_errorDiffCarriers, + UnifiedStream.mem_errorDiffCarriers] + exact ⟨clampToOne_error_inv us uc, + UnifiedStream.clampToOne_preserves_error_diff us uc⟩ + +/-! ## `distinct`, `bagExceptAll`, `bagIntersectAll`: composed theorems + +`distinct = clampToOne ∘ consolidate`, `bagExceptAll = clampPositive ∘ exceptAll`, +`bagIntersectAll = clampPositive ∘ intersectAll`. Error-scope behavior +composes from the parts. The collection-err set is preserved exactly +(both parts preserve iff). The row-err set can only shrink. -/ + +theorem UnifiedStream.distinct_errorDiffCarriers_iff + (us : UnifiedStream) (uc : UnifiedRow) : + uc ∈ UnifiedStream.errorDiffCarriers (UnifiedStream.distinct us) + ↔ uc ∈ UnifiedStream.errorDiffCarriers us := by + show uc ∈ UnifiedStream.errorDiffCarriers + (UnifiedStream.clampToOne (UnifiedStream.consolidate us)) ↔ _ + rw [UnifiedStream.clampToOne_errorDiffCarriers_iff, + UnifiedStream.consolidate_errorDiffCarriers_iff] + +theorem UnifiedStream.distinct_errCarriers_of_mem + (us : UnifiedStream) (e : EvalError) + (h : e ∈ UnifiedStream.errCarriers (UnifiedStream.distinct us)) : + e ∈ UnifiedStream.errCarriers us := by + -- distinct us = clampToOne (consolidate us). + -- clampToOne_errCarriers_of_mem → in consolidate us. + -- consolidate_errCarriers_iff → in us. + have h1 : e ∈ UnifiedStream.errCarriers (UnifiedStream.consolidate us) := + UnifiedStream.clampToOne_errCarriers_of_mem _ e h + exact (UnifiedStream.consolidate_errCarriers_iff us e).mp h1 + +theorem UnifiedStream.bagExceptAll_errorDiffCarriers_iff + (l r : UnifiedStream) (uc : UnifiedRow) : + uc ∈ UnifiedStream.errorDiffCarriers (UnifiedStream.bagExceptAll l r) + ↔ uc ∈ UnifiedStream.errorDiffCarriers l + ∨ uc ∈ UnifiedStream.errorDiffCarriers r := by + show uc ∈ UnifiedStream.errorDiffCarriers + (UnifiedStream.clampPositive (UnifiedStream.exceptAll l r)) ↔ _ + rw [UnifiedStream.clampPositive_errorDiffCarriers_iff, + UnifiedStream.exceptAll_errorDiffCarriers_iff] + +theorem UnifiedStream.bagExceptAll_errCarriers_of_mem + (l r : UnifiedStream) (e : EvalError) + (h : e ∈ UnifiedStream.errCarriers (UnifiedStream.bagExceptAll l r)) : + e ∈ UnifiedStream.errCarriers l ∨ e ∈ UnifiedStream.errCarriers r := by + have h1 : e ∈ UnifiedStream.errCarriers (UnifiedStream.exceptAll l r) := + UnifiedStream.clampPositive_errCarriers_of_mem _ e h + exact (UnifiedStream.exceptAll_errCarriers_iff l r e).mp h1 + end Mz From cb07c354c4244cd8e310ce80abb2b67ef99bed24 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 20:54:14 +0200 Subject: [PATCH 095/127] doc/semantics: intersectAll error-scope reverse direction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three reverse-direction theorems for intersectAll-family: * intersectAll_errCarriers_of_mem: an err in the intersectAll output appears in BOTH inputs (consolidate carrier preservation for the left side; lookup-witness for the right). * intersectAll_errorDiffCarriers_of_mem: a .error-diff carrier in the output exists (with some diff) in BOTH inputs. The min combinator only yields .error when at least one side has .error, but the carrier must be present on both for intersectAll to emit anything. * bagIntersectAll_errCarriers_of_mem: composition with clampPositive. Forward direction (errors in both inputs propagate to output) holds for collection-err on left via existing intersectAll_preserves_error_diff_left, with side conditions. Full iff theorems for intersectAll are out of scope — would need to characterize min combinator results per-bucket. Proofs use a hF restate via defeq (.fst/.snd projection reduction) before cases on lookup; otherwise rw [hLookup] fails to find the lookup expression syntactically. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 100 +++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 99f1554e20801..26868f0f9857a 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -1966,4 +1966,104 @@ theorem UnifiedStream.bagExceptAll_errCarriers_of_mem UnifiedStream.clampPositive_errCarriers_of_mem _ e h exact (UnifiedStream.exceptAll_errCarriers_iff l r e).mp h1 +/-! ## `intersectAll` error scopes + +`intersectAll` emits a record only for carriers present on BOTH +sides (after consolidation). Both error scopes are bounded by the +intersection of inputs. Forward direction fails — having an err +on both sides does not guarantee output preservation when the +combine rule depends on diff arithmetic. + +Reverse: every err in `intersectAll`'s output came from both `l` +*and* `r`. -/ + +theorem UnifiedStream.intersectAll_errCarriers_of_mem + (l r : UnifiedStream) (e : EvalError) + (h : e ∈ UnifiedStream.errCarriers (UnifiedStream.intersectAll l r)) : + e ∈ UnifiedStream.errCarriers l ∧ e ∈ UnifiedStream.errCarriers r := by + rw [UnifiedStream.mem_errCarriers] at h + obtain ⟨d, hMem⟩ := h + have hFM : (UnifiedRow.err e, d) + ∈ (UnifiedStream.consolidate l).filterMap (fun ud => + match UnifiedStream.lookup ud.1 + (UnifiedStream.consolidate r) with + | none => none + | some d' => some (ud.1, DiffWithError.min ud.2 d')) := hMem + obtain ⟨ud0, hUd0Mem, hF⟩ := List.mem_filterMap.mp hFM + obtain ⟨uc0, d0⟩ := ud0 + -- Restate hF with projections reduced. + have hF' : (match UnifiedStream.lookup uc0 (UnifiedStream.consolidate r) with + | none => (none : Option (UnifiedRow × DiffWithError Int)) + | some d' => some (uc0, DiffWithError.min d0 d')) + = some (UnifiedRow.err e, d) := hF + cases hLookup : UnifiedStream.lookup uc0 (UnifiedStream.consolidate r) with + | none => + rw [hLookup] at hF' + cases hF' + | some d' => + rw [hLookup] at hF' + have hPair : (uc0, DiffWithError.min d0 d') = (UnifiedRow.err e, d) := by + injection hF' + have hUc0 : uc0 = UnifiedRow.err e := (Prod.mk.injEq _ _ _ _).mp hPair |>.1 + subst hUc0 + have hInL : ∃ d', (UnifiedRow.err e, d') ∈ l := + UnifiedStream.mem_of_mem_consolidate l (UnifiedRow.err e) + ⟨d0, hUd0Mem⟩ + have hLookupMem : (UnifiedRow.err e, d') + ∈ UnifiedStream.consolidate r := + UnifiedStream.mem_of_lookup_eq_some hLookup + have hInR : ∃ d'', (UnifiedRow.err e, d'') ∈ r := + UnifiedStream.mem_of_mem_consolidate r (UnifiedRow.err e) + ⟨d', hLookupMem⟩ + exact ⟨(UnifiedStream.mem_errCarriers l e).mpr hInL, + (UnifiedStream.mem_errCarriers r e).mpr hInR⟩ + +/-- Same shape for collection-err: an output `.error`-diff carrier +must appear (with any diff) in both inputs. The `.error` itself +arises from the `min` combinator absorbing on either side. -/ +theorem UnifiedStream.intersectAll_errorDiffCarriers_of_mem + (l r : UnifiedStream) (uc : UnifiedRow) + (h : uc ∈ UnifiedStream.errorDiffCarriers (UnifiedStream.intersectAll l r)) : + (∃ d, (uc, d) ∈ l) ∧ (∃ d, (uc, d) ∈ r) := by + rw [UnifiedStream.mem_errorDiffCarriers] at h + have hFM : (uc, (DiffWithError.error : DiffWithError Int)) + ∈ (UnifiedStream.consolidate l).filterMap (fun ud => + match UnifiedStream.lookup ud.1 + (UnifiedStream.consolidate r) with + | none => none + | some d' => some (ud.1, DiffWithError.min ud.2 d')) := h + obtain ⟨ud0, hUd0Mem, hF⟩ := List.mem_filterMap.mp hFM + obtain ⟨uc0, d0⟩ := ud0 + have hF' : (match UnifiedStream.lookup uc0 (UnifiedStream.consolidate r) with + | none => (none : Option (UnifiedRow × DiffWithError Int)) + | some d' => some (uc0, DiffWithError.min d0 d')) + = some (uc, DiffWithError.error) := hF + cases hLookup : UnifiedStream.lookup uc0 (UnifiedStream.consolidate r) with + | none => + rw [hLookup] at hF' + cases hF' + | some d' => + rw [hLookup] at hF' + have hPair : (uc0, DiffWithError.min d0 d') = (uc, DiffWithError.error) := by + injection hF' + have hUc0 : uc0 = uc := (Prod.mk.injEq _ _ _ _).mp hPair |>.1 + rw [hUc0] at hUd0Mem hLookup + have hInL : ∃ d'', (uc, d'') ∈ l := + UnifiedStream.mem_of_mem_consolidate l uc ⟨d0, hUd0Mem⟩ + have hLookupMem : (uc, d') ∈ UnifiedStream.consolidate r := + UnifiedStream.mem_of_lookup_eq_some hLookup + have hInR : ∃ d'', (uc, d'') ∈ r := + UnifiedStream.mem_of_mem_consolidate r uc ⟨d', hLookupMem⟩ + exact ⟨hInL, hInR⟩ + +/-- `bagIntersectAll = clampPositive ∘ intersectAll`. Row-err +reverse direction lifts from `intersectAll`. -/ +theorem UnifiedStream.bagIntersectAll_errCarriers_of_mem + (l r : UnifiedStream) (e : EvalError) + (h : e ∈ UnifiedStream.errCarriers (UnifiedStream.bagIntersectAll l r)) : + e ∈ UnifiedStream.errCarriers l ∧ e ∈ UnifiedStream.errCarriers r := by + have h1 : e ∈ UnifiedStream.errCarriers (UnifiedStream.intersectAll l r) := + UnifiedStream.clampPositive_errCarriers_of_mem _ e h + exact UnifiedStream.intersectAll_errCarriers_of_mem l r e h1 + end Mz From 65d722d101bb3de68c74d8aff46ee130226da35d Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 21:13:09 +0200 Subject: [PATCH 096/127] doc/semantics: timed error-scope extractors Lift error-scope extractors to TimedUnifiedStream: * TimedUnifiedStream.errCarriers / errorDiffCarriers * _nil / _append reductions matching UnifiedStream's Per-operator theorems: * advanceFrontier_errCarriers / advanceFrontier_errorDiffCarriers: exact equality. advanceFrontier only changes times; carriers and diffs untouched. * atTime_errCarriers_subset / atTime_errorDiffCarriers_subset: errs in atTime t slice came from input s (subset direction). Time-slice projection loses records at other times, so this cannot be an iff. * consolidateAtTime_errCarriers_subset / consolidateAtTime_errorDiffCarriers_subset: composition of atTime subset with consolidate iff. Restructured: moved consolidate_errCarriers_iff / consolidate_errorDiffCarriers_iff from Mz/SetOps.lean to Mz/UnifiedConsolidate.lean. Mz/TimedConsolidate.lean cites them but the import chain has SetOps downstream of TimedConsolidate. SetOps continues to use them via its UnifiedConsolidate import. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 28 +-- .../semantics/Mz/TimedConsolidate.lean | 218 ++++++++++++++++++ .../semantics/Mz/UnifiedConsolidate.lean | 31 +++ 3 files changed, 253 insertions(+), 24 deletions(-) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 26868f0f9857a..72debe8e11f75 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -1640,17 +1640,6 @@ every input carrier survives. Backward direction is from `mem_of_mem_consolidate`. For `.error`-diff carriers, the forward direction is `consolidate_preserves_error`. -/ -theorem UnifiedStream.consolidate_errCarriers_iff - (us : UnifiedStream) (e : EvalError) : - e ∈ UnifiedStream.errCarriers (UnifiedStream.consolidate us) - ↔ e ∈ UnifiedStream.errCarriers us := by - rw [UnifiedStream.mem_errCarriers, UnifiedStream.mem_errCarriers] - constructor - · intro h - exact UnifiedStream.mem_of_mem_consolidate us (UnifiedRow.err e) h - · intro ⟨d, hMem⟩ - exact UnifiedStream.mem_consolidate_of_mem us (UnifiedRow.err e) d hMem - /-- Forward direction for collection-scoped errors: every input `.error`-diff carrier shows up in the consolidated output. Direct consequence of `consolidate_preserves_error`. -/ @@ -1662,19 +1651,10 @@ theorem UnifiedStream.consolidate_errorDiffCarriers_mono rw [UnifiedStream.mem_errorDiffCarriers] exact UnifiedStream.consolidate_preserves_error us uc h -/-- Full equivalence: the collection-err set of the consolidated -stream equals the input's. Combines `consolidate_preserves_error` -(forward) with `consolidate_error_inv` (reverse — no spurious -`.error` emerges from `.val + .val`). -/ -theorem UnifiedStream.consolidate_errorDiffCarriers_iff - (us : UnifiedStream) (uc : UnifiedRow) : - uc ∈ UnifiedStream.errorDiffCarriers (UnifiedStream.consolidate us) - ↔ uc ∈ UnifiedStream.errorDiffCarriers us := by - rw [UnifiedStream.mem_errorDiffCarriers, - UnifiedStream.mem_errorDiffCarriers] - constructor - · intro h; exact UnifiedStream.consolidate_error_inv us uc h - · intro h; exact UnifiedStream.consolidate_preserves_error us uc h +-- `consolidate_errCarriers_iff` and `consolidate_errorDiffCarriers_iff` +-- now live in `Mz/UnifiedConsolidate.lean` so `Mz/TimedConsolidate.lean` +-- can cite them. SetOps imports UnifiedConsolidate, so downstream +-- theorems here still see them. /-! ## Join and error scopes diff --git a/doc/developer/semantics/Mz/TimedConsolidate.lean b/doc/developer/semantics/Mz/TimedConsolidate.lean index 1e71255db1f00..7278903c2c218 100644 --- a/doc/developer/semantics/Mz/TimedConsolidate.lean +++ b/doc/developer/semantics/Mz/TimedConsolidate.lean @@ -205,4 +205,222 @@ theorem TimedUnifiedStream.consolidateAtTime_length_le (UnifiedStream.consolidate_length_le _) (TimedUnifiedStream.atTime_length_le t s) +/-! ## Timed error-scope extractors + +Lift the row-err and collection-err extractors from `UnifiedStream` +to `TimedUnifiedStream`. The time component is irrelevant to error +classification — `.err` carriers and `.error` diffs are observed +the same way regardless of time. -/ + +/-- Row-scoped err payloads carried by the timed stream. -/ +def TimedUnifiedStream.errCarriers (s : TimedUnifiedStream) : List EvalError := + s.filterMap fun r => match r.1 with + | .err e => some e + | _ => none + +/-- Carriers whose diff is collection-scoped `.error`. -/ +def TimedUnifiedStream.errorDiffCarriers (s : TimedUnifiedStream) : + List UnifiedRow := + s.filterMap fun r => match r.2.2 with + | .error => some r.1 + | _ => none + +theorem TimedUnifiedStream.errCarriers_nil : + TimedUnifiedStream.errCarriers [] = [] := rfl + +theorem TimedUnifiedStream.errorDiffCarriers_nil : + TimedUnifiedStream.errorDiffCarriers [] = [] := rfl + +theorem TimedUnifiedStream.errCarriers_append (a b : TimedUnifiedStream) : + TimedUnifiedStream.errCarriers (a ++ b) + = TimedUnifiedStream.errCarriers a ++ TimedUnifiedStream.errCarriers b := by + show (a ++ b).filterMap _ = a.filterMap _ ++ b.filterMap _ + exact List.filterMap_append + +theorem TimedUnifiedStream.errorDiffCarriers_append (a b : TimedUnifiedStream) : + TimedUnifiedStream.errorDiffCarriers (a ++ b) + = TimedUnifiedStream.errorDiffCarriers a + ++ TimedUnifiedStream.errorDiffCarriers b := by + show (a ++ b).filterMap _ = a.filterMap _ ++ b.filterMap _ + exact List.filterMap_append + +/-! ## `advanceFrontier` preserves error scopes + +`advanceFrontier` only changes record times; carriers and diffs +are untouched. Both error scopes are preserved exactly. -/ + +theorem TimedUnifiedStream.advanceFrontier_errCarriers + (f : Nat) (s : TimedUnifiedStream) : + TimedUnifiedStream.errCarriers (TimedUnifiedStream.advanceFrontier f s) + = TimedUnifiedStream.errCarriers s := by + induction s with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, t, d⟩ := hd + cases uc with + | row r => + have hLhs : TimedUnifiedStream.errCarriers + ((UnifiedRow.row r, Nat.max t f, d) + :: TimedUnifiedStream.advanceFrontier f tl) + = TimedUnifiedStream.errCarriers + (TimedUnifiedStream.advanceFrontier f tl) := rfl + have hRhs : TimedUnifiedStream.errCarriers + ((UnifiedRow.row r, t, d) :: tl) + = TimedUnifiedStream.errCarriers tl := rfl + show TimedUnifiedStream.errCarriers + ((UnifiedRow.row r, Nat.max t f, d) + :: TimedUnifiedStream.advanceFrontier f tl) + = TimedUnifiedStream.errCarriers + ((UnifiedRow.row r, t, d) :: tl) + rw [hLhs, hRhs, ih] + | err e => + have hLhs : TimedUnifiedStream.errCarriers + ((UnifiedRow.err e, Nat.max t f, d) + :: TimedUnifiedStream.advanceFrontier f tl) + = e :: TimedUnifiedStream.errCarriers + (TimedUnifiedStream.advanceFrontier f tl) := rfl + have hRhs : TimedUnifiedStream.errCarriers + ((UnifiedRow.err e, t, d) :: tl) + = e :: TimedUnifiedStream.errCarriers tl := rfl + show TimedUnifiedStream.errCarriers + ((UnifiedRow.err e, Nat.max t f, d) + :: TimedUnifiedStream.advanceFrontier f tl) + = TimedUnifiedStream.errCarriers + ((UnifiedRow.err e, t, d) :: tl) + rw [hLhs, hRhs, ih] + +theorem TimedUnifiedStream.advanceFrontier_errorDiffCarriers + (f : Nat) (s : TimedUnifiedStream) : + TimedUnifiedStream.errorDiffCarriers (TimedUnifiedStream.advanceFrontier f s) + = TimedUnifiedStream.errorDiffCarriers s := by + induction s with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, t, d⟩ := hd + cases d with + | val n => + have hLhs : TimedUnifiedStream.errorDiffCarriers + ((uc, Nat.max t f, DiffWithError.val n) + :: TimedUnifiedStream.advanceFrontier f tl) + = TimedUnifiedStream.errorDiffCarriers + (TimedUnifiedStream.advanceFrontier f tl) := rfl + have hRhs : TimedUnifiedStream.errorDiffCarriers + ((uc, t, DiffWithError.val n) :: tl) + = TimedUnifiedStream.errorDiffCarriers tl := rfl + show TimedUnifiedStream.errorDiffCarriers + ((uc, Nat.max t f, DiffWithError.val n) + :: TimedUnifiedStream.advanceFrontier f tl) + = TimedUnifiedStream.errorDiffCarriers + ((uc, t, DiffWithError.val n) :: tl) + rw [hLhs, hRhs, ih] + | error => + have hLhs : TimedUnifiedStream.errorDiffCarriers + ((uc, Nat.max t f, DiffWithError.error) + :: TimedUnifiedStream.advanceFrontier f tl) + = uc :: TimedUnifiedStream.errorDiffCarriers + (TimedUnifiedStream.advanceFrontier f tl) := rfl + have hRhs : TimedUnifiedStream.errorDiffCarriers + ((uc, t, DiffWithError.error) :: tl) + = uc :: TimedUnifiedStream.errorDiffCarriers tl := rfl + show TimedUnifiedStream.errorDiffCarriers + ((uc, Nat.max t f, DiffWithError.error) + :: TimedUnifiedStream.advanceFrontier f tl) + = TimedUnifiedStream.errorDiffCarriers + ((uc, t, DiffWithError.error) :: tl) + rw [hLhs, hRhs, ih] + +/-! ## `atTime` projects error scopes per time slice + +`atTime t s` drops records at times other than `t` and forgets +the time component. An err carrier or `.error` diff at time `t` +appears in the time-slice's `UnifiedStream` extractor; one at any +other time does not. -/ + +theorem TimedUnifiedStream.atTime_errCarriers_subset + (t : Nat) (s : TimedUnifiedStream) (e : EvalError) + (h : e ∈ UnifiedStream.errCarriers (TimedUnifiedStream.atTime t s)) : + e ∈ TimedUnifiedStream.errCarriers s := by + rw [UnifiedStream.mem_errCarriers] at h + obtain ⟨d, hMem⟩ := h + -- hMem : (.err e, d) ∈ atTime t s = filterMap (...) s + have hMem' : (UnifiedRow.err e, d) + ∈ s.filterMap (fun r => + if r.2.1 = t then some (r.1, r.2.2) else none) := hMem + obtain ⟨r0, hRMem, hF⟩ := List.mem_filterMap.mp hMem' + obtain ⟨uc0, t0, d0⟩ := r0 + -- hF : (if t0 = t then some (uc0, d0) else none) = some (.err e, d) + by_cases hT : t0 = t + · rw [if_pos hT] at hF + -- hF : some (uc0, d0) = some (.err e, d) + have hPair : (uc0, d0) = (UnifiedRow.err e, d) := by injection hF + have hUc : uc0 = UnifiedRow.err e := (Prod.mk.injEq _ _ _ _).mp hPair |>.1 + subst hUc + -- (.err e, t0, d0) ∈ s. errCarriers contains e. + show e ∈ s.filterMap fun r => match r.1 with + | .err e' => some e' + | _ => none + refine List.mem_filterMap.mpr ⟨(UnifiedRow.err e, t0, d0), hRMem, ?_⟩ + show (match UnifiedRow.err e with + | .err e' => some e' + | _ => none) = some e + rfl + · rw [if_neg hT] at hF + cases hF + +theorem TimedUnifiedStream.atTime_errorDiffCarriers_subset + (t : Nat) (s : TimedUnifiedStream) (uc : UnifiedRow) + (h : uc ∈ UnifiedStream.errorDiffCarriers (TimedUnifiedStream.atTime t s)) : + uc ∈ TimedUnifiedStream.errorDiffCarriers s := by + rw [UnifiedStream.mem_errorDiffCarriers] at h + have hMem' : (uc, (DiffWithError.error : DiffWithError Int)) + ∈ s.filterMap (fun r => + if r.2.1 = t then some (r.1, r.2.2) else none) := h + obtain ⟨r0, hRMem, hF⟩ := List.mem_filterMap.mp hMem' + obtain ⟨uc0, t0, d0⟩ := r0 + by_cases hT : t0 = t + · rw [if_pos hT] at hF + have hPair : (uc0, d0) = (uc, DiffWithError.error) := by injection hF + have hUc : uc0 = uc := (Prod.mk.injEq _ _ _ _).mp hPair |>.1 + have hD : d0 = DiffWithError.error := (Prod.mk.injEq _ _ _ _).mp hPair |>.2 + rw [hUc] at hRMem + subst hD + show uc ∈ s.filterMap fun r => match r.2.2 with + | .error => some r.1 + | _ => none + refine List.mem_filterMap.mpr + ⟨(uc, t0, (DiffWithError.error : DiffWithError Int)), hRMem, ?_⟩ + show (match (DiffWithError.error : DiffWithError Int) with + | .error => some uc + | _ => none) = some uc + rfl + · rw [if_neg hT] at hF + cases hF + +/-! ## `consolidateAtTime` error-scope behavior + +`consolidateAtTime t = consolidate ∘ atTime t`. Combines +`atTime`'s subset behavior with `consolidate`'s scope-preserving +properties. + +Row-err: subset (atTime drops non-`t` records; consolidate +preserves the set as set). Collection-err: subset same shape. -/ + +theorem TimedUnifiedStream.consolidateAtTime_errCarriers_subset + (t : Nat) (s : TimedUnifiedStream) (e : EvalError) + (h : e ∈ UnifiedStream.errCarriers + (TimedUnifiedStream.consolidateAtTime t s)) : + e ∈ TimedUnifiedStream.errCarriers s := by + unfold TimedUnifiedStream.consolidateAtTime at h + rw [UnifiedStream.consolidate_errCarriers_iff] at h + exact TimedUnifiedStream.atTime_errCarriers_subset t s e h + +theorem TimedUnifiedStream.consolidateAtTime_errorDiffCarriers_subset + (t : Nat) (s : TimedUnifiedStream) (uc : UnifiedRow) + (h : uc ∈ UnifiedStream.errorDiffCarriers + (TimedUnifiedStream.consolidateAtTime t s)) : + uc ∈ TimedUnifiedStream.errorDiffCarriers s := by + unfold TimedUnifiedStream.consolidateAtTime at h + rw [UnifiedStream.consolidate_errorDiffCarriers_iff] at h + exact TimedUnifiedStream.atTime_errorDiffCarriers_subset t s uc h + end Mz diff --git a/doc/developer/semantics/Mz/UnifiedConsolidate.lean b/doc/developer/semantics/Mz/UnifiedConsolidate.lean index 8ba5980d6e95a..e0a3b9d0e3928 100644 --- a/doc/developer/semantics/Mz/UnifiedConsolidate.lean +++ b/doc/developer/semantics/Mz/UnifiedConsolidate.lean @@ -639,4 +639,35 @@ theorem UnifiedStream.consolidate_error_inv exact List.mem_cons_self · exact List.mem_cons_of_mem _ (ih hMem) +/-! ## Error-scope characterizations under `consolidate` + +Set-level invariance of row-err and collection-err extractors +under consolidation. Combines carrier-preservation +(`mem_consolidate_of_mem`, `mem_of_mem_consolidate`) and error +inversion (`consolidate_preserves_error`, `consolidate_error_inv`). + +These iffs are placed here (not in `Mz/SetOps.lean`) because +`Mz/TimedConsolidate.lean` cites them and the import chain forbids +SetOps as an upstream dependency. -/ + +theorem UnifiedStream.consolidate_errCarriers_iff + (us : UnifiedStream) (e : EvalError) : + e ∈ UnifiedStream.errCarriers (UnifiedStream.consolidate us) + ↔ e ∈ UnifiedStream.errCarriers us := by + rw [UnifiedStream.mem_errCarriers, UnifiedStream.mem_errCarriers] + constructor + · intro h + exact UnifiedStream.mem_of_mem_consolidate us (UnifiedRow.err e) h + · intro ⟨d, hMem⟩ + exact UnifiedStream.mem_consolidate_of_mem us (UnifiedRow.err e) d hMem + +theorem UnifiedStream.consolidate_errorDiffCarriers_iff + (us : UnifiedStream) (uc : UnifiedRow) : + uc ∈ UnifiedStream.errorDiffCarriers (UnifiedStream.consolidate us) + ↔ uc ∈ UnifiedStream.errorDiffCarriers us := by + rw [UnifiedStream.mem_errorDiffCarriers, + UnifiedStream.mem_errorDiffCarriers] + exact ⟨UnifiedStream.consolidate_error_inv us uc, + UnifiedStream.consolidate_preserves_error us uc⟩ + end Mz From 8bd4bbcba467253547cdc8d01b73cdc3db034354 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 21:15:54 +0200 Subject: [PATCH 097/127] doc/semantics: README updates for clamp/intersect/timed/JoinPushdown Document new error-scope coverage: * Clamp/clamp-distinct error-scope (clampPositive, clampToOne, distinct, bagExceptAll, intersectAll, bagIntersectAll): full iff for collection-err, reverse-only for row-err (clamp drops). * negate_filter commutativity. * Timed error-scope lift (errCarriers / errorDiffCarriers on TimedUnifiedStream; advanceFrontier exact; atTime / consolidateAtTime subset). * JoinPushdown.lean entry: filter_cross_pushdown_left, IsPureData hypothesis, cross_singleton reduction. * Note relocation of consolidate iff theorems to UnifiedConsolidate.lean (TimedConsolidate citation chain). Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/developer/semantics/README.md b/doc/developer/semantics/README.md index 10191d637d5a5..c524e74ad45a6 100644 --- a/doc/developer/semantics/README.md +++ b/doc/developer/semantics/README.md @@ -47,6 +47,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four * `Mz/Consolidate.lean`: per-key diff summation over `List (DiffWithError α)`. The headline `sumAll_eq_error_of_mem` proves that an `error` diff anywhere in the list absorbs the consolidated sum to `error`, which is the property a differential dataflow `compact` operator cites when propagating global errors through consolidation. Companion `sumAll_val_of_all_val` says an all-`val` list sums to `val` of some base value. * `Mz/TimedConsolidate.lean`: per-`(row, time)` consolidation. `TimedUnifiedStream := List (UnifiedRow × Nat × DiffWithError Int)` carries records with time. `atTime t` projects to one time slice (dropping the time component); `consolidateAtTime t` chains it with `UnifiedStream.consolidate`. Theorems: `consolidateAtTime_preserves_error` (an `.error` diff at time `t` survives both filter and consolidation), `atTime_length_le` and `consolidateAtTime_length_le` (both non-expanding). Decomposes the joint key into "filter by time, then consolidate by row". `advanceFrontier f s` advances every record's time to `Nat.max time f` — the differential-dataflow `advance` operator on a scalar frontier. Records originally at time `< f` move to `f`; records at `≥ f` stay. Theorems: `advanceFrontier_nil`, `advanceFrontier_length` (length preserved), `advanceFrontier_zero` (zero frontier is identity), `advanceFrontier_idem` (idempotent on equal frontier), `advanceFrontier_advanceFrontier` (composing by `f` then `g` equals advancing by `Nat.max f g`). Real differential dataflow uses antichains of times; the scalar form is sufficient to state the algebraic laws. + Timed error-scope lift: `TimedUnifiedStream.errCarriers` and `TimedUnifiedStream.errorDiffCarriers` extract the row-err and collection-err sets from a timed stream (ignoring the time component). Reductions: `errCarriers_nil`/`_append`, `errorDiffCarriers_nil`/`_append`. Per-operator: `advanceFrontier_errCarriers` and `advanceFrontier_errorDiffCarriers` (exact — frontier advance only changes times, not carriers/diffs); `atTime_errCarriers_subset` and `atTime_errorDiffCarriers_subset` (errors at slice time `t` are a subset of input errors — time-slice projection loses non-`t` records); `consolidateAtTime_errCarriers_subset` and `consolidateAtTime_errorDiffCarriers_subset` (composition of `atTime` subset with `consolidate` iff). * `Mz/UnifiedConsolidate.lean`: row-keyed diff summation on `UnifiedStream`. `UnifiedStream.consolidate` buckets records by carrier (via `DecidableEq UnifiedRow`) and sums per-bucket diffs. Theorems cover three properties: *absorption* — `consolidate_preserves_error` proves an `.error` diff anywhere in the input gives an `.error` diff in the consolidated output for that carrier; *cardinality* — `consolidate_length_le` bounds the output by the input length (consolidation only merges, never expands); @@ -54,7 +55,7 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four *strict shrinkage* — `consolidate_strict_length_dup` proves that two adjacent records sharing a carrier compress to one in the output: `(consolidate ((uc, d) :: (uc, d') :: rest)).length ≤ rest.length + 1`, strictly less than the input's `rest.length + 2`. *carrier uniqueness* — `consolidate_noDup` proves that `consolidate` always produces a `NoDupCarriers` list (`Pairwise` on first-component inequality). Each carrier appears at most once. Supporting `consolidateInto_preserves_noDup` (and a `_general` variant that handles both fresh-key and matching-key cases) carries the invariant through single-step inserts. *carrier preservation* — `mem_consolidate_of_mem` (forward) and `mem_of_mem_consolidate` (reverse): every input carrier appears in the output, and every output carrier came from an input. The set of carriers is preserved exactly under consolidation (multiplicity may collapse). - *error inversion* — `consolidate_error_inv` (with helper `consolidateInto_error_inv` and DiffSemiring's `add_eq_error_left_or_right`): a `.error` diff in the output must come from a `.error` diff in the input at the same carrier. The reverse of `consolidate_preserves_error`; closes the loop for `consolidate_errorDiffCarriers_iff` in `Mz/SetOps.lean`. + *error inversion* — `consolidate_error_inv` (with helper `consolidateInto_error_inv` and DiffSemiring's `add_eq_error_left_or_right`): a `.error` diff in the output must come from a `.error` diff in the input at the same carrier. The reverse of `consolidate_preserves_error`; closes the loop for `consolidate_errorDiffCarriers_iff` (defined here, alongside `consolidate_errCarriers_iff`, so `Mz/TimedConsolidate.lean` can cite them). * `Mz/Triple.lean`: collection-wide and per-time *flat* consolidation views on `TimedUnifiedStream`. `consolidateAll` sums every diff in the stream; `consolidateAtTimeFlat t` sums every diff at time `t`. Both ignore the carrier — they collapse a time slice (or the whole stream) to one `DiffWithError Int`. Absorption: `consolidateAll_eq_error_of_mem` and `consolidateAtTimeFlat_eq_error_of_mem`. Complementary to `Mz/TimedConsolidate.lean`'s `consolidateAtTime t`, which buckets per `(row, time)` and returns a `UnifiedStream`. * `Mz/Join.lean`: relational joins on the diff-aware `UnifiedStream`. `cross` is the cartesian product — carriers combine via `combineCarrier` (rows concatenate; err on either side wins, left first), diffs multiply through `DiffWithError`'s `Mul` instance. A `.error` diff on either input therefore absorbs to `.error` on the output via `DiffWithError.error_mul_{left,right}`. `join pred l r` filters the product through a join predicate. Theorems: `cross_length` (`l.length * r.length`), `filter_length_le` (filter is non-expanding), `join_length_le` (corollary). Diff-propagation theorems: `cross_diff_error_{left,right}` (a `.error` diff on either side propagates through every output record), `filter_preserves_error_diff` (a record carrying `.error` diff is never dropped by `filter` — the absorbing marker cannot be filtered away). No-error preservation: `cross_no_error` and `filter_no_error` prove that all-`.val` input diffs yield all-`.val` output diffs, so `.error` is the only source of absorbing diffs in the joint output. Algebraic laws: `combineCarrier_assoc` (carrier combine is associative modulo `List.append_assoc`) and the headline `UnifiedStream.cross_assoc` (`(a × b) × c = a × (b × c)`). The proof rearranges nested `flatMap` / `map` via local list-monad lemmas and closes via `DiffWithError.mul_assoc` plus `combineCarrier_assoc`. Error-scope propagation through `cross`: `mem_cross_of_mems` is the carrier-level witness — every input pair contributes one output record. `cross_errCarriers_from_left` (left `.err e` paired with any right record yields `.err e` in output via combineCarrier's left-wins rule); `cross_errCarriers_from_right` (right `.err e` paired with a left `.row la` yields `.err e` — right propagates only when left is `.row`); `cross_errorDiffCarriers_from_{left,right}` (`.error` diff on either side absorbs through the product). cross grows both error scopes multiplicatively in the cardinality of the opposite input. @@ -70,7 +71,10 @@ The goal is to lock in the boolean truth tables for `AND` and `OR` over the four `negate_consolidate` proves that negation commutes with consolidation: `negate (consolidate us) = consolidate (negate us)`. The proof recurses via `negate_consolidateInto` (private), which lifts the same property to the single-step insertion. Negation is additive (`neg_add`), so it slides through per-bucket sums. `intersectAll l r` realizes bag-intersection via lookup: consolidate both inputs, then per left-carrier emit `(uc, min(L_diff, R_diff))` if the carrier exists in the right's consolidate, else drop. Supported by `UnifiedStream.lookup` (return the diff for a carrier in a consolidated stream, or `none`), `UnifiedStream.lookup_isSome_of_mem` (lookup returns `some` when carrier present), `UnifiedStream.mem_of_lookup_eq_some` (converse: lookup success witnesses membership), `UnifiedStream.lookup_eq_of_mem_noDup` (returns the exact diff when the list has unique carriers), and `DiffWithError.min` (the bag-min combinator with `.error` absorbing). Theorems: `intersectAll_length_le` (≤ left.length), `intersectAll_preserves_error_diff_left`/`_right` (`.error` diff for a carrier present in both inputs survives — left case uses left-min absorption, right case uses the no-dup property of `consolidate r`), `intersectAll_no_error` (all-`.val` inputs yield all-`.val` outputs). `bagIntersectAll = clampPositive ∘ intersectAll` realizes the bag-semantics `INTERSECT ALL` — drops records with non-positive multiplicity, leaving `.error` records untouched. Theorems lift the signed flavor: `bagIntersectAll_length_le`, `bagIntersectAll_preserves_error_diff_left`/`_right`, `bagIntersectAll_no_error`, `bagIntersectAll_only_positive`. - Error-scope characterization of set ops. `unionAll_errCarriers` and `unionAll_errorDiffCarriers` (concat — direct from `errCarriers_append` / `errorDiffCarriers_append`). `negate_errCarriers` and `negate_errorDiffCarriers` (both scopes are invariants of negation since carriers are unchanged and `.error` absorbs negation). `consolidate_errCarriers_iff` (row-err set equal as set; multiplicity may shrink) and `consolidate_errorDiffCarriers_iff` (collection-err set equal exactly — forward from `consolidate_preserves_error`, reverse from `consolidate_error_inv`). `union_errCarriers_iff` / `union_errorDiffCarriers_iff` and `exceptAll_errCarriers_iff` / `exceptAll_errorDiffCarriers_iff` follow as compositions of `unionAll`, `negate`, and `consolidate` theorems — the error-scope sets of both ops are the disjoint union of the inputs'. `join_errorDiffCarriers` (equals that of `cross`) and `join_errCarriers_mono` (filter is monotone on row-errs over `cross`) compose `cross` and `filter` results. + Error-scope characterization of set ops. `unionAll_errCarriers` and `unionAll_errorDiffCarriers` (concat — direct from `errCarriers_append` / `errorDiffCarriers_append`). `negate_errCarriers` and `negate_errorDiffCarriers` (both scopes are invariants of negation since carriers are unchanged and `.error` absorbs negation). Set-level invariance of `consolidate` lives in `Mz/UnifiedConsolidate.lean` (`consolidate_errCarriers_iff`, `consolidate_errorDiffCarriers_iff`) so `Mz/TimedConsolidate.lean` can cite it; `Mz/SetOps.lean` keeps the forward `consolidate_errorDiffCarriers_mono` corollary for symmetry with other set ops. `union_errCarriers_iff` / `union_errorDiffCarriers_iff` and `exceptAll_errCarriers_iff` / `exceptAll_errorDiffCarriers_iff` follow as compositions of `unionAll`, `negate`, and `consolidate` theorems — the error-scope sets of both ops are the disjoint union of the inputs'. `join_errorDiffCarriers` (equals that of `cross`) and `join_errCarriers_mono` (filter is monotone on row-errs over `cross`) compose `cross` and `filter` results. + Clamp / clamp-distinct error-scope. `clampPositive_errorDiffCarriers_iff` and `clampToOne_errorDiffCarriers_iff` are full equivalences — `.error` records always pass through the clamps. Row-err has only the reverse direction: `clampPositive_errCarriers_of_mem` and `clampToOne_errCarriers_of_mem`, because the clamps may drop `(.err e, .val 0)` or `(.err e, .val (-n))` records. Compositions: `distinct_errorDiffCarriers_iff` / `distinct_errCarriers_of_mem` (distinct = clampToOne ∘ consolidate), `bagExceptAll_errorDiffCarriers_iff` / `bagExceptAll_errCarriers_of_mem` (clampPositive ∘ exceptAll). For `intersectAll`, only the reverse direction is available — `intersectAll_errCarriers_of_mem` (an err in the output came from BOTH inputs) and `intersectAll_errorDiffCarriers_of_mem` (a collection-err carrier in the output exists with some diff in both inputs); composed with clamps to give `bagIntersectAll_errCarriers_of_mem`. + `negate` commutes with `filter`: `negate_filter` proves `negate (filter pred us) = filter pred (negate us)` unconditionally. Filter's row arm depends only on `eval r pred`, which is independent of the diff sign, so the negation slides through. Useful for optimizer rewrites that commute negation with filter (EXCEPT ALL paths, retraction reordering). +* `Mz/JoinPushdown.lean`: filter pushdown for cross products. The optimizer's canonical rewrite — `filter pred (cross l r) = cross (filter pred l) r` — when the predicate references only left-input columns. Headline theorem `UnifiedStream.filter_cross_pushdown_left` takes the column bound `pred.colReferencesBoundedBy N = true`, the per-row width invariant `∀ ud ∈ l, ∀ la, ud.1 = .row la → la.length ≥ N`, and the `IsPureData` predicate on `r` (no `.err` carriers, no `.error` diffs). The pure-data hypothesis is essential: without it, `combineCarrier`'s left-wins rule disagrees with filter's cell-to-row promotion on err attribution, and `.error` diff in `r` interacts with filter's first arm to keep records that would be dropped post-pushdown. Auxiliary reduction `UnifiedStream.cross_singleton` flattens `cross [(uc, d)] r` to a per-record map (carrier/diff split to avoid `.fst`/`.snd` projection issues under induction). Per-record helpers handle each left-record shape (`.error` diff, `.err` carrier, `.row` with eval keep/err/drop); main theorem assembles them via structural induction on `l` plus `eval_append_left_of_bounded`. * `Mz/GroupBy.lean`: two grouping primitives. `groupBy keyExpr rel` partitions a relation by evaluated key using Lean's derived `DecidableEq Datum` — two `Datum.err e` keys with the same payload collapse into one group. `groupByErrDistinct keyExpr rel` uses the spec-faithful `Datum.groupKeyEq`, which returns `false` whenever either side is `.err`, so every err key produces its own singleton group. From dc71e105d49fda52862f0cb3a5dedb2f1290d3f0 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 21:27:17 +0200 Subject: [PATCH 098/127] doc/semantics: sumAll error inversion + Triple reverse direction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DiffWithError.sumAll_error_inv: converse of absorption. A sum equal to .error witnesses at least one .error summand. Proof peels via add_eq_error_left_or_right (DiffSemiring inversion law) on each cons step; nil case rules out via .val 0 ≠ .error. Triple.lean reverse-direction theorems: * consolidateAll_error_inv: .error total → ∃ record in stream with .error diff. * consolidateAtTimeFlat_error_inv: .error total at time t → ∃ record at time t with .error diff. Both are direct corollaries — extract the map preimage and filter conditions from sumAll_error_inv's witness. The consolidateAtTimeFlat version decodes the decide-wrapped time predicate via of_decide_eq_true. Closes the round-trip on flat consolidations: forward absorption + reverse inversion makes consolidateAll = .error iff some record has .error diff (and similarly per-time). Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Consolidate.lean | 25 ++++++++++++++++++ doc/developer/semantics/Mz/Triple.lean | 29 +++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/doc/developer/semantics/Mz/Consolidate.lean b/doc/developer/semantics/Mz/Consolidate.lean index b957353a06909..6d85030c7ef98 100644 --- a/doc/developer/semantics/Mz/Consolidate.lean +++ b/doc/developer/semantics/Mz/Consolidate.lean @@ -82,6 +82,31 @@ theorem sumAll_val_of_all_val [Zero α] [Add α] rw [hh_eq, ht_eq] rfl +/-- Reverse direction: a `sumAll` equal to `.error` witnesses an +`.error` summand somewhere in the list. The `.val + .val` arm of +`+` never returns `.error`, so an `.error` total rules in at +least one `.error` input. -/ +theorem sumAll_error_inv [Zero α] [Add α] + {ds : List (DiffWithError α)} + (h : sumAll ds = error) : + ∃ d ∈ ds, d = error := by + induction ds with + | nil => + -- sumAll [] = 0 = val 0 ≠ error + show ∃ d ∈ ([] : List (DiffWithError α)), d = error + exfalso + have h0 : (0 : DiffWithError α) = (error : DiffWithError α) := h + have : (DiffWithError.val (0 : α) : DiffWithError α) + = (DiffWithError.error : DiffWithError α) := h0 + cases this + | cons hd tl ih => + show ∃ d ∈ hd :: tl, d = error + have hSum : hd + sumAll tl = error := h + rcases add_eq_error_left_or_right hd (sumAll tl) hSum with hHd | hTl + · exact ⟨hd, List.mem_cons_self, hHd⟩ + · obtain ⟨d, hMem, hD⟩ := ih hTl + exact ⟨d, List.mem_cons_of_mem _ hMem, hD⟩ + end DiffWithError end Mz diff --git a/doc/developer/semantics/Mz/Triple.lean b/doc/developer/semantics/Mz/Triple.lean index 96dae513d0439..3fd0c35546439 100644 --- a/doc/developer/semantics/Mz/Triple.lean +++ b/doc/developer/semantics/Mz/Triple.lean @@ -72,4 +72,33 @@ theorem TimedUnifiedStream.consolidateAtTimeFlat_eq_error_of_mem refine List.mem_map.mpr ⟨r, ?_, h_err⟩ exact List.mem_filter.mpr ⟨h_mem, by simp [h_time]⟩ +/-! ## Reverse direction: from `.error` total to `.error` record -/ + +/-- If the collection-wide consolidation is `.error`, at least +one record in the stream carries an `.error` diff. The converse +of `consolidateAll_eq_error_of_mem`. -/ +theorem TimedUnifiedStream.consolidateAll_error_inv + {s : TimedUnifiedStream} + (h : TimedUnifiedStream.consolidateAll s = DiffWithError.error) : + ∃ r ∈ s, r.2.2 = (DiffWithError.error : DiffWithError Int) := by + unfold TimedUnifiedStream.consolidateAll at h + obtain ⟨d, hMem, hD⟩ := DiffWithError.sumAll_error_inv h + obtain ⟨r, hRMem, hRD⟩ := List.mem_map.mp hMem + exact ⟨r, hRMem, by rw [hRD]; exact hD⟩ + +/-- Time-slice version: an `.error` total at time `t` witnesses +an `.error` record at time `t`. -/ +theorem TimedUnifiedStream.consolidateAtTimeFlat_error_inv + {s : TimedUnifiedStream} (t : Nat) + (h : TimedUnifiedStream.consolidateAtTimeFlat t s = DiffWithError.error) : + ∃ r ∈ s, r.2.1 = t ∧ r.2.2 = (DiffWithError.error : DiffWithError Int) := by + unfold TimedUnifiedStream.consolidateAtTimeFlat at h + obtain ⟨d, hMem, hD⟩ := DiffWithError.sumAll_error_inv h + obtain ⟨r, hRMem, hRD⟩ := List.mem_map.mp hMem + have hRFilter : r ∈ s.filter (·.2.1 = t) := hRMem + rw [List.mem_filter] at hRFilter + refine ⟨r, hRFilter.1, ?_, ?_⟩ + · exact of_decide_eq_true hRFilter.2 + · rw [hRD]; exact hD + end Mz From 7361bdd0cfab0ca387498d98c45558c24b7573c8 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 21:30:39 +0200 Subject: [PATCH 099/127] doc/semantics: Triple round-trip iff + errorDiffCarriers bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three new theorems in Mz/Triple.lean: * consolidateAll_eq_error_iff: forward + reverse combined. consolidateAll s = .error ↔ ∃ r ∈ s, r.2.2 = .error. * consolidateAtTimeFlat_eq_error_iff: same shape, restricted to time-t slice. * consolidateAll_eq_error_iff_errorDiffCarriers: bridges the flat consolidation to the timed error-scope extractor. consolidateAll s = .error ↔ errorDiffCarriers s ≠ []. The bridge connects two views of the same observation: the diff-semiring total (single .error or .val value) and the extractor list (carrier-level collection-err set). They agree exactly on whether the stream has any .error diff anywhere. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Triple.lean | 61 ++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/doc/developer/semantics/Mz/Triple.lean b/doc/developer/semantics/Mz/Triple.lean index 3fd0c35546439..04b71bddd64bb 100644 --- a/doc/developer/semantics/Mz/Triple.lean +++ b/doc/developer/semantics/Mz/Triple.lean @@ -101,4 +101,65 @@ theorem TimedUnifiedStream.consolidateAtTimeFlat_error_inv · exact of_decide_eq_true hRFilter.2 · rw [hRD]; exact hD +/-! ## Round-trip iff forms + +Combine forward absorption with reverse inversion. The flat +consolidations exactly characterize the presence of an `.error` +diff in the stream (per-time slice for the time-aware version). -/ + +theorem TimedUnifiedStream.consolidateAll_eq_error_iff + (s : TimedUnifiedStream) : + TimedUnifiedStream.consolidateAll s = DiffWithError.error + ↔ ∃ r ∈ s, r.2.2 = (DiffWithError.error : DiffWithError Int) := by + constructor + · exact TimedUnifiedStream.consolidateAll_error_inv + · intro ⟨r, hMem, hErr⟩ + exact TimedUnifiedStream.consolidateAll_eq_error_of_mem r hMem hErr + +theorem TimedUnifiedStream.consolidateAtTimeFlat_eq_error_iff + (s : TimedUnifiedStream) (t : Nat) : + TimedUnifiedStream.consolidateAtTimeFlat t s = DiffWithError.error + ↔ ∃ r ∈ s, r.2.1 = t + ∧ r.2.2 = (DiffWithError.error : DiffWithError Int) := by + constructor + · exact TimedUnifiedStream.consolidateAtTimeFlat_error_inv t + · intro ⟨r, hMem, hT, hErr⟩ + exact TimedUnifiedStream.consolidateAtTimeFlat_eq_error_of_mem t r hMem hT hErr + +/-! ## Bridge to `errorDiffCarriers` + +`consolidateAll` is `.error` iff the stream's collection-err set +is non-empty. Direct via `mem` ↔ `∃ r, r.2.2 = .error`. -/ + +theorem TimedUnifiedStream.consolidateAll_eq_error_iff_errorDiffCarriers + (s : TimedUnifiedStream) : + TimedUnifiedStream.consolidateAll s = DiffWithError.error + ↔ TimedUnifiedStream.errorDiffCarriers s ≠ [] := by + rw [TimedUnifiedStream.consolidateAll_eq_error_iff] + constructor + · intro ⟨r, hMem, hErr⟩ hNil + obtain ⟨uc, t, d⟩ := r + have hD : d = DiffWithError.error := hErr + subst hD + have hUcMem : uc ∈ TimedUnifiedStream.errorDiffCarriers s := + List.mem_filterMap.mpr + ⟨(uc, t, (DiffWithError.error : DiffWithError Int)), hMem, rfl⟩ + rw [hNil] at hUcMem + exact absurd hUcMem List.not_mem_nil + · intro hNonEmpty + match h : TimedUnifiedStream.errorDiffCarriers s with + | [] => exact absurd h hNonEmpty + | uc :: _ => + have hMemList : uc ∈ TimedUnifiedStream.errorDiffCarriers s := by + rw [h]; exact List.mem_cons_self + have hFM : uc ∈ s.filterMap (fun r => match r.2.2 with + | .error => some r.1 + | _ => none) := hMemList + obtain ⟨r0, hRMem, hRF⟩ := List.mem_filterMap.mp hFM + obtain ⟨uc0, t0, d0⟩ := r0 + cases d0 with + | val n => cases hRF + | error => + refine ⟨(uc0, t0, DiffWithError.error), hRMem, rfl⟩ + end Mz From 89694c31a1e256345cca3ef0204dbc2e4439ed3d Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 21:45:53 +0200 Subject: [PATCH 100/127] doc/semantics: negate_project commute negate commutes with project unconditionally: negate (project es us) = project es (negate us) Mirror of negate_filter. project's row arm produces rowProjectRecords es d r which preserves the input diff d on every emitted record; negation flips d uniformly, sliding through. Helper negate_rowProjectRecords: negation slides through the row-project helper. Uses List.map_map for the err-branch case. rowProjectRecords promoted from private to public so the cross-file commute theorem can cite it directly. Enables optimizer rewrites that commute negation with projection (EXCEPT ALL through projection, retraction propagation). Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 85 +++++++++++++++++++ doc/developer/semantics/Mz/UnifiedStream.lean | 6 +- 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 72debe8e11f75..bcd7ee04ca07b 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -376,6 +376,91 @@ theorem UnifiedStream.negate_filter (pred : Expr) (us : UnifiedStream) : rw [DiffWithError.neg_val] rw [hF, hN, hFn, hNE] +/-- `negate` commutes through `rowProjectRecords`: flipping the +diff before or after the per-row project gives the same result. +`rowProjectRecords` only places the input diff `d` on every +emitted record, so swapping `d` for `-d` slides through. -/ +private theorem negate_rowProjectRecords + (es : List Expr) (d : DiffWithError Int) (r : Row) : + UnifiedStream.negate (rowProjectRecords es d r) + = rowProjectRecords es (-d) r := by + unfold rowProjectRecords + by_cases h : rowAllSafe es r + · rw [if_pos h, if_pos h] + show ([(UnifiedRow.row (es.map (eval r)), -d)] : UnifiedStream) = _ + rfl + · rw [if_neg h, if_neg h] + show ((rowErrs es r).map (fun e => (UnifiedRow.err e, d))).map + (fun ud => (ud.1, -ud.2)) + = (rowErrs es r).map (fun e => (UnifiedRow.err e, -d)) + rw [List.map_map] + rfl + +/-- `negate` commutes with `project`. Per record, project's row +arm produces `rowProjectRecords es d r` which preserves the input +diff; negation flips the diff uniformly on every output record. -/ +theorem UnifiedStream.negate_project (es : List Expr) (us : UnifiedStream) : + UnifiedStream.negate (UnifiedStream.project es us) + = UnifiedStream.project es (UnifiedStream.negate us) := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hCons : ((uc, d) :: tl : UnifiedStream) = [(uc, d)] ++ tl := rfl + rw [hCons, UnifiedStream.project_append, UnifiedStream.negate_append, + ih, UnifiedStream.negate_append, UnifiedStream.project_append] + congr 1 + cases d with + | error => + have hP : UnifiedStream.project es + [(uc, (DiffWithError.error : DiffWithError Int))] + = [(uc, (DiffWithError.error : DiffWithError Int))] := by + cases uc <;> rfl + have hN : UnifiedStream.negate + [(uc, (DiffWithError.error : DiffWithError Int))] + = [(uc, (DiffWithError.error : DiffWithError Int))] := by + show [(uc, -(DiffWithError.error : DiffWithError Int))] = _ + rw [DiffWithError.neg_error] + rw [hP, hN, hP] + | val n => + cases uc with + | err e => + have hP : UnifiedStream.project es + [(UnifiedRow.err e, DiffWithError.val n)] + = [(UnifiedRow.err e, DiffWithError.val n)] := rfl + have hPn : UnifiedStream.project es + [(UnifiedRow.err e, DiffWithError.val (-n))] + = [(UnifiedRow.err e, DiffWithError.val (-n))] := rfl + have hN : UnifiedStream.negate + [(UnifiedRow.err e, DiffWithError.val n)] + = [(UnifiedRow.err e, DiffWithError.val (-n))] := by + show [(UnifiedRow.err e, -(DiffWithError.val n : DiffWithError Int))] + = [(UnifiedRow.err e, DiffWithError.val (-n))] + rw [DiffWithError.neg_val] + rw [hP, hN, hPn] + | row r => + have hP : UnifiedStream.project es + [(UnifiedRow.row r, DiffWithError.val n)] + = rowProjectRecords es (DiffWithError.val n) r := by + show List.flatMap _ _ = _ + simp only [List.flatMap_cons, List.flatMap_nil, List.append_nil] + have hPn : UnifiedStream.project es + [(UnifiedRow.row r, DiffWithError.val (-n))] + = rowProjectRecords es (DiffWithError.val (-n)) r := by + show List.flatMap _ _ = _ + simp only [List.flatMap_cons, List.flatMap_nil, List.append_nil] + have hN : UnifiedStream.negate + [(UnifiedRow.row r, DiffWithError.val n)] + = [(UnifiedRow.row r, DiffWithError.val (-n))] := by + show [(UnifiedRow.row r, -(DiffWithError.val n : DiffWithError Int))] + = [(UnifiedRow.row r, DiffWithError.val (-n))] + rw [DiffWithError.neg_val] + rw [hP, hN, hPn, negate_rowProjectRecords] + -- After rewrites: rowProjectRecords es -(.val n) r = rowProjectRecords es (.val -n) r. + show rowProjectRecords es (-(DiffWithError.val n : DiffWithError Int)) r + = rowProjectRecords es (DiffWithError.val (-n)) r + rw [DiffWithError.neg_val] + /-- `.error` diffs survive negation; the carrier is preserved. -/ theorem UnifiedStream.negate_preserves_error_diff (us : UnifiedStream) (uc : UnifiedRow) diff --git a/doc/developer/semantics/Mz/UnifiedStream.lean b/doc/developer/semantics/Mz/UnifiedStream.lean index 189ee7fe013fb..62f58d4a616dd 100644 --- a/doc/developer/semantics/Mz/UnifiedStream.lean +++ b/doc/developer/semantics/Mz/UnifiedStream.lean @@ -144,8 +144,10 @@ holds both, distinguished by the carrier tag. -/ /-- Project a single `.row` record through `es`, returning the list of unified records the row contributes. Diff is preserved on -every produced record (rows-share-diff, errs-share-diff). -/ -private def rowProjectRecords (es : List Expr) (d : DiffWithError Int) (r : Row) : +every produced record (rows-share-diff, errs-share-diff). Exposed +(not `private`) so cross-file commute theorems like +`project_negate` in `Mz/SetOps.lean` can reason about it directly. -/ +def rowProjectRecords (es : List Expr) (d : DiffWithError Int) (r : Row) : UnifiedStream := if rowAllSafe es r then [(UnifiedRow.row (es.map (eval r)), d)] From cb42aa3de6af31e182e0c526f6d3812f349c8913 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 21:50:44 +0200 Subject: [PATCH 101/127] doc/semantics: cross_negate_right symmetric law Symmetric counterpart to cross_negate_left: cross l (negate r) = negate (cross l r) Uses DiffWithError.mul_neg_int (a * -b = -(a * b)) on the diff arithmetic; combineCarrier carrier is unchanged by negation. Proof structure mirrors cross_negate_left: induction on l with hL/hR reduction lemmas to expose flatMap output, then map_congr on the per-record body. Together with cross_negate_left, gives the bilinear law for cross: negate slides through from either side. Useful for optimizer rewrites where retractions on either input propagate through cross. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 37 ++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index bcd7ee04ca07b..2d129afbcb95b 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -1047,6 +1047,43 @@ theorem UnifiedStream.cross_negate_left (l r : UnifiedStream) : = (combineCarrier uc rd.1, -(d * rd.2)) rw [DiffWithError.neg_mul_int] +/-- Symmetric law: negating the right input of a cross product +equals negating the cross output. Uses `a * (-b) = -(a * b)` +(`DiffWithError.mul_neg_int`). -/ +theorem UnifiedStream.cross_negate_right (l r : UnifiedStream) : + UnifiedStream.cross l (UnifiedStream.negate r) + = UnifiedStream.negate (UnifiedStream.cross l r) := by + induction l with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hL : UnifiedStream.cross ((uc, d) :: tl) (UnifiedStream.negate r) + = (UnifiedStream.negate r).map + (fun rd => (combineCarrier uc rd.1, d * rd.2)) + ++ UnifiedStream.cross tl (UnifiedStream.negate r) := by + show ((uc, d) :: tl).flatMap _ = _ + simp only [List.flatMap_cons] + rfl + have hR : UnifiedStream.cross ((uc, d) :: tl) r + = r.map (fun rd => (combineCarrier uc rd.1, d * rd.2)) + ++ UnifiedStream.cross tl r := by + show ((uc, d) :: tl).flatMap _ = _ + simp only [List.flatMap_cons] + rfl + rw [hL, hR, UnifiedStream.negate_append, ih] + congr 1 + show (UnifiedStream.negate r).map + (fun rd => (combineCarrier uc rd.1, d * rd.2)) + = UnifiedStream.negate + (r.map (fun rd => (combineCarrier uc rd.1, d * rd.2))) + unfold UnifiedStream.negate + rw [List.map_map, List.map_map] + apply List.map_congr_left + intro rd _ + show (combineCarrier uc rd.1, d * -rd.2) + = (combineCarrier uc rd.1, -(d * rd.2)) + rw [DiffWithError.mul_neg_int] + /-- Empty left input: `exceptAll [] r = negate (consolidate r)`. Reduces via `unionAll_nil_left` and `negate_consolidate`. -/ theorem UnifiedStream.exceptAll_nil_left (r : UnifiedStream) : From 0e4c950d345ded268b80694a5ed15735ed222509 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 21:53:05 +0200 Subject: [PATCH 102/127] doc/semantics: transforms catalog Single index of every equational / inclusion law on UnifiedStream and TimedUnifiedStream. Grouped by algebraic shape: * Append / unionAll distribution * Commutativity (negate slides through) * Pushdown (filter_cross_pushdown_left) * Bilinearity (cross-negate) * Associativity (cross, unionAll, combineCarrier) * Involution / idempotence (negate_negate, clamp/escalate/advance idem) * Length / cardinality bounds * Trivial cases (nil / singleton reductions) * Cons / step reductions (cross_cons, consolidateInto match/skip) * Error-scope: row-err (errCarriers) * Error-scope: collection-err (errorDiffCarriers) * Error-diff record absorption (forward) and inversion (reverse) * No-error preservation * Multiplicity / shape constraints * NoDup carrier-uniqueness * Membership bridges * Round-trip / iff * DiffWithError underlying laws * Column-reference analyzers Each row links theorem name to source file. Optimizer rewrites should index here first; ~190 theorems total. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/transforms.md | 338 ++++++++++++++++++++++++++ 1 file changed, 338 insertions(+) create mode 100644 doc/developer/semantics/transforms.md diff --git a/doc/developer/semantics/transforms.md b/doc/developer/semantics/transforms.md new file mode 100644 index 0000000000000..48d1019d23090 --- /dev/null +++ b/doc/developer/semantics/transforms.md @@ -0,0 +1,338 @@ +# Transform catalog + +Mechanized equational and inclusion laws for `UnifiedStream` / `TimedUnifiedStream` operators. +Grouped by algebraic shape so optimizer rewrites have a single index. + +Each entry links theorem name to source file. +`L = R` denotes equality, `L ⊆ R` denotes one-direction membership (forward), `L = R ↔ P` denotes a logical iff over membership. + +## Append / unionAll distribution + +Operators that distribute over concatenation. +Each is direct from `List.flatMap_append` or `List.map_append` on the underlying carrier. + +| Theorem | Statement | File | +| --- | --- | --- | +| `unionAll_assoc` | `(a ⊎ b) ⊎ c = a ⊎ (b ⊎ c)` | `Mz/SetOps.lean` | +| `filter_unionAll` | `filter p (a ⊎ b) = filter p a ⊎ filter p b` | `Mz/SetOps.lean` | +| `project_unionAll` | `project es (a ⊎ b) = project es a ⊎ project es b` | `Mz/SetOps.lean` | +| `negate_unionAll` | `negate (a ⊎ b) = negate a ⊎ negate b` | `Mz/SetOps.lean` | +| `cross_unionAll_left` | `cross (a ⊎ b) r = cross a r ⊎ cross b r` | `Mz/SetOps.lean` | +| `filter_append` | `filter p (a ++ b) = filter p a ++ filter p b` | `Mz/UnifiedStream.lean` | +| `project_append` | `project es (a ++ b) = project es a ++ project es b` | `Mz/UnifiedStream.lean` | +| `negate_append` | `negate (a ++ b) = negate a ++ negate b` | `Mz/SetOps.lean` | +| `cross_append_left` | `cross (a ++ b) r = cross a r ++ cross b r` | `Mz/Join.lean` | +| `errCarriers_append` | `errCarriers (a ++ b) = errCarriers a ++ errCarriers b` | `Mz/UnifiedStream.lean` | +| `errorDiffCarriers_append` | symmetric for collection-err | `Mz/UnifiedStream.lean` | +| `unionAll_errCarriers` | corollary | `Mz/SetOps.lean` | +| `unionAll_errorDiffCarriers` | corollary | `Mz/SetOps.lean` | +| `TimedUnifiedStream.errCarriers_append` | timed lift | `Mz/TimedConsolidate.lean` | +| `TimedUnifiedStream.errorDiffCarriers_append` | timed lift | `Mz/TimedConsolidate.lean` | + +## Commutativity / sliding through + +Two operators commute as `f ∘ g = g ∘ f`. + +| Theorem | Statement | File | +| --- | --- | --- | +| `negate_filter` | `negate (filter p us) = filter p (negate us)` | `Mz/SetOps.lean` | +| `negate_project` | `negate (project es us) = project es (negate us)` | `Mz/SetOps.lean` | +| `negate_consolidate` | `negate (consolidate us) = consolidate (negate us)` | `Mz/SetOps.lean` | +| `negate_consolidateInto` (private) | step lemma for above | `Mz/SetOps.lean` | + +## Pushdown + +Rewrites that move an outer operator inside a join or product. + +| Theorem | Statement | File | +| --- | --- | --- | +| `filter_cross_pushdown_left` | `filter p (cross l r) = cross (filter p l) r` when `p` bounded by left widths and `r` `IsPureData` | `Mz/JoinPushdown.lean` | + +## Bilinearity (cross with negate) + +Negation slides through cross from either side. + +| Theorem | Statement | File | +| --- | --- | --- | +| `cross_negate_left` | `cross (negate l) r = negate (cross l r)` via `(-a) * b = -(a * b)` | `Mz/SetOps.lean` | +| `cross_negate_right` | `cross l (negate r) = negate (cross l r)` via `a * (-b) = -(a * b)` | `Mz/SetOps.lean` | + +## Associativity + +| Theorem | Statement | File | +| --- | --- | --- | +| `cross_assoc` | `cross (cross a b) c = cross a (cross b c)` modulo carrier-append associativity | `Mz/Join.lean` | +| `unionAll_assoc` | concat associative | `Mz/SetOps.lean` | +| `combineCarrier_assoc` | carrier-side associativity for cross | `Mz/Join.lean` | + +## Involution / Idempotence + +Applying an operator twice equals once (or zero times). + +| Theorem | Statement | File | +| --- | --- | --- | +| `negate_negate` | `negate (negate us) = us` (involution) | `Mz/SetOps.lean` | +| `clampPositive_idem` | `clampPositive (clampPositive us) = clampPositive us` | `Mz/SetOps.lean` | +| `clampToOne_idem` | `clampToOne (clampToOne us) = clampToOne us` | `Mz/SetOps.lean` | +| `escalateRowErrs_idem` | `escalateRowErrs (escalateRowErrs us) = escalateRowErrs us` | `Mz/UnifiedStream.lean` | +| `advanceFrontier_idem` | `advanceFrontier f (advanceFrontier f s) = advanceFrontier f s` | `Mz/TimedConsolidate.lean` | +| `advanceFrontier_zero` | `advanceFrontier 0 s = s` (zero-frontier identity) | `Mz/TimedConsolidate.lean` | +| `advanceFrontier_advanceFrontier` | `advanceFrontier g (advanceFrontier f s) = advanceFrontier (max f g) s` | `Mz/TimedConsolidate.lean` | +| `clampPositive_clampToOne` | `clampPositive ∘ clampToOne = clampToOne` | `Mz/SetOps.lean` | + +## Length / cardinality + +Bounds on output cardinality. `_length` is equality, `_length_le` is upper bound. + +| Theorem | Statement | File | +| --- | --- | --- | +| `unionAll_length` | `|a ⊎ b| = |a| + |b|` | `Mz/SetOps.lean` | +| `negate_length` | `|negate us| = |us|` | `Mz/SetOps.lean` | +| `cross_length` | `|cross l r| = |l| * |r|` | `Mz/Join.lean` | +| `filter_length_le` | filter non-expanding | `Mz/Join.lean` | +| `join_length_le` | `|join p l r| ≤ |l| * |r|` | `Mz/Join.lean` | +| `union_length_le` | bound via consolidate | `Mz/SetOps.lean` | +| `exceptAll_length_le` | `≤ |l| + |r|` | `Mz/SetOps.lean` | +| `clampPositive_length_le` | non-expanding | `Mz/SetOps.lean` | +| `clampToOne_length_le` | non-expanding | `Mz/SetOps.lean` | +| `distinct_length_le` | non-expanding | `Mz/SetOps.lean` | +| `intersectAll_length_le` | `≤ |l|` | `Mz/SetOps.lean` | +| `bagExceptAll_length_le` | composed bound | `Mz/SetOps.lean` | +| `bagIntersectAll_length_le` | composed bound | `Mz/SetOps.lean` | +| `consolidate_length_le` | merging never expands | `Mz/UnifiedConsolidate.lean` | +| `consolidate_strict_length_dup` | strict shrink on adjacent duplicate | `Mz/UnifiedConsolidate.lean` | +| `escalateRowErrs_length` | length-preserving | `Mz/UnifiedStream.lean` | +| `advanceFrontier_length` | length-preserving | `Mz/TimedConsolidate.lean` | +| `atTime_length_le` | non-expanding | `Mz/TimedConsolidate.lean` | +| `consolidateAtTime_length_le` | non-expanding | `Mz/TimedConsolidate.lean` | + +## Trivial cases + +Reductions on empty / singleton inputs. + +| Theorem | Statement | File | +| --- | --- | --- | +| `*_nil` | operator applied to `[]` returns `[]` | various | +| `filter_nil`, `project_nil_stream`, `cross_nil_left`, `cross_nil_right`, `negate_nil` (implicit), `consolidate_nil`, `errCarriers_nil`, `errorDiffCarriers_nil`, `clampToOne_nil`, `escalateRowErrs_nil`, `advanceFrontier_nil`, `atTime_nil`, `consolidateAtTime_nil`, `consolidateInto_nil`, `unionAll_nil_left`, `unionAll_nil_right`, `union_nil_left`, `union_nil_right`, `bagExceptAll_nil_left`, `bagExceptAll_nil_right`, `exceptAll_nil_left`, `exceptAll_nil_right`, `lookup_nil` | | various | +| `consolidate_singleton` | `consolidate [(uc, d)] = [(uc, d)]` | `Mz/UnifiedConsolidate.lean` | +| `cross_singleton` | `cross [(uc, d)] r = r.map (combineCarrier uc rd.1, d * rd.2)` | `Mz/JoinPushdown.lean` | +| `project_nil_es` | empty projection list collapses rows | `Mz/UnifiedStream.lean` | + +## Cons / step reductions + +Named per-shape reductions of recursive operators. + +| Theorem | Statement | File | +| --- | --- | --- | +| `cross_cons_left` | cross unfolding on `(hd :: tl)` left | `Mz/Join.lean` | +| `consolidateInto_match` | matching head folds into bucket | `Mz/UnifiedConsolidate.lean` | +| `consolidateInto_skip` | non-matching head recurses | `Mz/UnifiedConsolidate.lean` | +| `consolidateInto_nil` | trivial | `Mz/UnifiedConsolidate.lean` | + +## Error-scope: row-err (`errCarriers`) + +Set of row-scoped error payloads (carrier = `.err e`). +Iff = preserved exactly as set. Mono = forward inclusion only (one direction). `_of_mem` = reverse inclusion only. + +| Theorem | Direction | File | +| --- | --- | --- | +| `unionAll_errCarriers` | `=` concat | `Mz/SetOps.lean` | +| `negate_errCarriers` | `=` | `Mz/SetOps.lean` | +| `consolidate_errCarriers_iff` | `↔` (set) | `Mz/UnifiedConsolidate.lean` | +| `union_errCarriers_iff` | disjoint union | `Mz/SetOps.lean` | +| `exceptAll_errCarriers_iff` | disjoint union | `Mz/SetOps.lean` | +| `filter_errCarriers_mono` | mono (cell→row promotion adds) | `Mz/SetOps.lean` | +| `project_errCarriers_mono` | mono (scalar errs add) | `Mz/UnifiedStream.lean` | +| `cross_errCarriers_from_left` | mono propagation | `Mz/Join.lean` | +| `cross_errCarriers_from_right` | mono propagation (left = `.row`) | `Mz/Join.lean` | +| `join_errCarriers_mono` | mono (filter of cross) | `Mz/SetOps.lean` | +| `clampPositive_errCarriers_of_mem` | reverse (clamps drop) | `Mz/SetOps.lean` | +| `clampToOne_errCarriers_of_mem` | reverse | `Mz/SetOps.lean` | +| `distinct_errCarriers_of_mem` | reverse | `Mz/SetOps.lean` | +| `bagExceptAll_errCarriers_of_mem` | reverse | `Mz/SetOps.lean` | +| `intersectAll_errCarriers_of_mem` | reverse (and: in both) | `Mz/SetOps.lean` | +| `bagIntersectAll_errCarriers_of_mem` | reverse (and: in both) | `Mz/SetOps.lean` | +| `escalateRowErrs_errCarriers` | `=` (carriers untouched) | `Mz/UnifiedStream.lean` | +| `TimedUnifiedStream.advanceFrontier_errCarriers` | `=` (time-only op) | `Mz/TimedConsolidate.lean` | +| `TimedUnifiedStream.atTime_errCarriers_subset` | reverse (slice drops) | `Mz/TimedConsolidate.lean` | +| `TimedUnifiedStream.consolidateAtTime_errCarriers_subset` | reverse | `Mz/TimedConsolidate.lean` | + +## Error-scope: collection-err (`errorDiffCarriers`) + +Carriers whose diff is `.error`. + +| Theorem | Direction | File | +| --- | --- | --- | +| `unionAll_errorDiffCarriers` | `=` concat | `Mz/SetOps.lean` | +| `negate_errorDiffCarriers` | `=` | `Mz/SetOps.lean` | +| `consolidate_errorDiffCarriers_iff` | `↔` exact | `Mz/UnifiedConsolidate.lean` | +| `consolidate_errorDiffCarriers_mono` | forward (corollary) | `Mz/SetOps.lean` | +| `union_errorDiffCarriers_iff` | disjoint union | `Mz/SetOps.lean` | +| `exceptAll_errorDiffCarriers_iff` | disjoint union | `Mz/SetOps.lean` | +| `filter_errorDiffCarriers` | `=` (.error passes through) | `Mz/SetOps.lean` | +| `project_errorDiffCarriers` | `=` | `Mz/UnifiedStream.lean` | +| `cross_errorDiffCarriers_from_left` | forward propagation | `Mz/Join.lean` | +| `cross_errorDiffCarriers_from_right` | forward propagation | `Mz/Join.lean` | +| `join_errorDiffCarriers` | `=` (= cross) | `Mz/SetOps.lean` | +| `clampPositive_errorDiffCarriers_iff` | `↔` exact | `Mz/SetOps.lean` | +| `clampToOne_errorDiffCarriers_iff` | `↔` exact | `Mz/SetOps.lean` | +| `distinct_errorDiffCarriers_iff` | `↔` exact | `Mz/SetOps.lean` | +| `bagExceptAll_errorDiffCarriers_iff` | disjoint union | `Mz/SetOps.lean` | +| `intersectAll_errorDiffCarriers_of_mem` | reverse (in both) | `Mz/SetOps.lean` | +| `escalateRowErrs_errCarriers_in_errorDiff` | forward (promotion) | `Mz/UnifiedStream.lean` | +| `TimedUnifiedStream.advanceFrontier_errorDiffCarriers` | `=` | `Mz/TimedConsolidate.lean` | +| `TimedUnifiedStream.atTime_errorDiffCarriers_subset` | reverse | `Mz/TimedConsolidate.lean` | +| `TimedUnifiedStream.consolidateAtTime_errorDiffCarriers_subset` | reverse | `Mz/TimedConsolidate.lean` | + +## Error-diff record-level absorption (forward) + +`.error` diff survives the operator on the same carrier. + +| Theorem | File | +| --- | --- | +| `consolidate_preserves_error` | `Mz/UnifiedConsolidate.lean` | +| `project_preserves_error_diff` | `Mz/UnifiedStream.lean` | +| `filter_preserves_error_diff` | `Mz/Join.lean` | +| `cross_diff_error_left` / `cross_diff_error_right` | `Mz/Join.lean` | +| `unionAll_preserves_error_diff_left` / `_right` | `Mz/SetOps.lean` | +| `union_preserves_error_diff_left` / `_right` | `Mz/SetOps.lean` | +| `exceptAll_preserves_error_diff_left` / `_right` | `Mz/SetOps.lean` | +| `intersectAll_preserves_error_diff_left` / `_right` | `Mz/SetOps.lean` | +| `bagExceptAll_preserves_error_diff_left` / `_right` | `Mz/SetOps.lean` | +| `bagIntersectAll_preserves_error_diff_left` / `_right` | `Mz/SetOps.lean` | +| `clampPositive_preserves_error_diff` | `Mz/SetOps.lean` | +| `clampToOne_preserves_error_diff` | `Mz/SetOps.lean` | +| `distinct_preserves_error_diff` | `Mz/SetOps.lean` | +| `negate_preserves_error_diff` | `Mz/SetOps.lean` | +| `TimedUnifiedStream.consolidateAtTime_preserves_error` | `Mz/TimedConsolidate.lean` | + +## Error-diff inversion (reverse) + +If output has `.error`, input had `.error` at that carrier. + +| Theorem | File | +| --- | --- | +| `consolidate_error_inv` | `Mz/UnifiedConsolidate.lean` | +| `consolidateInto_error_inv` (private) | `Mz/UnifiedConsolidate.lean` | + +## No-error preservation + +All-`.val` inputs yield all-`.val` outputs (collection-err free). + +| Theorem | File | +| --- | --- | +| `filter_no_error` | `Mz/Join.lean` | +| `project_no_error` | `Mz/UnifiedStream.lean` | +| `cross_no_error` | `Mz/Join.lean` | +| `consolidate_no_error` | `Mz/UnifiedConsolidate.lean` | +| `negate_no_error` | `Mz/SetOps.lean` | +| `unionAll_no_error` | `Mz/SetOps.lean` | +| `union_no_error` | `Mz/SetOps.lean` | +| `exceptAll_no_error` | `Mz/SetOps.lean` | +| `intersectAll_no_error` | `Mz/SetOps.lean` | +| `clampPositive_no_error` | `Mz/SetOps.lean` | +| `clampToOne_no_error` | `Mz/SetOps.lean` | +| `distinct_no_error` | `Mz/SetOps.lean` | +| `bagExceptAll_no_error` | `Mz/SetOps.lean` | +| `bagIntersectAll_no_error` | `Mz/SetOps.lean` | + +## Multiplicity / shape constraints + +| Theorem | Statement | File | +| --- | --- | --- | +| `clampPositive_only_positive` | output `.val` is strictly positive | `Mz/SetOps.lean` | +| `clampToOne_only_one_or_error` | output diff is `.val 1` or `.error` | `Mz/SetOps.lean` | +| `bagExceptAll_only_positive` | composed | `Mz/SetOps.lean` | +| `bagIntersectAll_only_positive` | composed | `Mz/SetOps.lean` | +| `distinct_only_one_or_error` | composed | `Mz/SetOps.lean` | + +## Carrier uniqueness (NoDup) + +| Theorem | File | +| --- | --- | +| `NoDupCarriers.nil` | `Mz/UnifiedConsolidate.lean` | +| `consolidate_noDup` | `Mz/UnifiedConsolidate.lean` | +| `union_noDup` | `Mz/SetOps.lean` | +| `exceptAll_noDup` | `Mz/SetOps.lean` | +| `bagExceptAll_noDup` | `Mz/SetOps.lean` | +| `intersectAll_noDup` | `Mz/SetOps.lean` | +| `bagIntersectAll_noDup` | `Mz/SetOps.lean` | +| `distinct_noDup` | `Mz/SetOps.lean` | +| `clampPositive_noDup` | `Mz/SetOps.lean` | +| `clampToOne_noDup` | `Mz/SetOps.lean` | +| `negate_noDup` | `Mz/SetOps.lean` | + +## Membership bridges + +Convert extractor / structural membership. + +| Theorem | File | +| --- | --- | +| `mem_errCarriers` | `Mz/UnifiedStream.lean` | +| `mem_errorDiffCarriers` | `Mz/UnifiedStream.lean` | +| `mem_consolidate_of_mem` | forward carrier preservation | `Mz/UnifiedConsolidate.lean` | +| `mem_of_mem_consolidate` | reverse | `Mz/UnifiedConsolidate.lean` | +| `mem_cross_of_mems` | pair-membership in cross | `Mz/Join.lean` | +| `lookup_isSome_of_mem` | lookup characterization | `Mz/SetOps.lean` | +| `mem_of_lookup_eq_some` | lookup converse | `Mz/SetOps.lean` | +| `lookup_eq_of_mem_noDup` | exact diff under NoDup | `Mz/SetOps.lean` | +| `TimedUnifiedStream.mem_atTime_of_mem` | timed lift | `Mz/TimedConsolidate.lean` | + +## Round-trip / iff + +Combine forward and reverse direction. + +| Theorem | File | +| --- | --- | +| `split_ofBag` | `BagStream` round-trip | `Mz/UnifiedStream.lean` | +| `split_data_ofBag`, `split_errors_ofBag` | components | `Mz/UnifiedStream.lean` | +| `TimedUnifiedStream.consolidateAll_eq_error_iff` | flat absorption | `Mz/Triple.lean` | +| `TimedUnifiedStream.consolidateAll_eq_error_iff_errorDiffCarriers` | extractor bridge | `Mz/Triple.lean` | +| `TimedUnifiedStream.consolidateAtTimeFlat_eq_error_iff` | per-time | `Mz/Triple.lean` | +| `TimedUnifiedStream.consolidateAll_error_inv` | reverse half | `Mz/Triple.lean` | +| `TimedUnifiedStream.consolidateAtTimeFlat_error_inv` | reverse half | `Mz/Triple.lean` | + +## DiffWithError underlying laws + +The semiring layer that operator proofs cite. + +| Theorem | File | +| --- | --- | +| `error_add_left` / `error_add_right` | `Mz/DiffSemiring.lean` | +| `error_mul_left` / `error_mul_right` | `Mz/DiffSemiring.lean` | +| `error_min_left` / `error_min_right` | `Mz/DiffSemiring.lean` | +| `add_eq_error_left_or_right` | inversion | `Mz/DiffSemiring.lean` | +| `neg_error`, `neg_val`, `neg_neg_val` | negation laws | `Mz/DiffSemiring.lean` | +| `val_add_neg_val` | self-cancellation | `Mz/DiffSemiring.lean` | +| `neg_mul`, `mul_neg`, `neg_add` | distributive negation | `Mz/DiffSemiring.lean` | +| `min_val_val` | min on `.val` | `Mz/DiffSemiring.lean` | +| `mul_add`, `mul_assoc`, `mul_comm` | semiring laws | `Mz/DiffSemiring.lean` | +| `add_comm`, `add_assoc`, `zero_add_val`, `val_add_zero` | additive laws | `Mz/DiffSemiring.lean` | +| `*_int` specializations | base hypotheses discharged at `Int` | `Mz/DiffSemiring.lean` | +| `sumAll_eq_error_of_mem` | forward absorption | `Mz/Consolidate.lean` | +| `sumAll_error_inv` | reverse inversion | `Mz/Consolidate.lean` | +| `sumAll_val_of_all_val` | all-`.val` total | `Mz/Consolidate.lean` | + +## Column-reference analyzers + +Static analyses used by pushdown. + +| Theorem | File | +| --- | --- | +| `colReferencesBoundedBy_mono` | bound is monotone | `Mz/ColRefs.lean` | +| `eval_append_left_of_bounded` | eval-on-left agreement | `Mz/ColRefs.lean` | +| `eval_append_right_shift` | shifted eval on right | `Mz/ColRefs.lean` | +| `colShift` monoid laws | various | `Mz/ColRefs.lean` | + +## Notes + +* `⊎` denotes `UnifiedStream.unionAll` (defined as `++` on the carrier). +* Extractors / scopes: + * `errCarriers us` — list of row-scoped `.err e` payloads. + * `errorDiffCarriers us` — list of carriers whose diff is `.error`. +* Iff vs forward vs reverse: many operators preserve the *set* of errors but not the multiset. + Forward-only theorems hold when the operator can introduce new errs (cell-to-row promotion in `filter`, `project`). + Reverse-only theorems hold when the operator can drop errs (the clamps). +* The pushdown / commutativity laws are the consumable API for an optimizer. + Length and NoDup laws are invariants needed by cost models and uniqueness reasoning. + Error-scope laws are observable-behavior guarantees for the error model. From 3b02085a4c5af4e5a43d5dc8ec1021a04da84390 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 22:02:41 +0200 Subject: [PATCH 103/127] doc/semantics: map Mz optimizer passes to Lean coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 'Materialize optimizer passes → Lean coverage' section in transforms.md. Inventories all 66 passes in src/transform/ and classifies each as: * Modeled — equivalent theorem shipped (6 passes). * Modelable — current UnifiedStream / DiffWithError infra is sufficient; just write the theorem (11 passes incl. filter/map fusion, right-side pushdown, threshold elision, redundant_join, semijoin_idempotence, demand, canonicalize_mfp, equivalence_propagation use sites). * Infra gap — needs new operator or analysis (10 passes: Reduce family, TopK, FlatMap, IndexedFilter, monotonicity, column lattice). * Out of scope — physical planning, syntactic canonicalization, user-facing notices, type checking (~14 passes). Closes with priority recommendations: filter_cross_pushdown_right, filter/map fusion, threshold_elision as easy next ships. Reduce operator is the largest single infra gap (unlocks 4+ passes). Catalog derived via Explore-agent enumeration of src/transform/. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/transforms.md | 79 +++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/doc/developer/semantics/transforms.md b/doc/developer/semantics/transforms.md index 48d1019d23090..c9c6f5b3edec8 100644 --- a/doc/developer/semantics/transforms.md +++ b/doc/developer/semantics/transforms.md @@ -324,6 +324,85 @@ Static analyses used by pushdown. | `eval_append_right_shift` | shifted eval on right | `Mz/ColRefs.lean` | | `colShift` monoid laws | various | `Mz/ColRefs.lean` | +## Materialize optimizer passes → Lean coverage + +The Rust optimizer in `src/transform/` has 66 passes (41 algebraic rewrites, 13 analyses, 11 stateful planning steps, 1 framework). +This section maps each pass to its status in the Lean spec. + +Status legend: + +* **Modeled** — equivalent theorem (or strong proxy) shipped here. +* **Modelable** — current `UnifiedStream` / `DiffWithError` infra is enough; just write the theorem. +* **Infra gap** — needs a new operator (`Reduce`, `TopK`, `FlatMap`, …) or a new analysis (equivalence classes, monotonicity, column lattice). +* **Out of scope** — physical planning, syntactic plumbing, or user-facing metadata; no place in a denotational spec. + +### Algebraic rewrites — modeled + +| Rust pass | Lean correspondent | +| --- | --- | +| `predicate_pushdown.rs` | `filter_cross_pushdown_left` (left-only; right-side pending; bound and pure-data hypotheses) | +| `compound/union.rs` (UnionNegateFusion) | `negate_unionAll` + `unionAll_assoc` | +| `fusion/union.rs` (Union fusion) | `unionAll_assoc` + nil identities | +| `fusion/negate.rs` (Negate fusion) | `negate_negate` (involution) | +| `fusion/join.rs` (Join fusion / associativity) | `cross_assoc` | +| `union_cancel.rs` (partial) | `consolidate (unionAll a (negate a))` reduces to `.val 0` records via diff arithmetic; no theorem yet, but ingredients in place | + +### Algebraic rewrites — modelable (worth shipping) + +| Rust pass | Lean approach | +| --- | --- | +| `fusion/filter.rs` (filter ∘ filter) | `filter p ∘ filter q = filter (p ∧ q)`. Holds under `evalAnd` if we exclude the `err`-on-left + `false`-on-right corner. State with err-free / null-free hypothesis, or use a stratified `andErrStrict` variant. | +| `fusion/map.rs` (map fusion) | `project es ∘ project es' = project (es' ∘ es)`. Uses `Expr.subst` and `eval_subst` (already exist in `Mz/Pushdown.lean`); needs a `UnifiedStream`-level statement. | +| `fusion/project.rs` / `movement/projection_lifting.rs` / `projection_pushdown.rs` | We have `project_unionAll`. Add `project_filter` (commutes when no scalar errors collide with predicate), `project_cross_pushdown` (push project through cross when columns split cleanly). | +| `predicate_pushdown.rs` (right side) | `filter_cross_pushdown_right` mirror — needs uniform left-row-width hypothesis (so `colShift` arithmetic is well-defined) plus left pure-data. | +| `threshold_elision.rs` | `clampPositive` is a no-op when every diff is already `.val n > 0`. Lemma: `clampPositive us = us` under `∀ rec ∈ us, ∃ n > 0, rec.2 = .val n`. | +| `redundant_join.rs` (distinct + join) | Express `distinct` + `cross` commutation when right side is already key-unique. Requires `intersectAll`-style lookup invariants we already have. | +| `semijoin_idempotence.rs` (partial) | A semijoin is `cross` + project + distinct. Idempotence via `distinct_idem` (provable; we have `clampToOne_idem`). | +| `non_null_requirements.rs` (model the strict-null laws) | We already have `evalAnd` / `evalOr` / arithmetic err-/null-strictness. State as `NullPropagatingBinary` / `ErrPropagatingBinary` instances; some exist in `Mz/Strict.lean`. Lift to `UnifiedStream.filter` to characterize when predicates drop vs promote. | +| `demand.rs` (column-projection analysis) | We have `colReferencesBoundedBy`, `colReferencesUnused`, `eval_replaceAt_of_unused` in `Mz/ColRefs.lean`. Add a `UnifiedStream`-level theorem: replacing unused columns in every row leaves the operator output equal. | +| `canonicalize_mfp.rs` | Establish a canonical form `Project ∘ Filter ∘ Map` and prove every MFP-like composition has a unique canonical equivalent. Needs Map (`UnifiedStream.project` is the analog of MapFilterProject's Project part; we don't have an MFP wrapper). | +| `equivalence_propagation.rs` (use sites only) | The *use* of equivalence is `if a = b then replace a with b`. With a proved `evalEq` characterization we can show `filter (a = b) us` preserves a row iff substituting `b` for `a` in the rest of the predicate gives the same evaluation. Substitution machinery is in `Expr.subst`. | + +### Algebraic rewrites — infra gap + +These need a new operator or analysis before they can be expressed. + +| Rust pass | Missing infra | +| --- | --- | +| `fold_constants.rs` | Constant collections at the `UnifiedStream` level. Could be a singleton `.row r, .val 1` literal stream. Would unlock evaluating constant subqueries during proof. | +| `fusion/reduce.rs`, `reduce_reduction.rs`, `reduce_elision.rs`, `reduction_pushdown.rs` | `Reduce` operator on `UnifiedStream`. Aggregate is at `Mz/Aggregate.lean` but only on `List Datum`; lift to `UnifiedStream` with group-by interface. | +| `fusion/top_k.rs`, `canonicalization/topk_elision.rs` | `TopK` operator (sort + limit-offset) on `UnifiedStream`. Needs an ordering on rows. | +| `canonicalization/flat_map_elimination.rs` | `FlatMap` (table-valued function) operator. The constant-arg elimination piece reduces to existing `cross` with a literal stream. | +| `literal_constraints.rs` | `IndexedFilter` operator (semi-join with constant collection). | +| `literal_lifting.rs` | Map-with-literal-columns recognition. Modelable once `project` distinguishes literal vs computed columns. | +| `column_knowledge.rs` | Per-column lattice (`{literal, nullable, type}`) + propagation. Would be a separate analysis file. | +| `equivalence_propagation.rs` (full pass) | Equivalence-class lattice. | +| `monotonic.rs` (top-level) and `analysis/monotonic.rs` | Logical monotonicity analysis: streams that never retract. Requires a `NoRetraction` predicate on `UnifiedStream` (`∀ rec, ∃ n ≥ 0, rec.2 = .val n` modulo `.error`). | +| `case_literal.rs`, `coalesce_case.rs` | Scalar `Expr` rewrites; we model `Expr` semantics but no rewrite-rule infrastructure. Easy to add as `eval`-equivalence theorems but currently unused. | + +### Out of scope (intentional) + +| Rust pass | Reason | +| --- | --- | +| `join_implementation.rs`, `will_distinct.rs`, `dataflow.rs`, `ordering.rs`, `normalize_lets.rs`, `normalize_ops.rs` | Physical planning, downstream-info-driven, or syntactic canonicalization. The denotational spec is invariant under these. | +| `cse/anf.rs`, `cse/relation_cse.rs` | Common-subexpression / ANF transformations are pure syntactic. The spec treats `MirRelationExpr` modulo CSE by definition. | +| `collect_notices.rs`, `notice/*.rs` | User-facing diagnostics; not part of the semantics. | +| `typecheck.rs` | Type preservation across passes. Our spec is intrinsically typed (Lean's type system), so this property holds by construction. | +| `analysis.rs` | Analysis framework (trait infrastructure). Lean uses theorems directly; no analogue. | +| `canonicalization/projection_extraction.rs` | Identifies projections hiding inside `Map` / `Reduce`. Syntactic, no semantic content. | + +### Priority recommendations + +If a single pass should be modeled next, the highest-value candidates by API consumption density: + +1. **`filter_cross_pushdown_right`** — completes pushdown bilaterally; mirror of left-side proof. +2. **`fusion/filter.rs` (filter ∘ filter)** — direct equational rewrite, frequently quoted by the optimizer. +3. **`fusion/map.rs` (project ∘ project)** — uses existing `Expr.subst` machinery; would also document substitution at the relation level. +4. **`threshold_elision.rs`** — small theorem (`clampPositive` is identity on positive-only streams); good warm-up before introducing `Reduce`. + +Beyond those, the cluster `{Reduce + reduce_elision + reduce_reduction + reduction_pushdown}` is the largest dependency gap. +A `UnifiedStream.reduce` operator would unlock four passes plus the GroupBy semantics already partially in `Mz/GroupBy.lean`. + ## Notes * `⊎` denotes `UnifiedStream.unionAll` (defined as `++` on the carrier). From 11593022449b1c343dcfe50180762f84a34b55d0 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 22:11:25 +0200 Subject: [PATCH 104/127] doc/semantics: filter_cross_pushdown_right Mirror of filter_cross_pushdown_left. Predicate references only right-input columns (pred = pred'.colShift M, every left row of width M), left input is pure data. Bridges via eval_append_right_shift; symmetric to the left version's colReferencesBoundedBy + width-monotone setup. Single per-record helper filter_map_pure_data_right covers the .row+.val case; .error diff / .err carrier rows on the left are ruled out by IsPureData l. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/JoinPushdown.lean | 123 +++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/doc/developer/semantics/Mz/JoinPushdown.lean b/doc/developer/semantics/Mz/JoinPushdown.lean index 8c73a76170a4e..72899549d619d 100644 --- a/doc/developer/semantics/Mz/JoinPushdown.lean +++ b/doc/developer/semantics/Mz/JoinPushdown.lean @@ -423,4 +423,127 @@ theorem UnifiedStream.filter_cross_pushdown_left | err e_pred => exact filter_map_row_val_err pred la n r e_pred hBound' hEval hRPure +/-! ## Right-side pushdown + +Mirror of `filter_cross_pushdown_left`. Predicate references only +right-input columns — encoded as `pred = pred'.colShift M`, where +every left row has width exactly `M`. Left input is pure data (no +row-errs, no `.error` diffs), the symmetric counterpart to +`IsPureData r` on the left version. + +Width is `= M` rather than `≥ N`: `eval_append_right_shift` +strips the prefix only when the shift amount matches the left +row's length. The left version's monotone slack does not apply +here. -/ + +/-- Per-record helper for right pushdown. With a pure-data left +record `(.row la, .val n)` of width `M`, filtering the cross +output against `pred'.colShift M` commutes with crossing against +the right input filtered by `pred'`. Each `r` record's +contribution lines up under `eval_append_right_shift`. -/ +private theorem filter_map_pure_data_right + (pred' : Expr) (M : Nat) (la : Row) (n : Int) (r : UnifiedStream) + (hLen : la.length = M) : + UnifiedStream.filter (pred'.colShift M) + (r.map fun rd => (combineCarrier (UnifiedRow.row la) rd.1, + DiffWithError.val n * rd.2)) + = (UnifiedStream.filter pred' r).map + fun rd => (combineCarrier (UnifiedRow.row la) rd.1, + DiffWithError.val n * rd.2) := by + induction r with + | nil => rfl + | cons hd tl ih => + obtain ⟨ru, rd⟩ := hd + have hMapCons : + ((ru, rd) :: tl).map (fun rd' => (combineCarrier (UnifiedRow.row la) rd'.1, + DiffWithError.val n * rd'.2)) + = [(combineCarrier (UnifiedRow.row la) ru, DiffWithError.val n * rd)] + ++ tl.map (fun rd' => (combineCarrier (UnifiedRow.row la) rd'.1, + DiffWithError.val n * rd'.2)) := rfl + rw [hMapCons, UnifiedStream.filter_append, ih] + have hFilterCons : + UnifiedStream.filter pred' ((ru, rd) :: tl) + = UnifiedStream.filter pred' [(ru, rd)] + ++ UnifiedStream.filter pred' tl := by + have : ((ru, rd) :: tl : UnifiedStream) = [(ru, rd)] ++ tl := rfl + rw [this, UnifiedStream.filter_append] + rw [hFilterCons, List.map_append] + congr 1 + cases rd with + | error => + -- Both heads reduce to [(combineCarrier (.row la) ru, .error)]. + rfl + | val m => + cases ru with + | err e' => + -- combineCarrier (.row la) (.err e') = .err e'; both heads + -- reduce via err-carrier arm to [(.err e', .val (n*m))]. + rfl + | row rb => + have hEvalShift : eval (la ++ rb) (pred'.colShift M) = eval rb pred' := by + rw [← hLen] + exact eval_append_right_shift la rb pred' + -- combineCarrier (.row la) (.row rb) = .row (la ++ rb). + -- val n * val m = val (n*m). Both sides land in the .row arm + -- of filter; bridge by hEvalShift. + show (match eval (la ++ rb) (pred'.colShift M) with + | .bool true => [(UnifiedRow.row (la ++ rb), DiffWithError.val (n * m))] + | .err e => [(UnifiedRow.err e, DiffWithError.val (n * m))] + | _ => []) ++ [] + = ((match eval rb pred' with + | .bool true => [(UnifiedRow.row rb, DiffWithError.val m)] + | .err e => [(UnifiedRow.err e, DiffWithError.val m)] + | _ => []) ++ []).map + (fun rd' => (combineCarrier (UnifiedRow.row la) rd'.1, + DiffWithError.val n * rd'.2)) + rw [hEvalShift] + cases eval rb pred' with + | bool b => + cases b with + | true => rfl + | false => rfl + | err _ => rfl + | int _ => rfl + | null => rfl + +/-- Filter pushdown for cross products, right-side version. When +the join predicate references only right-input columns (encoded as +`pred = pred'.colShift M`, with every left row width exactly `M`), +and the left input is pure data, filtering the cross product +equals crossing the left input with the filtered right. + +The left-pure hypothesis is the symmetric counterpart of the +right-pure requirement in `filter_cross_pushdown_left`: row-scoped +errors or `.error` diffs on the un-filtered side would interact +with the filter's promotion rules and break the commute. -/ +theorem UnifiedStream.filter_cross_pushdown_right + (pred' : Expr) (M : Nat) (l r : UnifiedStream) + (hLPure : UnifiedStream.IsPureData l) + (hLWidth : ∀ ud ∈ l, ∀ la, ud.1 = UnifiedRow.row la → la.length = M) : + UnifiedStream.filter (pred'.colShift M) (UnifiedStream.cross l r) + = UnifiedStream.cross l (UnifiedStream.filter pred' r) := by + induction l with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hTlPure : UnifiedStream.IsPureData tl := hLPure.tail + have hTlWidth : ∀ ud ∈ tl, ∀ la, ud.1 = UnifiedRow.row la → la.length = M := + fun ud hMem la hUc => hLWidth ud (List.mem_cons_of_mem _ hMem) la hUc + obtain ⟨⟨rb_uc, hUc⟩, ⟨n, hD⟩⟩ := hLPure.head + have hHdLen : rb_uc.length = M := hLWidth (uc, d) List.mem_cons_self rb_uc hUc + subst hUc; subst hD + rw [UnifiedStream.cross_cons_left, UnifiedStream.filter_append, + ih hTlPure hTlWidth] + have hCrossRConsLeft : + UnifiedStream.cross ((UnifiedRow.row rb_uc, DiffWithError.val n) :: tl) + (UnifiedStream.filter pred' r) + = (UnifiedStream.filter pred' r).map + (fun rd => (combineCarrier (UnifiedRow.row rb_uc) rd.1, + DiffWithError.val n * rd.2)) + ++ UnifiedStream.cross tl (UnifiedStream.filter pred' r) := by + rw [UnifiedStream.cross_cons_left] + rw [hCrossRConsLeft] + congr 1 + exact filter_map_pure_data_right pred' M rb_uc n r hHdLen + end Mz From b261a40c455f91d15b2a36cdfa377f7366ffcf0e Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 22:12:08 +0200 Subject: [PATCH 105/127] doc/semantics: transforms.md, mark right pushdown shipped MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move predicate_pushdown.rs entry from "left-only pending" to bilateral. Drop redundant Modelable row. Rotate the priority list — top-of-list slot now goes to filter ∘ filter fusion. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/transforms.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/developer/semantics/transforms.md b/doc/developer/semantics/transforms.md index c9c6f5b3edec8..c28fd0795ec32 100644 --- a/doc/developer/semantics/transforms.md +++ b/doc/developer/semantics/transforms.md @@ -340,7 +340,7 @@ Status legend: | Rust pass | Lean correspondent | | --- | --- | -| `predicate_pushdown.rs` | `filter_cross_pushdown_left` (left-only; right-side pending; bound and pure-data hypotheses) | +| `predicate_pushdown.rs` | `filter_cross_pushdown_left` + `filter_cross_pushdown_right` (bilateral; bound / `colShift` and pure-data hypotheses) | | `compound/union.rs` (UnionNegateFusion) | `negate_unionAll` + `unionAll_assoc` | | `fusion/union.rs` (Union fusion) | `unionAll_assoc` + nil identities | | `fusion/negate.rs` (Negate fusion) | `negate_negate` (involution) | @@ -354,7 +354,6 @@ Status legend: | `fusion/filter.rs` (filter ∘ filter) | `filter p ∘ filter q = filter (p ∧ q)`. Holds under `evalAnd` if we exclude the `err`-on-left + `false`-on-right corner. State with err-free / null-free hypothesis, or use a stratified `andErrStrict` variant. | | `fusion/map.rs` (map fusion) | `project es ∘ project es' = project (es' ∘ es)`. Uses `Expr.subst` and `eval_subst` (already exist in `Mz/Pushdown.lean`); needs a `UnifiedStream`-level statement. | | `fusion/project.rs` / `movement/projection_lifting.rs` / `projection_pushdown.rs` | We have `project_unionAll`. Add `project_filter` (commutes when no scalar errors collide with predicate), `project_cross_pushdown` (push project through cross when columns split cleanly). | -| `predicate_pushdown.rs` (right side) | `filter_cross_pushdown_right` mirror — needs uniform left-row-width hypothesis (so `colShift` arithmetic is well-defined) plus left pure-data. | | `threshold_elision.rs` | `clampPositive` is a no-op when every diff is already `.val n > 0`. Lemma: `clampPositive us = us` under `∀ rec ∈ us, ∃ n > 0, rec.2 = .val n`. | | `redundant_join.rs` (distinct + join) | Express `distinct` + `cross` commutation when right side is already key-unique. Requires `intersectAll`-style lookup invariants we already have. | | `semijoin_idempotence.rs` (partial) | A semijoin is `cross` + project + distinct. Idempotence via `distinct_idem` (provable; we have `clampToOne_idem`). | @@ -395,10 +394,10 @@ These need a new operator or analysis before they can be expressed. If a single pass should be modeled next, the highest-value candidates by API consumption density: -1. **`filter_cross_pushdown_right`** — completes pushdown bilaterally; mirror of left-side proof. -2. **`fusion/filter.rs` (filter ∘ filter)** — direct equational rewrite, frequently quoted by the optimizer. -3. **`fusion/map.rs` (project ∘ project)** — uses existing `Expr.subst` machinery; would also document substitution at the relation level. -4. **`threshold_elision.rs`** — small theorem (`clampPositive` is identity on positive-only streams); good warm-up before introducing `Reduce`. +1. **`fusion/filter.rs` (filter ∘ filter)** — direct equational rewrite, frequently quoted by the optimizer. +2. **`fusion/map.rs` (project ∘ project)** — uses existing `Expr.subst` machinery; would also document substitution at the relation level. +3. **`threshold_elision.rs`** — small theorem (`clampPositive` is identity on positive-only streams); good warm-up before introducing `Reduce`. +4. **`semijoin_idempotence.rs`** — distinct + cross + project commutation; uses `clampToOne_idem` already in `Mz/SetOps.lean`. Beyond those, the cluster `{Reduce + reduce_elision + reduce_reduction + reduction_pushdown}` is the largest dependency gap. A `UnifiedStream.reduce` operator would unlock four passes plus the GroupBy semantics already partially in `Mz/GroupBy.lean`. From 893cb8cc4d52ddcb983e8fb2a45c898ae53e935b Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 22:38:42 +0200 Subject: [PATCH 106/127] =?UTF-8?q?doc/semantics:=20filter=20=E2=88=98=20f?= =?UTF-8?q?ilter=20fusion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `filter p (filter q us) = filter (.and q p) us` under per-row err-freedom: neither predicate may evaluate to .err on any data row of the input. Side condition is sharp — without it, the ordering between filter's err-promotion and evalAnd's `.bool false` absorption clause disagrees, e.g. on `(eval r q, eval r p) = (.err e, .bool false)`. Side condition lives in new `UnifiedStream.predNoRowErr`. Carrier- err records and `.error`-diff records pass through both pipelines unchanged; the hypothesis is only needed at `.row` records. Mirrors fusion/filter.rs in the Rust optimizer. New module Mz/FilterFusion.lean. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/FilterFusion.lean | 168 +++++++++++++++++++ 2 files changed, 169 insertions(+) create mode 100644 doc/developer/semantics/Mz/FilterFusion.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 03a7c3d5cdfa9..842b6a473054b 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -21,5 +21,6 @@ import Mz.Consolidate import Mz.Triple import Mz.Join import Mz.JoinPushdown +import Mz.FilterFusion import Mz.GroupBy import Mz.SetOps diff --git a/doc/developer/semantics/Mz/FilterFusion.lean b/doc/developer/semantics/Mz/FilterFusion.lean new file mode 100644 index 0000000000000..c9e582ed5db01 --- /dev/null +++ b/doc/developer/semantics/Mz/FilterFusion.lean @@ -0,0 +1,168 @@ +import Mz.UnifiedStream +import Mz.Boolean +import Mz.Expr +import Mz.Eval + +/-! +# Filter fusion + +The Rust optimizer's `fusion/filter.rs` pass collapses adjacent +filters: `filter p ∘ filter q ↝ filter (q ∧ p)`. The denotational +statement holds under a row-level err-freedom side condition: +neither predicate may evaluate to `.err _` on any data row of the +input stream. + +The side condition is forced by an interaction between filter's +err-promotion rule (a predicate `err` on a `.row r` record routes +the err into the carrier, keeping the record) and `evalAnd`'s +clause ordering (a `.bool false` argument absorbs everything, +including `.err`). Without err-freedom the fusion fails on, e.g., +`(eval r q, eval r p) = (.err e, .bool false)`: `filter q` first +produces `(.err e, _)`, which `filter p` keeps via the err-carrier +arm; whereas `filter (.and q p)` reduces to `.bool false` and +drops the record. + +Carrier-err records (`.err`) and collection-err records +(`.error` diff) flow through both pipelines unchanged — the err- +freedom hypothesis is only required at `.row` records. -/ + +namespace Mz + +/-- Predicate err-freedom on a stream's data rows. A predicate is +*data-err-free* on `us` when, for every `.row r` carrier in `us`, +`eval r e` is not an `.err _`. This is the precise hypothesis the +filter-fusion theorem needs at the `.row` arm: it rules out the +ordering mismatch between filter's err-promotion and `evalAnd`'s +`.bool false`-absorbs-everything clause. -/ +def UnifiedStream.predNoRowErr (e : Expr) (us : UnifiedStream) : Prop := + ∀ ud ∈ us, ∀ r, ud.1 = UnifiedRow.row r → ∀ ev, eval r e ≠ Datum.err ev + +theorem UnifiedStream.predNoRowErr.tail {e : Expr} + {hd : UnifiedRow × DiffWithError Int} {tl : UnifiedStream} + (h : UnifiedStream.predNoRowErr e (hd :: tl)) : + UnifiedStream.predNoRowErr e tl := + fun ud hMem => h ud (List.mem_cons_of_mem _ hMem) + +theorem UnifiedStream.predNoRowErr.head {e : Expr} + {hd : UnifiedRow × DiffWithError Int} {tl : UnifiedStream} + (h : UnifiedStream.predNoRowErr e (hd :: tl)) : + ∀ r, hd.1 = UnifiedRow.row r → ∀ ev, eval r e ≠ Datum.err ev := + h hd List.mem_cons_self + +/-! ## Per-record fusion at a data row + +Single-record filter pipelines line up with `filter (.and q p)` +when err-freedom holds at the row. The proof walks every non-err +`Datum` shape `eval r q` can produce; in the `.bool true` +keep-arm, it walks every non-err shape `eval r p` can produce. -/ + +/-- Filter reduces on a `.row` singleton to a `match eval r _`. -/ +private theorem filter_row_singleton (e : Expr) (r : Row) (n : Int) : + UnifiedStream.filter e [(UnifiedRow.row r, DiffWithError.val n)] + = (match eval r e with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err ev => [(UnifiedRow.err ev, DiffWithError.val n)] + | _ => []) := by + show (match eval r e with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err ev => [(UnifiedRow.err ev, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [List.append_nil] + +private theorem filter_fusion_row + (q p : Expr) (r : Row) (n : Int) + (hQ : ∀ ev, eval r q ≠ Datum.err ev) + (hP : ∀ ev, eval r p ≠ Datum.err ev) : + UnifiedStream.filter p + (UnifiedStream.filter q [(UnifiedRow.row r, DiffWithError.val n)]) + = UnifiedStream.filter (Expr.and q p) + [(UnifiedRow.row r, DiffWithError.val n)] := by + rw [filter_row_singleton q r n, filter_row_singleton (Expr.and q p) r n] + have hEvalAnd : eval r (Expr.and q p) = evalAnd (eval r q) (eval r p) := by + simp only [eval] + rw [hEvalAnd] + cases hQ' : eval r q with + | err e => exact absurd hQ' (hQ e) + | bool b => + cases b with + | true => + -- LHS becomes `filter p [(.row r, .val n)]`. + rw [filter_row_singleton p r n] + cases hP' : eval r p with + | err e => exact absurd hP' (hP e) + | bool b' => cases b' with | true => rfl | false => rfl + | null => rfl + | int _ => rfl + | false => + -- LHS becomes `filter p []` which is `[]`. + -- RHS match on `evalAnd .bool false _ = .bool false` → `[]`. + cases hP' : eval r p with + | err e => exact absurd hP' (hP e) + | bool _ => rfl + | null => rfl + | int _ => rfl + | null => + cases hP' : eval r p with + | err e => exact absurd hP' (hP e) + | bool b => + cases b with + | true => rfl + | false => rfl + | null => rfl + | int _ => rfl + | int k => + cases hP' : eval r p with + | err e => exact absurd hP' (hP e) + | bool b => + cases b with + | true => + -- `evalAnd (.int k) (.bool true) = .int k`; match drops via `_`. + rfl + | false => rfl + | null => rfl + | int m => + by_cases hKM : k = m + · have hEA : evalAnd (Datum.int k) (Datum.int m) = Datum.int k := by + show (if k = m then Datum.int k else Datum.null) = Datum.int k + rw [if_pos hKM] + rw [hEA]; rfl + · have hEA : evalAnd (Datum.int k) (Datum.int m) = Datum.null := by + show (if k = m then Datum.int k else Datum.null) = Datum.null + rw [if_neg hKM] + rw [hEA]; rfl + +/-! ## Main fusion theorem -/ + +/-- Adjacent filters fuse: `filter p ∘ filter q = filter (.and q p)` +when neither predicate triggers an `.err` on any data row of the +input stream. The hypothesis is sharp — see file docstring for the +ordering corner where err-freedom is required. -/ +theorem UnifiedStream.filter_filter_fuse + (q p : Expr) (us : UnifiedStream) + (hQ : UnifiedStream.predNoRowErr q us) + (hP : UnifiedStream.predNoRowErr p us) : + UnifiedStream.filter p (UnifiedStream.filter q us) + = UnifiedStream.filter (Expr.and q p) us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hTlQ : UnifiedStream.predNoRowErr q tl := hQ.tail + have hTlP : UnifiedStream.predNoRowErr p tl := hP.tail + have hConsAsApp : ((uc, d) :: tl : UnifiedStream) = [(uc, d)] ++ tl := rfl + rw [hConsAsApp, UnifiedStream.filter_append, + UnifiedStream.filter_append, UnifiedStream.filter_append, ih hTlQ hTlP] + congr 1 + cases d with + | error => rfl + | val n => + cases uc with + | err e => rfl + | row r => + have hQr : ∀ ev, eval r q ≠ Datum.err ev := + hQ.head r rfl + have hPr : ∀ ev, eval r p ≠ Datum.err ev := + hP.head r rfl + exact filter_fusion_row q p r n hQr hPr + +end Mz From 89e2236202e5d619bbda6d929f98b80b20995a09 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 22:39:15 +0200 Subject: [PATCH 107/127] doc/semantics: transforms.md, filter fusion shipped Move fusion/filter.rs from Modelable to Modeled, cite filter_filter_fuse in Mz/FilterFusion.lean. Rotate priority list: project fusion now top of queue. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/transforms.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/developer/semantics/transforms.md b/doc/developer/semantics/transforms.md index c28fd0795ec32..9861d796325b3 100644 --- a/doc/developer/semantics/transforms.md +++ b/doc/developer/semantics/transforms.md @@ -346,12 +346,12 @@ Status legend: | `fusion/negate.rs` (Negate fusion) | `negate_negate` (involution) | | `fusion/join.rs` (Join fusion / associativity) | `cross_assoc` | | `union_cancel.rs` (partial) | `consolidate (unionAll a (negate a))` reduces to `.val 0` records via diff arithmetic; no theorem yet, but ingredients in place | +| `fusion/filter.rs` (filter ∘ filter) | `UnifiedStream.filter_filter_fuse` in `Mz/FilterFusion.lean`. Holds under per-row err-freedom (`predNoRowErr`). Excludes the `err`-on-left + `evalAnd` ordering corner. | ### Algebraic rewrites — modelable (worth shipping) | Rust pass | Lean approach | | --- | --- | -| `fusion/filter.rs` (filter ∘ filter) | `filter p ∘ filter q = filter (p ∧ q)`. Holds under `evalAnd` if we exclude the `err`-on-left + `false`-on-right corner. State with err-free / null-free hypothesis, or use a stratified `andErrStrict` variant. | | `fusion/map.rs` (map fusion) | `project es ∘ project es' = project (es' ∘ es)`. Uses `Expr.subst` and `eval_subst` (already exist in `Mz/Pushdown.lean`); needs a `UnifiedStream`-level statement. | | `fusion/project.rs` / `movement/projection_lifting.rs` / `projection_pushdown.rs` | We have `project_unionAll`. Add `project_filter` (commutes when no scalar errors collide with predicate), `project_cross_pushdown` (push project through cross when columns split cleanly). | | `threshold_elision.rs` | `clampPositive` is a no-op when every diff is already `.val n > 0`. Lemma: `clampPositive us = us` under `∀ rec ∈ us, ∃ n > 0, rec.2 = .val n`. | @@ -394,10 +394,10 @@ These need a new operator or analysis before they can be expressed. If a single pass should be modeled next, the highest-value candidates by API consumption density: -1. **`fusion/filter.rs` (filter ∘ filter)** — direct equational rewrite, frequently quoted by the optimizer. -2. **`fusion/map.rs` (project ∘ project)** — uses existing `Expr.subst` machinery; would also document substitution at the relation level. -3. **`threshold_elision.rs`** — small theorem (`clampPositive` is identity on positive-only streams); good warm-up before introducing `Reduce`. -4. **`semijoin_idempotence.rs`** — distinct + cross + project commutation; uses `clampToOne_idem` already in `Mz/SetOps.lean`. +1. **`fusion/map.rs` (project ∘ project)** — uses existing `Expr.subst` machinery; would also document substitution at the relation level. +2. **`threshold_elision.rs`** — small theorem (`clampPositive` is identity on positive-only streams); good warm-up before introducing `Reduce`. +3. **`semijoin_idempotence.rs`** — distinct + cross + project commutation; uses `clampToOne_idem` already in `Mz/SetOps.lean`. +4. **`non_null_requirements.rs`** — lift `Strict.lean` propagation classes to `UnifiedStream.filter` to characterize drop vs promote. Beyond those, the cluster `{Reduce + reduce_elision + reduce_reduction + reduction_pushdown}` is the largest dependency gap. A `UnifiedStream.reduce` operator would unlock four passes plus the GroupBy semantics already partially in `Mz/GroupBy.lean`. From 6f8f367da97b0a4c0b6bdd5d4ac1b860a49d5929 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 22:40:27 +0200 Subject: [PATCH 108/127] doc/semantics: threshold_elision MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit clampPositive_id_of_positive: clampPositive is identity on streams where every record's diff is .error or .val n with n > 0. Models threshold_elision.rs — drop the post-pass when input is already sign-normalized. transforms.md: move to Modeled, rotate priority list. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 21 +++++++++++++++++++++ doc/developer/semantics/transforms.md | 8 ++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 2d129afbcb95b..87663c52c7df1 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -618,6 +618,27 @@ theorem UnifiedStream.clampPositive_no_error unfold UnifiedStream.clampPositive at hMem exact h x (List.mem_filter.mp hMem).1 +/-- Threshold elision: `clampPositive` is the identity on streams +where every `.val` diff is already strictly positive (`.error` +diffs pass through `clampPositive` unconditionally, so they need +no hypothesis). Models the Rust optimizer's `threshold_elision.rs` +pass: drop the `clampPositive` post-pass when the input is already +sign-normalized. -/ +theorem UnifiedStream.clampPositive_id_of_positive + (us : UnifiedStream) + (h : ∀ ud ∈ us, ud.2 = DiffWithError.error + ∨ ∃ n : Int, ud.2 = DiffWithError.val n ∧ 0 < n) : + UnifiedStream.clampPositive us = us := by + unfold UnifiedStream.clampPositive + apply List.filter_eq_self.mpr + intro ud hMem + show isPositiveDiff ud.2 = true + rcases h ud hMem with hErr | ⟨n, hN, hPos⟩ + · rw [hErr]; rfl + · rw [hN] + show decide (0 < n) = true + exact decide_eq_true hPos + /-- The output of `clampPositive` never contains a `.val n` with `n ≤ 0`. Equivalently, every surviving `.val` diff is strictly positive. -/ diff --git a/doc/developer/semantics/transforms.md b/doc/developer/semantics/transforms.md index 9861d796325b3..dec5602ec6960 100644 --- a/doc/developer/semantics/transforms.md +++ b/doc/developer/semantics/transforms.md @@ -347,6 +347,7 @@ Status legend: | `fusion/join.rs` (Join fusion / associativity) | `cross_assoc` | | `union_cancel.rs` (partial) | `consolidate (unionAll a (negate a))` reduces to `.val 0` records via diff arithmetic; no theorem yet, but ingredients in place | | `fusion/filter.rs` (filter ∘ filter) | `UnifiedStream.filter_filter_fuse` in `Mz/FilterFusion.lean`. Holds under per-row err-freedom (`predNoRowErr`). Excludes the `err`-on-left + `evalAnd` ordering corner. | +| `threshold_elision.rs` | `UnifiedStream.clampPositive_id_of_positive` in `Mz/SetOps.lean`. `clampPositive` is identity when every record's diff is `.error` or a strictly-positive `.val`. | ### Algebraic rewrites — modelable (worth shipping) @@ -354,7 +355,6 @@ Status legend: | --- | --- | | `fusion/map.rs` (map fusion) | `project es ∘ project es' = project (es' ∘ es)`. Uses `Expr.subst` and `eval_subst` (already exist in `Mz/Pushdown.lean`); needs a `UnifiedStream`-level statement. | | `fusion/project.rs` / `movement/projection_lifting.rs` / `projection_pushdown.rs` | We have `project_unionAll`. Add `project_filter` (commutes when no scalar errors collide with predicate), `project_cross_pushdown` (push project through cross when columns split cleanly). | -| `threshold_elision.rs` | `clampPositive` is a no-op when every diff is already `.val n > 0`. Lemma: `clampPositive us = us` under `∀ rec ∈ us, ∃ n > 0, rec.2 = .val n`. | | `redundant_join.rs` (distinct + join) | Express `distinct` + `cross` commutation when right side is already key-unique. Requires `intersectAll`-style lookup invariants we already have. | | `semijoin_idempotence.rs` (partial) | A semijoin is `cross` + project + distinct. Idempotence via `distinct_idem` (provable; we have `clampToOne_idem`). | | `non_null_requirements.rs` (model the strict-null laws) | We already have `evalAnd` / `evalOr` / arithmetic err-/null-strictness. State as `NullPropagatingBinary` / `ErrPropagatingBinary` instances; some exist in `Mz/Strict.lean`. Lift to `UnifiedStream.filter` to characterize when predicates drop vs promote. | @@ -395,9 +395,9 @@ These need a new operator or analysis before they can be expressed. If a single pass should be modeled next, the highest-value candidates by API consumption density: 1. **`fusion/map.rs` (project ∘ project)** — uses existing `Expr.subst` machinery; would also document substitution at the relation level. -2. **`threshold_elision.rs`** — small theorem (`clampPositive` is identity on positive-only streams); good warm-up before introducing `Reduce`. -3. **`semijoin_idempotence.rs`** — distinct + cross + project commutation; uses `clampToOne_idem` already in `Mz/SetOps.lean`. -4. **`non_null_requirements.rs`** — lift `Strict.lean` propagation classes to `UnifiedStream.filter` to characterize drop vs promote. +2. **`semijoin_idempotence.rs`** — distinct + cross + project commutation; uses `clampToOne_idem` already in `Mz/SetOps.lean`. +3. **`non_null_requirements.rs`** — lift `Strict.lean` propagation classes to `UnifiedStream.filter` to characterize drop vs promote. +4. **`demand.rs`** — uses `colReferencesUnused` + `eval_replaceAt_of_unused`; lift to a `UnifiedStream`-level replacement-invariance theorem. Beyond those, the cluster `{Reduce + reduce_elision + reduce_reduction + reduction_pushdown}` is the largest dependency gap. A `UnifiedStream.reduce` operator would unlock four passes plus the GroupBy semantics already partially in `Mz/GroupBy.lean`. From d08e791859cb74e94700f6d32a9ad8d911941eb7 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 22:57:05 +0200 Subject: [PATCH 109/127] =?UTF-8?q?doc/semantics:=20project=20=E2=88=98=20?= =?UTF-8?q?project=20fusion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `project es' ∘ project es = project (es'.map (·.subst es))` when `es` is safe on every data row of the input. Side condition is sharp: if `es` errors on a row, step 1 emits one err record per erroring scalar of `es`; the fused form instead emits one err record per `e' ∈ es'` whose substituted form errors. Those err sets only agree in the safe-input case. Row-level fusion lemma `rowProjectRecords_substList` is unconditional; the stream-level theorem `project_project_fuse` lifts it under `projsAllSafe`. Bridges via `eval_subst` from `Mz/Pushdown.lean`. Mirrors fusion/map.rs in the Rust optimizer. New module Mz/ProjectFusion.lean. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/ProjectFusion.lean | 150 ++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 doc/developer/semantics/Mz/ProjectFusion.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 842b6a473054b..e1442798e13d8 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -22,5 +22,6 @@ import Mz.Triple import Mz.Join import Mz.JoinPushdown import Mz.FilterFusion +import Mz.ProjectFusion import Mz.GroupBy import Mz.SetOps diff --git a/doc/developer/semantics/Mz/ProjectFusion.lean b/doc/developer/semantics/Mz/ProjectFusion.lean new file mode 100644 index 0000000000000..70e41cd16b5c8 --- /dev/null +++ b/doc/developer/semantics/Mz/ProjectFusion.lean @@ -0,0 +1,150 @@ +import Mz.UnifiedStream +import Mz.Pushdown + +/-! +# Project fusion + +The Rust optimizer's `fusion/map.rs` pass collapses adjacent +projections: `project es' ∘ project es ↝ project (es'[es])`, where +`e' ∈ es'` is rewritten by substituting each `col i` in `e'` with +`es[i]`. + +The denotational statement holds when every projection scalar in +`es` succeeds on every data row of the input (`rowAllSafe es r = +true`). The hypothesis is sharp: if `es` errors on some row, +`project es` emits one err record per erroring scalar of `es`; +the fused `project (es'.map (·.subst es))` instead emits one err +record per `e' ∈ es'` whose substituted form errors. Those err +sets agree only in the safe-input case. + +Carrier-err records (`.err`) and collection-err records (`.error` +diff) flow through both pipelines unchanged — the safety +hypothesis is only required at `.row` records. + +Substitution machinery (`Expr.subst`, `eval_subst`) is imported +from `Mz/Pushdown.lean`. -/ + +namespace Mz + +/-- Safety of `es` on every data row of `us`. The exact hypothesis +the projection-fusion theorem needs at the `.row` arm. -/ +def UnifiedStream.projsAllSafe (es : List Expr) (us : UnifiedStream) : Prop := + ∀ ud ∈ us, ∀ r, ud.1 = UnifiedRow.row r → rowAllSafe es r = true + +theorem UnifiedStream.projsAllSafe.tail {es : List Expr} + {hd : UnifiedRow × DiffWithError Int} {tl : UnifiedStream} + (h : UnifiedStream.projsAllSafe es (hd :: tl)) : + UnifiedStream.projsAllSafe es tl := + fun ud hMem => h ud (List.mem_cons_of_mem _ hMem) + +theorem UnifiedStream.projsAllSafe.head {es : List Expr} + {hd : UnifiedRow × DiffWithError Int} {tl : UnifiedStream} + (h : UnifiedStream.projsAllSafe es (hd :: tl)) : + ∀ r, hd.1 = UnifiedRow.row r → rowAllSafe es r = true := + h hd List.mem_cons_self + +/-! ## Substitution preserves the row-projection record set + +Under no hypothesis, projecting `es'` against the row produced by +projecting `es` first agrees with projecting the substituted +expression list `es'.map (·.subst es)` against the original row. +This is the row-level fusion law; the stream-level fusion lifts it +under the safety hypothesis. -/ + +private theorem rowAllSafe_substList (es es' : List Expr) (r : Row) : + rowAllSafe (es'.map (·.subst es)) r + = rowAllSafe es' (es.map (eval r)) := by + unfold rowAllSafe + rw [List.all_map] + congr 1 + funext e' + show (match eval r (e'.subst es) with | .err _ => false | _ => true) + = (match eval (es.map (eval r)) e' with | .err _ => false | _ => true) + rw [eval_subst r es e'] + +private theorem rowErrs_substList (es es' : List Expr) (r : Row) : + rowErrs (es'.map (·.subst es)) r + = rowErrs es' (es.map (eval r)) := by + unfold rowErrs + rw [List.filterMap_map] + congr 1 + funext e' + show (match eval r (e'.subst es) with | .err err => some err | _ => none) + = (match eval (es.map (eval r)) e' with | .err err => some err | _ => none) + rw [eval_subst r es e'] + +private theorem evalList_substList (es es' : List Expr) (r : Row) : + (es'.map (·.subst es)).map (eval r) = es'.map (eval (es.map (eval r))) := by + rw [List.map_map] + apply List.map_congr_left + intro e' _ + exact eval_subst r es e' + +private theorem rowProjectRecords_substList + (es es' : List Expr) (d : DiffWithError Int) (r : Row) : + rowProjectRecords (es'.map (·.subst es)) d r + = rowProjectRecords es' d (es.map (eval r)) := by + unfold rowProjectRecords + rw [rowAllSafe_substList, rowErrs_substList] + by_cases hSafe : rowAllSafe es' (es.map (eval r)) = true + · rw [if_pos hSafe, if_pos hSafe, evalList_substList] + · rw [if_neg hSafe, if_neg hSafe] + +/-! ## Main fusion theorem -/ + +/-- Adjacent projections fuse: `project es' ∘ project es = +project (es'.map (·.subst es))` when `es` is safe on every data +row of the input (no scalar in `es` errors). The hypothesis is +sharp — see file docstring for the err-set divergence. -/ +theorem UnifiedStream.project_project_fuse + (es es' : List Expr) (us : UnifiedStream) + (hSafe : UnifiedStream.projsAllSafe es us) : + UnifiedStream.project es' (UnifiedStream.project es us) + = UnifiedStream.project (es'.map (·.subst es)) us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hTl : UnifiedStream.projsAllSafe es tl := hSafe.tail + have hConsAsApp : ((uc, d) :: tl : UnifiedStream) = [(uc, d)] ++ tl := rfl + rw [hConsAsApp, UnifiedStream.project_append, + UnifiedStream.project_append, UnifiedStream.project_append, + ih hTl] + congr 1 + cases d with + | error => + -- Both pipelines pass `.error`-diff through unchanged. + rfl + | val n => + cases uc with + | err e => + -- Both pipelines pass `.err` carrier through unchanged. + rfl + | row r => + have hHd : rowAllSafe es r = true := hSafe.head r rfl + -- Step 1: `project es [(.row r, .val n)]` reduces via the + -- safe branch to `[(.row (es.map (eval r)), .val n)]`. + have hStep1 : UnifiedStream.project es [(UnifiedRow.row r, DiffWithError.val n)] + = [(UnifiedRow.row (es.map (eval r)), DiffWithError.val n)] := by + show rowProjectRecords es (DiffWithError.val n) r ++ [] = _ + unfold rowProjectRecords + rw [if_pos hHd, List.append_nil] + -- Step 2: `project es'` of step 1 = rowProjectRecords es' against + -- the projected row. + have hStep2 : UnifiedStream.project es' + [(UnifiedRow.row (es.map (eval r)), DiffWithError.val n)] + = rowProjectRecords es' (DiffWithError.val n) + (es.map (eval r)) := by + show rowProjectRecords es' (DiffWithError.val n) (es.map (eval r)) ++ [] = _ + rw [List.append_nil] + -- Fused: same value via the row-level fusion lemma. + have hFused : UnifiedStream.project (es'.map (·.subst es)) + [(UnifiedRow.row r, DiffWithError.val n)] + = rowProjectRecords (es'.map (·.subst es)) + (DiffWithError.val n) r := by + show rowProjectRecords (es'.map (·.subst es)) (DiffWithError.val n) r ++ [] = _ + rw [List.append_nil] + rw [hStep1, hStep2, hFused, + rowProjectRecords_substList es es' (DiffWithError.val n) r] + +end Mz From 7132f4db707731a9eac33ee4ec20a68e6c8531ff Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 22:57:35 +0200 Subject: [PATCH 110/127] doc/semantics: transforms.md, project fusion shipped Move fusion/map.rs from Modelable to Modeled. Rotate priority list: semijoin_idempotence now top. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/transforms.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/developer/semantics/transforms.md b/doc/developer/semantics/transforms.md index dec5602ec6960..ff001264aa1dc 100644 --- a/doc/developer/semantics/transforms.md +++ b/doc/developer/semantics/transforms.md @@ -348,12 +348,12 @@ Status legend: | `union_cancel.rs` (partial) | `consolidate (unionAll a (negate a))` reduces to `.val 0` records via diff arithmetic; no theorem yet, but ingredients in place | | `fusion/filter.rs` (filter ∘ filter) | `UnifiedStream.filter_filter_fuse` in `Mz/FilterFusion.lean`. Holds under per-row err-freedom (`predNoRowErr`). Excludes the `err`-on-left + `evalAnd` ordering corner. | | `threshold_elision.rs` | `UnifiedStream.clampPositive_id_of_positive` in `Mz/SetOps.lean`. `clampPositive` is identity when every record's diff is `.error` or a strictly-positive `.val`. | +| `fusion/map.rs` (project ∘ project) | `UnifiedStream.project_project_fuse` in `Mz/ProjectFusion.lean`. Holds under `projsAllSafe` (`es` is safe on every data row). Bridges via `eval_subst`. | ### Algebraic rewrites — modelable (worth shipping) | Rust pass | Lean approach | | --- | --- | -| `fusion/map.rs` (map fusion) | `project es ∘ project es' = project (es' ∘ es)`. Uses `Expr.subst` and `eval_subst` (already exist in `Mz/Pushdown.lean`); needs a `UnifiedStream`-level statement. | | `fusion/project.rs` / `movement/projection_lifting.rs` / `projection_pushdown.rs` | We have `project_unionAll`. Add `project_filter` (commutes when no scalar errors collide with predicate), `project_cross_pushdown` (push project through cross when columns split cleanly). | | `redundant_join.rs` (distinct + join) | Express `distinct` + `cross` commutation when right side is already key-unique. Requires `intersectAll`-style lookup invariants we already have. | | `semijoin_idempotence.rs` (partial) | A semijoin is `cross` + project + distinct. Idempotence via `distinct_idem` (provable; we have `clampToOne_idem`). | @@ -394,10 +394,10 @@ These need a new operator or analysis before they can be expressed. If a single pass should be modeled next, the highest-value candidates by API consumption density: -1. **`fusion/map.rs` (project ∘ project)** — uses existing `Expr.subst` machinery; would also document substitution at the relation level. -2. **`semijoin_idempotence.rs`** — distinct + cross + project commutation; uses `clampToOne_idem` already in `Mz/SetOps.lean`. -3. **`non_null_requirements.rs`** — lift `Strict.lean` propagation classes to `UnifiedStream.filter` to characterize drop vs promote. -4. **`demand.rs`** — uses `colReferencesUnused` + `eval_replaceAt_of_unused`; lift to a `UnifiedStream`-level replacement-invariance theorem. +1. **`semijoin_idempotence.rs`** — distinct + cross + project commutation; uses `clampToOne_idem` already in `Mz/SetOps.lean`. +2. **`non_null_requirements.rs`** — lift `Strict.lean` propagation classes to `UnifiedStream.filter` to characterize drop vs promote. +3. **`demand.rs`** — uses `colReferencesUnused` + `eval_replaceAt_of_unused`; lift to a `UnifiedStream`-level replacement-invariance theorem. +4. **`redundant_join.rs`** — distinct + cross commutation when right side is key-unique; uses existing intersect/lookup invariants. Beyond those, the cluster `{Reduce + reduce_elision + reduce_reduction + reduction_pushdown}` is the largest dependency gap. A `UnifiedStream.reduce` operator would unlock four passes plus the GroupBy semantics already partially in `Mz/GroupBy.lean`. From 84483834967cbf1aeea7886d56127e0c9560d40f Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 23:03:43 +0200 Subject: [PATCH 111/127] doc/semantics: demand, unused-column invariance for filter `UnifiedStream.filter_replaceAtRow_of_unused`: filter and `replaceAtRow n v` commute when the predicate doesn't reference column `n`. Lifts `eval_replaceAt_of_unused` from `Mz/ColRefs.lean` to the stream level. Models the row-side of demand.rs: a column that nothing downstream reads can be overwritten arbitrarily without changing the operator's output. New module Mz/Demand.lean. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Demand.lean | 100 +++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 doc/developer/semantics/Mz/Demand.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index e1442798e13d8..240c50a69309a 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -23,5 +23,6 @@ import Mz.Join import Mz.JoinPushdown import Mz.FilterFusion import Mz.ProjectFusion +import Mz.Demand import Mz.GroupBy import Mz.SetOps diff --git a/doc/developer/semantics/Mz/Demand.lean b/doc/developer/semantics/Mz/Demand.lean new file mode 100644 index 0000000000000..15f2fb21e3492 --- /dev/null +++ b/doc/developer/semantics/Mz/Demand.lean @@ -0,0 +1,100 @@ +import Mz.UnifiedStream +import Mz.ColRefs + +/-! +# Demand: unused-column invariance + +The Rust optimizer's `demand.rs` pass tracks which columns are +required downstream and rewrites the plan to avoid materializing +unused columns. The denotational counterpart: if a column is +unused by an operator, replacing the column with any value in +every row of the input leaves the operator output unchanged +(modulo the same replacement on the output). + +`Mz/ColRefs.lean` supplies the row-level invariance +(`eval_replaceAt_of_unused`). This file lifts it to +`UnifiedStream` for the two operators that consume a row at the +predicate / expression level: `filter` and `project`. + +`replaceAtRow n v` is the column-`n` substitution applied to every +`.row` carrier in the stream. `.err` carriers and `.error` diffs +flow through unchanged. -/ + +namespace Mz + +/-- Replace column `n` of every `.row` carrier in `us` with `v`. +`.err` carriers pass through unchanged; the diff is preserved on +every record. -/ +def UnifiedStream.replaceAtRow (n : Nat) (v : Datum) (us : UnifiedStream) : UnifiedStream := + us.map fun ud => match ud.1 with + | UnifiedRow.row r => (UnifiedRow.row (Env.replaceAt r n v), ud.2) + | UnifiedRow.err _ => ud + +theorem UnifiedStream.replaceAtRow_nil (n : Nat) (v : Datum) : + UnifiedStream.replaceAtRow n v [] = [] := rfl + +theorem UnifiedStream.replaceAtRow_append + (n : Nat) (v : Datum) (a b : UnifiedStream) : + UnifiedStream.replaceAtRow n v (a ++ b) + = UnifiedStream.replaceAtRow n v a + ++ UnifiedStream.replaceAtRow n v b := by + unfold UnifiedStream.replaceAtRow + exact List.map_append + +/-! ## Filter invariance under unused-column replacement -/ + +/-- Filter commutes with `replaceAtRow n v` when the predicate +does not reference column `n`. Replacing the unused column on the +input then filtering equals filtering then replacing on the +output. Models `demand.rs`: an unused column is free to be +overwritten without affecting the filter result. -/ +theorem UnifiedStream.filter_replaceAtRow_of_unused + (pred : Expr) (n : Nat) (v : Datum) (us : UnifiedStream) + (h : pred.colReferencesUnused n = true) : + UnifiedStream.filter pred (UnifiedStream.replaceAtRow n v us) + = UnifiedStream.replaceAtRow n v (UnifiedStream.filter pred us) := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hConsAsApp : ((uc, d) :: tl : UnifiedStream) = [(uc, d)] ++ tl := rfl + rw [hConsAsApp, UnifiedStream.replaceAtRow_append, + UnifiedStream.filter_append, UnifiedStream.filter_append, + UnifiedStream.replaceAtRow_append, ih] + congr 1 + cases d with + | error => + -- `.error` diff: filter keeps; replaceAtRow ignores diff; + -- carrier handled identically on both pipelines. + cases uc with + | row r => rfl + | err _ => rfl + | val n' => + cases uc with + | err _ => rfl + | row r => + -- Both pipelines reduce to `(match eval … pred with …) ++ []`. + -- `eval_replaceAt_of_unused` bridges the inner match. + have hEval : eval (Env.replaceAt r n v) pred = eval r pred := + eval_replaceAt_of_unused r n v pred h + show (match eval (Env.replaceAt r n v) pred with + | .bool true => [(UnifiedRow.row (Env.replaceAt r n v), + DiffWithError.val n')] + | .err e => [(UnifiedRow.err e, DiffWithError.val n')] + | _ => []) ++ [] + = UnifiedStream.replaceAtRow n v + ((match eval r pred with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n')] + | .err e => [(UnifiedRow.err e, DiffWithError.val n')] + | _ => []) ++ []) + rw [hEval] + cases eval r pred with + | bool b => + cases b with + | true => rfl + | false => rfl + | err _ => rfl + | int _ => rfl + | null => rfl + +end Mz From 005c4d9e571aa05583298b929ba6106a34f79b27 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 23:07:10 +0200 Subject: [PATCH 112/127] doc/semantics: demand for project under IsPureData MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `project_replaceAtRow_eq_of_unused`: when every expression in `es` has column `n` unused and the input is pure data, replacing column `n` of every input row leaves the project output equal. Asymmetry vs filter: filter preserves the row content of surviving records, so demand admits replacement on both input and output sides; project rewrites row content via `es.map (eval r)`, so column `n` of the output is unrelated to column `n` of the input, and the simpler invariance form requires ruling out `.err`/ `.error` passthrough — hence the `IsPureData` hypothesis. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/Demand.lean | 97 ++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/doc/developer/semantics/Mz/Demand.lean b/doc/developer/semantics/Mz/Demand.lean index 15f2fb21e3492..ad23caea3d912 100644 --- a/doc/developer/semantics/Mz/Demand.lean +++ b/doc/developer/semantics/Mz/Demand.lean @@ -1,5 +1,6 @@ import Mz.UnifiedStream import Mz.ColRefs +import Mz.JoinPushdown /-! # Demand: unused-column invariance @@ -97,4 +98,100 @@ theorem UnifiedStream.filter_replaceAtRow_of_unused | int _ => rfl | null => rfl +/-! ## Project invariance under unused-column replacement -/ + +/-- Every expression in `es` has column `n` unused. -/ +def Expr.argsColRefUnusedList (n : Nat) (es : List Expr) : Prop := + ∀ e ∈ es, e.colReferencesUnused n = true + +private theorem rowAllSafe_replaceAt_of_unused + (es : List Expr) (n : Nat) (v : Datum) (r : Row) + (h : Expr.argsColRefUnusedList n es) : + rowAllSafe es (Env.replaceAt r n v) = rowAllSafe es r := by + induction es with + | nil => rfl + | cons hd tl ih => + have hHd : hd.colReferencesUnused n = true := h hd List.mem_cons_self + have hTl : Expr.argsColRefUnusedList n tl := + fun e hMem => h e (List.mem_cons_of_mem _ hMem) + have hEval : eval (Env.replaceAt r n v) hd = eval r hd := + eval_replaceAt_of_unused r n v hd hHd + unfold rowAllSafe at ih ⊢ + rw [List.all_cons, List.all_cons, hEval, ih hTl] + +private theorem rowErrs_replaceAt_of_unused + (es : List Expr) (n : Nat) (v : Datum) (r : Row) + (h : Expr.argsColRefUnusedList n es) : + rowErrs es (Env.replaceAt r n v) = rowErrs es r := by + induction es with + | nil => rfl + | cons hd tl ih => + have hHd : hd.colReferencesUnused n = true := h hd List.mem_cons_self + have hTl : Expr.argsColRefUnusedList n tl := + fun e hMem => h e (List.mem_cons_of_mem _ hMem) + have hEval : eval (Env.replaceAt r n v) hd = eval r hd := + eval_replaceAt_of_unused r n v hd hHd + unfold rowErrs at ih ⊢ + rw [List.filterMap_cons, List.filterMap_cons, hEval, ih hTl] + +private theorem evalMap_replaceAt_of_unused + (es : List Expr) (n : Nat) (v : Datum) (r : Row) + (h : Expr.argsColRefUnusedList n es) : + es.map (eval (Env.replaceAt r n v)) = es.map (eval r) := by + induction es with + | nil => rfl + | cons hd tl ih => + have hHd : hd.colReferencesUnused n = true := h hd List.mem_cons_self + have hTl : Expr.argsColRefUnusedList n tl := + fun e hMem => h e (List.mem_cons_of_mem _ hMem) + have hEval : eval (Env.replaceAt r n v) hd = eval r hd := + eval_replaceAt_of_unused r n v hd hHd + show eval (Env.replaceAt r n v) hd :: tl.map _ + = eval r hd :: tl.map _ + rw [hEval, ih hTl] + +/-- Project is invariant under input replacement of an unused +column, when the input is pure data (no `.err` carriers, no +`.error` diffs). The hypothesis rules out the err / error +passthrough that *would* preserve the replaced carrier on the +output and break the simpler invariance form. + +This is the project-side counterpart of +`filter_replaceAtRow_of_unused`. The reason the filter version +admits replacement on both sides (no purity needed) and this one +doesn't: filter preserves the row content of surviving records, +so column `n` of the output equals column `n` of the input; +project rewrites the row content via `es.map (eval r)`, so column +`n` of the output is unrelated to column `n` of the input. -/ +theorem UnifiedStream.project_replaceAtRow_eq_of_unused + (es : List Expr) (n : Nat) (v : Datum) (us : UnifiedStream) + (hPure : UnifiedStream.IsPureData us) + (h : Expr.argsColRefUnusedList n es) : + UnifiedStream.project es (UnifiedStream.replaceAtRow n v us) + = UnifiedStream.project es us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hTlPure : UnifiedStream.IsPureData tl := hPure.tail + obtain ⟨⟨r, hUc⟩, ⟨n', hD⟩⟩ := hPure.head + subst hUc; subst hD + have hConsAsApp : ((UnifiedRow.row r, DiffWithError.val n') :: tl : UnifiedStream) + = [(UnifiedRow.row r, DiffWithError.val n')] ++ tl := rfl + rw [hConsAsApp, UnifiedStream.replaceAtRow_append, + UnifiedStream.project_append, UnifiedStream.project_append, + ih hTlPure] + congr 1 + -- LHS: project es [(.row (Env.replaceAt r n v), .val n')]. + -- RHS: project es [(.row r, .val n')]. Bottoms out at + -- `rowProjectRecords es (.val n') (...)`. The two agree under + -- the unused-column hypothesis. + show rowProjectRecords es (DiffWithError.val n') (Env.replaceAt r n v) ++ [] + = rowProjectRecords es (DiffWithError.val n') r ++ [] + rw [List.append_nil, List.append_nil] + unfold rowProjectRecords + rw [rowAllSafe_replaceAt_of_unused es n v r h, + rowErrs_replaceAt_of_unused es n v r h, + evalMap_replaceAt_of_unused es n v r h] + end Mz From deca34ea81e37373d6159a5c06cc5264411bd34e Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 23:07:54 +0200 Subject: [PATCH 113/127] doc/semantics: transforms.md, demand shipped Move demand.rs from Modelable to Modeled. Cite both filter_replaceAtRow_of_unused (full input) and project_replaceAtRow_eq_of_unused (under IsPureData). Rotate priority list; canonicalize_mfp now reachable on top of three fusion theorems. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/transforms.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/developer/semantics/transforms.md b/doc/developer/semantics/transforms.md index ff001264aa1dc..5699cdff9b94a 100644 --- a/doc/developer/semantics/transforms.md +++ b/doc/developer/semantics/transforms.md @@ -349,6 +349,7 @@ Status legend: | `fusion/filter.rs` (filter ∘ filter) | `UnifiedStream.filter_filter_fuse` in `Mz/FilterFusion.lean`. Holds under per-row err-freedom (`predNoRowErr`). Excludes the `err`-on-left + `evalAnd` ordering corner. | | `threshold_elision.rs` | `UnifiedStream.clampPositive_id_of_positive` in `Mz/SetOps.lean`. `clampPositive` is identity when every record's diff is `.error` or a strictly-positive `.val`. | | `fusion/map.rs` (project ∘ project) | `UnifiedStream.project_project_fuse` in `Mz/ProjectFusion.lean`. Holds under `projsAllSafe` (`es` is safe on every data row). Bridges via `eval_subst`. | +| `demand.rs` | `filter_replaceAtRow_of_unused` (any input) + `project_replaceAtRow_eq_of_unused` (under `IsPureData`) in `Mz/Demand.lean`. Lifts `eval_replaceAt_of_unused` to the stream level. | ### Algebraic rewrites — modelable (worth shipping) @@ -358,7 +359,6 @@ Status legend: | `redundant_join.rs` (distinct + join) | Express `distinct` + `cross` commutation when right side is already key-unique. Requires `intersectAll`-style lookup invariants we already have. | | `semijoin_idempotence.rs` (partial) | A semijoin is `cross` + project + distinct. Idempotence via `distinct_idem` (provable; we have `clampToOne_idem`). | | `non_null_requirements.rs` (model the strict-null laws) | We already have `evalAnd` / `evalOr` / arithmetic err-/null-strictness. State as `NullPropagatingBinary` / `ErrPropagatingBinary` instances; some exist in `Mz/Strict.lean`. Lift to `UnifiedStream.filter` to characterize when predicates drop vs promote. | -| `demand.rs` (column-projection analysis) | We have `colReferencesBoundedBy`, `colReferencesUnused`, `eval_replaceAt_of_unused` in `Mz/ColRefs.lean`. Add a `UnifiedStream`-level theorem: replacing unused columns in every row leaves the operator output equal. | | `canonicalize_mfp.rs` | Establish a canonical form `Project ∘ Filter ∘ Map` and prove every MFP-like composition has a unique canonical equivalent. Needs Map (`UnifiedStream.project` is the analog of MapFilterProject's Project part; we don't have an MFP wrapper). | | `equivalence_propagation.rs` (use sites only) | The *use* of equivalence is `if a = b then replace a with b`. With a proved `evalEq` characterization we can show `filter (a = b) us` preserves a row iff substituting `b` for `a` in the rest of the predicate gives the same evaluation. Substitution machinery is in `Expr.subst`. | @@ -396,8 +396,8 @@ If a single pass should be modeled next, the highest-value candidates by API con 1. **`semijoin_idempotence.rs`** — distinct + cross + project commutation; uses `clampToOne_idem` already in `Mz/SetOps.lean`. 2. **`non_null_requirements.rs`** — lift `Strict.lean` propagation classes to `UnifiedStream.filter` to characterize drop vs promote. -3. **`demand.rs`** — uses `colReferencesUnused` + `eval_replaceAt_of_unused`; lift to a `UnifiedStream`-level replacement-invariance theorem. -4. **`redundant_join.rs`** — distinct + cross commutation when right side is key-unique; uses existing intersect/lookup invariants. +3. **`redundant_join.rs`** — distinct + cross commutation when right side is key-unique; uses existing intersect/lookup invariants. +4. **`canonicalize_mfp.rs`** — canonical `project ∘ filter ∘ map` form. Builds on the three fusion theorems already shipped (filter, map, predicate pushdown). Beyond those, the cluster `{Reduce + reduce_elision + reduce_reduction + reduction_pushdown}` is the largest dependency gap. A `UnifiedStream.reduce` operator would unlock four passes plus the GroupBy semantics already partially in `Mz/GroupBy.lean`. From c800d0422dd97b7796e709c8d93ef8f5a6d2ba15 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 23:10:24 +0200 Subject: [PATCH 114/127] doc/semantics: filter idempotence (unconditional) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `UnifiedStream.filter_idem`: filter pred (filter pred us) = filter pred us. Holds without err-freedom — when both filters share a predicate, the err-promotion / evalAnd ordering mismatch that forces predNoRowErr in filter_filter_fuse cannot arise. Special case of filter_filter_fuse + evalAnd_idem, but proved directly to avoid the unnecessary hypothesis. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/FilterFusion.lean | 53 ++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/doc/developer/semantics/Mz/FilterFusion.lean b/doc/developer/semantics/Mz/FilterFusion.lean index c9e582ed5db01..79e994dc9b97c 100644 --- a/doc/developer/semantics/Mz/FilterFusion.lean +++ b/doc/developer/semantics/Mz/FilterFusion.lean @@ -165,4 +165,57 @@ theorem UnifiedStream.filter_filter_fuse hP.head r rfl exact filter_fusion_row q p r n hQr hPr +/-! ## Idempotence (no hypothesis required) + +The general fusion theorem needs err-freedom because `evalAnd`'s +clause ordering can disagree with filter's err-promotion when the +two predicates differ. For the special case `q = p`, the two +pipelines bottom out at the same `eval r p` value, so the err +case lines up and no err-freedom hypothesis is needed. -/ + +/-- Filter is idempotent: applying the same predicate twice equals +applying it once. Holds unconditionally — unlike `filter_filter_fuse`, +no err-freedom hypothesis is needed when both filters share a +predicate. -/ +theorem UnifiedStream.filter_idem (pred : Expr) (us : UnifiedStream) : + UnifiedStream.filter pred (UnifiedStream.filter pred us) + = UnifiedStream.filter pred us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hConsAsApp : ((uc, d) :: tl : UnifiedStream) = [(uc, d)] ++ tl := rfl + rw [hConsAsApp, UnifiedStream.filter_append, + UnifiedStream.filter_append, ih] + congr 1 + cases d with + | error => + -- `(uc, .error)` survives once via first arm; second pass + -- sees the same pair and keeps it again. + rfl + | val n => + cases uc with + | err e => + -- `(.err e, .val n)` survives once via err-arm; second pass + -- still sees an err carrier and keeps it. + rfl + | row r => + rw [filter_row_singleton pred r n] + cases hEval : eval r pred with + | bool b => + cases b with + | true => + -- First filter keeps `(.row r, .val n)`; second filter + -- evaluates pred again on the same row. + rw [filter_row_singleton pred r n, hEval] + | false => + -- First filter drops; second filter on `[]` is `[]`. + rfl + | err e => + -- First filter promotes to `(.err e, .val n)`; second + -- filter keeps via err-arm. + rfl + | int _ => rfl + | null => rfl + end Mz From f742335b0e503d79358b767b58992b8a1c8b1db8 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 23:12:06 +0200 Subject: [PATCH 115/127] doc/semantics: filter commutativity (under err-freedom) `UnifiedStream.filter_comm`: filter p (filter q us) = filter q (filter p us) when neither predicate errors on the input's data rows. Reduces to `filter_filter_fuse` applied both ways, then equates `.and q p` and `.and p q` via `evalAnd_comm_of_no_err`. New helper `filter_eval_eq`: two filters with eval-equivalent predicates on every data row produce equal outputs. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/FilterFusion.lean | 71 ++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/doc/developer/semantics/Mz/FilterFusion.lean b/doc/developer/semantics/Mz/FilterFusion.lean index 79e994dc9b97c..e0b400b11796f 100644 --- a/doc/developer/semantics/Mz/FilterFusion.lean +++ b/doc/developer/semantics/Mz/FilterFusion.lean @@ -2,6 +2,7 @@ import Mz.UnifiedStream import Mz.Boolean import Mz.Expr import Mz.Eval +import Mz.Laws /-! # Filter fusion @@ -218,4 +219,74 @@ theorem UnifiedStream.filter_idem (pred : Expr) (us : UnifiedStream) : | int _ => rfl | null => rfl +/-! ## Filter commutativity (under err-freedom) + +Two filters can swap order when neither errors on the input's +data rows. The proof reduces to `filter_filter_fuse` on both +sides, then equates the fused predicates via `evalAnd_comm_of_no_err` +applied row by row. -/ + +/-- Two filters with eval-equivalent predicates on every data row +of the input produce equal outputs. Useful for re-associating / +re-ordering fused predicates without re-running the full +filter analysis. -/ +theorem UnifiedStream.filter_eval_eq + (p q : Expr) (us : UnifiedStream) + (h : ∀ ud ∈ us, ∀ r, ud.1 = UnifiedRow.row r → eval r p = eval r q) : + UnifiedStream.filter p us = UnifiedStream.filter q us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hTl : ∀ ud ∈ tl, ∀ r, ud.1 = UnifiedRow.row r → eval r p = eval r q := + fun ud hMem => h ud (List.mem_cons_of_mem _ hMem) + have hConsAsApp : ((uc, d) :: tl : UnifiedStream) = [(uc, d)] ++ tl := rfl + rw [hConsAsApp, UnifiedStream.filter_append, + UnifiedStream.filter_append, ih hTl] + congr 1 + cases d with + | error => rfl + | val n => + cases uc with + | err _ => rfl + | row r => + have hRow : eval r p = eval r q := h (UnifiedRow.row r, DiffWithError.val n) + List.mem_cons_self r rfl + rw [filter_row_singleton p r n, filter_row_singleton q r n, hRow] + +/-- `Datum` is not an err iff none of its `err _` matches. The +predicate `¬d.IsErr` from `Datum.IsErr` is what `evalAnd_comm_of_no_err` +takes; restate via the err-freedom hypothesis used here. -/ +private theorem datum_not_isErr_of_no_err {d : Datum} + (h : ∀ ev, d ≠ Datum.err ev) : ¬d.IsErr := by + cases d with + | err e => exact absurd rfl (h e) + | bool _ => exact id + | int _ => exact id + | null => exact id + +/-- Filters commute when neither predicate errors on any data row +of the input. Reduces to `filter_filter_fuse` applied both ways, +then equates `.and q p` with `.and p q` via `evalAnd_comm_of_no_err`. -/ +theorem UnifiedStream.filter_comm + (q p : Expr) (us : UnifiedStream) + (hQ : UnifiedStream.predNoRowErr q us) + (hP : UnifiedStream.predNoRowErr p us) : + UnifiedStream.filter p (UnifiedStream.filter q us) + = UnifiedStream.filter q (UnifiedStream.filter p us) := by + rw [UnifiedStream.filter_filter_fuse q p us hQ hP, + UnifiedStream.filter_filter_fuse p q us hP hQ] + apply UnifiedStream.filter_eval_eq + intro ud hMem r hUc + have hQr : ∀ ev, eval r q ≠ Datum.err ev := hQ ud hMem r hUc + have hPr : ∀ ev, eval r p ≠ Datum.err ev := hP ud hMem r hUc + have hEvalAndQP : eval r (Expr.and q p) = evalAnd (eval r q) (eval r p) := by + simp only [eval] + have hEvalAndPQ : eval r (Expr.and p q) = evalAnd (eval r p) (eval r q) := by + simp only [eval] + rw [hEvalAndQP, hEvalAndPQ] + exact evalAnd_comm_of_no_err + (datum_not_isErr_of_no_err hQr) + (datum_not_isErr_of_no_err hPr) + end Mz From 7f098a1ddf7dadee649b33f812ab09436e86e672 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 23:12:20 +0200 Subject: [PATCH 116/127] doc/semantics: transforms.md, idem + comm filter laws Cite filter_idem (unconditional) and filter_comm (under err-freedom) alongside filter_filter_fuse. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/transforms.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/developer/semantics/transforms.md b/doc/developer/semantics/transforms.md index 5699cdff9b94a..a091bd93e6739 100644 --- a/doc/developer/semantics/transforms.md +++ b/doc/developer/semantics/transforms.md @@ -346,7 +346,7 @@ Status legend: | `fusion/negate.rs` (Negate fusion) | `negate_negate` (involution) | | `fusion/join.rs` (Join fusion / associativity) | `cross_assoc` | | `union_cancel.rs` (partial) | `consolidate (unionAll a (negate a))` reduces to `.val 0` records via diff arithmetic; no theorem yet, but ingredients in place | -| `fusion/filter.rs` (filter ∘ filter) | `UnifiedStream.filter_filter_fuse` in `Mz/FilterFusion.lean`. Holds under per-row err-freedom (`predNoRowErr`). Excludes the `err`-on-left + `evalAnd` ordering corner. | +| `fusion/filter.rs` (filter ∘ filter) | `UnifiedStream.filter_filter_fuse` in `Mz/FilterFusion.lean` plus `filter_idem` (unconditional) and `filter_comm` (under err-freedom). | | `threshold_elision.rs` | `UnifiedStream.clampPositive_id_of_positive` in `Mz/SetOps.lean`. `clampPositive` is identity when every record's diff is `.error` or a strictly-positive `.val`. | | `fusion/map.rs` (project ∘ project) | `UnifiedStream.project_project_fuse` in `Mz/ProjectFusion.lean`. Holds under `projsAllSafe` (`es` is safe on every data row). Bridges via `eval_subst`. | | `demand.rs` | `filter_replaceAtRow_of_unused` (any input) + `project_replaceAtRow_eq_of_unused` (under `IsPureData`) in `Mz/Demand.lean`. Lifts `eval_replaceAt_of_unused` to the stream level. | From fe3d7fcb15a23a53aa24965d489b77b0f4028051 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 23:13:27 +0200 Subject: [PATCH 117/127] =?UTF-8?q?doc/semantics:=20filter=20=E2=88=98=20p?= =?UTF-8?q?roject=20pushdown=20at=20UnifiedStream=20level?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `UnifiedStream.filter_project_pushdown`: filter p ∘ project es = project es ∘ filter (p.subst es), under projsAllSafe es us. Lifts BagStream.project_filter_pushdown_data (data-side only) to the unified stream — errs flow through both pipelines symmetrically under the safety hypothesis, so no data/error asymmetry remains. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/ProjectFusion.lean | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/doc/developer/semantics/Mz/ProjectFusion.lean b/doc/developer/semantics/Mz/ProjectFusion.lean index 70e41cd16b5c8..05ae9ed612d52 100644 --- a/doc/developer/semantics/Mz/ProjectFusion.lean +++ b/doc/developer/semantics/Mz/ProjectFusion.lean @@ -147,4 +147,105 @@ theorem UnifiedStream.project_project_fuse rw [hStep1, hStep2, hFused, rowProjectRecords_substList es es' (DiffWithError.val n) r] +/-! ## Filter ∘ project pushdown + +Lifts `filterRel_pushdown_project` to `UnifiedStream` under the +same safety hypothesis (`projsAllSafe`) the fusion theorem uses. +Filtering after projecting equals substituting through the +projection and filtering before projecting. -/ + +/-- Per-`.row` helper: filter applied to a single projected row +record equals project applied to a filter-substituted singleton. +Under safety, both sides reduce through the `.row` arm and +`eval_subst` bridges the predicates. -/ +private theorem filter_project_pushdown_row + (p : Expr) (es : List Expr) (n : Int) (r : Row) + (hSafe : rowAllSafe es r = true) : + UnifiedStream.filter p + (UnifiedStream.project es [(UnifiedRow.row r, DiffWithError.val n)]) + = UnifiedStream.project es + (UnifiedStream.filter (p.subst es) + [(UnifiedRow.row r, DiffWithError.val n)]) := by + have hStep1 : UnifiedStream.project es [(UnifiedRow.row r, DiffWithError.val n)] + = [(UnifiedRow.row (es.map (eval r)), DiffWithError.val n)] := by + show rowProjectRecords es (DiffWithError.val n) r ++ [] = _ + unfold rowProjectRecords + rw [if_pos hSafe, List.append_nil] + have hFilterSubstSingleton : + UnifiedStream.filter (p.subst es) [(UnifiedRow.row r, DiffWithError.val n)] + = (match eval r (p.subst es) with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err ev => [(UnifiedRow.err ev, DiffWithError.val n)] + | _ => []) := by + show (match eval r (p.subst es) with + | .bool true => [(UnifiedRow.row r, DiffWithError.val n)] + | .err ev => [(UnifiedRow.err ev, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [List.append_nil] + have hFilterProjSingleton : + UnifiedStream.filter p + [(UnifiedRow.row (es.map (eval r)), DiffWithError.val n)] + = (match eval (es.map (eval r)) p with + | .bool true => + [(UnifiedRow.row (es.map (eval r)), DiffWithError.val n)] + | .err ev => [(UnifiedRow.err ev, DiffWithError.val n)] + | _ => []) := by + show (match eval (es.map (eval r)) p with + | .bool true => + [(UnifiedRow.row (es.map (eval r)), DiffWithError.val n)] + | .err ev => [(UnifiedRow.err ev, DiffWithError.val n)] + | _ => []) ++ [] = _ + rw [List.append_nil] + have hEvalSubst : eval r (p.subst es) = eval (es.map (eval r)) p := + eval_subst r es p + rw [hStep1, hFilterProjSingleton, hFilterSubstSingleton, hEvalSubst] + -- Both sides branch on the same `eval (es.map (eval r)) p` value. + cases eval (es.map (eval r)) p with + | bool b => + cases b with + | true => + -- LHS: [(.row (es.map (eval r)), .val n)]. + -- RHS: project es [(.row r, .val n)] = same. + rw [hStep1] + | false => rfl + | err ev => + -- LHS: [(.err ev, .val n)]. + -- RHS: project es [(.err ev, .val n)] = [(.err ev, .val n)]. + rfl + | int _ => rfl + | null => rfl + +/-- Filter pushes through project: `filter p ∘ project es = +project es ∘ filter (p.subst es)`, under `projsAllSafe es us`. +Models the data-side relational pushdown rewrite at the +`UnifiedStream` level. Mirrors `BagStream.project_filter_pushdown_data` +without the data-only restriction (errs flow through both +pipelines symmetrically under the safety hypothesis). -/ +theorem UnifiedStream.filter_project_pushdown + (p : Expr) (es : List Expr) (us : UnifiedStream) + (hSafe : UnifiedStream.projsAllSafe es us) : + UnifiedStream.filter p (UnifiedStream.project es us) + = UnifiedStream.project es (UnifiedStream.filter (p.subst es) us) := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hTl : UnifiedStream.projsAllSafe es tl := hSafe.tail + have hConsAsApp : ((uc, d) :: tl : UnifiedStream) = [(uc, d)] ++ tl := rfl + rw [hConsAsApp, UnifiedStream.project_append, + UnifiedStream.filter_append, UnifiedStream.filter_append, + UnifiedStream.project_append, ih hTl] + congr 1 + cases d with + | error => + cases uc with + | row r => rfl + | err _ => rfl + | val n => + cases uc with + | err _ => rfl + | row r => + have hHd : rowAllSafe es r = true := hSafe.head r rfl + exact filter_project_pushdown_row p es n r hHd + end Mz From 6b64c221ea59c2e0bea05780c0fcf4efd37724d4 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 23:13:40 +0200 Subject: [PATCH 118/127] doc/semantics: transforms.md, filter/project pushdown shipped MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Note filter_project_pushdown in the Modelable row for projection-lifting passes. Still missing: project_cross_pushdown (harder — depends on column-split semantics). Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/transforms.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/developer/semantics/transforms.md b/doc/developer/semantics/transforms.md index a091bd93e6739..72fb8335d4b1b 100644 --- a/doc/developer/semantics/transforms.md +++ b/doc/developer/semantics/transforms.md @@ -355,7 +355,7 @@ Status legend: | Rust pass | Lean approach | | --- | --- | -| `fusion/project.rs` / `movement/projection_lifting.rs` / `projection_pushdown.rs` | We have `project_unionAll`. Add `project_filter` (commutes when no scalar errors collide with predicate), `project_cross_pushdown` (push project through cross when columns split cleanly). | +| `fusion/project.rs` / `movement/projection_lifting.rs` / `projection_pushdown.rs` | Have `project_unionAll` + `filter_project_pushdown` (in `Mz/ProjectFusion.lean`). Still need `project_cross_pushdown` (push project through cross when columns split cleanly). | | `redundant_join.rs` (distinct + join) | Express `distinct` + `cross` commutation when right side is already key-unique. Requires `intersectAll`-style lookup invariants we already have. | | `semijoin_idempotence.rs` (partial) | A semijoin is `cross` + project + distinct. Idempotence via `distinct_idem` (provable; we have `clampToOne_idem`). | | `non_null_requirements.rs` (model the strict-null laws) | We already have `evalAnd` / `evalOr` / arithmetic err-/null-strictness. State as `NullPropagatingBinary` / `ErrPropagatingBinary` instances; some exist in `Mz/Strict.lean`. Lift to `UnifiedStream.filter` to characterize when predicates drop vs promote. | From 7a2d5417d375a9dd15a832cda3662cccc28242d6 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Mon, 18 May 2026 23:24:21 +0200 Subject: [PATCH 119/127] doc/semantics: clampToOne identity on .val 1/.error inputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `clampToOne_id_of_one`: clampToOne is identity when every record's diff is already .val 1 or .error. Companion to clampPositive_id_of_positive — same threshold-elision pattern, but for the set-semantics post-pass instead of the bag-semantics one. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz/SetOps.lean | 38 ++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/doc/developer/semantics/Mz/SetOps.lean b/doc/developer/semantics/Mz/SetOps.lean index 87663c52c7df1..f7f3b46c228c0 100644 --- a/doc/developer/semantics/Mz/SetOps.lean +++ b/doc/developer/semantics/Mz/SetOps.lean @@ -822,6 +822,44 @@ theorem UnifiedStream.clampToOne_idem (us : UnifiedStream) : rw [ih] · exact ih +/-- `clampToOne` is the identity on streams where every record's +diff is already `.val 1` or `.error`. Companion to +`clampPositive_id_of_positive`: the threshold-elision-flavored +result for the set-semantics post-pass. -/ +theorem UnifiedStream.clampToOne_id_of_one + (us : UnifiedStream) + (h : ∀ ud ∈ us, ud.2 = DiffWithError.error + ∨ ud.2 = DiffWithError.val 1) : + UnifiedStream.clampToOne us = us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨uc, d⟩ := hd + have hHd := h (uc, d) List.mem_cons_self + have hTl : ∀ ud ∈ tl, ud.2 = DiffWithError.error + ∨ ud.2 = DiffWithError.val 1 := + fun ud hMem => h ud (List.mem_cons_of_mem _ hMem) + cases d with + | error => + show (uc, DiffWithError.error) :: UnifiedStream.clampToOne tl + = (uc, DiffWithError.error) :: tl + rw [ih hTl] + | val n => + rcases hHd with hErr | hOne + · -- `.val n = .error` is impossible. + exact absurd hErr (by intro hEq; cases hEq) + · -- `.val n = .val 1`, so `n = 1`. + have hN : n = 1 := by + have := hOne + cases this + rfl + subst hN + show (if 0 < (1 : Int) + then (uc, DiffWithError.val 1) :: UnifiedStream.clampToOne tl + else UnifiedStream.clampToOne tl) + = (uc, DiffWithError.val 1) :: tl + rw [if_pos (by decide : (0 : Int) < 1), ih hTl] + /-- All-`.val` inputs yield all-`.val` outputs through `clampToOne`: no `.error` is introduced (records with non-positive `.val` are dropped, positive `.val` become `.val 1`). -/ From 461a7beedbdd6ffd2153d468a256934227259a5c Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Wed, 20 May 2026 11:45:49 -0400 Subject: [PATCH 120/127] doc/design: row-scoped diff encoding (non-absorbing err counts) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an Int × ErrCount component to the diff so row-scoped errs preserve row content AND retract through normal diff arithmetic. Replaces the previous design where row-scoped errs went through DataflowError-with-row-drop (lossy) or absorbing-marker (terminal). Two-layer encoding: * val (Int × ErrCount): retractable, used for row-scoped errs. * global: absorbing, used for collection-scoped errs only. Covers WHERE / join-predicate err semantics, multiplication rule under cross/join, retraction symmetry, and the rejected alternatives (carrier replacement, absorbing-only). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../20260517_error_handling_semantics.md | 76 +++++++++++++++---- 1 file changed, 63 insertions(+), 13 deletions(-) diff --git a/doc/developer/design/20260517_error_handling_semantics.md b/doc/developer/design/20260517_error_handling_semantics.md index f7edcb1060c9f..1b4e8d8c7b977 100644 --- a/doc/developer/design/20260517_error_handling_semantics.md +++ b/doc/developer/design/20260517_error_handling_semantics.md @@ -75,21 +75,58 @@ The type system treats `Datum::Error` as inhabiting every `ScalarType`. This mirrors the way `NULL` inhabits every nullable type. The variant carries an `EvalError`, not a string, so that error introspection functions can be added later without a format break. -### Row-scoped errors: `DataflowError` (unchanged) +### Row-scoped errors: `DataflowError` plus diff-encoded multiplicities `DataflowError` continues to carry row-scoped errors through the existing error collection. -The semantics are unchanged. -An operator that wishes to escalate a `Datum::Error` to row scope does so by emitting a `DataflowError::EvalError` and dropping the row from the data collection. +The semantics of the existing collection are unchanged. +An operator that wishes to escalate a `Datum::Error` to row scope does so by emitting a `DataflowError::EvalError` and dropping the row from the data collection, as today. -### Global-scoped errors: diff-field encoding (specification only) +Operators that produce row-scoped errors from per-row evaluation — `WHERE`, join predicates, projection scalars — face a tension that the existing `DataflowError` pathway does not resolve. +The erroring expression's result (for example `1/(a+b)` with `a+b = 0`) has no column in the row to live in. +Replacing the row with an opaque error marker loses the row's content, which downstream operators may still need to count, consolidate, or retract. +Routing the failure to `DataflowError` and dropping the row from the data collection has the same effect: the data row is gone, even though the row's data was otherwise well-defined. -A global error at time `t` is encoded as a distinguished record in the error collection whose `diff` field carries a special marker. -The intent is that any downstream operator observing such a record at time `t` treats the entire input collection at `t` as invalid, propagating the global error to its own output. -The natural encoding in differential dataflow uses the `diff` field because the data field is per-row and the time field is per-update. -A monoid extension of the `diff` semiring that adds an absorbing "error" element captures the propagation rule: any sum involving the absorbing element is itself the absorbing element, which is exactly the semantics required. +The spec therefore extends the `diff` field with a per-error-kind multiplicity component so that row-scoped errors retain row content and participate in normal diff arithmetic. +A diff is the pair + +``` +Diff = Int × ErrCount +ErrCount = EvalError -fin-> Int +``` + +where `Int` is the valid-copy multiplicity and `ErrCount` is the finite-support map from error payloads to error-copy multiplicities. +Addition is pointwise on both axes. +Multiplication (used by join and cross) is `(a, m) * (b, n) = (a*b, a • n + b • m)` where `a • n` scales every err-count in `n` by the scalar `a`; this is the unique extension that distributes over addition and preserves the absorbing role of `0`. +Negation is pointwise: `-(a, m) = (-a, -m)`, where `-m` is the pointwise negation of the err-count map. +Zero is `(0, ∅)`; one is `(1, ∅)`. + +Under this encoding, a `WHERE` predicate that errors with `EvalError::DivisionByZero` on a row carrying diff `(a, m)` produces an output record with the row content preserved and diff `(0, m ∪ {DivisionByZero ↦ a})`. +A later retraction of the same row produces the negated diff and the err-count cancels through ordinary differential-dataflow consolidation. +Two distinct rows that happen to produce the same `EvalError` accumulate in the same err-count slot; an `IS DISTINCT FROM`-style query that distinguishes them can still do so via the row content, which the carrier has not touched. + +`DataflowError` and the diff-encoded multiplicities are complementary, not redundant. +A sink that cannot emit error rows still escalates to `DataflowError` and drops the row. +An intermediate operator that wants to keep counting through an erroring evaluation uses the diff-encoded component and leaves `DataflowError` alone. +The escalation rule is explicit: an operator opts into `DataflowError` when its downstream contract requires it. + +### Global-scoped errors: absorbing diff marker (specification only) + +A global error at time `t` is encoded as a distinguished record whose diff carries an absorbing element. +The absorbing element sits outside the `Int × ErrCount` pair, layered as + +``` +DiffWithGlobal = val(Diff) | global +``` + +with `global` absorbing both addition and multiplication: `global + d = global`, `d + global = global`, `global * d = global`, `d * global = global`. +Any sum involving `global` is itself `global`, which is exactly the propagation rule a downstream operator needs. +Unlike the row-scoped err-count component, `global` is terminal: a `global` marker at time `t` cannot be retracted, because the claim "this collection is invalid at `t`" is one-way. + +The intent is that any downstream operator observing a `global` record at time `t` treats the entire input collection at `t` as invalid, propagating the global error to its own output. +The natural locus is still the diff field, because the data field is per-row and the time field is per-update, and an absorbing monoid extension of the diff semiring captures exactly the propagation rule required. Implementation is out of scope. -The spec exists so that future operator work targets this encoding rather than inventing alternates. +The spec exists so that future operator work targets this two-layer encoding (`Int × ErrCount` for retractable row-scoped, `global` for terminal collection-scoped) rather than inventing alternates. ### SQL error semantics @@ -132,8 +169,8 @@ The extension is conservative: any cell that PostgreSQL would have produced as ` **Predicates.** A `WHERE` clause emits a row when its predicate evaluates to `TRUE`. It drops the row when the predicate is `FALSE` or `NULL`, as today. -When the predicate is `ERROR`, the row is escalated to a row-scoped error and surfaced via `DataflowError`. -This preserves "predicates are total" externally — the user sees either matching rows or row errors, never silently dropped errors. +When the predicate is `ERROR`, the row is recorded with diff `(0, {e ↦ a})` where `e` is the error and `a` is the input row's valid-copy multiplicity; the row content is preserved and the err-count participates in retraction via the row-scoped diff encoding described above. +A downstream operator that must escalate to `DataflowError` does so explicitly; the default behavior preserves the row. **Comparison.** `=`, `<`, `>`, etc., applied to `Datum::Error` return `Datum::Error`. @@ -152,8 +189,8 @@ An explicit opt-out is provided by future `try_sum`-style aggregates. This avoids accidentally collapsing unrelated failures into a single aggregate output. **Joins.** -A join predicate evaluating to `ERROR` escalates the candidate pair to a row-scoped error. -This is symmetric with the `WHERE` rule. +A join predicate evaluating to `ERROR` produces an output row whose diff carries the err-count component, symmetric with the `WHERE` rule and using the multiplicative rule on diffs to combine the contributions of the two input sides. +A downstream operator that must escalate to `DataflowError` does so explicitly. Join keys containing `Datum::Error` do not match any other key, including identical `Datum::Error` values, mirroring the grouping rule. **Casts and `try_cast`.** @@ -224,6 +261,17 @@ Carry global errors in a separate timely stream. Works, but requires every operator to be aware of two error inputs. The `diff`-field encoding leverages differential dataflow's existing fan-in and is the natural extension of the semiring. +**Row-scoped errors via carrier replacement rather than a diff component.** +Encode a row-scoped error by replacing the row with an opaque error marker in the data field while keeping the diff a plain `Int`. +This is the simplest extension but loses the row's content at the carrier flip, so any downstream operator that needed the original row data (for join, group-by on other columns, or counting) cannot recover it. +The `diff`-encoded `ErrCount` keeps the row and counts the error as a parallel multiplicity, which preserves both pieces of information and stays retractable through ordinary diff arithmetic. + +**Row-scoped errors via an absorbing diff marker like global errors.** +Use the same absorbing-monoid extension as the global-scoped case. +Rejected because per-row evaluation errors must be retractable: a row that errs on `WHERE 1/(a+b) > 5` at time `t` and is retracted at time `t'` must net to zero in the consolidated view. +An absorbing marker cannot retract. +The two-layer encoding (`Int × ErrCount` for row-scoped, `global` for collection-scoped) keeps the algebra of each scope faithful to its semantics. + ## Open questions * What is the exact set of `EvalError` payloads that operators may produce as `Datum::Error`? @@ -234,3 +282,5 @@ A reader on an older binary that encounters `Datum::Error` must have a defined b PostgreSQL has no precedent; candidates are "errors sort last", "errors sort like `NULL`", and "errors are unordered and produce a sort-key error". * What is the storage cost of widening `Row` encoding to include the new tag, and how does it compare to the current cost of routing failures through the error collection? * For global errors, what is the precise specification of the absorbing element in the `diff` semiring, and how does it interact with consolidation and arrangement? +* For row-scoped errors carried in the diff's `ErrCount` component, what is the storage / wire cost of a finite map from `EvalError` to `Int` versus the existing single-`Int` diff, and is the cost acceptable for collections in which errors are rare? +* For row-scoped errors carried in the diff, how does the encoding interact with existing arrangement layouts that assume `Int` diffs, and what is the minimum change required for the runtime to round-trip the new diff type? From 6f99f923a0c640b88ea81a38c299b324a69a688a Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Wed, 20 May 2026 11:54:51 -0400 Subject: [PATCH 121/127] doc/semantics: ErrCount + Diff types (row-scoped retractable diff) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foundation for the design-doc refactor toward non-absorbing row-scoped errs. ErrCount := EvalError → Int (pointwise add/neg, scalar mul, single). Diff := { val : Int, errs : ErrCount } with full commutative-ring-like algebra: * additive: assoc / comm / zero / neg. * multiplicative: (a, m) * (b, n) = (a*b, a • n + b • m). * distributive: mul_add / add_mul. * neg_mul / mul_neg. Zero (0, 0) absorbs *. One (1, 0) is identity for *. New file Mz/DiffErrCount.lean. Adds 33rd build job. The earlier DiffWithError (absorbing) stays in place; the next commit repurposes it as DiffWithGlobal (collection-scoped only). Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/DiffErrCount.lean | 425 +++++++++++++++++++ 2 files changed, 426 insertions(+) create mode 100644 doc/developer/semantics/Mz/DiffErrCount.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 240c50a69309a..854b57a3b1e92 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -13,6 +13,7 @@ import Mz.ErrStream import Mz.Pushdown import Mz.ColRefs import Mz.DiffSemiring +import Mz.DiffErrCount import Mz.UnifiedStream import Mz.UnifiedConsolidate import Mz.TimedConsolidate diff --git a/doc/developer/semantics/Mz/DiffErrCount.lean b/doc/developer/semantics/Mz/DiffErrCount.lean new file mode 100644 index 0000000000000..8d465db1a647d --- /dev/null +++ b/doc/developer/semantics/Mz/DiffErrCount.lean @@ -0,0 +1,425 @@ +import Mz.Datum + +/-! +# Row-scoped retractable diff + +The skeleton's earlier `DiffWithError α` used an absorbing `error` +marker for *all* error scopes. That encoding can only express +collection-scoped (global) errors faithfully — the marker is +terminal under both addition and multiplication, so it cannot +retract. + +Row-scoped errors raised inside a `WHERE` predicate or a join +condition (e.g., `1/(a+b)` on a row with `a+b = 0`) must retract: +a row that errs at time `t` and is retracted at time `t'` should +net to zero in the consolidated view. The earlier encoding's only +recourse was to drop the row from the data collection and route +the error through `DataflowError`, which loses the row's content. + +The design doc (`doc/developer/design/20260517_error_handling_semantics.md`) +extends the diff with a per-`EvalError` multiplicity component: + +``` +Diff = Int × ErrCount +ErrCount = EvalError -fin-> Int +``` + +`Int` carries the valid-copy multiplicity; `ErrCount` carries the +per-`EvalError`-payload error-copy multiplicities. Addition is +pointwise on both axes. Multiplication is +`(a, m) * (b, n) = (a*b, a • n + b • m)` — the unique extension +that distributes over addition and keeps `(0, ∅)` absorbing under +`*`. Negation is pointwise, so both valid and err counts are +retractable through ordinary diff arithmetic. + +This file defines `ErrCount` and `Diff` and proves the algebraic +laws. The absorbing collection-scoped marker layered on top of +`Diff` lives in `Mz/DiffWithGlobal.lean`. -/ + +namespace Mz + +/-- Finite per-`EvalError` multiplicity. Modeled as a total +function with the understanding that we only ever construct +instances with finite support (`zero` plus finitely many +`single`-style additions). Decidable equality on `EvalError` +makes `single` definable. + +A `Finsupp`-style finite-support carrier would be cleaner but +requires mathlib; the function representation suffices for the +algebra here — point-wise equality is reached via `funext` and +the laws below. -/ +def ErrCount : Type := EvalError → Int + +namespace ErrCount + +/-- Identity for `add`: every err has count zero. -/ +def zero : ErrCount := fun _ => 0 + +/-- Pointwise addition. -/ +def add (m n : ErrCount) : ErrCount := fun e => m e + n e + +/-- Pointwise negation. -/ +def neg (m : ErrCount) : ErrCount := fun e => -(m e) + +/-- Scalar multiplication: scale every err count by `a`. -/ +def smul (a : Int) (m : ErrCount) : ErrCount := fun e => a * m e + +/-- Concentrate `n` copies of err `e` and nothing else. -/ +def single (e : EvalError) (n : Int) : ErrCount := + fun e' => if e = e' then n else 0 + +instance : Zero ErrCount := ⟨zero⟩ +instance : Add ErrCount := ⟨add⟩ +instance : Neg ErrCount := ⟨neg⟩ + +/-! ## Pointwise laws -/ + +theorem add_zero (m : ErrCount) : m + 0 = m := by + funext e + show m e + 0 = m e + exact Int.add_zero _ + +theorem zero_add (m : ErrCount) : 0 + m = m := by + funext e + show 0 + m e = m e + exact Int.zero_add _ + +theorem add_comm (m n : ErrCount) : m + n = n + m := by + funext e + show m e + n e = n e + m e + exact Int.add_comm _ _ + +theorem add_assoc (m n p : ErrCount) : (m + n) + p = m + (n + p) := by + funext e + show (m e + n e) + p e = m e + (n e + p e) + exact Int.add_assoc _ _ _ + +theorem neg_add_self (m : ErrCount) : -m + m = 0 := by + funext e + show -(m e) + m e = 0 + exact Int.add_left_neg _ + +theorem add_neg_self (m : ErrCount) : m + -m = 0 := by + funext e + show m e + -(m e) = 0 + exact Int.add_right_neg _ + +theorem neg_zero : -(0 : ErrCount) = 0 := by + funext e + show -(0 : Int) = 0 + rfl + +theorem neg_add (m n : ErrCount) : -(m + n) = -m + -n := by + funext e + show -(m e + n e) = -(m e) + -(n e) + exact Int.neg_add + +/-! ## Scalar multiplication laws -/ + +theorem smul_zero (a : Int) : smul a 0 = 0 := by + funext e + show a * 0 = 0 + exact Int.mul_zero _ + +theorem zero_smul (m : ErrCount) : smul 0 m = 0 := by + funext e + show 0 * m e = 0 + exact Int.zero_mul _ + +theorem one_smul (m : ErrCount) : smul 1 m = m := by + funext e + show 1 * m e = m e + exact Int.one_mul _ + +theorem smul_add (a : Int) (m n : ErrCount) : + smul a (m + n) = smul a m + smul a n := by + funext e + show a * (m e + n e) = a * m e + a * n e + exact Int.mul_add _ _ _ + +theorem add_smul (a b : Int) (m : ErrCount) : + smul (a + b) m = smul a m + smul b m := by + funext e + show (a + b) * m e = a * m e + b * m e + exact Int.add_mul _ _ _ + +theorem smul_smul (a b : Int) (m : ErrCount) : + smul a (smul b m) = smul (a * b) m := by + funext e + show a * (b * m e) = (a * b) * m e + rw [Int.mul_assoc] + +theorem smul_neg (a : Int) (m : ErrCount) : + smul a (-m) = -(smul a m) := by + funext e + show a * -(m e) = -(a * m e) + exact Int.mul_neg _ _ + +theorem neg_smul (a : Int) (m : ErrCount) : + smul (-a) m = -(smul a m) := by + funext e + show (-a) * m e = -(a * m e) + exact Int.neg_mul _ _ + +end ErrCount + +/-! ## `Diff`: pair of valid count and err counts + +The carrier of row-scoped retractable multiplicities. -/ + +structure Diff where + val : Int + errs : ErrCount + +namespace Diff + +instance : Inhabited Diff := ⟨{ val := 0, errs := fun _ => 0 }⟩ + +def zero : Diff := { val := 0, errs := 0 } +def one : Diff := { val := 1, errs := 0 } + +def add (a b : Diff) : Diff := + { val := a.val + b.val, errs := a.errs + b.errs } + +def neg (a : Diff) : Diff := + { val := -a.val, errs := -a.errs } + +/-- Multiplicative combine. The cross/join rule: valid copies on +each side multiply normally; an err-count on one side is scaled by +the valid count on the other side and summed. The unique extension +of `*` that distributes over `+` and keeps `(0, ∅)` absorbing. -/ +def mul (a b : Diff) : Diff := + { val := a.val * b.val + , errs := ErrCount.add (ErrCount.smul a.val b.errs) + (ErrCount.smul b.val a.errs) } + +instance : Zero Diff := ⟨zero⟩ +instance : One Diff := ⟨one⟩ +instance : Add Diff := ⟨add⟩ +instance : Neg Diff := ⟨neg⟩ +instance : Mul Diff := ⟨mul⟩ + +/-! ## Convenience constructors -/ + +/-- Pure valid count, no errors. -/ +@[inline] def pure (n : Int) : Diff := { val := n, errs := 0 } + +/-- Pure err count for a single err payload. -/ +@[inline] def err (e : EvalError) (n : Int) : Diff := + { val := 0, errs := ErrCount.single e n } + +/-! ## Component reduction lemmas -/ + +theorem val_zero : (0 : Diff).val = 0 := rfl +theorem errs_zero : (0 : Diff).errs = (0 : ErrCount) := rfl +theorem val_one : (1 : Diff).val = 1 := rfl +theorem errs_one : (1 : Diff).errs = (0 : ErrCount) := rfl + +theorem val_add (a b : Diff) : (a + b).val = a.val + b.val := rfl +theorem errs_add (a b : Diff) : (a + b).errs = a.errs + b.errs := rfl + +theorem val_neg (a : Diff) : (-a).val = -a.val := rfl +theorem errs_neg (a : Diff) : (-a).errs = -a.errs := rfl + +theorem val_mul (a b : Diff) : (a * b).val = a.val * b.val := rfl + +theorem errs_mul (a b : Diff) : + (a * b).errs + = ErrCount.add (ErrCount.smul a.val b.errs) + (ErrCount.smul b.val a.errs) := rfl + +/-! ## Additive monoid laws -/ + +theorem add_zero (a : Diff) : a + 0 = a := by + rcases a with ⟨v, m⟩ + show (⟨v + 0, ErrCount.add m 0⟩ : Diff) = ⟨v, m⟩ + congr 1 + · exact Int.add_zero _ + · exact ErrCount.add_zero _ + +theorem zero_add (a : Diff) : 0 + a = a := by + rcases a with ⟨v, m⟩ + show (⟨0 + v, ErrCount.add 0 m⟩ : Diff) = ⟨v, m⟩ + congr 1 + · exact Int.zero_add _ + · exact ErrCount.zero_add _ + +theorem add_comm (a b : Diff) : a + b = b + a := by + rcases a with ⟨v, m⟩ + rcases b with ⟨w, n⟩ + show (⟨v + w, ErrCount.add m n⟩ : Diff) = ⟨w + v, ErrCount.add n m⟩ + congr 1 + · exact Int.add_comm _ _ + · exact ErrCount.add_comm _ _ + +theorem add_assoc (a b c : Diff) : (a + b) + c = a + (b + c) := by + rcases a with ⟨v, m⟩ + rcases b with ⟨w, n⟩ + rcases c with ⟨u, p⟩ + show (⟨(v + w) + u, ErrCount.add (ErrCount.add m n) p⟩ : Diff) + = ⟨v + (w + u), ErrCount.add m (ErrCount.add n p)⟩ + congr 1 + · exact Int.add_assoc _ _ _ + · exact ErrCount.add_assoc _ _ _ + +theorem add_neg_self (a : Diff) : a + -a = 0 := by + rcases a with ⟨v, m⟩ + show (⟨v + -v, ErrCount.add m (-m)⟩ : Diff) = ⟨0, 0⟩ + congr 1 + · exact Int.add_right_neg _ + · exact ErrCount.add_neg_self _ + +theorem neg_add_self (a : Diff) : -a + a = 0 := by + rcases a with ⟨v, m⟩ + show (⟨-v + v, ErrCount.add (-m) m⟩ : Diff) = ⟨0, 0⟩ + congr 1 + · exact Int.add_left_neg _ + · exact ErrCount.neg_add_self _ + +theorem neg_neg (a : Diff) : - -a = a := by + rcases a with ⟨v, m⟩ + show (⟨- -v, ErrCount.neg (ErrCount.neg m)⟩ : Diff) = ⟨v, m⟩ + congr 1 + · exact Int.neg_neg _ + · funext e; exact Int.neg_neg _ + +theorem neg_zero : -(0 : Diff) = 0 := rfl + +theorem neg_add (a b : Diff) : -(a + b) = -a + -b := by + rcases a with ⟨v, m⟩ + rcases b with ⟨w, n⟩ + show (Diff.neg (Diff.add ⟨v, m⟩ ⟨w, n⟩) : Diff) = Diff.add (Diff.neg ⟨v, m⟩) (Diff.neg ⟨w, n⟩) + show (⟨-(v + w), ErrCount.neg (ErrCount.add m n)⟩ : Diff) + = ⟨-v + -w, ErrCount.add (-m) (-n)⟩ + congr 1 + · exact Int.neg_add + · exact ErrCount.neg_add _ _ + +/-! ## Multiplicative laws -/ + +theorem mul_zero (a : Diff) : a * 0 = 0 := by + rcases a with ⟨v, m⟩ + show (⟨v * 0, ErrCount.add (ErrCount.smul v (0 : ErrCount)) + (ErrCount.smul 0 m)⟩ : Diff) + = ⟨0, 0⟩ + congr 1 + · exact Int.mul_zero _ + · rw [ErrCount.smul_zero, ErrCount.zero_smul] + exact ErrCount.add_zero _ + +theorem zero_mul (a : Diff) : 0 * a = 0 := by + rcases a with ⟨v, m⟩ + show (⟨0 * v, ErrCount.add (ErrCount.smul 0 m) (ErrCount.smul v (0 : ErrCount))⟩ + : Diff) = ⟨0, 0⟩ + congr 1 + · exact Int.zero_mul _ + · rw [ErrCount.zero_smul, ErrCount.smul_zero] + exact ErrCount.add_zero _ + +theorem mul_one (a : Diff) : a * 1 = a := by + rcases a with ⟨v, m⟩ + show (⟨v * 1, ErrCount.add (ErrCount.smul v (0 : ErrCount)) + (ErrCount.smul 1 m)⟩ : Diff) + = ⟨v, m⟩ + congr 1 + · exact Int.mul_one _ + · rw [ErrCount.smul_zero, ErrCount.one_smul] + exact ErrCount.zero_add _ + +theorem one_mul (a : Diff) : 1 * a = a := by + rcases a with ⟨v, m⟩ + show (⟨1 * v, ErrCount.add (ErrCount.smul 1 m) (ErrCount.smul v (0 : ErrCount))⟩ + : Diff) = ⟨v, m⟩ + congr 1 + · exact Int.one_mul _ + · rw [ErrCount.one_smul, ErrCount.smul_zero] + exact ErrCount.add_zero _ + +theorem mul_comm (a b : Diff) : a * b = b * a := by + rcases a with ⟨v, m⟩ + rcases b with ⟨w, n⟩ + show (⟨v * w, ErrCount.add (ErrCount.smul v n) (ErrCount.smul w m)⟩ : Diff) + = ⟨w * v, ErrCount.add (ErrCount.smul w m) (ErrCount.smul v n)⟩ + congr 1 + · exact Int.mul_comm _ _ + · exact ErrCount.add_comm _ _ + +theorem mul_assoc (a b c : Diff) : (a * b) * c = a * (b * c) := by + rcases a with ⟨v, m⟩ + rcases b with ⟨w, n⟩ + rcases c with ⟨u, p⟩ + show (Diff.mul (Diff.mul ⟨v, m⟩ ⟨w, n⟩) ⟨u, p⟩) + = Diff.mul ⟨v, m⟩ (Diff.mul ⟨w, n⟩ ⟨u, p⟩) + show (⟨(v * w) * u, + ErrCount.add (ErrCount.smul (v * w) p) + (ErrCount.smul u (ErrCount.add (ErrCount.smul v n) + (ErrCount.smul w m)))⟩ : Diff) + = ⟨v * (w * u), + ErrCount.add (ErrCount.smul v (ErrCount.add (ErrCount.smul w p) + (ErrCount.smul u n))) + (ErrCount.smul (w * u) m)⟩ + congr 1 + · exact Int.mul_assoc _ _ _ + · funext e + show (v * w) * p e + u * (v * n e + w * m e) + = v * (w * p e + u * n e) + (w * u) * m e + have h1 : u * (v * n e + w * m e) = (u * v) * n e + (u * w) * m e := by + rw [Int.mul_add, ← Int.mul_assoc, ← Int.mul_assoc] + have h2 : v * (w * p e + u * n e) = (v * w) * p e + (v * u) * n e := by + rw [Int.mul_add, ← Int.mul_assoc, ← Int.mul_assoc] + rw [h1, h2] + have hUV : u * v = v * u := Int.mul_comm _ _ + have hUW : u * w = w * u := Int.mul_comm _ _ + rw [hUV, hUW] + -- LHS: (v*w)*p e + ((v*u)*n e + (w*u)*m e) + -- RHS: (v*w)*p e + (v*u)*n e + (w*u)*m e = ((v*w)*p e + (v*u)*n e) + (w*u)*m e + -- Need: a + (b + c) = (a + b) + c, i.e., ← Int.add_assoc on RHS. + rw [← Int.add_assoc] + +theorem mul_add (a b c : Diff) : a * (b + c) = a * b + a * c := by + rcases a with ⟨v, m⟩ + rcases b with ⟨w, n⟩ + rcases c with ⟨u, p⟩ + show Diff.mul ⟨v, m⟩ (Diff.add ⟨w, n⟩ ⟨u, p⟩) + = Diff.add (Diff.mul ⟨v, m⟩ ⟨w, n⟩) (Diff.mul ⟨v, m⟩ ⟨u, p⟩) + show (⟨v * (w + u), + ErrCount.add (ErrCount.smul v (ErrCount.add n p)) + (ErrCount.smul (w + u) m)⟩ : Diff) + = ⟨v * w + v * u, + ErrCount.add (ErrCount.add (ErrCount.smul v n) (ErrCount.smul w m)) + (ErrCount.add (ErrCount.smul v p) (ErrCount.smul u m))⟩ + congr 1 + · exact Int.mul_add _ _ _ + · funext e + show v * (n e + p e) + (w + u) * m e + = (v * n e + w * m e) + (v * p e + u * m e) + rw [Int.mul_add, Int.add_mul] + -- (v*n e + v*p e) + (w*m e + u*m e) = (v*n e + w*m e) + (v*p e + u*m e) + rw [Int.add_assoc, ← Int.add_assoc (v * p e), + Int.add_comm (v * p e) (w * m e), + Int.add_assoc, ← Int.add_assoc] + +theorem add_mul (a b c : Diff) : (a + b) * c = a * c + b * c := by + rw [mul_comm, mul_add, mul_comm c a, mul_comm c b] + +theorem neg_mul (a b : Diff) : -a * b = -(a * b) := by + rcases a with ⟨v, m⟩ + rcases b with ⟨w, n⟩ + show Diff.mul (Diff.neg ⟨v, m⟩) ⟨w, n⟩ = Diff.neg (Diff.mul ⟨v, m⟩ ⟨w, n⟩) + show (⟨(-v) * w, + ErrCount.add (ErrCount.smul (-v) n) (ErrCount.smul w (-m))⟩ : Diff) + = ⟨-(v * w), + -(ErrCount.add (ErrCount.smul v n) (ErrCount.smul w m))⟩ + congr 1 + · exact Int.neg_mul _ _ + · rw [ErrCount.neg_smul, ErrCount.smul_neg] + funext e + show -(v * n e) + -(w * m e) = -(v * n e + w * m e) + rw [Int.neg_add] + +theorem mul_neg (a b : Diff) : a * -b = -(a * b) := by + rw [mul_comm, neg_mul, mul_comm] + +end Diff + +end Mz From a54c8c0a0d561a37e57bae453bd47e22cf555104 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Wed, 20 May 2026 14:54:45 -0400 Subject: [PATCH 122/127] doc/semantics: DiffWithGlobal absorbing layer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `DiffWithGlobal := val (Diff) | global` — absorbing only on `global`. Used for collection-scoped errs that cannot retract. The val side wraps the row-scoped `Diff` (Int × ErrCount) from DiffErrCount.lean, which is fully retractable. Mirrors DiffSemiring.lean's absorbing API at the `global` element: absorption laws on +/* (both sides), commutativity, associativity, distributivity, negation, the converse `add_eq_global_left_or_right`. The existing absorbing `DiffWithError α` stays in place; downstream files migrate to DiffWithGlobal in subsequent commits. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + .../semantics/Mz/DiffWithGlobal.lean | 316 ++++++++++++++++++ 2 files changed, 317 insertions(+) create mode 100644 doc/developer/semantics/Mz/DiffWithGlobal.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 854b57a3b1e92..c112e7797de4d 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -14,6 +14,7 @@ import Mz.Pushdown import Mz.ColRefs import Mz.DiffSemiring import Mz.DiffErrCount +import Mz.DiffWithGlobal import Mz.UnifiedStream import Mz.UnifiedConsolidate import Mz.TimedConsolidate diff --git a/doc/developer/semantics/Mz/DiffWithGlobal.lean b/doc/developer/semantics/Mz/DiffWithGlobal.lean new file mode 100644 index 0000000000000..0de56c75860ea --- /dev/null +++ b/doc/developer/semantics/Mz/DiffWithGlobal.lean @@ -0,0 +1,316 @@ +import Mz.DiffErrCount + +/-! +# Diff extended with an absorbing global-error marker + +`Mz/DiffErrCount.lean` defines the row-scoped retractable diff +`Diff = Int × ErrCount`. Collection-scoped (global) errors require a +strictly stronger encoding: a marker that absorbs both addition and +multiplication and cannot be retracted. + +The design doc (`doc/developer/design/20260517_error_handling_semantics.md`, +section "Global-scoped errors: absorbing diff marker") layers an +absorbing `global` marker over `Diff`: + +``` +DiffWithGlobal = val(Diff) | global +``` + +with `global + d = global`, `d + global = global`, `global * d = global`, +`d * global = global`. The `val` branch reuses `Diff`'s full +`Add`/`Mul`/`Neg` algebra; the `global` branch is terminal under every +operation. + +This is the two-layer encoding the design doc prescribes: row-scoped +errors live in `Diff.errs` (retractable), collection-scoped errors live +in `DiffWithGlobal.global` (terminal). The older `DiffWithError α` in +`Mz/DiffSemiring.lean` conflated the two scopes; downstream files still +use it pending migration. + +The mechanization mirrors `Mz/DiffSemiring.lean`'s proof style but +specializes the payload to `Diff`, so the laws can cite +`Diff.add_comm`, `Diff.mul_assoc`, etc., directly rather than taking +`h_*` hypotheses on a generic base. -/ + +namespace Mz + +/-- `Diff`-valued diff augmented with an absorbing `global` marker. +The `global` element encodes "this collection is invalid at this +time". Unlike the row-scoped err-count axis inside `Diff`, `global` +is terminal: addition and multiplication preserve it, and there is +no inverse. -/ +inductive DiffWithGlobal where + | val (d : Diff) + | global + deriving Inhabited + +namespace DiffWithGlobal + +/-- Lifted addition. `global` absorbs from either side; `val`s add +pointwise via `Diff`'s `+`. -/ +def add : DiffWithGlobal → DiffWithGlobal → DiffWithGlobal + | .global, _ => .global + | _, .global => .global + | .val a, .val b => .val (a + b) + +/-- Lifted multiplication. Joins/crosses on the absorbing extension. -/ +def mul : DiffWithGlobal → DiffWithGlobal → DiffWithGlobal + | .global, _ => .global + | _, .global => .global + | .val a, .val b => .val (a * b) + +/-- Lifted negation. `global` absorbs (the "collection invalid" +marker cannot be negated away); `val a` negates pointwise via +`Diff`'s `Neg`. Required for bag-difference flavors of set +operations (`EXCEPT ALL`). -/ +def neg : DiffWithGlobal → DiffWithGlobal + | .global => .global + | .val a => .val (-a) + +instance : Add DiffWithGlobal := ⟨add⟩ +instance : Mul DiffWithGlobal := ⟨mul⟩ +instance : Neg DiffWithGlobal := ⟨neg⟩ + +/-- Lifted zero (identity for `+`). -/ +instance : Zero DiffWithGlobal := ⟨.val 0⟩ + +/-- Lifted one (identity for `*`). -/ +instance : One DiffWithGlobal := ⟨.val 1⟩ + +/-! ## Absorption laws + +The defining property of the `global` marker: any sum or product +involving it is itself `global`. -/ + +theorem global_add_left (y : DiffWithGlobal) : + (global : DiffWithGlobal) + y = global := rfl + +theorem global_add_right (x : DiffWithGlobal) : + x + (global : DiffWithGlobal) = global := by + cases x with + | val _ => rfl + | global => rfl + +/-- Converse of absorption: `.global` can only emerge from `+` if at +least one summand was `.global`. The `.val + .val` branch always +returns `.val`, so the result `.global` rules it out. -/ +theorem add_eq_global_left_or_right (a b : DiffWithGlobal) + (h : a + b = global) : + a = global ∨ b = global := by + cases a with + | global => exact Or.inl rfl + | val x => + cases b with + | global => exact Or.inr rfl + | val y => + have hEq : (DiffWithGlobal.val x : DiffWithGlobal) + DiffWithGlobal.val y + = DiffWithGlobal.val (x + y) := rfl + rw [hEq] at h + cases h + +theorem global_mul_left (y : DiffWithGlobal) : + (global : DiffWithGlobal) * y = global := rfl + +theorem global_mul_right (x : DiffWithGlobal) : + x * (global : DiffWithGlobal) = global := by + cases x with + | val _ => rfl + | global => rfl + +/-! ## Commutativity / associativity of `+` -/ + +theorem add_comm (a b : DiffWithGlobal) : a + b = b + a := by + cases a with + | val x => + cases b with + | val y => + show (val (x + y) : DiffWithGlobal) = val (y + x) + rw [Diff.add_comm] + | global => rfl + | global => + cases b with + | val _ => rfl + | global => rfl + +theorem add_assoc (a b c : DiffWithGlobal) : (a + b) + c = a + (b + c) := by + cases a with + | val x => + cases b with + | val y => + cases c with + | val z => + show (val ((x + y) + z) : DiffWithGlobal) = val (x + (y + z)) + rw [Diff.add_assoc] + | global => rfl + | global => + cases c with + | val _ => rfl + | global => rfl + | global => + cases b with + | val _ => + cases c with + | val _ => rfl + | global => rfl + | global => + cases c with + | val _ => rfl + | global => rfl + +/-! ## Zero identity -/ + +theorem zero_add_val (x : Diff) : + (0 : DiffWithGlobal) + val x = val x := by + show (val (0 + x) : DiffWithGlobal) = val x + rw [Diff.zero_add] + +theorem val_add_zero (x : Diff) : + (val x : DiffWithGlobal) + 0 = val x := by + show (val (x + 0) : DiffWithGlobal) = val x + rw [Diff.add_zero] + +/-! ## Distributivity + +Left distributivity says `a * (b + c) = a * b + a * c`. With the +absorbing `global`, the law holds unconditionally: any `global` in the +inputs forces every sub-expression containing it to `global`, and +`global + global = global` restores the equality on the right. -/ + +theorem mul_add (a b c : DiffWithGlobal) : a * (b + c) = a * b + a * c := by + cases a with + | val x => + cases b with + | val y => + cases c with + | val z => + show (val (x * (y + z)) : DiffWithGlobal) = val (x * y + x * z) + rw [Diff.mul_add] + | global => rfl + | global => + cases c with + | val _ => rfl + | global => rfl + | global => + cases b with + | val _ => + cases c with + | val _ => rfl + | global => rfl + | global => + cases c with + | val _ => rfl + | global => rfl + +/-! ## Associativity / commutativity of `*` -/ + +theorem mul_assoc (a b c : DiffWithGlobal) : (a * b) * c = a * (b * c) := by + cases a with + | val x => + cases b with + | val y => + cases c with + | val z => + show (val ((x * y) * z) : DiffWithGlobal) = val (x * (y * z)) + rw [Diff.mul_assoc] + | global => rfl + | global => + cases c with + | val _ => rfl + | global => rfl + | global => + cases b with + | val _ => + cases c with + | val _ => rfl + | global => rfl + | global => + cases c with + | val _ => rfl + | global => rfl + +theorem mul_comm (a b : DiffWithGlobal) : a * b = b * a := by + cases a with + | val x => + cases b with + | val y => + show (val (x * y) : DiffWithGlobal) = val (y * x) + rw [Diff.mul_comm] + | global => rfl + | global => + cases b with + | val _ => rfl + | global => rfl + +/-! ## Negation laws + +`global` absorbs negation, and double-negation is the identity on +`val` (lifted from `Diff.neg_neg`). -/ + +theorem neg_global : + -(global : DiffWithGlobal) = global := rfl + +theorem neg_val (x : Diff) : + -(val x : DiffWithGlobal) = val (-x) := rfl + +theorem neg_neg (a : DiffWithGlobal) : - -a = a := by + cases a with + | val x => + show (val (- -x) : DiffWithGlobal) = val x + rw [Diff.neg_neg] + | global => rfl + +/-- Right-inverse on `val`: lifted from `Diff.add_neg_self`. `global` +has no inverse — the absorber is unrecoverable, which is exactly the +spec property: a collection-scoped error cannot be retracted. -/ +theorem val_add_neg_val (x : Diff) : + (val x : DiffWithGlobal) + -val x = 0 := by + show (val (x + -x) : DiffWithGlobal) = val 0 + rw [Diff.add_neg_self] + +/-- Negation distributes over addition, lifted from `Diff.neg_add`. -/ +theorem neg_add (a b : DiffWithGlobal) : -(a + b) = -a + -b := by + cases a with + | val x => + cases b with + | val y => + show (val (-(x + y)) : DiffWithGlobal) = val (-x + -y) + rw [Diff.neg_add] + | global => rfl + | global => + cases b with + | val _ => rfl + | global => rfl + +/-- Negation distributes over multiplication on the left, lifted from +`Diff.neg_mul`. -/ +theorem neg_mul (a b : DiffWithGlobal) : (-a) * b = -(a * b) := by + cases a with + | val x => + cases b with + | val y => + show (val ((-x) * y) : DiffWithGlobal) = val (-(x * y)) + rw [Diff.neg_mul] + | global => rfl + | global => + cases b with + | val _ => rfl + | global => rfl + +/-- Negation distributes over multiplication on the right, lifted +from `Diff.mul_neg`. -/ +theorem mul_neg (a b : DiffWithGlobal) : a * (-b) = -(a * b) := by + cases a with + | val x => + cases b with + | val y => + show (val (x * (-y)) : DiffWithGlobal) = val (-(x * y)) + rw [Diff.mul_neg] + | global => rfl + | global => + cases b with + | val _ => rfl + | global => rfl + +end DiffWithGlobal + +end Mz From 1895bb42ed3af67eb3ebd1d52cdfb4432d02cab1 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Wed, 20 May 2026 15:55:06 -0400 Subject: [PATCH 123/127] doc/semantics: UnifiedStream2 module (new diff-based err encoding) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parallel module to UnifiedStream.lean. Carrier := Row (no err constructor); row-scoped errs live in the diff's ErrCount component, collection-scoped errs in DiffWithGlobal.global. * filter: predicate err converts row's valid-count copies into err-count copies via ErrCount.single. Row content preserved. * project: rowAllSafe → row replaced by es.map (eval r), diff unchanged. Unsafe → row preserved, valid count zeroed, per- erroring-scalar err counts via rowErrCount helper. * global diff passes through both operators unchanged. Includes filter_nil, filter_append, project_nil_stream, project_append reduction lemmas. Old UnifiedStream module retained; downstream operators migrate incrementally. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + .../semantics/Mz/UnifiedStream2.lean | 157 ++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 doc/developer/semantics/Mz/UnifiedStream2.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index c112e7797de4d..91f7b1dfc35e3 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -16,6 +16,7 @@ import Mz.DiffSemiring import Mz.DiffErrCount import Mz.DiffWithGlobal import Mz.UnifiedStream +import Mz.UnifiedStream2 import Mz.UnifiedConsolidate import Mz.TimedConsolidate import Mz.Aggregate diff --git a/doc/developer/semantics/Mz/UnifiedStream2.lean b/doc/developer/semantics/Mz/UnifiedStream2.lean new file mode 100644 index 0000000000000..32aa8e4956188 --- /dev/null +++ b/doc/developer/semantics/Mz/UnifiedStream2.lean @@ -0,0 +1,157 @@ +import Mz.DiffErrCount +import Mz.DiffWithGlobal +import Mz.Expr +import Mz.Eval +import Mz.Bag +import Mz.ErrStream + +/-! +# Unified stream, take two + +`Mz/UnifiedStream.lean` carries a sum-typed `UnifiedRow := row | err` +carrier. Filter and project replace the row with `.err e` on +evaluation error, which loses the row's content — a row that errs +inside a `WHERE` predicate no longer exists as data even though its +columns were otherwise well-defined. + +The design doc +(`doc/developer/design/20260517_error_handling_semantics.md`, +sections "Row-scoped errors" and "Predicates") fixes this by moving +row-scoped errs out of the carrier and into the diff's `ErrCount` +component: + +* the carrier is just `Row` — no `err` constructor; +* row-scoped errs live in `Diff.errs : ErrCount` (retractable); +* collection-scoped errs live in `DiffWithGlobal.global` (terminal). + +This module is the new model. The old `UnifiedStream` is retained +unchanged because downstream files (Join, SetOps, Consolidate, etc.) +still cite it; migration is incremental. + +## Encoding + +`UnifiedStream2 := List (Row × DiffWithGlobal)`. Operators consume +and produce `UnifiedStream2`s. A `WHERE` predicate that errors on a +row carrying diff `(a, m)` produces the same row with diff +`(0, m ∪ {e ↦ a})` — the row content is preserved and the err-count +participates in retraction through ordinary diff arithmetic. -/ + +namespace Mz + +/-- New unified stream. Carrier is just `Row` — no err marker. +Row-scoped errs live in the diff's `ErrCount` component; +collection-scoped errs live in the `DiffWithGlobal.global` marker. -/ +abbrev UnifiedStream2 := List (Row × DiffWithGlobal) + +namespace UnifiedStream2 + +/-! ## Filter + +Per the design doc: + +* `eval r pred = .bool true` → diff unchanged. +* `eval r pred = .bool false / .null / .int _` + → diff valid count zeroed, + errs kept. +* `eval r pred = .err e` → diff valid count zeroed, + errs gets `e` added with + count `d.val`. +* `diff = .global` → passes through unchanged. + +For `.bool false / .null / .int` we always emit the record even +when the diff's errs are zero — semantic clarity over output-size +optimization. The downstream consolidator drops zero-diff records +if it cares. -/ + +/-- Per-`Diff` filter action. Splits on the predicate result on `r`. +Always returns exactly one record (with the same row, possibly a +zeroed-out diff). -/ +@[inline] private def filterOne (pred : Expr) (r : Row) (d : Diff) : + Row × DiffWithGlobal := + match eval r pred with + | .bool true => + -- predicate true: pass row through with its original diff. + (r, DiffWithGlobal.val d) + | .err e => + -- predicate errs: zero the valid count, add e ↦ d.val to errs. + (r, DiffWithGlobal.val { val := 0 + , errs := d.errs + ErrCount.single e d.val }) + | _ => + -- predicate false / null / int: zero the valid count, errs kept. + (r, DiffWithGlobal.val { val := 0, errs := d.errs }) + +/-- Diff-aware filter on the new unified stream. -/ +def filter (pred : Expr) (us : UnifiedStream2) : UnifiedStream2 := + us.map fun rd => match rd with + | (r, DiffWithGlobal.val d) => filterOne pred r d + | (_, DiffWithGlobal.global) => rd + +/-! ### Reduction lemmas + +Named per-list-shape reductions for `filter`. Downstream proofs +cite these instead of unfolding `map` inline. -/ + +theorem filter_nil (pred : Expr) : + filter pred [] = [] := rfl + +theorem filter_append (pred : Expr) (a b : UnifiedStream2) : + filter pred (a ++ b) = filter pred a ++ filter pred b := by + show (a ++ b).map _ = a.map _ ++ b.map _ + exact List.map_append + +/-! ## Project + +Per the design doc: + +* `rowAllSafe es r = true` → row replaced by + `es.map (eval r)`, + diff unchanged. +* `rowAllSafe es r = false` → row preserved, diff valid + count zeroed, errs gets one + entry per erroring scalar + with count `d.val`. +* `diff = .global` → passes through unchanged. + +This mirrors the old module's `rowProjectRecords` shape but stays in +the new diff encoding: a per-row evaluation error keeps the row in +the carrier and routes the failure through the err-count component +of `Diff` rather than replacing the row with an `.err` carrier. -/ + +/-- Build an `ErrCount` accumulating each erroring scalar's payload +with count `n`. Order matches the expression order within the row; +duplicates accumulate via `ErrCount.add`. -/ +def rowErrCount (es : List Expr) (r : Row) (n : Int) : ErrCount := + (rowErrs es r).foldr (fun e acc => ErrCount.single e n + acc) 0 + +/-- Per-`Diff` project action on a single row. -/ +@[inline] private def projectOne (es : List Expr) (r : Row) (d : Diff) : + Row × DiffWithGlobal := + if rowAllSafe es r then + -- every scalar evaluates cleanly: replace row by projected row, + -- keep the diff. + (es.map (eval r), DiffWithGlobal.val d) + else + -- some scalar errs: preserve the row, zero the valid count, add + -- one err-count entry per erroring scalar with count d.val. + (r, DiffWithGlobal.val { val := 0 + , errs := d.errs + rowErrCount es r d.val }) + +/-- Diff-aware projection on the new unified stream. -/ +def project (es : List Expr) (us : UnifiedStream2) : UnifiedStream2 := + us.map fun rd => match rd with + | (r, DiffWithGlobal.val d) => projectOne es r d + | (_, DiffWithGlobal.global) => rd + +/-! ### Reduction lemmas -/ + +theorem project_nil_stream (es : List Expr) : + project es [] = [] := rfl + +theorem project_append (es : List Expr) (a b : UnifiedStream2) : + project es (a ++ b) = project es a ++ project es b := by + show (a ++ b).map _ = a.map _ ++ b.map _ + exact List.map_append + +end UnifiedStream2 + +end Mz From dfc0d98c25babd698d3230d72a1ae48ad29ef4f3 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Wed, 20 May 2026 15:59:24 -0400 Subject: [PATCH 124/127] doc/semantics: Join2 + SetOps2 + UnifiedConsolidate2 Three parallel operator modules over UnifiedStream2 (new diff- based err encoding): * Join2: cross, join. Carrier-row concat replaces combineCarrier. Diff via DiffWithGlobal.mul. Includes cross_nil_left/right, cross_append_left, cross_cons_left, cross_length. * SetOps2: unionAll, negate, exceptAll. Subset of old SetOps; clampPositive/clampToOne/distinct/intersectAll defer to later. Includes unionAll_assoc, negate_negate (via DiffWithGlobal.neg_neg), negate_append. * UnifiedConsolidate2: bucket-by-row consolidation. Row carrier is plain Row (no err marker), so consolidateInto keys on Row's DecidableEq. Includes consolidateInto_match/skip and consolidate_singleton. 38 lean build jobs all green. Old modules untouched. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 3 + doc/developer/semantics/Mz/Join2.lean | 101 ++++++++++++++++ doc/developer/semantics/Mz/SetOps2.lean | 108 ++++++++++++++++++ .../semantics/Mz/UnifiedConsolidate2.lean | 81 +++++++++++++ 4 files changed, 293 insertions(+) create mode 100644 doc/developer/semantics/Mz/Join2.lean create mode 100644 doc/developer/semantics/Mz/SetOps2.lean create mode 100644 doc/developer/semantics/Mz/UnifiedConsolidate2.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 91f7b1dfc35e3..72847371ceb37 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -18,14 +18,17 @@ import Mz.DiffWithGlobal import Mz.UnifiedStream import Mz.UnifiedStream2 import Mz.UnifiedConsolidate +import Mz.UnifiedConsolidate2 import Mz.TimedConsolidate import Mz.Aggregate import Mz.Consolidate import Mz.Triple import Mz.Join +import Mz.Join2 import Mz.JoinPushdown import Mz.FilterFusion import Mz.ProjectFusion import Mz.Demand import Mz.GroupBy import Mz.SetOps +import Mz.SetOps2 diff --git a/doc/developer/semantics/Mz/Join2.lean b/doc/developer/semantics/Mz/Join2.lean new file mode 100644 index 0000000000000..5b2c75887e020 --- /dev/null +++ b/doc/developer/semantics/Mz/Join2.lean @@ -0,0 +1,101 @@ +import Mz.UnifiedStream2 + +/-! +# Joins on `UnifiedStream2` + +Two-input relational join on the new diff-aware stream +(`Mz/UnifiedStream2.lean`). The cartesian product `cross l r` is the +building block; `join pred l r` filters the product through a join +predicate. + +The carrier is now plain `Row` (no `.err` constructor), so the +combinator collapses to ordinary list concatenation: there is no +carrier-err to absorb, no left-wins rule to apply. Error propagation +moves entirely into the diff component: + +* row-scoped errs ride along inside each side's `Diff.errs`; on the + product, `Diff.mul` carries them through linearly (see + `Mz/DiffErrCount.lean`); +* collection-scoped errs live in `DiffWithGlobal.global`, which + absorbs on multiplication via `DiffWithGlobal.global_mul_left` / + `global_mul_right`. + +`cross` makes no commitment to row schema beyond list concatenation, +matching the old `Mz/Join.lean` `cross` on the `.row/.row` branch. +Schema-aware joins (equi-joins on named columns) would lift to this +with a column-substitution layer. +-/ + +namespace Mz + +namespace UnifiedStream2 + +/-- Cartesian product of two new unified streams. For each pair +`((rL, dL), (rR, dR))`: +* row is `rL ++ rR` (plain concatenation; no carrier-err to handle); +* diff is `dL * dR` via `DiffWithGlobal.mul`, which uses `Diff.mul` + on the `val`/`val` case and absorbs on `.global`. -/ +def cross (l r : UnifiedStream2) : UnifiedStream2 := + l.flatMap fun rd => + r.map fun rd' => (rd.1 ++ rd'.1, rd.2 * rd'.2) + +/-- Theta-join: cross product filtered by a predicate. The predicate +evaluates against the concatenated row; existing +`UnifiedStream2.filter` semantics apply (predicate `.err` zeros the +valid count and routes the failure into the diff's err-count). -/ +def join (pred : Expr) (l r : UnifiedStream2) : UnifiedStream2 := + filter pred (cross l r) + +/-! ## Empty cases -/ + +theorem cross_nil_left (r : UnifiedStream2) : + cross [] r = [] := rfl + +theorem cross_nil_right (l : UnifiedStream2) : + cross l [] = [] := by + induction l with + | nil => rfl + | cons _ tl _ih => simp [cross, List.map_nil, List.flatMap_cons] + +/-! ### Reduction lemmas on the left input + +Named per-list-shape reductions so downstream proofs cite these +instead of unfolding `flatMap` inline. -/ + +theorem cross_append_left (a b r : UnifiedStream2) : + cross (a ++ b) r = cross a r ++ cross b r := by + show (a ++ b).flatMap _ = a.flatMap _ ++ b.flatMap _ + exact List.flatMap_append + +theorem cross_cons_left + (hd : Row × DiffWithGlobal) (tl r : UnifiedStream2) : + cross (hd :: tl) r + = (r.map (fun rd => (hd.1 ++ rd.1, hd.2 * rd.2))) + ++ cross tl r := by + show (hd :: tl).flatMap _ = _ ++ tl.flatMap _ + simp [List.flatMap_cons] + +/-! ## Cardinality -/ + +/-- Cross product cardinality. `cross l r` produces exactly one +output record per `(l, r)` pair. Unlike the old `Mz/Join.lean`, +there is no carrier-err short-circuit; the count argument is the +same combinatorial induction. -/ +theorem cross_length (l r : UnifiedStream2) : + (cross l r).length = l.length * r.length := by + induction l with + | nil => simp [cross] + | cons hd tl ih => + show (cross (hd :: tl) r).length = (tl.length + 1) * r.length + rw [Nat.succ_mul] + show ((hd :: tl).flatMap fun ld => + r.map fun rd => (ld.1 ++ rd.1, ld.2 * rd.2)).length + = tl.length * r.length + r.length + rw [List.flatMap_cons, List.length_append, List.length_map] + show r.length + (cross tl r).length = tl.length * r.length + r.length + rw [ih] + exact Nat.add_comm _ _ + +end UnifiedStream2 + +end Mz diff --git a/doc/developer/semantics/Mz/SetOps2.lean b/doc/developer/semantics/Mz/SetOps2.lean new file mode 100644 index 0000000000000..c8a96aa9d55f8 --- /dev/null +++ b/doc/developer/semantics/Mz/SetOps2.lean @@ -0,0 +1,108 @@ +import Mz.UnifiedStream2 + +/-! +# Set operations on `UnifiedStream2` + +Parallel to `Mz/SetOps.lean`, but specialized to the new +`UnifiedStream2` encoding from `Mz/UnifiedStream2.lean`, where the +carrier is `Row` and the diff is `DiffWithGlobal` (a `Diff` payload +plus an absorbing `global` marker). + +This file lands only `UNION ALL`, signed negation, and the signed +flavor of `EXCEPT ALL`. The remaining clamp/distinct/intersect +flavors require inspecting `Diff.val` through the `DiffWithGlobal` +wrapper and are deferred to a follow-up. + +## `UNION ALL` + +Bag union is list concatenation. Each record passes through with its +diff (whether `.val _` or `.global`) unchanged. Both row-scoped errs +(carried inside `Diff.errs`) and the collection-scoped `.global` +marker propagate verbatim through the union. + +## Negation + +Pointwise diff negation through `DiffWithGlobal`'s `Neg` instance. +`.val d` becomes `.val (-d)`; `.global` absorbs negation per +`DiffWithGlobal.neg_global`, so a collection-scoped error survives a +sign flip — it cannot be retracted away. + +## `EXCEPT ALL` + +Differential dataflow's signed-diff bag-difference: negate every +diff of the right input and concatenate. The clamp-to-nonnegative +step that pure bag semantics demands is deferred along with the +other clamp operators; this file states the signed flavor only. +-/ + +namespace Mz + +namespace UnifiedStream2 + +/-! ## `UNION ALL` -/ + +/-- Bag union: concatenate two unified streams. Order is left input +first, then right input. Every record passes through with its diff +unchanged. -/ +def unionAll (l r : UnifiedStream2) : UnifiedStream2 := l ++ r + +/-! ### Reduction lemmas -/ + +theorem unionAll_nil_left (r : UnifiedStream2) : + unionAll [] r = r := List.nil_append r + +theorem unionAll_nil_right (l : UnifiedStream2) : + unionAll l [] = l := List.append_nil l + +theorem unionAll_assoc (a b c : UnifiedStream2) : + unionAll (unionAll a b) c = unionAll a (unionAll b c) := + List.append_assoc a b c + +/-! ## Negation + +Negate every diff in the stream via `DiffWithGlobal`'s `Neg`. `.val +d` becomes `.val (-d)`; `.global` is absorbing under negation. -/ + +def negate (us : UnifiedStream2) : UnifiedStream2 := + us.map fun rd => (rd.1, -rd.2) + +/-! ### Reduction lemmas -/ + +theorem negate_nil : negate [] = [] := rfl + +/-- `negate` distributes over `++`. -/ +theorem negate_append (a b : UnifiedStream2) : + negate (a ++ b) = negate a ++ negate b := by + show (a ++ b).map _ = a.map _ ++ b.map _ + exact List.map_append + +/-- Double negation is the identity. Lifted from +`DiffWithGlobal.neg_neg`. -/ +theorem negate_negate (us : UnifiedStream2) : + negate (negate us) = us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨r, d⟩ := hd + show ((r, - -d) :: negate (negate tl)) = (r, d) :: tl + rw [ih, DiffWithGlobal.neg_neg] + +/-! ## `EXCEPT ALL` (signed) + +Bag difference via signed diffs: left ∪ (negation of right). The +clamp-to-nonnegative normalize step is deferred — this is the signed +flavor only. -/ + +def exceptAll (l r : UnifiedStream2) : UnifiedStream2 := + unionAll l (negate r) + +/-! ### Reduction lemmas -/ + +theorem exceptAll_nil_right (l : UnifiedStream2) : + exceptAll l [] = l := by + show unionAll l (negate []) = l + rw [negate_nil, unionAll_nil_right] + +end UnifiedStream2 + +end Mz diff --git a/doc/developer/semantics/Mz/UnifiedConsolidate2.lean b/doc/developer/semantics/Mz/UnifiedConsolidate2.lean new file mode 100644 index 0000000000000..a252abf83252f --- /dev/null +++ b/doc/developer/semantics/Mz/UnifiedConsolidate2.lean @@ -0,0 +1,81 @@ +import Mz.UnifiedStream2 + +/-! +# Row-keyed consolidation on `UnifiedStream2` + +The new diff encoding from `Mz/UnifiedStream2.lean` carries row-scoped +errors in `Diff.errs` and collection-scoped errors in +`DiffWithGlobal.global`. Bucketing for that encoding is structurally +identical to `Mz/UnifiedConsolidate.lean` — same per-key fold, same +encounter-order placement — but the carrier is just `Row` and the diff +is `DiffWithGlobal`. + +The headline laws (absorption of `global`, cardinality, carrier +uniqueness, retraction of row-scoped errs) are left for follow-up +modules; this file ships only the definition and the per-shape +reduction lemmas downstream proofs need to cite. -/ + +namespace Mz + +namespace UnifiedStream2 + +/-- Insert `(r, d)` into a consolidated stream. If a record with the +same `Row` carrier already exists, add `d` to its diff via +`DiffWithGlobal`'s `+`. Otherwise append a new record at the end of +the list. Exposed (not `private`) so downstream files can state laws +about it. -/ +def consolidateInto (r : Row) (d : DiffWithGlobal) : + UnifiedStream2 → UnifiedStream2 + | [] => [(r, d)] + | (r', d') :: rest => + if r = r' then (r', d + d') :: rest + else (r', d') :: consolidateInto r d rest + +/-- Sum diffs per `Row` carrier across the stream. Order of distinct +carriers follows encounter order from the right, mirroring the old +`UnifiedStream.consolidate`. -/ +def consolidate : UnifiedStream2 → UnifiedStream2 + | [] => [] + | (r, d) :: rest => consolidateInto r d (consolidate rest) + +/-! ### `consolidateInto` reduction lemmas + +Named per-shape reductions so proofs cite a single lemma instead of +unfolding the `if`-then-else by hand. -/ + +theorem consolidateInto_nil (r : Row) (d : DiffWithGlobal) : + consolidateInto r d [] = [(r, d)] := rfl + +/-- Inserting `(r, d)` at the head of a list whose head matches `r` +folds into the head bucket. -/ +theorem consolidateInto_match + (r : Row) (d d' : DiffWithGlobal) (tl : UnifiedStream2) : + consolidateInto r d ((r, d') :: tl) = (r, d + d') :: tl := by + show (if r = r then (r, d + d') :: tl + else (r, d') :: consolidateInto r d tl) + = (r, d + d') :: tl + rw [if_pos rfl] + +/-- Inserting `(r, d)` at the head of a list whose head does not match +`r` skips the head and recurses on the tail. -/ +theorem consolidateInto_skip + (r r' : Row) (d d' : DiffWithGlobal) (tl : UnifiedStream2) + (h : r ≠ r') : + consolidateInto r d ((r', d') :: tl) + = (r', d') :: consolidateInto r d tl := by + show (if r = r' then (r', d + d') :: tl + else (r', d') :: consolidateInto r d tl) + = (r', d') :: consolidateInto r d tl + rw [if_neg h] + +/-! ## Trivial cases -/ + +theorem consolidate_nil : + consolidate [] = [] := rfl + +theorem consolidate_singleton (r : Row) (d : DiffWithGlobal) : + consolidate [(r, d)] = [(r, d)] := rfl + +end UnifiedStream2 + +end Mz From 800f2b4186699b603ec9022a9783e8b057ac9fd8 Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Wed, 20 May 2026 16:08:48 -0400 Subject: [PATCH 125/127] doc/semantics: TimedConsolidate2 + Consolidate2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * TimedConsolidate2: TimedUnifiedStream2 := List (Row × Nat × DiffWithGlobal). advanceFrontier (max-based), atTime (filterMap drop), consolidateAtTime composing UnifiedStream2.consolidate. * Consolidate2: DiffWithGlobal.sumAll right-fold from 0. Plus sumAll_eq_global_of_mem (absorption), sumAll_val_of_all_val (no-escalation), sumAll_global_inv (reverse direction). 40 lean build jobs green. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 2 + doc/developer/semantics/Mz/Consolidate2.lean | 114 ++++++++++++++++++ .../semantics/Mz/TimedConsolidate2.lean | 86 +++++++++++++ 3 files changed, 202 insertions(+) create mode 100644 doc/developer/semantics/Mz/Consolidate2.lean create mode 100644 doc/developer/semantics/Mz/TimedConsolidate2.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 72847371ceb37..57e05dcee34b5 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -20,8 +20,10 @@ import Mz.UnifiedStream2 import Mz.UnifiedConsolidate import Mz.UnifiedConsolidate2 import Mz.TimedConsolidate +import Mz.TimedConsolidate2 import Mz.Aggregate import Mz.Consolidate +import Mz.Consolidate2 import Mz.Triple import Mz.Join import Mz.Join2 diff --git a/doc/developer/semantics/Mz/Consolidate2.lean b/doc/developer/semantics/Mz/Consolidate2.lean new file mode 100644 index 0000000000000..1b840d839f228 --- /dev/null +++ b/doc/developer/semantics/Mz/Consolidate2.lean @@ -0,0 +1,114 @@ +import Mz.DiffWithGlobal + +/-! +# Consolidation under the two-layer diff + +`Mz/Consolidate.lean` consolidates a list of `DiffWithError α` +values, where a single `error` absorbs the entire sum. That model +conflates row-scoped errors (retractable, kept in `Diff.errs`) with +collection-scoped errors (terminal, encoded as +`DiffWithGlobal.global`). See +`doc/developer/design/20260517_error_handling_semantics.md`, section +"Global-scoped errors: absorbing diff marker". + +This file mirrors `Mz/Consolidate.lean` for the new +`DiffWithGlobal` carrier: the absorbing element is now +`DiffWithGlobal.global`, and the row-scoped err counts inside +`Diff` are summed pointwise rather than absorbing. The absorption +proofs cite `DiffWithGlobal.global_add_left/right` and +`add_eq_global_left_or_right` directly, the same way the old file +cited the corresponding `DiffWithError` laws. + +The honest sum / converse-absorption pair argues that downstream +operators may treat a `global` consolidated diff as proof that some +input record contributed `global`, and conversely that an all-`val` +input list stays in the `val` branch — i.e. row-scoped err counts +alone never escalate consolidation to `global`. +-/ + +namespace Mz + +namespace DiffWithGlobal + +/-- Sum a list of diffs, right-associative fold from `0`. Mirrors +`DiffWithError.sumAll`; with `+` commutative on `DiffWithGlobal` +(via `DiffWithGlobal.add_comm`), the result is order-independent. -/ +def sumAll : List DiffWithGlobal → DiffWithGlobal + | [] => 0 + | d :: rest => d + sumAll rest + +/-! ## Absorption -/ + +/-- If `global` appears anywhere in the diff list, the consolidated +sum is `global`. The proof walks the list and uses the absorption +laws from `Mz/DiffWithGlobal.lean` at the matching cons. -/ +theorem sumAll_eq_global_of_mem + {ds : List DiffWithGlobal} (h : DiffWithGlobal.global ∈ ds) : + sumAll ds = DiffWithGlobal.global := by + induction ds with + | nil => cases h + | cons hd tl ih => + cases h with + | head _ => + -- hd = global; first cons step is `global + sumAll tl`, which is `global`. + show (global : DiffWithGlobal) + sumAll tl = global + exact global_add_left _ + | tail _ h_tl => + -- global is in tl. By IH `sumAll tl = global`, then `hd + global = global`. + show hd + sumAll tl = global + rw [ih h_tl] + exact global_add_right hd + +/-! ## No-global preservation + +If every diff in the list is an honest `val d`, the consolidated +sum is also `val` of *some* `Diff`. The exact value depends on +`Diff`'s addition, which this file does not commit to here. -/ + +theorem sumAll_val_of_all_val + {ds : List DiffWithGlobal} + (h : ∀ d ∈ ds, ∃ x : Diff, d = val x) : + ∃ x : Diff, sumAll ds = val x := by + induction ds with + | nil => + show ∃ x : Diff, (0 : DiffWithGlobal) = val x + refine ⟨0, ?_⟩ + rfl + | cons hd tl ih => + obtain ⟨xh, hh_eq⟩ := h hd (List.Mem.head _) + have htl : ∀ d ∈ tl, ∃ x : Diff, d = val x := + fun d hd_mem => h d (List.Mem.tail _ hd_mem) + obtain ⟨xt, ht_eq⟩ := ih htl + refine ⟨xh + xt, ?_⟩ + show hd + sumAll tl = val (xh + xt) + rw [hh_eq, ht_eq] + rfl + +/-- Reverse direction: a `sumAll` equal to `.global` witnesses a +`.global` summand somewhere in the list. The `.val + .val` arm of +`+` never returns `.global`, so a `.global` total rules in at +least one `.global` input. -/ +theorem sumAll_global_inv + {ds : List DiffWithGlobal} + (h : sumAll ds = DiffWithGlobal.global) : + ∃ d ∈ ds, d = DiffWithGlobal.global := by + induction ds with + | nil => + -- sumAll [] = 0 = val 0 ≠ global + show ∃ d ∈ ([] : List DiffWithGlobal), d = global + exfalso + have h0 : (0 : DiffWithGlobal) = (global : DiffWithGlobal) := h + have : (DiffWithGlobal.val (0 : Diff) : DiffWithGlobal) + = (DiffWithGlobal.global : DiffWithGlobal) := h0 + cases this + | cons hd tl ih => + show ∃ d ∈ hd :: tl, d = global + have hSum : hd + sumAll tl = global := h + rcases add_eq_global_left_or_right hd (sumAll tl) hSum with hHd | hTl + · exact ⟨hd, List.mem_cons_self, hHd⟩ + · obtain ⟨d, hMem, hD⟩ := ih hTl + exact ⟨d, List.mem_cons_of_mem _ hMem, hD⟩ + +end DiffWithGlobal + +end Mz diff --git a/doc/developer/semantics/Mz/TimedConsolidate2.lean b/doc/developer/semantics/Mz/TimedConsolidate2.lean new file mode 100644 index 0000000000000..f92b93ea259d9 --- /dev/null +++ b/doc/developer/semantics/Mz/TimedConsolidate2.lean @@ -0,0 +1,86 @@ +import Mz.UnifiedStream2 +import Mz.UnifiedConsolidate2 +import Mz.DiffWithGlobal + +/-! +# Per-`(row, time)` consolidation on `UnifiedStream2` + +Mirror of `Mz/TimedConsolidate.lean` over the new diff encoding from +`Mz/UnifiedStream2.lean`. The carrier is just `Row` (no `.err` +constructor) and the diff is `DiffWithGlobal`, so a timed record is +the triple `(Row, Nat, DiffWithGlobal)`. The per-time slice +`atTime t` keeps only records at time `t` and forgets the time +component, producing a plain `UnifiedStream2`. Composing with +`UnifiedStream2.consolidate` gives `consolidateAtTime`. + +The frontier operator `advanceFrontier f` lifts each record's time +to at least `f` via `Nat.max`, identical in shape to the old module. + +Only the definition and the trivial per-shape reductions ship here; +absorption, cardinality, and retraction laws are left for follow-up +modules in keeping with `Mz/UnifiedConsolidate2.lean`. -/ + +namespace Mz + +/-- A timed record on the new unified stream: carrier, time, diff. +The carrier is the plain `Row` from `Mz/UnifiedStream2.lean`; the +diff is `DiffWithGlobal`, which separates row-scoped errs (inside +`Diff.errs`) from collection-scoped errs (`DiffWithGlobal.global`). -/ +abbrev TimedUnifiedRecord2 := Row × Nat × DiffWithGlobal + +/-- Differential-dataflow-style stream of timed records over the new +diff encoding. -/ +abbrev TimedUnifiedStream2 := List TimedUnifiedRecord2 + +/-! ## Frontier advance + +Records with time strictly before frontier `f` are "advanced" to +`f` (their time is updated to `f`), making the past immutable. +Records at or past `f` are left untouched. The encoding tracks +frontiers as a single `Nat`, sufficient to state the algebraic +laws. -/ + +/-- Advance every record's time to at least `f`. Records originally +at time `< f` move to `f`; records already at `≥ f` stay. -/ +def TimedUnifiedStream2.advanceFrontier (f : Nat) (s : TimedUnifiedStream2) : + TimedUnifiedStream2 := + s.map fun r => (r.1, Nat.max r.2.1 f, r.2.2) + +/-! ## Time-slice projection -/ + +/-- Project a timed stream to the time slice at `t`. Records at +other times are dropped; the time component is forgotten, producing +an ordinary `UnifiedStream2`. -/ +def TimedUnifiedStream2.atTime (t : Nat) (s : TimedUnifiedStream2) : + UnifiedStream2 := + s.filterMap fun r => + if r.2.1 = t then some (r.1, r.2.2) else none + +/-- Bucket records at time `t` by carrier and sum their diffs. -/ +def TimedUnifiedStream2.consolidateAtTime (t : Nat) (s : TimedUnifiedStream2) : + UnifiedStream2 := + UnifiedStream2.consolidate (TimedUnifiedStream2.atTime t s) + +/-! ## Reduction lemmas + +Named per-shape reductions so downstream proofs cite a single +lemma instead of unfolding `map` / `filterMap` inline. -/ + +theorem TimedUnifiedStream2.advanceFrontier_nil (f : Nat) : + TimedUnifiedStream2.advanceFrontier f [] = [] := rfl + +theorem TimedUnifiedStream2.advanceFrontier_append + (f : Nat) (a b : TimedUnifiedStream2) : + TimedUnifiedStream2.advanceFrontier f (a ++ b) + = TimedUnifiedStream2.advanceFrontier f a + ++ TimedUnifiedStream2.advanceFrontier f b := by + show (a ++ b).map _ = a.map _ ++ b.map _ + exact List.map_append + +theorem TimedUnifiedStream2.atTime_nil (t : Nat) : + TimedUnifiedStream2.atTime t [] = [] := rfl + +theorem TimedUnifiedStream2.consolidateAtTime_nil (t : Nat) : + TimedUnifiedStream2.consolidateAtTime t [] = [] := rfl + +end Mz From 3f435c7aeb1d8a4ba6a8b5d422dd32a98f9629bf Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Wed, 20 May 2026 16:19:16 -0400 Subject: [PATCH 126/127] doc/semantics: Triple2 collection-wide consolidation TimedUnifiedStream2.consolidateAll + consolidateAtTimeFlat. Cite DiffWithGlobal.sumAll_eq_global_of_mem and sumAll_global_inv for absorption + inversion. Iff forms wrap both directions. 41 lean build jobs green. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 1 + doc/developer/semantics/Mz/Triple2.lean | 126 ++++++++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 doc/developer/semantics/Mz/Triple2.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index 57e05dcee34b5..e8143e222a432 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -25,6 +25,7 @@ import Mz.Aggregate import Mz.Consolidate import Mz.Consolidate2 import Mz.Triple +import Mz.Triple2 import Mz.Join import Mz.Join2 import Mz.JoinPushdown diff --git a/doc/developer/semantics/Mz/Triple2.lean b/doc/developer/semantics/Mz/Triple2.lean new file mode 100644 index 0000000000000..31fb5d04da914 --- /dev/null +++ b/doc/developer/semantics/Mz/Triple2.lean @@ -0,0 +1,126 @@ +import Mz.UnifiedStream2 +import Mz.TimedConsolidate2 +import Mz.DiffWithGlobal +import Mz.Consolidate2 + +/-! +# Collection-wide diff sum on the new timed unified stream + +Mirror of `Mz/Triple.lean` over the two-layer diff encoding from +`Mz/DiffWithGlobal.lean`. The carrier here is `TimedUnifiedRecord2` +from `Mz/TimedConsolidate2.lean`, which pairs a `Row` (the new +err-free carrier) with a `Nat` time and a `DiffWithGlobal` diff. +Row-scoped errs now live inside `Diff.errs`; collection-scoped errs +live in the diff via the absorbing `.global` marker. + +The two consolidation views below do *not* bucket by row: + +* `consolidateAll`: sum every diff in the stream, ignoring row and + time. The collection-wide diff. +* `consolidateAtTimeFlat`: sum every diff at time `t`, ignoring row. + The per-time collection diff. + +Both reduce to `DiffWithGlobal.sumAll`, so the absorption laws from +`Mz/Consolidate2.lean` transport directly: a `.global` diff anywhere +in the consolidated range forces the consolidated total to +`.global`. + +For per-`(row, time)` bucketing — where the output is itself a +`UnifiedStream2`, one record per surviving carrier with the bucket's +summed diff — use `TimedUnifiedStream2.consolidateAtTime` in +`Mz/TimedConsolidate2.lean`. The two views are complementary: this +file collapses time slices to a single diff value; the +TimedConsolidate2 view collapses each `(row, time)` bucket +separately. +-/ + +namespace Mz + +/-- Sum every diff in the stream, ignoring row and time. -/ +def TimedUnifiedStream2.consolidateAll (s : TimedUnifiedStream2) : DiffWithGlobal := + DiffWithGlobal.sumAll (s.map (·.2.2)) + +/-- Sum every diff at a given time, ignoring row. -/ +def TimedUnifiedStream2.consolidateAtTimeFlat + (t : Nat) (s : TimedUnifiedStream2) : DiffWithGlobal := + DiffWithGlobal.sumAll ((s.filter (·.2.1 = t)).map (·.2.2)) + +/-! ## Absorption -/ + +/-- A `.global` diff anywhere in the stream forces the +collection-wide consolidation to `.global`. -/ +theorem TimedUnifiedStream2.consolidateAll_eq_global_of_mem + {s : TimedUnifiedStream2} (r : TimedUnifiedRecord2) + (h_mem : r ∈ s) (h_g : r.2.2 = DiffWithGlobal.global) : + TimedUnifiedStream2.consolidateAll s = DiffWithGlobal.global := by + unfold TimedUnifiedStream2.consolidateAll + apply DiffWithGlobal.sumAll_eq_global_of_mem + refine List.mem_map.mpr ⟨r, h_mem, ?_⟩ + exact h_g + +/-- Restricted to a time slice: a `.global` record at time `t` +forces the per-time flat consolidation at `t` to `.global`. -/ +theorem TimedUnifiedStream2.consolidateAtTimeFlat_eq_global_of_mem + {s : TimedUnifiedStream2} (t : Nat) (r : TimedUnifiedRecord2) + (h_mem : r ∈ s) (h_time : r.2.1 = t) + (h_g : r.2.2 = DiffWithGlobal.global) : + TimedUnifiedStream2.consolidateAtTimeFlat t s = DiffWithGlobal.global := by + unfold TimedUnifiedStream2.consolidateAtTimeFlat + apply DiffWithGlobal.sumAll_eq_global_of_mem + refine List.mem_map.mpr ⟨r, ?_, h_g⟩ + exact List.mem_filter.mpr ⟨h_mem, by simp [h_time]⟩ + +/-! ## Reverse direction: from `.global` total to `.global` record -/ + +/-- If the collection-wide consolidation is `.global`, at least one +record in the stream carries a `.global` diff. The converse of +`consolidateAll_eq_global_of_mem`. -/ +theorem TimedUnifiedStream2.consolidateAll_global_inv + {s : TimedUnifiedStream2} + (h : TimedUnifiedStream2.consolidateAll s = DiffWithGlobal.global) : + ∃ r ∈ s, r.2.2 = DiffWithGlobal.global := by + unfold TimedUnifiedStream2.consolidateAll at h + obtain ⟨d, hMem, hD⟩ := DiffWithGlobal.sumAll_global_inv h + obtain ⟨r, hRMem, hRD⟩ := List.mem_map.mp hMem + exact ⟨r, hRMem, by rw [hRD]; exact hD⟩ + +/-- Time-slice version: a `.global` total at time `t` witnesses a +`.global` record at time `t`. -/ +theorem TimedUnifiedStream2.consolidateAtTimeFlat_global_inv + {s : TimedUnifiedStream2} (t : Nat) + (h : TimedUnifiedStream2.consolidateAtTimeFlat t s = DiffWithGlobal.global) : + ∃ r ∈ s, r.2.1 = t ∧ r.2.2 = DiffWithGlobal.global := by + unfold TimedUnifiedStream2.consolidateAtTimeFlat at h + obtain ⟨d, hMem, hD⟩ := DiffWithGlobal.sumAll_global_inv h + obtain ⟨r, hRMem, hRD⟩ := List.mem_map.mp hMem + have hRFilter : r ∈ s.filter (·.2.1 = t) := hRMem + rw [List.mem_filter] at hRFilter + refine ⟨r, hRFilter.1, ?_, ?_⟩ + · exact of_decide_eq_true hRFilter.2 + · rw [hRD]; exact hD + +/-! ## Round-trip iff forms + +Combine forward absorption with reverse inversion. The flat +consolidations exactly characterize the presence of a `.global` diff +in the stream (per-time slice for the time-aware version). -/ + +theorem TimedUnifiedStream2.consolidateAll_eq_global_iff + (s : TimedUnifiedStream2) : + TimedUnifiedStream2.consolidateAll s = DiffWithGlobal.global + ↔ ∃ r ∈ s, r.2.2 = DiffWithGlobal.global := by + constructor + · exact TimedUnifiedStream2.consolidateAll_global_inv + · intro ⟨r, hMem, hG⟩ + exact TimedUnifiedStream2.consolidateAll_eq_global_of_mem r hMem hG + +theorem TimedUnifiedStream2.consolidateAtTimeFlat_eq_global_iff + (s : TimedUnifiedStream2) (t : Nat) : + TimedUnifiedStream2.consolidateAtTimeFlat t s = DiffWithGlobal.global + ↔ ∃ r ∈ s, r.2.1 = t ∧ r.2.2 = DiffWithGlobal.global := by + constructor + · exact TimedUnifiedStream2.consolidateAtTimeFlat_global_inv t + · intro ⟨r, hMem, hT, hG⟩ + exact TimedUnifiedStream2.consolidateAtTimeFlat_eq_global_of_mem t r hMem hT hG + +end Mz From 81359980bd88f13470de9b0362a04c72113c0a7e Mon Sep 17 00:00:00 2001 From: Moritz Hoffmann Date: Wed, 20 May 2026 17:08:00 -0400 Subject: [PATCH 127/127] doc/semantics: FilterFusion2 + Demand2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FilterFusion2 (331 lines): * predNoRowErr — quantifies over val records only (.global unconstrained). * filter_filter_fuse, filter_idem (unconditional!), filter_comm, filter_eval_eq. * filter_idem now needs no hypothesis: filter operates only on the diff, so a second pass sees the same row and same eval result; ErrCount.single _ 0 absorbs cleanly into prior errs. Demand2 (293 lines): * replaceAtRow always touches the row (no err-carrier carve-out). * filter_replaceAtRow_of_unused — full commute under colReferencesUnused. * project_replaceAtRow_of_unused — full commute under argsColRefUnusedList2 + es.length ≤ n. Width side condition handles project's row-rewrite asymmetry: when safe, output row is es-mapped and replaceAtRow on output touches a different position than on input. Putting n out-of-bounds in the projected row makes Env.replaceAt the identity there. argsColRefUnusedList2 renamed to avoid clash with Demand.lean's identically-named predicate. 43 lean build jobs green. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/developer/semantics/Mz.lean | 2 + doc/developer/semantics/Mz/Demand2.lean | 292 +++++++++++++++ doc/developer/semantics/Mz/FilterFusion2.lean | 331 ++++++++++++++++++ 3 files changed, 625 insertions(+) create mode 100644 doc/developer/semantics/Mz/Demand2.lean create mode 100644 doc/developer/semantics/Mz/FilterFusion2.lean diff --git a/doc/developer/semantics/Mz.lean b/doc/developer/semantics/Mz.lean index e8143e222a432..c4aa5eacb5fee 100644 --- a/doc/developer/semantics/Mz.lean +++ b/doc/developer/semantics/Mz.lean @@ -30,8 +30,10 @@ import Mz.Join import Mz.Join2 import Mz.JoinPushdown import Mz.FilterFusion +import Mz.FilterFusion2 import Mz.ProjectFusion import Mz.Demand +import Mz.Demand2 import Mz.GroupBy import Mz.SetOps import Mz.SetOps2 diff --git a/doc/developer/semantics/Mz/Demand2.lean b/doc/developer/semantics/Mz/Demand2.lean new file mode 100644 index 0000000000000..ea6476a5f8661 --- /dev/null +++ b/doc/developer/semantics/Mz/Demand2.lean @@ -0,0 +1,292 @@ +import Mz.UnifiedStream2 +import Mz.ColRefs + +/-! +# Demand for UnifiedStream2: unused-column invariance + +Port of `Mz/Demand.lean` to the new `UnifiedStream2` carrier +(`Row × DiffWithGlobal`). The model is simpler than the old +`UnifiedStream`: + +* The carrier is just `Row` — there is no `.err` constructor to + carve around. `replaceAtRow` always applies `Env.replaceAt` to + the row component. +* `project` keeps the row on evaluation failure (the failure is + routed into the diff's `errs` component), so the + `IsPureData`-style hypothesis that gated the old project proof + is no longer required. + +The headline theorems mirror the old file: `filter` commutes with +`replaceAtRow` under `colReferencesUnused`, and `project` +commutes with `replaceAtRow` under +`Expr.argsColRefUnusedList2` together with the projection-width +side condition `es.length ≤ n` (so the output row has no column +`n` to disturb). +-/ + +namespace Mz + +/-- Replace column `n` of every row in `us` with `v`. With the new +carrier there is no `.err` branch — every record's first +component is a `Row`, so the substitution applies uniformly. The +`DiffWithGlobal` component (whether `.val` or `.global`) is +preserved on every record. -/ +def UnifiedStream2.replaceAtRow (n : Nat) (v : Datum) (us : UnifiedStream2) : + UnifiedStream2 := + us.map fun rd => (Env.replaceAt rd.1 n v, rd.2) + +theorem UnifiedStream2.replaceAtRow_nil (n : Nat) (v : Datum) : + UnifiedStream2.replaceAtRow n v [] = [] := rfl + +theorem UnifiedStream2.replaceAtRow_append + (n : Nat) (v : Datum) (a b : UnifiedStream2) : + UnifiedStream2.replaceAtRow n v (a ++ b) + = UnifiedStream2.replaceAtRow n v a + ++ UnifiedStream2.replaceAtRow n v b := by + unfold UnifiedStream2.replaceAtRow + exact List.map_append + +/-! ## Filter invariance under unused-column replacement -/ + +/-- Filter commutes with `replaceAtRow n v` when the predicate +does not reference column `n`. Replacing the unused column on the +input then filtering equals filtering then replacing on the +output. Models `demand.rs`: an unused column is free to be +overwritten without affecting the filter result. + +Compared with the old `UnifiedStream` proof this version is +strictly simpler — every record has carrier `Row`, so there is no +`.err` carve-out, and the `.global` diff case becomes a clean +pass-through (the carrier is still a row that `replaceAtRow` +touches on both sides). -/ +theorem UnifiedStream2.filter_replaceAtRow_of_unused + (pred : Expr) (n : Nat) (v : Datum) (us : UnifiedStream2) + (h : pred.colReferencesUnused n = true) : + UnifiedStream2.filter pred (UnifiedStream2.replaceAtRow n v us) + = UnifiedStream2.replaceAtRow n v + (UnifiedStream2.filter pred us) := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨r, d⟩ := hd + have hConsAsApp : ((r, d) :: tl : UnifiedStream2) = [(r, d)] ++ tl := rfl + rw [hConsAsApp, UnifiedStream2.replaceAtRow_append, + UnifiedStream2.filter_append, UnifiedStream2.filter_append, + UnifiedStream2.replaceAtRow_append, ih] + congr 1 + cases d with + | global => + -- `.global` diff: filter passes through unchanged; the + -- carrier on both sides is `Env.replaceAt r n v`. + rfl + | val d => + -- Both pipelines reduce to a single record produced by + -- `filter`'s inner `match eval _ pred with ...`. The + -- `eval_replaceAt_of_unused` lemma bridges the input + -- substitution to the predicate result. + have hEval : eval (Env.replaceAt r n v) pred = eval r pred := + eval_replaceAt_of_unused r n v pred h + show [match eval (Env.replaceAt r n v) pred with + | .bool true => + (Env.replaceAt r n v, DiffWithGlobal.val d) + | .err e => + (Env.replaceAt r n v, DiffWithGlobal.val + { val := 0 + , errs := d.errs + ErrCount.single e d.val }) + | _ => + (Env.replaceAt r n v, DiffWithGlobal.val + { val := 0, errs := d.errs })] ++ [] + = UnifiedStream2.replaceAtRow n v + ([match eval r pred with + | .bool true => + (r, DiffWithGlobal.val d) + | .err e => + (r, DiffWithGlobal.val + { val := 0 + , errs := d.errs + ErrCount.single e d.val }) + | _ => + (r, DiffWithGlobal.val + { val := 0, errs := d.errs })] ++ []) + rw [hEval] + -- Match on `eval r pred`: each branch produces a single + -- record whose row component on the LHS is already + -- `Env.replaceAt r n v` and on the RHS becomes so after + -- `replaceAtRow n v`. + cases eval r pred with + | bool b => + cases b with + | true => rfl + | false => rfl + | err _ => rfl + | int _ => rfl + | null => rfl + +/-! ## Project invariance under unused-column replacement -/ + +/-- Every expression in `es` has column `n` unused. Same predicate +as in the old `Mz/Demand.lean`. -/ +def Expr.argsColRefUnusedList2 (n : Nat) (es : List Expr) : Prop := + ∀ e ∈ es, e.colReferencesUnused n = true + +private theorem rowAllSafe_replaceAt_of_unused + (es : List Expr) (n : Nat) (v : Datum) (r : Row) + (h : Expr.argsColRefUnusedList2 n es) : + rowAllSafe es (Env.replaceAt r n v) = rowAllSafe es r := by + induction es with + | nil => rfl + | cons hd tl ih => + have hHd : hd.colReferencesUnused n = true := h hd List.mem_cons_self + have hTl : Expr.argsColRefUnusedList2 n tl := + fun e hMem => h e (List.mem_cons_of_mem _ hMem) + have hEval : eval (Env.replaceAt r n v) hd = eval r hd := + eval_replaceAt_of_unused r n v hd hHd + unfold rowAllSafe at ih ⊢ + rw [List.all_cons, List.all_cons, hEval, ih hTl] + +private theorem rowErrs_replaceAt_of_unused + (es : List Expr) (n : Nat) (v : Datum) (r : Row) + (h : Expr.argsColRefUnusedList2 n es) : + rowErrs es (Env.replaceAt r n v) = rowErrs es r := by + induction es with + | nil => rfl + | cons hd tl ih => + have hHd : hd.colReferencesUnused n = true := h hd List.mem_cons_self + have hTl : Expr.argsColRefUnusedList2 n tl := + fun e hMem => h e (List.mem_cons_of_mem _ hMem) + have hEval : eval (Env.replaceAt r n v) hd = eval r hd := + eval_replaceAt_of_unused r n v hd hHd + unfold rowErrs at ih ⊢ + rw [List.filterMap_cons, List.filterMap_cons, hEval, ih hTl] + +private theorem evalMap_replaceAt_of_unused + (es : List Expr) (n : Nat) (v : Datum) (r : Row) + (h : Expr.argsColRefUnusedList2 n es) : + es.map (eval (Env.replaceAt r n v)) = es.map (eval r) := by + induction es with + | nil => rfl + | cons hd tl ih => + have hHd : hd.colReferencesUnused n = true := h hd List.mem_cons_self + have hTl : Expr.argsColRefUnusedList2 n tl := + fun e hMem => h e (List.mem_cons_of_mem _ hMem) + have hEval : eval (Env.replaceAt r n v) hd = eval r hd := + eval_replaceAt_of_unused r n v hd hHd + show eval (Env.replaceAt r n v) hd :: tl.map _ + = eval r hd :: tl.map _ + rw [hEval, ih hTl] + +private theorem rowErrCount_replaceAt_of_unused + (es : List Expr) (n : Nat) (v : Datum) (r : Row) (k : Int) + (h : Expr.argsColRefUnusedList2 n es) : + UnifiedStream2.rowErrCount es (Env.replaceAt r n v) k + = UnifiedStream2.rowErrCount es r k := by + unfold UnifiedStream2.rowErrCount + rw [rowErrs_replaceAt_of_unused es n v r h] + +/-- Out-of-bounds replacement is a no-op. -/ +private theorem replaceAt_of_length_le : + ∀ (env : Env) (n : Nat) (v : Datum), env.length ≤ n → + Env.replaceAt env n v = env + | [], _, _, _ => rfl + | _ :: _, 0, _, h => by + -- `(_ :: _).length = _ + 1`, which cannot be `≤ 0`. + exact absurd h (Nat.not_succ_le_zero _) + | hd :: tl, n + 1, v, h => by + show hd :: Env.replaceAt tl n v = hd :: tl + have hTl : tl.length ≤ n := Nat.le_of_succ_le_succ h + rw [replaceAt_of_length_le tl n v hTl] + +/-- Length of `List.map` is the length of the underlying list. -/ +private theorem length_map_eval (es : List Expr) (r : Row) : + (es.map (eval r)).length = es.length := List.length_map _ + +/-- Project commutes with `replaceAtRow n v` when every expression +in `es` has column `n` unused and the projected row is too short +to contain a column at index `n` (`es.length ≤ n`). + +The width side condition handles project's row-rewrite asymmetry: +when `rowAllSafe es r = true` the output row is `es.map (eval r)`, +and `replaceAtRow n v` on the *output* side touches position `n` of +that projected row. The unused-column hypothesis makes the +projected row independent of the input's column `n` but says +nothing about what value happens to live at the *output's* column +`n`. Requiring `es.length ≤ n` puts the would-be substitution out +of bounds, where `Env.replaceAt` is the identity. + +The unsafe branch needs no width side condition: project preserves +the row there, so `replaceAtRow` on input and output coincide. -/ +theorem UnifiedStream2.project_replaceAtRow_of_unused + (es : List Expr) (n : Nat) (v : Datum) (us : UnifiedStream2) + (h : Expr.argsColRefUnusedList2 n es) (hLen : es.length ≤ n) : + UnifiedStream2.project es (UnifiedStream2.replaceAtRow n v us) + = UnifiedStream2.replaceAtRow n v + (UnifiedStream2.project es us) := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨r, d⟩ := hd + have hConsAsApp : ((r, d) :: tl : UnifiedStream2) = [(r, d)] ++ tl := rfl + rw [hConsAsApp, UnifiedStream2.replaceAtRow_append, + UnifiedStream2.project_append, UnifiedStream2.project_append, + UnifiedStream2.replaceAtRow_append, ih] + congr 1 + cases d with + | global => + -- `.global`: project passes the record through. Both + -- pipelines apply `replaceAtRow n v` exactly once. + rfl + | val d => + -- Both pipelines reduce to a single record produced by + -- `project`'s inner `if rowAllSafe es _ then ... else ...`. + show [if rowAllSafe es (Env.replaceAt r n v) then + (es.map (eval (Env.replaceAt r n v)), DiffWithGlobal.val d) + else + (Env.replaceAt r n v, DiffWithGlobal.val + { val := 0 + , errs := d.errs + + UnifiedStream2.rowErrCount es + (Env.replaceAt r n v) d.val })] ++ [] + = UnifiedStream2.replaceAtRow n v + ([if rowAllSafe es r then + (es.map (eval r), DiffWithGlobal.val d) + else + (r, DiffWithGlobal.val + { val := 0 + , errs := d.errs + + UnifiedStream2.rowErrCount es r d.val })] + ++ []) + rw [rowAllSafe_replaceAt_of_unused es n v r h] + by_cases hSafe : rowAllSafe es r = true + · -- Safe branch: output row is `es.map (eval _)`, diff kept. + rw [if_pos hSafe, if_pos hSafe] + show [(es.map (eval (Env.replaceAt r n v)), DiffWithGlobal.val d)] + ++ [] + = UnifiedStream2.replaceAtRow n v + ([(es.map (eval r), DiffWithGlobal.val d)] ++ []) + rw [List.append_nil, List.append_nil] + show [(es.map (eval (Env.replaceAt r n v)), DiffWithGlobal.val d)] + = [(Env.replaceAt (es.map (eval r)) n v, DiffWithGlobal.val d)] + rw [evalMap_replaceAt_of_unused es n v r h] + -- `Env.replaceAt` on the projected row is a no-op because + -- the row is `es.length`-long and `n ≥ es.length`. + have hOob : (es.map (eval r)).length ≤ n := by + rw [length_map_eval]; exact hLen + rw [replaceAt_of_length_le (es.map (eval r)) n v hOob] + · -- Unsafe branch: row is preserved, diff valid count zeroed, + -- err-count entries added per erroring scalar. + rw [if_neg hSafe, if_neg hSafe] + show [(Env.replaceAt r n v, + DiffWithGlobal.val + { val := 0 + , errs := d.errs + + UnifiedStream2.rowErrCount es + (Env.replaceAt r n v) d.val })] ++ [] + = UnifiedStream2.replaceAtRow n v + ([(r, DiffWithGlobal.val + { val := 0 + , errs := d.errs + + UnifiedStream2.rowErrCount es r d.val })] + ++ []) + rw [rowErrCount_replaceAt_of_unused es n v r d.val h] + rfl + +end Mz diff --git a/doc/developer/semantics/Mz/FilterFusion2.lean b/doc/developer/semantics/Mz/FilterFusion2.lean new file mode 100644 index 0000000000000..8bb8cc3d86baf --- /dev/null +++ b/doc/developer/semantics/Mz/FilterFusion2.lean @@ -0,0 +1,331 @@ +import Mz.UnifiedStream2 +import Mz.DiffErrCount +import Mz.DiffWithGlobal +import Mz.Boolean +import Mz.Expr +import Mz.Eval +import Mz.Laws + +/-! +# Filter fusion on the new unified stream + +Mirror of `Mz/FilterFusion.lean` over `UnifiedStream2`. The new stream +keeps the row in the carrier and routes row-scoped predicate errors +through the `Diff.errs` axis (see +`doc/developer/design/20260517_error_handling_semantics.md`). +Collection-scoped errors live in the absorbing `.global` branch of +`DiffWithGlobal`. + +The Rust optimizer's `fusion/filter.rs` pass collapses adjacent +filters: `filter p ∘ filter q ↝ filter (q ∧ p)`. As in the old model, +the denotational statement holds under a row-level err-freedom side +condition: neither predicate may evaluate to `.err _` on any data row +whose diff is `.val _`. + +The new filter never drops a record — it preserves the row and only +mutates the diff's `val` count (zeroing it on non-`.bool true` +results) and `errs` count (adding `e ↦ d.val` on `.err e`). +Consequently the side condition is purely about lining up the err +case with `evalAnd`'s `.bool false`-absorbs-everything clause: a +predicate `.err e` followed by `.bool false` (or vice versa) would +disagree with the fused form, which collapses to `.bool false` and +loses the err entry. + +`.global` diffs pass through both pipelines unchanged, so the err- +freedom hypothesis is only required on records whose diff is +`.val _`. -/ + +namespace Mz + +namespace UnifiedStream2 + +/-- Predicate err-freedom on a stream's data rows. A predicate is +*data-err-free* on `us` when, for every record `(r, .val d)` in `us`, +`eval r e` is not an `.err _`. Records with a `.global` diff carry no +constraint — the collection is already invalid, so the row's err +status is irrelevant to downstream consolidation. -/ +def predNoRowErr (e : Expr) (us : UnifiedStream2) : Prop := + ∀ rd ∈ us, ∀ d, rd.2 = DiffWithGlobal.val d → ∀ ev, eval rd.1 e ≠ Datum.err ev + +theorem predNoRowErr.tail {e : Expr} + {hd : Row × DiffWithGlobal} {tl : UnifiedStream2} + (h : predNoRowErr e (hd :: tl)) : + predNoRowErr e tl := + fun rd hMem => h rd (List.mem_cons_of_mem _ hMem) + +theorem predNoRowErr.head {e : Expr} + {hd : Row × DiffWithGlobal} {tl : UnifiedStream2} + (h : predNoRowErr e (hd :: tl)) : + ∀ d, hd.2 = DiffWithGlobal.val d → ∀ ev, eval hd.1 e ≠ Datum.err ev := + h hd List.mem_cons_self + +/-! ## Per-record fusion at a data row + +Single-record filter pipelines line up with `filter (.and q p)` when +err-freedom holds at the row. The proof walks every non-err `Datum` +shape `eval r q` can produce; in each branch it walks every non-err +shape `eval r p` can produce. Unlike the old filter, the new filter +keeps the row in every branch — only the diff's `val` count and +`errs` count change — so the per-record case analysis only has to +match diff components, not list shapes. -/ + +/-- Filter reduces on a singleton `.val` record to a `match eval r _` +on `filterOne`. -/ +private theorem filter_val_singleton (e : Expr) (r : Row) (d : Diff) : + filter e [(r, DiffWithGlobal.val d)] + = [(match eval r e with + | .bool true => (r, DiffWithGlobal.val d) + | .err ev => (r, DiffWithGlobal.val + { val := 0 + , errs := d.errs + ErrCount.single ev d.val }) + | _ => (r, DiffWithGlobal.val { val := 0, errs := d.errs }))] := by + show (match eval r e with + | .bool true => _ + | .err ev => _ + | _ => _) :: [] = _ + rfl + +/-- Filter passes a singleton `.global` record through unchanged. -/ +private theorem filter_global_singleton (e : Expr) (r : Row) : + filter e [(r, DiffWithGlobal.global)] = [(r, DiffWithGlobal.global)] := rfl + +private theorem filter_fusion_val + (q p : Expr) (r : Row) (d : Diff) + (hQ : ∀ ev, eval r q ≠ Datum.err ev) + (hP : ∀ ev, eval r p ≠ Datum.err ev) : + filter p (filter q [(r, DiffWithGlobal.val d)]) + = filter (Expr.and q p) [(r, DiffWithGlobal.val d)] := by + rw [filter_val_singleton q r d, filter_val_singleton (Expr.and q p) r d] + have hEvalAnd : eval r (Expr.and q p) = evalAnd (eval r q) (eval r p) := by + simp only [eval] + rw [hEvalAnd] + -- The RHS is now a singleton list whose head is determined by + -- `evalAnd (eval r q) (eval r p)`. The LHS still needs `filter p` + -- to reduce on the singleton produced by the inner `filter q`. + cases hQ' : eval r q with + | err e => exact absurd hQ' (hQ e) + | bool b => + cases b with + | true => + -- Inner result: `(r, .val d)`. Outer filter on the same row. + rw [filter_val_singleton p r d] + cases hP' : eval r p with + | err e => exact absurd hP' (hP e) + | bool b' => cases b' with | true => rfl | false => rfl + | null => rfl + | int _ => rfl + | false => + -- Inner result: `(r, .val { val := 0, errs := d.errs })`. + -- Outer filter sees the same row again. + rw [filter_val_singleton p r { val := 0, errs := d.errs }] + cases hP' : eval r p with + | err e => exact absurd hP' (hP e) + | bool b' => cases b' with | true => rfl | false => rfl + | null => rfl + | int _ => rfl + | null => + rw [filter_val_singleton p r { val := 0, errs := d.errs }] + cases hP' : eval r p with + | err e => exact absurd hP' (hP e) + | bool b => cases b with | true => rfl | false => rfl + | null => rfl + | int _ => rfl + | int k => + rw [filter_val_singleton p r { val := 0, errs := d.errs }] + cases hP' : eval r p with + | err e => exact absurd hP' (hP e) + | bool b => + cases b with + | true => + -- `evalAnd (.int k) (.bool true) = .int k`; falls in the `_` arm. + rfl + | false => rfl + | null => rfl + | int m => + by_cases hKM : k = m + · have hEA : evalAnd (Datum.int k) (Datum.int m) = Datum.int k := by + show (if k = m then Datum.int k else Datum.null) = Datum.int k + rw [if_pos hKM] + rw [hEA] + · have hEA : evalAnd (Datum.int k) (Datum.int m) = Datum.null := by + show (if k = m then Datum.int k else Datum.null) = Datum.null + rw [if_neg hKM] + rw [hEA] + +/-! ## Main fusion theorem -/ + +/-- Adjacent filters fuse: `filter p ∘ filter q = filter (.and q p)` +when neither predicate triggers an `.err` on any `.val`-diff record +of the input stream. `.global`-diff records pass through both +pipelines unchanged. -/ +theorem filter_filter_fuse + (q p : Expr) (us : UnifiedStream2) + (hQ : predNoRowErr q us) + (hP : predNoRowErr p us) : + filter p (filter q us) = filter (Expr.and q p) us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨r, dg⟩ := hd + have hTlQ : predNoRowErr q tl := hQ.tail + have hTlP : predNoRowErr p tl := hP.tail + have hConsAsApp : ((r, dg) :: tl : UnifiedStream2) = [(r, dg)] ++ tl := rfl + rw [hConsAsApp, filter_append, filter_append, filter_append, ih hTlQ hTlP] + congr 1 + cases dg with + | global => + -- `.global` passes through both pipelines, including the + -- fused-predicate one. + rfl + | val d => + have hQr : ∀ ev, eval r q ≠ Datum.err ev := + hQ.head d rfl + have hPr : ∀ ev, eval r p ≠ Datum.err ev := + hP.head d rfl + exact filter_fusion_val q p r d hQr hPr + +/-! ## Idempotence (no hypothesis required) + +Unlike the old `UnifiedStream.filter`, the new filter never drops a +record — every input record produces exactly one output record with +the same row. Idempotence holds unconditionally: the second pass +sees the same `eval r pred`, and the diff transformation under +`val := 0` is fixed by another application of itself. -/ + +/-- Helper: `ErrCount.single e 0 = 0` pointwise. The conditional +returns `0` on both branches. -/ +private theorem errCount_single_zero (e : EvalError) : + ErrCount.single e 0 = (0 : ErrCount) := by + funext e' + show (if e = e' then (0 : Int) else 0) = 0 + split <;> rfl + +/-- Helper: the err-arm output of `filterOne` is a fixed point of +the err-arm of `filterOne`. After one application, the diff's +`val` is zero, so the next err-arm adds `ErrCount.single e 0 = 0`. -/ +private theorem filterOne_err_idem (r : Row) (d : Diff) (e : EvalError) : + (r, DiffWithGlobal.val + { val := 0 + , errs := (d.errs + ErrCount.single e d.val) + ErrCount.single e 0 }) + = (r, DiffWithGlobal.val + { val := 0 + , errs := d.errs + ErrCount.single e d.val }) := by + congr 2 + rw [errCount_single_zero, ErrCount.add_zero] + +/-- Filter is idempotent: applying the same predicate twice equals +applying it once. Holds unconditionally because the new filter +preserves rows in every branch and the diff transformation is +fixed by re-application. -/ +theorem filter_idem (pred : Expr) (us : UnifiedStream2) : + filter pred (filter pred us) = filter pred us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨r, dg⟩ := hd + have hConsAsApp : ((r, dg) :: tl : UnifiedStream2) = [(r, dg)] ++ tl := rfl + rw [hConsAsApp, filter_append, filter_append, ih] + congr 1 + cases dg with + | global => rfl + | val d => + rw [filter_val_singleton pred r d] + cases hEval : eval r pred with + | bool b => + cases b with + | true => + -- First filter keeps `(r, .val d)`; second filter sees the + -- same `eval r pred = .bool true` and keeps it again. + rw [filter_val_singleton pred r d, hEval] + | false => + -- First filter outputs `(r, .val { val := 0, errs := d.errs })`; + -- second filter on `eval r pred = .bool false` zeros the val + -- again, leaving the errs unchanged. + rw [filter_val_singleton pred r { val := 0, errs := d.errs }, hEval] + | err e => + -- First filter outputs `(r, .val { val := 0, errs := d.errs + single e d.val })`; + -- second filter sees `eval r pred = .err e` and adds + -- `single e 0 = 0`, leaving the diff unchanged. + rw [filter_val_singleton pred r { val := 0 + , errs := d.errs + ErrCount.single e d.val }, + hEval] + show [_] = [_] + congr 1 + exact filterOne_err_idem r d e + | null => + rw [filter_val_singleton pred r { val := 0, errs := d.errs }, hEval] + | int _ => + rw [filter_val_singleton pred r { val := 0, errs := d.errs }, hEval] + +/-! ## Filter under eval-equivalent predicates + +Two predicates that agree on every `.val`-diff row produce equal +outputs. The proof walks the stream and reduces to per-record +equality via `filter_val_singleton` for `.val` and definitional +equality for `.global`. -/ + +/-- Filters with eval-equivalent predicates on every `.val`-diff +row produce equal outputs. Useful for re-associating or re-ordering +fused predicates without re-running the full filter analysis. -/ +theorem filter_eval_eq + (p q : Expr) (us : UnifiedStream2) + (h : ∀ rd ∈ us, ∀ d, rd.2 = DiffWithGlobal.val d → eval rd.1 p = eval rd.1 q) : + filter p us = filter q us := by + induction us with + | nil => rfl + | cons hd tl ih => + obtain ⟨r, dg⟩ := hd + have hTl : ∀ rd ∈ tl, ∀ d, rd.2 = DiffWithGlobal.val d → + eval rd.1 p = eval rd.1 q := + fun rd hMem => h rd (List.mem_cons_of_mem _ hMem) + have hConsAsApp : ((r, dg) :: tl : UnifiedStream2) = [(r, dg)] ++ tl := rfl + rw [hConsAsApp, filter_append, filter_append, ih hTl] + congr 1 + cases dg with + | global => rfl + | val d => + have hRow : eval r p = eval r q := + h (r, DiffWithGlobal.val d) List.mem_cons_self d rfl + rw [filter_val_singleton p r d, filter_val_singleton q r d, hRow] + +/-! ## Filter commutativity (under err-freedom) + +Two filters commute when neither predicate errors on any `.val`-diff +row. Reduces to `filter_filter_fuse` applied both ways, then equates +`.and q p` with `.and p q` via `evalAnd_comm_of_no_err`. -/ + +/-- `Datum` is not an err iff none of its `err _` matches. Restates +the err-freedom hypothesis used here in the form +`evalAnd_comm_of_no_err` expects (`¬d.IsErr`). -/ +private theorem datum_not_isErr_of_no_err {d : Datum} + (h : ∀ ev, d ≠ Datum.err ev) : ¬d.IsErr := by + cases d with + | err e => exact absurd rfl (h e) + | bool _ => exact id + | int _ => exact id + | null => exact id + +/-- Filters commute when neither predicate errors on any `.val`-diff +row of the input. -/ +theorem filter_comm + (q p : Expr) (us : UnifiedStream2) + (hQ : predNoRowErr q us) + (hP : predNoRowErr p us) : + filter p (filter q us) = filter q (filter p us) := by + rw [filter_filter_fuse q p us hQ hP, filter_filter_fuse p q us hP hQ] + apply filter_eval_eq + intro rd hMem d hVal + have hQr : ∀ ev, eval rd.1 q ≠ Datum.err ev := hQ rd hMem d hVal + have hPr : ∀ ev, eval rd.1 p ≠ Datum.err ev := hP rd hMem d hVal + have hEvalAndQP : eval rd.1 (Expr.and q p) = evalAnd (eval rd.1 q) (eval rd.1 p) := by + simp only [eval] + have hEvalAndPQ : eval rd.1 (Expr.and p q) = evalAnd (eval rd.1 p) (eval rd.1 q) := by + simp only [eval] + rw [hEvalAndQP, hEvalAndPQ] + exact evalAnd_comm_of_no_err + (datum_not_isErr_of_no_err hQr) + (datum_not_isErr_of_no_err hPr) + +end UnifiedStream2 + +end Mz