From e9c9a5a2331d9fd55f6249ba6b6f45f1203049b5 Mon Sep 17 00:00:00 2001
From: Richard Iannone <riannone@me.com>
Date: Mon, 23 Feb 2026 12:25:25 -0500
Subject: [PATCH 01/11] Support governance, aggregates, and refs in YAML

---
 pointblank/yaml.py | 84 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 80 insertions(+), 4 deletions(-)

diff --git a/pointblank/yaml.py b/pointblank/yaml.py
index 6ccd05ef2..3ab325f1b 100644
--- a/pointblank/yaml.py
+++ b/pointblank/yaml.py
@@ -6,6 +6,7 @@
 
 import yaml
 
+from pointblank._agg import is_valid_agg
 from pointblank._utils import _is_lib_present
 from pointblank.thresholds import Actions
 from pointblank.validate import Validate, load_dataset
@@ -241,11 +242,13 @@ class YAMLValidator:
         "col_vals_increasing": "col_vals_increasing",
         "col_vals_decreasing": "col_vals_decreasing",
         "col_vals_within_spec": "col_vals_within_spec",
+        "col_pct_null": "col_pct_null",
         "rows_distinct": "rows_distinct",
         "rows_complete": "rows_complete",
         "col_count_match": "col_count_match",
         "row_count_match": "row_count_match",
         "col_schema_match": "col_schema_match",
+        "data_freshness": "data_freshness",
         "tbl_match": "tbl_match",
         "prompt": "prompt",
         "conjointly": "conjointly",
@@ -620,9 +623,13 @@ def _parse_validation_step(
         else:
             raise YAMLValidationError(f"Invalid step configuration type: {type(step_config)}")
 
-        # Validate that we know this method
-        if method_name not in self.validation_method_map:
-            available_methods = list(self.validation_method_map.keys())
+        # Validate that we know this method (static map or dynamic aggregate method)
+        if method_name not in self.validation_method_map and not is_valid_agg(method_name):
+            available_methods = list(self.validation_method_map.keys()) + [
+                "col_sum_*",
+                "col_avg_*",
+                "col_sd_*",
+            ]
             raise YAMLValidationError(
                 f"Unknown validation method '{method_name}'. Available methods: {available_methods}"
             )
@@ -693,7 +700,14 @@ def _parse_validation_step(
         if "inclusive" in parameters and isinstance(parameters["inclusive"], list):
             parameters["inclusive"] = tuple(parameters["inclusive"])
 
-        return self.validation_method_map[method_name], parameters
+        # Resolve the method name: static map takes priority, then dynamic aggregate methods
+        if method_name in self.validation_method_map:
+            resolved_name = self.validation_method_map[method_name]
+        else:
+            # Dynamic aggregate method (col_sum_gt, col_avg_eq, etc.)
+            resolved_name = method_name
+
+        return resolved_name, parameters
 
     def build_validation(
         self, config: dict, namespaces: Optional[Union[Iterable[str], Mapping[str, str]]] = None
@@ -750,6 +764,41 @@ def build_validation(
         if "brief" in config:
             validate_kwargs["brief"] = config["brief"]
 
+        # Set owner if provided (governance)
+        if "owner" in config:
+            validate_kwargs["owner"] = config["owner"]
+
+        # Set consumers if provided (governance)
+        if "consumers" in config:
+            validate_kwargs["consumers"] = config["consumers"]
+
+        # Set version if provided (governance)
+        if "version" in config:
+            validate_kwargs["version"] = config["version"]
+
+        # Set final_actions if provided
+        if "final_actions" in config:
+            from pointblank.thresholds import FinalActions
+
+            final_actions_spec = config["final_actions"]
+            # Process Python expressions in final_actions
+            processed_final_actions = _process_python_expressions(
+                final_actions_spec, namespaces=namespaces
+            )
+            if isinstance(processed_final_actions, list):
+                validate_kwargs["final_actions"] = FinalActions(*processed_final_actions)
+            elif callable(processed_final_actions):
+                validate_kwargs["final_actions"] = FinalActions(processed_final_actions)
+            else:
+                validate_kwargs["final_actions"] = processed_final_actions
+
+        # Set reference data if provided
+        if "reference" in config:
+            ref_spec = config["reference"]
+            # Process as a data source (could be a dataset name, file path, or python expression)
+            ref_data = self._load_data_source(ref_spec, df_library)
+            validate_kwargs["reference"] = ref_data
+
         validation = Validate(data, **validate_kwargs)
 
         # Add validation steps
@@ -1496,6 +1545,26 @@ def extract_python_expressions(obj, path=""):
         else:
             validate_args.append(f'brief="{config["brief"]}"')
 
+    # Add owner if present (governance)
+    if "owner" in config:
+        validate_args.append(f'owner="{config["owner"]}"')
+
+    # Add consumers if present (governance)
+    if "consumers" in config:
+        consumers = config["consumers"]
+        if isinstance(consumers, list):
+            if len(consumers) == 1:
+                validate_args.append(f'consumers="{consumers[0]}"')
+            else:
+                consumers_str = "[" + ", ".join([f'"{c}"' for c in consumers]) + "]"
+                validate_args.append(f"consumers={consumers_str}")
+        elif isinstance(consumers, str):
+            validate_args.append(f'consumers="{consumers}"')
+
+    # Add version if present (governance)
+    if "version" in config:
+        validate_args.append(f'version="{config["version"]}"')
+
     # Create the `pb.Validate()` call
     if len(validate_args) == 1:
         # Single argument fits on one line
@@ -1514,6 +1583,13 @@ def extract_python_expressions(obj, path=""):
     for step_index, step_config in enumerate(config["steps"]):
         # Get original expressions before parsing
         original_expressions = {}
+
+        # Handle string steps (parameterless methods like "rows_distinct")
+        if isinstance(step_config, str):
+            method_name, parameters = validator._parse_validation_step(step_config, namespaces=None)
+            code_lines.append(f"    .{method_name}()")
+            continue
+
         step_method = list(step_config.keys())[
             0
         ]  # Get the method name (conjointly, specially, etc.)

From 451b6c2ee6ab0df0ec31a8eb95105aee5da29aff Mon Sep 17 00:00:00 2001
From: Richard Iannone <riannone@me.com>
Date: Mon, 23 Feb 2026 12:25:40 -0500
Subject: [PATCH 02/11] Add YAML tests for new validations and governance

---
 tests/test_yaml.py | 376 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 376 insertions(+)

diff --git a/tests/test_yaml.py b/tests/test_yaml.py
index 8420772e4..7604c03f4 100644
--- a/tests/test_yaml.py
+++ b/tests/test_yaml.py
@@ -4838,3 +4838,379 @@ def test_new_methods_yaml_to_python_comprehensive():
     assert "increasing_tol=0.2" in python_code
     assert 'spec="url"' in python_code
     assert 'model="openai:gpt-4"' in python_code
+
+
+def test_yaml_col_pct_null_basic():
+    """Test col_pct_null validation via YAML"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_pct_null:
+        columns: c
+        p: 0.15
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert len(result.validation_info) == 1
+    assert result.validation_info[0].assertion_type == "col_pct_null"
+
+
+def test_yaml_col_pct_null_with_tolerance():
+    """Test col_pct_null with tolerance parameter"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_pct_null:
+        columns: [c]
+        p: 0.15
+        tol: 0.1
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert len(result.validation_info) == 1
+
+
+def test_yaml_data_freshness_basic():
+    """Test data_freshness validation via YAML"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - data_freshness:
+        column: date_time
+        max_age: "100000 hours"
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert len(result.validation_info) == 1
+    assert result.validation_info[0].assertion_type == "data_freshness"
+
+
+def test_yaml_data_freshness_with_options():
+    """Test data_freshness with timezone and other options"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - data_freshness:
+        column: date_time
+        max_age: "876000 hours"
+        timezone: "UTC"
+        allow_tz_mismatch: true
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert len(result.validation_info) == 1
+
+
+def test_yaml_aggregate_methods_basic():
+    """Test that aggregate methods (col_sum_*, col_avg_*, col_sd_*) work in YAML"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_sum_gt:
+        columns: a
+        value: 0
+    - col_avg_gt:
+        columns: a
+        value: 0
+    - col_sd_gt:
+        columns: d
+        value: 0
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert len(result.validation_info) == 3
+    assert result.validation_info[0].assertion_type == "col_sum_gt"
+    assert result.validation_info[1].assertion_type == "col_avg_gt"
+    assert result.validation_info[2].assertion_type == "col_sd_gt"
+
+
+def test_yaml_aggregate_methods_all_comparators():
+    """Test all aggregate comparator variants"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_sum_eq:
+        columns: a
+        value: 6
+    - col_sum_ge:
+        columns: a
+        value: 0
+    - col_sum_gt:
+        columns: a
+        value: -1
+    - col_sum_lt:
+        columns: a
+        value: 100000
+    - col_sum_le:
+        columns: a
+        value: 100000
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert len(result.validation_info) == 5
+
+
+def test_yaml_aggregate_methods_with_tolerance():
+    """Test aggregate methods with tolerance parameter"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_sum_eq:
+        columns: a
+        value: 6
+        tol: 1
+    - col_avg_eq:
+        columns: d
+        value: 5000
+        tol: 5000
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert len(result.validation_info) == 2
+
+
+def test_yaml_aggregate_methods_with_thresholds():
+    """Test aggregate methods with thresholds"""
+    yaml_content = """
+    tbl: small_table
+    thresholds:
+      warning: 0.1
+    steps:
+    - col_sum_gt:
+        columns: a
+        value: 0
+        thresholds:
+          warning: 1
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert len(result.validation_info) == 1
+
+
+def test_yaml_unknown_method_still_errors():
+    """Test that truly unknown methods still raise errors"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_totally_fake:
+        columns: a
+        value: 0
+    """
+    with pytest.raises(YAMLValidationError, match="Unknown validation method"):
+        yaml_interrogate(yaml_content)
+
+
+def test_yaml_governance_owner_param():
+    """Test that owner parameter is forwarded through YAML"""
+    yaml_content = """
+    tbl: small_table
+    owner: "data-platform-team"
+    steps:
+    - col_vals_not_null:
+        columns: a
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert result.owner == "data-platform-team"
+
+
+def test_yaml_governance_consumers_param():
+    """Test that consumers parameter is forwarded through YAML"""
+    yaml_content = """
+    tbl: small_table
+    consumers:
+      - ml-team
+      - analytics
+    steps:
+    - col_vals_not_null:
+        columns: a
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert result.consumers == ["ml-team", "analytics"]
+
+
+def test_yaml_governance_consumers_single():
+    """Test that a single consumer string is forwarded"""
+    yaml_content = """
+    tbl: small_table
+    consumers: analytics
+    steps:
+    - col_vals_not_null:
+        columns: a
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    # Validate normalizes single string to list
+    assert result.consumers == ["analytics"]
+
+
+def test_yaml_governance_version_param():
+    """Test that version parameter is forwarded through YAML"""
+    yaml_content = """
+    tbl: small_table
+    version: "2.1.0"
+    steps:
+    - col_vals_not_null:
+        columns: a
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert result.version == "2.1.0"
+
+
+def test_yaml_governance_all_params():
+    """Test all governance params together"""
+    yaml_content = """
+    tbl: small_table
+    owner: "data-team"
+    consumers:
+      - team-a
+      - team-b
+    version: "1.0.0"
+    steps:
+    - col_vals_not_null:
+        columns: a
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert result.owner == "data-team"
+    assert result.consumers == ["team-a", "team-b"]
+    assert result.version == "1.0.0"
+
+
+def test_yaml_to_python_col_pct_null():
+    """Test yaml_to_python for col_pct_null"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_pct_null:
+        columns: c
+        p: 0.15
+        tol: 0.05
+    """
+    python_code = yaml_to_python(yaml_content)
+    assert ".col_pct_null(" in python_code
+    assert 'columns="c"' in python_code
+    assert "p=0.15" in python_code
+    assert "tol=0.05" in python_code
+
+
+def test_yaml_to_python_data_freshness():
+    """Test yaml_to_python for data_freshness"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - data_freshness:
+        column: date_time
+        max_age: "24 hours"
+        timezone: "UTC"
+    """
+    python_code = yaml_to_python(yaml_content)
+    assert ".data_freshness(" in python_code
+    assert 'column="date_time"' in python_code
+    assert 'max_age="24 hours"' in python_code
+    assert 'timezone="UTC"' in python_code
+
+
+def test_yaml_to_python_aggregate_methods():
+    """Test yaml_to_python for aggregate methods"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_sum_gt:
+        columns: a
+        value: 0
+    - col_avg_le:
+        columns: d
+        value: 10000
+    - col_sd_eq:
+        columns: d
+        value: 100
+        tol: 200
+    """
+    python_code = yaml_to_python(yaml_content)
+    assert ".col_sum_gt(" in python_code
+    assert ".col_avg_le(" in python_code
+    assert ".col_sd_eq(" in python_code
+    assert "tol=200" in python_code
+
+
+def test_yaml_to_python_governance_params():
+    """Test yaml_to_python includes governance parameters"""
+    yaml_content = """
+    tbl: small_table
+    owner: "data-team"
+    consumers:
+      - ml-team
+      - analytics
+    version: "1.0.0"
+    steps:
+    - col_vals_not_null:
+        columns: a
+    """
+    python_code = yaml_to_python(yaml_content)
+    assert 'owner="data-team"' in python_code
+    assert 'consumers=["ml-team", "analytics"]' in python_code
+    assert 'version="1.0.0"' in python_code
+
+
+def test_yaml_to_python_governance_single_consumer():
+    """Test yaml_to_python with single consumer string"""
+    yaml_content = """
+    tbl: small_table
+    consumers: analytics
+    steps:
+    - col_vals_not_null:
+        columns: a
+    """
+    python_code = yaml_to_python(yaml_content)
+    assert 'consumers="analytics"' in python_code
+
+
+def test_yaml_to_python_parameterless_step():
+    """Test yaml_to_python handles parameterless string steps (e.g., rows_distinct)"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - rows_distinct
+    - rows_complete
+    """
+    python_code = yaml_to_python(yaml_content)
+    assert ".rows_distinct()" in python_code
+    assert ".rows_complete()" in python_code
+
+
+def test_yaml_combined_new_and_existing_methods():
+    """Test YAML with a mix of existing and newly-added methods"""
+    yaml_content = """
+    tbl: small_table
+    owner: "team"
+    version: "1.0.0"
+    steps:
+    - col_vals_gt:
+        columns: a
+        value: 0
+    - col_pct_null:
+        columns: c
+        p: 0.15
+    - col_sum_gt:
+        columns: a
+        value: 0
+    - col_avg_gt:
+        columns: d
+        value: 0
+    - rows_distinct
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert len(result.validation_info) == 5
+    assert result.owner == "team"
+    assert result.version == "1.0.0"
+
+    types = [vi.assertion_type for vi in result.validation_info]
+    assert "col_vals_gt" in types
+    assert "col_pct_null" in types
+    assert "col_sum_gt" in types
+    assert "col_avg_gt" in types
+    assert "rows_distinct" in types

From 39a5f8e5eeddd1d945d6e061f044e1e1a43d01d0 Mon Sep 17 00:00:00 2001
From: Richard Iannone <riannone@me.com>
Date: Mon, 23 Feb 2026 12:50:31 -0500
Subject: [PATCH 03/11] Validate YAML keys; add reference support

---
 pointblank/yaml.py | 47 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/pointblank/yaml.py b/pointblank/yaml.py
index 3ab325f1b..9a528ab82 100644
--- a/pointblank/yaml.py
+++ b/pointblank/yaml.py
@@ -326,6 +326,33 @@ def _validate_schema(self, config: dict) -> None:
         YAMLValidationError
             If the schema is invalid.
         """
+        # Define known top-level keys
+        known_keys = {
+            "tbl",
+            "steps",
+            "tbl_name",
+            "label",
+            "thresholds",
+            "actions",
+            "final_actions",
+            "brief",
+            "lang",
+            "locale",
+            "df_library",
+            "owner",
+            "consumers",
+            "version",
+            "reference",
+        }
+
+        # Warn about unknown top-level keys (likely typos)
+        unknown_keys = set(config.keys()) - known_keys
+        if unknown_keys:
+            raise YAMLValidationError(
+                f"Unknown top-level key(s): {sorted(unknown_keys)}. "
+                f"Valid keys are: {sorted(known_keys)}"
+            )
+
         # Check required fields
         if "tbl" not in config:
             raise YAMLValidationError("YAML must contain 'tbl' field")
@@ -1420,6 +1447,11 @@ def yaml_to_python(yaml: Union[str, Path]) -> str:
     if isinstance(raw_config.get("tbl"), dict) and "python" in raw_config["tbl"]:
         original_tbl_expression = raw_config["tbl"]["python"].strip()
 
+    # Extract the original reference python expression if it exists
+    original_reference_expression = None
+    if isinstance(raw_config.get("reference"), dict) and "python" in raw_config["reference"]:
+        original_reference_expression = raw_config["reference"]["python"].strip()
+
     # Extract original Actions expressions if they exist
     original_actions_expressions = {}
     if "actions" in raw_config:
@@ -1565,6 +1597,21 @@ def extract_python_expressions(obj, path=""):
     if "version" in config:
         validate_args.append(f'version="{config["version"]}"')
 
+    # Add reference if present
+    if "reference" in config:
+        ref_spec = config["reference"]
+        if original_reference_expression:
+            validate_args.append(f"reference={original_reference_expression}")
+        elif isinstance(ref_spec, str):
+            if ref_spec.endswith((".csv", ".parquet")):
+                validate_args.append(
+                    f'reference=pb.load_dataset("{ref_spec}", tbl_type="{df_library}")'
+                )
+            else:
+                validate_args.append(
+                    f'reference=pb.load_dataset("{ref_spec}", tbl_type="{df_library}")'
+                )
+
     # Create the `pb.Validate()` call
     if len(validate_args) == 1:
         # Single argument fits on one line

From 6c0192fef36f233ea4a2705209ae540d736dbfd0 Mon Sep 17 00:00:00 2001
From: Richard Iannone <riannone@me.com>
Date: Mon, 23 Feb 2026 12:50:45 -0500
Subject: [PATCH 04/11] Add tests for YAML unknown keys and references

---
 tests/test_yaml.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/test_yaml.py b/tests/test_yaml.py
index 7604c03f4..1ba5d529d 100644
--- a/tests/test_yaml.py
+++ b/tests/test_yaml.py
@@ -5214,3 +5214,60 @@ def test_yaml_combined_new_and_existing_methods():
     assert "col_sum_gt" in types
     assert "col_avg_gt" in types
     assert "rows_distinct" in types
+
+
+def test_yaml_unknown_top_level_key_rejected():
+    """Test that unknown top-level keys (likely typos) are caught"""
+    yaml_content = """
+    tbl: small_table
+    tbl_nmae: "Typo Table Name"
+    steps:
+    - col_vals_not_null:
+        columns: a
+    """
+    with pytest.raises(YAMLValidationError, match="Unknown top-level key"):
+        yaml_interrogate(yaml_content)
+
+
+def test_yaml_unknown_top_level_key_error_message():
+    """Test that the error message includes the bad key and valid keys"""
+    yaml_content = """
+    tbl: small_table
+    labell: "typo"
+    steps:
+    - rows_distinct
+    """
+    with pytest.raises(YAMLValidationError, match="labell") as exc_info:
+        validate_yaml(yaml_content)
+    # Should also mention valid keys
+    assert "label" in str(exc_info.value)
+
+
+def test_yaml_to_python_reference():
+    """Test yaml_to_python renders reference parameter"""
+    yaml_content = """
+    tbl: small_table
+    reference: small_table
+    steps:
+    - col_vals_not_null:
+        columns: a
+    """
+    python_code = yaml_to_python(yaml_content)
+    assert "reference=" in python_code
+    assert 'pb.load_dataset("small_table"' in python_code
+
+
+def test_yaml_to_python_reference_with_python_expression():
+    """Test yaml_to_python renders reference with python: expression"""
+    yaml_content = """
+    tbl: small_table
+    reference:
+      python: |
+        pb.load_dataset("small_table", tbl_type="polars")
+    steps:
+    - col_vals_not_null:
+        columns: a
+    """
+    python_code = yaml_to_python(yaml_content)
+    assert "reference=" in python_code
+    assert 'pb.load_dataset("small_table", tbl_type="polars")' in python_code

From 8e8e3a464375ad04d9528ab93785f3381db8b859 Mon Sep 17 00:00:00 2001
From: Richard Iannone <riannone@me.com>
Date: Mon, 23 Feb 2026 13:43:03 -0500
Subject: [PATCH 05/11] Support shortcut syntax for `active` in YAML

---
 pointblank/yaml.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/pointblank/yaml.py b/pointblank/yaml.py
index 9a528ab82..d57683048 100644
--- a/pointblank/yaml.py
+++ b/pointblank/yaml.py
@@ -672,6 +672,10 @@ def _parse_validation_step(
             elif key == "pre" and isinstance(value, str):
                 # Treat string directly as Python code (shortcut syntax)
                 processed_parameters[key] = _safe_eval_python_code(value, namespaces=namespaces)
+            # Special case: `active=` parameter can use shortcut syntax for callables
+            # (e.g., `active: pb.has_columns("col_a")` or `active: false`)
+            elif key == "active" and isinstance(value, str):
+                processed_parameters[key] = _safe_eval_python_code(value, namespaces=namespaces)
             else:
                 # Normal processing (requires python: block syntax)
                 processed_parameters[key] = _process_python_expressions(
@@ -1468,9 +1472,9 @@ def extract_python_expressions(obj, path=""):
             else:
                 for key, value in obj.items():
                     new_path = f"{path}.{key}" if path else key
-                    # Special handling for `expr=` and `pre=` parameters that
-                    # can use shortcut syntax
-                    if key in ["expr", "pre"] and isinstance(value, str):
+                    # Special handling for `expr=`, `pre=`, and `active=` parameters
+                    # that can use shortcut syntax
+                    if key in ["expr", "pre", "active"] and isinstance(value, str):
                         expressions[new_path] = value.strip()
                     # Special handling for actions that might contain python: expressions
                     elif key == "actions" and isinstance(value, dict):

From 8a401ab89e6290f05490ad96db58409e27e93f57 Mon Sep 17 00:00:00 2001
From: Richard Iannone <riannone@me.com>
Date: Mon, 23 Feb 2026 13:43:27 -0500
Subject: [PATCH 06/11] Add tests for YAML 'active' handling

---
 tests/test_yaml.py | 123 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)

diff --git a/tests/test_yaml.py b/tests/test_yaml.py
index 1ba5d529d..9f2562b99 100644
--- a/tests/test_yaml.py
+++ b/tests/test_yaml.py
@@ -5271,3 +5271,126 @@ def test_yaml_to_python_reference_with_python_expression():
     python_code = yaml_to_python(yaml_content)
     assert "reference=" in python_code
     assert 'pb.load_dataset("small_table", tbl_type="polars")' in python_code
+
+
+def test_yaml_active_boolean_false():
+    """Test active=false disables a step via YAML"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_vals_gt:
+        columns: d
+        value: 100
+        active: false
+    - col_vals_not_null:
+        columns: a
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert len(result.validation_info) == 2
+    # First step should be inactive
+    assert result.validation_info[0].active is False
+    # Second step should be active (default)
+    assert result.validation_info[1].active is True
+
+
+def test_yaml_active_boolean_true():
+    """Test active=true keeps a step active via YAML"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_vals_gt:
+        columns: d
+        value: 100
+        active: true
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert result.validation_info[0].active is True
+
+
+def test_yaml_active_callable_shortcut():
+    """Test active with callable shortcut syntax (e.g., has_columns)"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_vals_gt:
+        columns: d
+        value: 100
+        active: "pb.has_columns('d')"
+    - col_vals_gt:
+        columns: nonexistent_column
+        value: 0
+        active: "pb.has_columns('nonexistent_column')"
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert len(result.validation_info) == 2
+
+    # First step: column 'd' exists, so active should be True and step should run
+    assert result.validation_info[0].active is not False
+
+    # Second step: column 'nonexistent_column' doesn't exist,
+    # so active callable returns False and step is inactive
+    assert result.validation_info[1].active is not True
+
+
+def test_yaml_active_python_block():
+    """Test active with python: block syntax"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_vals_gt:
+        columns: d
+        value: 100
+        active:
+          python: "pb.has_columns('d')"
+    """
+    result = yaml_interrogate(yaml_content)
+    assert result is not None
+    assert len(result.validation_info) == 1
+    # Column 'd' exists, so step should be active/run
+    assert result.validation_info[0].active is not False
+
+
+def test_yaml_to_python_active_boolean():
+    """Test yaml_to_python renders active=False correctly"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_vals_gt:
+        columns: d
+        value: 100
+        active: false
+    """
+    python_code = yaml_to_python(yaml_content)
+    assert "active=False" in python_code
+
+
+def test_yaml_to_python_active_callable_shortcut():
+    """Test yaml_to_python preserves active callable shortcut expression"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_vals_gt:
+        columns: d
+        value: 100
+        active: "pb.has_columns('d')"
+    """
+    python_code = yaml_to_python(yaml_content)
+    assert "active=pb.has_columns('d')" in python_code
+
+
+def test_yaml_to_python_active_python_block():
+    """Test yaml_to_python preserves active python: block expression"""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_vals_gt:
+        columns: d
+        value: 100
+        active:
+          python: "pb.has_columns('d')"
+    """
+    python_code = yaml_to_python(yaml_content)
+    assert "active=pb.has_columns('d')" in python_code

From 026e753478c3b12986f354566be6f52b63b6abfb Mon Sep 17 00:00:00 2001
From: Richard Iannone <riannone@me.com>
Date: Mon, 23 Feb 2026 15:36:17 -0500
Subject: [PATCH 07/11] Add YAML-to-Python roundtrip tests

---
 tests/test_yaml.py | 257 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 257 insertions(+)

diff --git a/tests/test_yaml.py b/tests/test_yaml.py
index 9f2562b99..ce9f8c212 100644
--- a/tests/test_yaml.py
+++ b/tests/test_yaml.py
@@ -5394,3 +5394,260 @@ def test_yaml_to_python_active_python_block():
     """
     python_code = yaml_to_python(yaml_content)
     assert "active=pb.has_columns('d')" in python_code
+
+
+def _exec_yaml_to_python(yaml_content: str) -> Validate:
+    """Helper: convert YAML to Python code via yaml_to_python, then exec it and return result."""
+    import pointblank as pb
+
+    python_code = yaml_to_python(yaml_content)
+
+    # Strip markdown code fences
+    code = python_code.strip()
+    if code.startswith("```python"):
+        code = code[len("```python") :].strip()
+    if code.endswith("```"):
+        code = code[: -len("```")].strip()
+
+    # Execute the generated code: split imports from expression
+    # The code has imports on top, then a blank line, then a parenthesized expression
+    parts = code.split("\n\n", 1)
+    imports_section = parts[0]
+    expression_section = parts[1] if len(parts) > 1 else ""
+
+    code_with_capture = imports_section + "\n\nresult = " + expression_section
+    exec_globals = {"__builtins__": __builtins__}
+    exec(code_with_capture, exec_globals)
+    return exec_globals["result"]
+
+
+def _compare_validation_results(yaml_result: Validate, python_result: Validate):
+    """Helper: compare two Validate objects for equivalence."""
+    # Same number of validation steps
+    assert len(yaml_result.validation_info) == len(python_result.validation_info)
+
+    for i, (y_step, p_step) in enumerate(
+        zip(yaml_result.validation_info, python_result.validation_info)
+    ):
+        # Same validation method
+        assert y_step.assertion_type == p_step.assertion_type, (
+            f"Step {i}: method mismatch: {y_step.assertion_type} vs {p_step.assertion_type}"
+        )
+        # Same pass/fail outcome
+        assert y_step.all_passed == p_step.all_passed, (
+            f"Step {i} ({y_step.assertion_type}): "
+            f"all_passed mismatch: {y_step.all_passed} vs {p_step.all_passed}"
+        )
+        # Same test unit counts
+        assert y_step.n == p_step.n, f"Step {i}: n mismatch: {y_step.n} vs {p_step.n}"
+        assert y_step.n_passed == p_step.n_passed, (
+            f"Step {i}: n_passed mismatch: {y_step.n_passed} vs {p_step.n_passed}"
+        )
+        assert y_step.n_failed == p_step.n_failed, (
+            f"Step {i}: n_failed mismatch: {y_step.n_failed} vs {p_step.n_failed}"
+        )
+
+
+def test_roundtrip_basic():
+    """Round-trip: basic YAML with parameterless and parameterized steps."""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - rows_distinct
+    - col_exists:
+        columns: [date, a, b]
+    """
+    yaml_result = yaml_interrogate(yaml_content)
+    python_result = _exec_yaml_to_python(yaml_content)
+    _compare_validation_results(yaml_result, python_result)
+
+
+def test_roundtrip_with_thresholds():
+    """Round-trip: YAML with global thresholds."""
+    yaml_content = """
+    tbl: small_table
+    tbl_name: threshold_test
+    label: Test thresholds
+    thresholds:
+      warning: 0.1
+      error: 0.25
+    steps:
+    - col_vals_gt:
+        columns: [d]
+        value: 100
+    - col_vals_not_null:
+        columns: [date, a]
+    """
+    yaml_result = yaml_interrogate(yaml_content)
+    python_result = _exec_yaml_to_python(yaml_content)
+    _compare_validation_results(yaml_result, python_result)
+    assert python_result.tbl_name == "threshold_test"
+    assert python_result.label == "Test thresholds"
+
+
+def test_roundtrip_column_validations():
+    """Round-trip: various column validation methods."""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_vals_gt:
+        columns: [d]
+        value: 0
+    - col_vals_lt:
+        columns: [a]
+        value: 100
+    - col_vals_between:
+        columns: [c]
+        left: 1
+        right: 10
+    - col_vals_not_null:
+        columns: [date]
+    - col_vals_in_set:
+        columns: [f]
+        set: [low, mid, high]
+    """
+    yaml_result = yaml_interrogate(yaml_content)
+    python_result = _exec_yaml_to_python(yaml_content)
+    _compare_validation_results(yaml_result, python_result)
+
+
+def test_roundtrip_regex_and_schema():
+    """Round-trip: regex and column existence validation."""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_vals_regex:
+        columns: [b]
+        pattern: '[0-9]-[a-z]{3}-[0-9]{3}'
+    - col_exists:
+        columns: [date, a, b, c, d, f]
+    """
+    yaml_result = yaml_interrogate(yaml_content)
+    python_result = _exec_yaml_to_python(yaml_content)
+    _compare_validation_results(yaml_result, python_result)
+
+
+def test_roundtrip_governance_metadata():
+    """Round-trip: governance metadata (owner, consumers, version) preserved."""
+    yaml_content = """
+    tbl: small_table
+    tbl_name: governance_test
+    owner: Data Engineering
+    consumers: [Analytics, Finance]
+    version: "2.1.0"
+    steps:
+    - col_vals_not_null:
+        columns: [a]
+    """
+    yaml_result = yaml_interrogate(yaml_content)
+    python_result = _exec_yaml_to_python(yaml_content)
+    _compare_validation_results(yaml_result, python_result)
+    assert python_result.owner == "Data Engineering"
+    assert python_result.consumers == ["Analytics", "Finance"]
+    assert python_result.version == "2.1.0"
+
+
+def test_roundtrip_aggregate_methods():
+    """Round-trip: aggregate validation methods (col_sum_gt, col_avg_le)."""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_sum_gt:
+        columns: [d]
+        value: 0
+    - col_avg_le:
+        columns: [a]
+        value: 10
+    """
+    yaml_result = yaml_interrogate(yaml_content)
+    python_result = _exec_yaml_to_python(yaml_content)
+    _compare_validation_results(yaml_result, python_result)
+
+
+def test_roundtrip_mixed_workflow():
+    """Round-trip: comprehensive workflow mixing multiple method types."""
+    yaml_content = """
+    tbl: small_table
+    tbl_name: mixed_test
+    label: Comprehensive test
+    thresholds:
+      warning: 0.1
+      error: 0.25
+      critical: 0.35
+    steps:
+    - rows_distinct
+    - col_exists:
+        columns: [date, a, b]
+    - col_vals_gt:
+        columns: [d]
+        value: 100
+    - col_vals_regex:
+        columns: [b]
+        pattern: '[0-9]-[a-z]{3}-[0-9]{3}'
+    - col_vals_not_null:
+        columns: [date, a]
+    - col_vals_between:
+        columns: [c]
+        left: 1
+        right: 10
+    - col_vals_in_set:
+        columns: [f]
+        set: [low, mid, high]
+    """
+    yaml_result = yaml_interrogate(yaml_content)
+    python_result = _exec_yaml_to_python(yaml_content)
+    _compare_validation_results(yaml_result, python_result)
+
+
+def test_roundtrip_with_brief():
+    """Round-trip: validation steps with brief descriptions."""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_vals_gt:
+        columns: [d]
+        value: 100
+        brief: "Values must exceed 100"
+    - col_vals_not_null:
+        columns: [a]
+        brief: true
+    """
+    yaml_result = yaml_interrogate(yaml_content)
+    python_result = _exec_yaml_to_python(yaml_content)
+    _compare_validation_results(yaml_result, python_result)
+
+
+def test_roundtrip_row_validations():
+    """Round-trip: row-level validations."""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - rows_distinct
+    - rows_complete
+    - row_count_match:
+        count: 13
+    """
+    yaml_result = yaml_interrogate(yaml_content)
+    python_result = _exec_yaml_to_python(yaml_content)
+    _compare_validation_results(yaml_result, python_result)
+
+
+def test_roundtrip_step_level_thresholds():
+    """Round-trip: step-level thresholds."""
+    yaml_content = """
+    tbl: small_table
+    steps:
+    - col_vals_gt:
+        columns: [d]
+        value: 100
+        thresholds:
+          warning: 0.05
+          error: 0.15
+    - col_vals_not_null:
+        columns: [a]
+        thresholds:
+          warning: 0.01
+    """
+    yaml_result = yaml_interrogate(yaml_content)
+    python_result = _exec_yaml_to_python(yaml_content)
+    _compare_validation_results(yaml_result, python_result)

From 0695afdb833c5cfa128bc72ba99d5bc5d6685815 Mon Sep 17 00:00:00 2001
From: Richard Iannone <riannone@me.com>
Date: Mon, 23 Feb 2026 15:39:50 -0500
Subject: [PATCH 08/11] Include governance and aggregates in YAML docs

---
 pointblank/yaml.py | 143 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)

diff --git a/pointblank/yaml.py b/pointblank/yaml.py
index d57683048..0e03441a1 100644
--- a/pointblank/yaml.py
+++ b/pointblank/yaml.py
@@ -1082,6 +1082,94 @@ def my_custom_action():
     pipeline or version control system, allowing you to maintain validation rules alongside your
     code.
 
+    ### Governance Metadata
+
+    YAML workflows support governance metadata via `owner`, `consumers`, and `version` top-level
+    keys. These are forwarded to the `Validate` constructor and embedded in the validation report:
+
+    ```{python}
+    yaml_config = '''
+    tbl: small_table
+    tbl_name: sales_pipeline
+    owner: Data Engineering
+    consumers: [Analytics, Finance, Compliance]
+    version: "2.1.0"
+    steps:
+    - col_vals_not_null:
+        columns: [a, b]
+    '''
+
+    result = pb.yaml_interrogate(yaml_config)
+    print(f"Owner: {result.owner}")
+    print(f"Consumers: {result.consumers}")
+    print(f"Version: {result.version}")
+    ```
+
+    ### Aggregate Validations
+
+    YAML supports aggregate validation methods for checking column-level statistics. These methods
+    validate that a column's sum, average, or standard deviation meets a threshold:
+
+    ```{python}
+    yaml_config = '''
+    tbl: small_table
+    steps:
+    - col_sum_gt:
+        columns: [d]
+        value: 0
+    - col_avg_le:
+        columns: [a]
+        value: 10
+    '''
+
+    result = pb.yaml_interrogate(yaml_config)
+    result
+    ```
+
+    The 15 available aggregate methods follow the pattern `col_{stat}_{comparator}` where
+    `{stat}` is `sum`, `avg`, or `sd` and `{comparator}` is `gt`, `lt`, `ge`,
+    `le`, or `eq`.
+
+    ### Data Freshness
+
+    Check that a date/datetime column has recent data using `data_freshness`:
+
+    ```yaml
+    tbl: events.csv
+    steps:
+    - data_freshness:
+        columns: event_date
+        freshness: "24h"
+    ```
+
+    ### Active Parameter Shortcut
+
+    The `active=` parameter controls whether a validation step runs. It supports boolean values
+    and Python expression shortcuts:
+
+    ```yaml
+    steps:
+    - col_vals_gt:
+        columns: [d]
+        value: 100
+        active: false            # Skip this step
+
+    - col_vals_not_null:
+        columns: [a]
+        active: true             # Always run (default)
+    ```
+
+    ### Null Percentage Check
+
+    Use `col_pct_null` to validate that the percentage of null values in a column is within bounds:
+
+    ```yaml
+    steps:
+    - col_pct_null:
+        columns: [a, b]
+        value: 0.05
+    ```
+
     ### Using `set_tbl=` to Override the Table
 
     The `set_tbl=` parameter allows you to override the table specified in the YAML configuration.
@@ -1314,6 +1402,39 @@ def safe_yaml_interrogate(yaml_config):
     source ('tbl') exists or is accessible. Data source validation occurs during execution with
     `yaml_interrogate()`.
 
+    Supported Top-level Keys
+    ------------------------
+    The following top-level keys are recognized in the YAML configuration:
+
+    - `tbl`: data source specification (required)
+    - `steps`: list of validation steps (required)
+    - `tbl_name`: human-readable table name
+    - `label`: validation description
+    - `df_library`: DataFrame library (`"polars"`, `"pandas"`, `"duckdb"`)
+    - `lang`: language code
+    - `locale`: locale setting
+    - `brief`: global brief template
+    - `thresholds`: global failure thresholds
+    - `actions`: global failure actions
+    - `final_actions`: actions triggered after all steps complete
+    - `owner`: data owner (governance metadata)
+    - `consumers`: data consumers (governance metadata)
+    - `version`: validation version string (governance metadata)
+    - `reference`: reference table for comparison-based validations
+
+    Unknown top-level keys are rejected, which catches typos like `tbl_nmae` or `step`.
+
+    Supported Validation Methods
+    ----------------------------
+    In addition to all standard validation methods (e.g., `col_vals_gt`, `rows_distinct`,
+    `col_schema_match`), the following methods are also supported:
+
+    - `col_pct_null`: check the percentage of null values in a column
+    - `data_freshness`: check that data is recent
+    - aggregate methods: `col_sum_gt`, `col_sum_lt`, `col_sum_ge`, `col_sum_le`,
+      `col_sum_eq`, `col_avg_gt`, `col_avg_lt`, `col_avg_ge`, `col_avg_le`,
+      `col_avg_eq`, `col_sd_gt`, `col_sd_lt`, `col_sd_ge`, `col_sd_le`, `col_sd_eq`
+
     See Also
     --------
     yaml_interrogate : execute YAML-based validation workflows
@@ -1416,6 +1537,28 @@ def yaml_to_python(yaml: Union[str, Path]) -> str:
     The generated code includes all configuration parameters, thresholds, and maintains the exact
     same validation logic as the original YAML workflow.
 
+    Governance metadata (`owner`, `consumers`, `version`) and `reference` are also rendered
+    in the generated Python code:
+
+    ```{python}
+    yaml_config = '''
+    tbl: small_table
+    tbl_name: Sales Pipeline
+    owner: Data Engineering
+    consumers: [Analytics, Finance]
+    version: "2.1.0"
+    steps:
+    - col_vals_not_null:
+        columns: [a]
+    - col_sum_gt:
+        columns: [d]
+        value: 0
+    '''
+
+    python_code = pb.yaml_to_python(yaml_config)
+    print(python_code)
+    ```
+
     This function is also useful for educational purposes, helping users understand how YAML
     configurations map to the underlying Python API calls.
     """

From aaeeea6645875c2138b2cf4e2db4c08cb7998d80 Mon Sep 17 00:00:00 2001
From: Richard Iannone <riannone@me.com>
Date: Mon, 23 Feb 2026 15:40:08 -0500
Subject: [PATCH 09/11] Add optional metadata and validation examples

---
 pointblank/cli.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/pointblank/cli.py b/pointblank/cli.py
index 3cebc4024..e20b6d2a2 100644
--- a/pointblank/cli.py
+++ b/pointblank/cli.py
@@ -3944,12 +3944,20 @@ def make_template(output_file: str | None):
 tbl: small_table  # Replace with your data source
                   # Can be: dataset name, CSV file, Parquet file, database connection, etc.
 
+# Optional: DataFrame library ("polars", "pandas", "duckdb")
+# df_library: polars
+
 # Optional: Table name for reporting (defaults to filename if not specified)
 tbl_name: "Example Validation"
 
 # Optional: Label for this validation run
 label: "Validation Template"
 
+# Optional: Governance metadata
+# owner: "Data Engineering"
+# consumers: [Analytics, Finance]
+# version: "1.0.0"
+
 # Optional: Validation thresholds (defaults shown below)
 # thresholds:
 #   warning: 0.05   # 5% failure rate triggers warning
@@ -3993,6 +4001,27 @@ def make_template(output_file: str | None):
   #     columns: status
   #     set: [active, inactive, pending]
 
+  # Aggregate validations (uncomment and modify as needed)
+  # - col_sum_gt:
+  #     columns: revenue
+  #     value: 0
+  #     brief: "Total revenue is positive"
+
+  # - col_avg_between:
+  #     columns: rating
+  #     left: 1
+  #     right: 5
+
+  # Check null percentage (uncomment and modify as needed)
+  # - col_pct_null:
+  #     columns: [email, phone]
+  #     value: 0.05
+
+  # Data freshness check (uncomment and modify as needed)
+  # - data_freshness:
+  #     columns: event_date
+  #     freshness: "24h"
+
 # Add more validation steps as needed
 # See the Pointblank documentation for the full list of available validation functions
 """

From 37261e953064513cbe50360a0a800818b82f5728 Mon Sep 17 00:00:00 2001
From: Richard Iannone <riannone@me.com>
Date: Mon, 23 Feb 2026 15:40:41 -0500
Subject: [PATCH 10/11] Extend YAML reference with metadata and methods

---
 docs/user-guide/yaml-reference.qmd | 90 ++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/docs/user-guide/yaml-reference.qmd b/docs/user-guide/yaml-reference.qmd
index d34f9bb2b..cbe3341b2 100644
--- a/docs/user-guide/yaml-reference.qmd
+++ b/docs/user-guide/yaml-reference.qmd
@@ -22,6 +22,12 @@ label: "Validation Description"        # OPTIONAL: Description for the validatio
 lang: "en"                             # OPTIONAL: Language code (default: "en")
 locale: "en"                           # OPTIONAL: Locale setting (default: "en")
 brief: "Global brief: {auto}"          # OPTIONAL: Global brief template
+owner: "Data Engineering"              # OPTIONAL: Data owner (governance metadata)
+consumers: [Analytics, Finance]        # OPTIONAL: Data consumers (governance metadata)
+version: "1.0.0"                       # OPTIONAL: Validation version (governance metadata)
+reference:                             # OPTIONAL: Reference table for comparison validations
+  python: |
+    pb.load_dataset("ref_table")
 thresholds:                            # OPTIONAL: Global failure thresholds
   warning: 0.1
   error: 0.2
@@ -31,6 +37,9 @@ actions:                               # OPTIONAL: Global failure actions
   error: "Error message template"
   critical: "Critical message template"
   highest_only: false
+final_actions:                         # OPTIONAL: Actions triggered after all steps complete
+  warning: "Post-validation warning"
+  error: "Post-validation error"
 steps:                                 # REQUIRED: List of validation steps
   - validation_method_name
   - validation_method_name:
@@ -838,6 +847,68 @@ Examples:
 - Performance-critical validations with large datasets
 - When deterministic results are required
 
+### Data Quality Methods
+
+`col_pct_null`: is the percentage of null values in a column within bounds?
+
+```yaml
+- col_pct_null:
+    columns: [column_name]             # REQUIRED: Column(s) to validate
+    value: 0.05                        # REQUIRED: Maximum allowed null fraction
+    thresholds:                        # OPTIONAL: Step-level thresholds
+      warning: 0.1
+    actions:                           # OPTIONAL: Step-level actions
+      warning: "Custom message"
+    brief: "Null rate check"           # OPTIONAL: Step description
+```
+
+`data_freshness`: is the data in a date/datetime column recent?
+
+```yaml
+- data_freshness:
+    columns: [date_column]             # REQUIRED: Date/datetime column
+    freshness: "24h"                   # REQUIRED: Maximum age of data
+    thresholds:                        # OPTIONAL: Step-level thresholds
+      warning: 0.1
+    actions:                           # OPTIONAL: Step-level actions
+      warning: "Custom message"
+    brief: "Data is recent"            # OPTIONAL: Step description
+```
+
+### Aggregate Validations
+
+Aggregate methods validate column-level statistics (sum, average, standard deviation) against a
+threshold. They follow the pattern `col_{stat}_{comparator}`:
+
+```yaml
+# Sum validations
+- col_sum_gt:
+    columns: [revenue]
+    value: 0
+    brief: "Total revenue is positive"
+
+# Average validations
+- col_avg_le:
+    columns: [rating]
+    value: 5
+    brief: "Average rating at most 5"
+
+# Standard deviation validations
+- col_sd_lt:
+    columns: [temperature]
+    value: 10
+    brief: "Temperature variation is bounded"
+```
+
+Available aggregate methods:
+
+- **Sum**: `col_sum_gt`, `col_sum_lt`, `col_sum_ge`, `col_sum_le`, `col_sum_eq`
+- **Average**: `col_avg_gt`, `col_avg_lt`, `col_avg_ge`, `col_avg_le`, `col_avg_eq`
+- **Standard deviation**: `col_sd_gt`, `col_sd_lt`, `col_sd_ge`, `col_sd_le`, `col_sd_eq`
+
+All aggregate methods accept these common parameters: `columns`, `value`, `thresholds`, `actions`,
+`brief`, `active`, and `pre`.
+
 ## Column Selection Patterns
 
 All validation methods that accept a `columns` parameter support these selection patterns:
@@ -871,6 +942,25 @@ These parameters are available for most validation methods:
 - `thresholds`: step-level failure thresholds (dict)
 - `actions`: step-level failure actions (dict)
 - `brief`: step description (string, boolean, or template)
+- `active`: whether the step is active (boolean, default: true)
+
+### Active Parameter
+
+The `active` parameter controls whether a validation step runs. It defaults to `true`; set it to
+`false` to skip a step without removing it from the configuration:
+
+```yaml
+steps:
+  # This step will be skipped
+  - col_vals_gt:
+      columns: [amount]
+      value: 0
+      active: false
+
+  # This step runs normally (default active: true)
+  - col_vals_not_null:
+      columns: [customer_id]
+```
 
 ### Brief Parameter Options
 

From a149a45a55869e90a04eb610fd4ed9f3da0ae997 Mon Sep 17 00:00:00 2001
From: Richard Iannone <riannone@me.com>
Date: Mon, 23 Feb 2026 15:40:57 -0500
Subject: [PATCH 11/11] Add YAML governance and validation docs

---
 docs/user-guide/yaml-validation-workflows.qmd | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)

diff --git a/docs/user-guide/yaml-validation-workflows.qmd b/docs/user-guide/yaml-validation-workflows.qmd
index 521ab5f17..c0e25262e 100644
--- a/docs/user-guide/yaml-validation-workflows.qmd
+++ b/docs/user-guide/yaml-validation-workflows.qmd
@@ -722,6 +722,111 @@ Brief Templating Options:
   - `{value}`: the comparison value used in the validation (for single-value comparisons)
   - `{pattern}`: for regex validations, the pattern being matched
 
+### Governance Metadata
+
+YAML workflows support governance metadata that identifies ownership and usage of validation
+workflows. These fields are embedded in the validation report:
+
+```yaml
+tbl: sales_data.csv
+tbl_name: "Sales Pipeline"
+owner: "Data Engineering"
+consumers: [Analytics Team, Finance, Compliance]
+version: "2.1.0"
+steps:
+  - col_vals_not_null:
+      columns: [customer_id, revenue]
+  - col_vals_gt:
+      columns: [revenue]
+      value: 0
+```
+
+The `owner`, `consumers`, and `version` fields are forwarded to the `Validate` constructor and
+appear in the validation report header. These fields are optional and do not affect validation
+behavior.
+
+### Data Freshness and Null Percentage
+
+Two additional validation methods support common data quality checks:
+
+**`data_freshness`**: Validate that a date/datetime column has recent data:
+
+```yaml
+steps:
+  - data_freshness:
+      columns: event_date
+      freshness: "24h"
+```
+
+**`col_pct_null`**: Validate that the percentage of null values is within bounds:
+
+```yaml
+steps:
+  - col_pct_null:
+      columns: [email, phone]
+      value: 0.05
+```
+
+### Aggregate Validations
+
+Aggregate methods validate column-level statistics like sum, average, and standard deviation:
+
+```yaml
+steps:
+  # Check that total revenue is positive
+  - col_sum_gt:
+      columns: [revenue]
+      value: 0
+
+  # Validate average rating is at most 5
+  - col_avg_le:
+      columns: [rating]
+      value: 5
+
+  # Ensure temperature variation is bounded
+  - col_sd_lt:
+      columns: [temperature]
+      value: 10
+```
+
+Available methods follow the `col_{stat}_{comparator}` pattern where `{stat}` is `sum`, `avg`, or
+`sd`, and `{comparator}` is `gt`, `lt`, `ge`, `le`, `eq`, `between`, or `outside`.
+
+### Step Activation Control
+
+The `active` parameter allows you to temporarily disable validation steps without removing them
+from the configuration:
+
+```yaml
+steps:
+  # This step is disabled
+  - col_vals_gt:
+      columns: [amount]
+      value: 0
+      active: false
+
+  # This step runs normally (active: true is the default)
+  - col_vals_not_null:
+      columns: [customer_id]
+```
+
+This is useful for debugging, phased rollouts, or temporarily skipping steps that are known to fail.
+
+### Reference Tables
+
+The `reference` top-level key specifies a reference table for comparison-based validations:
+
+```yaml
+tbl: current_data.csv
+reference:
+  python: |
+    pb.load_dataset("baseline_data", tbl_type="polars")
+steps:
+  - tbl_match:
+      tbl_compare:
+        python: |
+          pb.load_dataset("baseline_data", tbl_type="polars")
+```
 
 ## Working with YAML Files