From e9c9a5a2331d9fd55f6249ba6b6f45f1203049b5 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Mon, 23 Feb 2026 12:25:25 -0500 Subject: [PATCH 01/11] Support governance, aggregates, and refs in YAML --- pointblank/yaml.py | 84 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 4 deletions(-) diff --git a/pointblank/yaml.py b/pointblank/yaml.py index 6ccd05ef2..3ab325f1b 100644 --- a/pointblank/yaml.py +++ b/pointblank/yaml.py @@ -6,6 +6,7 @@ import yaml +from pointblank._agg import is_valid_agg from pointblank._utils import _is_lib_present from pointblank.thresholds import Actions from pointblank.validate import Validate, load_dataset @@ -241,11 +242,13 @@ class YAMLValidator: "col_vals_increasing": "col_vals_increasing", "col_vals_decreasing": "col_vals_decreasing", "col_vals_within_spec": "col_vals_within_spec", + "col_pct_null": "col_pct_null", "rows_distinct": "rows_distinct", "rows_complete": "rows_complete", "col_count_match": "col_count_match", "row_count_match": "row_count_match", "col_schema_match": "col_schema_match", + "data_freshness": "data_freshness", "tbl_match": "tbl_match", "prompt": "prompt", "conjointly": "conjointly", @@ -620,9 +623,13 @@ def _parse_validation_step( else: raise YAMLValidationError(f"Invalid step configuration type: {type(step_config)}") - # Validate that we know this method - if method_name not in self.validation_method_map: - available_methods = list(self.validation_method_map.keys()) + # Validate that we know this method (static map or dynamic aggregate method) + if method_name not in self.validation_method_map and not is_valid_agg(method_name): + available_methods = list(self.validation_method_map.keys()) + [ + "col_sum_*", + "col_avg_*", + "col_sd_*", + ] raise YAMLValidationError( f"Unknown validation method '{method_name}'. Available methods: {available_methods}" ) @@ -693,7 +700,14 @@ def _parse_validation_step( if "inclusive" in parameters and isinstance(parameters["inclusive"], list): parameters["inclusive"] = tuple(parameters["inclusive"]) - return self.validation_method_map[method_name], parameters + # Resolve the method name: static map takes priority, then dynamic aggregate methods + if method_name in self.validation_method_map: + resolved_name = self.validation_method_map[method_name] + else: + # Dynamic aggregate method (col_sum_gt, col_avg_eq, etc.) + resolved_name = method_name + + return resolved_name, parameters def build_validation( self, config: dict, namespaces: Optional[Union[Iterable[str], Mapping[str, str]]] = None @@ -750,6 +764,41 @@ def build_validation( if "brief" in config: validate_kwargs["brief"] = config["brief"] + # Set owner if provided (governance) + if "owner" in config: + validate_kwargs["owner"] = config["owner"] + + # Set consumers if provided (governance) + if "consumers" in config: + validate_kwargs["consumers"] = config["consumers"] + + # Set version if provided (governance) + if "version" in config: + validate_kwargs["version"] = config["version"] + + # Set final_actions if provided + if "final_actions" in config: + from pointblank.thresholds import FinalActions + + final_actions_spec = config["final_actions"] + # Process Python expressions in final_actions + processed_final_actions = _process_python_expressions( + final_actions_spec, namespaces=namespaces + ) + if isinstance(processed_final_actions, list): + validate_kwargs["final_actions"] = FinalActions(*processed_final_actions) + elif callable(processed_final_actions): + validate_kwargs["final_actions"] = FinalActions(processed_final_actions) + else: + validate_kwargs["final_actions"] = processed_final_actions + + # Set reference data if provided + if "reference" in config: + ref_spec = config["reference"] + # Process as a data source (could be a dataset name, file path, or python expression) + ref_data = self._load_data_source(ref_spec, df_library) + validate_kwargs["reference"] = ref_data + validation = Validate(data, **validate_kwargs) # Add validation steps @@ -1496,6 +1545,26 @@ def extract_python_expressions(obj, path=""): else: validate_args.append(f'brief="{config["brief"]}"') + # Add owner if present (governance) + if "owner" in config: + validate_args.append(f'owner="{config["owner"]}"') + + # Add consumers if present (governance) + if "consumers" in config: + consumers = config["consumers"] + if isinstance(consumers, list): + if len(consumers) == 1: + validate_args.append(f'consumers="{consumers[0]}"') + else: + consumers_str = "[" + ", ".join([f'"{c}"' for c in consumers]) + "]" + validate_args.append(f"consumers={consumers_str}") + elif isinstance(consumers, str): + validate_args.append(f'consumers="{consumers}"') + + # Add version if present (governance) + if "version" in config: + validate_args.append(f'version="{config["version"]}"') + # Create the `pb.Validate()` call if len(validate_args) == 1: # Single argument fits on one line @@ -1514,6 +1583,13 @@ def extract_python_expressions(obj, path=""): for step_index, step_config in enumerate(config["steps"]): # Get original expressions before parsing original_expressions = {} + + # Handle string steps (parameterless methods like "rows_distinct") + if isinstance(step_config, str): + method_name, parameters = validator._parse_validation_step(step_config, namespaces=None) + code_lines.append(f" .{method_name}()") + continue + step_method = list(step_config.keys())[ 0 ] # Get the method name (conjointly, specially, etc.) From 451b6c2ee6ab0df0ec31a8eb95105aee5da29aff Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Mon, 23 Feb 2026 12:25:40 -0500 Subject: [PATCH 02/11] Add YAML tests for new validations and governance --- tests/test_yaml.py | 376 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 376 insertions(+) diff --git a/tests/test_yaml.py b/tests/test_yaml.py index 8420772e4..7604c03f4 100644 --- a/tests/test_yaml.py +++ b/tests/test_yaml.py @@ -4838,3 +4838,379 @@ def test_new_methods_yaml_to_python_comprehensive(): assert "increasing_tol=0.2" in python_code assert 'spec="url"' in python_code assert 'model="openai:gpt-4"' in python_code + + +def test_yaml_col_pct_null_basic(): + """Test col_pct_null validation via YAML""" + yaml_content = """ + tbl: small_table + steps: + - col_pct_null: + columns: c + p: 0.15 + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert len(result.validation_info) == 1 + assert result.validation_info[0].assertion_type == "col_pct_null" + + +def test_yaml_col_pct_null_with_tolerance(): + """Test col_pct_null with tolerance parameter""" + yaml_content = """ + tbl: small_table + steps: + - col_pct_null: + columns: [c] + p: 0.15 + tol: 0.1 + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert len(result.validation_info) == 1 + + +def test_yaml_data_freshness_basic(): + """Test data_freshness validation via YAML""" + yaml_content = """ + tbl: small_table + steps: + - data_freshness: + column: date_time + max_age: "100000 hours" + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert len(result.validation_info) == 1 + assert result.validation_info[0].assertion_type == "data_freshness" + + +def test_yaml_data_freshness_with_options(): + """Test data_freshness with timezone and other options""" + yaml_content = """ + tbl: small_table + steps: + - data_freshness: + column: date_time + max_age: "876000 hours" + timezone: "UTC" + allow_tz_mismatch: true + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert len(result.validation_info) == 1 + + +def test_yaml_aggregate_methods_basic(): + """Test that aggregate methods (col_sum_*, col_avg_*, col_sd_*) work in YAML""" + yaml_content = """ + tbl: small_table + steps: + - col_sum_gt: + columns: a + value: 0 + - col_avg_gt: + columns: a + value: 0 + - col_sd_gt: + columns: d + value: 0 + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert len(result.validation_info) == 3 + assert result.validation_info[0].assertion_type == "col_sum_gt" + assert result.validation_info[1].assertion_type == "col_avg_gt" + assert result.validation_info[2].assertion_type == "col_sd_gt" + + +def test_yaml_aggregate_methods_all_comparators(): + """Test all aggregate comparator variants""" + yaml_content = """ + tbl: small_table + steps: + - col_sum_eq: + columns: a + value: 6 + - col_sum_ge: + columns: a + value: 0 + - col_sum_gt: + columns: a + value: -1 + - col_sum_lt: + columns: a + value: 100000 + - col_sum_le: + columns: a + value: 100000 + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert len(result.validation_info) == 5 + + +def test_yaml_aggregate_methods_with_tolerance(): + """Test aggregate methods with tolerance parameter""" + yaml_content = """ + tbl: small_table + steps: + - col_sum_eq: + columns: a + value: 6 + tol: 1 + - col_avg_eq: + columns: d + value: 5000 + tol: 5000 + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert len(result.validation_info) == 2 + + +def test_yaml_aggregate_methods_with_thresholds(): + """Test aggregate methods with thresholds""" + yaml_content = """ + tbl: small_table + thresholds: + warning: 0.1 + steps: + - col_sum_gt: + columns: a + value: 0 + thresholds: + warning: 1 + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert len(result.validation_info) == 1 + + +def test_yaml_unknown_method_still_errors(): + """Test that truly unknown methods still raise errors""" + yaml_content = """ + tbl: small_table + steps: + - col_totally_fake: + columns: a + value: 0 + """ + with pytest.raises(YAMLValidationError, match="Unknown validation method"): + yaml_interrogate(yaml_content) + + +def test_yaml_governance_owner_param(): + """Test that owner parameter is forwarded through YAML""" + yaml_content = """ + tbl: small_table + owner: "data-platform-team" + steps: + - col_vals_not_null: + columns: a + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert result.owner == "data-platform-team" + + +def test_yaml_governance_consumers_param(): + """Test that consumers parameter is forwarded through YAML""" + yaml_content = """ + tbl: small_table + consumers: + - ml-team + - analytics + steps: + - col_vals_not_null: + columns: a + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert result.consumers == ["ml-team", "analytics"] + + +def test_yaml_governance_consumers_single(): + """Test that a single consumer string is forwarded""" + yaml_content = """ + tbl: small_table + consumers: analytics + steps: + - col_vals_not_null: + columns: a + """ + result = yaml_interrogate(yaml_content) + assert result is not None + # Validate normalizes single string to list + assert result.consumers == ["analytics"] + + +def test_yaml_governance_version_param(): + """Test that version parameter is forwarded through YAML""" + yaml_content = """ + tbl: small_table + version: "2.1.0" + steps: + - col_vals_not_null: + columns: a + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert result.version == "2.1.0" + + +def test_yaml_governance_all_params(): + """Test all governance params together""" + yaml_content = """ + tbl: small_table + owner: "data-team" + consumers: + - team-a + - team-b + version: "1.0.0" + steps: + - col_vals_not_null: + columns: a + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert result.owner == "data-team" + assert result.consumers == ["team-a", "team-b"] + assert result.version == "1.0.0" + + +def test_yaml_to_python_col_pct_null(): + """Test yaml_to_python for col_pct_null""" + yaml_content = """ + tbl: small_table + steps: + - col_pct_null: + columns: c + p: 0.15 + tol: 0.05 + """ + python_code = yaml_to_python(yaml_content) + assert ".col_pct_null(" in python_code + assert 'columns="c"' in python_code + assert "p=0.15" in python_code + assert "tol=0.05" in python_code + + +def test_yaml_to_python_data_freshness(): + """Test yaml_to_python for data_freshness""" + yaml_content = """ + tbl: small_table + steps: + - data_freshness: + column: date_time + max_age: "24 hours" + timezone: "UTC" + """ + python_code = yaml_to_python(yaml_content) + assert ".data_freshness(" in python_code + assert 'column="date_time"' in python_code + assert 'max_age="24 hours"' in python_code + assert 'timezone="UTC"' in python_code + + +def test_yaml_to_python_aggregate_methods(): + """Test yaml_to_python for aggregate methods""" + yaml_content = """ + tbl: small_table + steps: + - col_sum_gt: + columns: a + value: 0 + - col_avg_le: + columns: d + value: 10000 + - col_sd_eq: + columns: d + value: 100 + tol: 200 + """ + python_code = yaml_to_python(yaml_content) + assert ".col_sum_gt(" in python_code + assert ".col_avg_le(" in python_code + assert ".col_sd_eq(" in python_code + assert "tol=200" in python_code + + +def test_yaml_to_python_governance_params(): + """Test yaml_to_python includes governance parameters""" + yaml_content = """ + tbl: small_table + owner: "data-team" + consumers: + - ml-team + - analytics + version: "1.0.0" + steps: + - col_vals_not_null: + columns: a + """ + python_code = yaml_to_python(yaml_content) + assert 'owner="data-team"' in python_code + assert 'consumers=["ml-team", "analytics"]' in python_code + assert 'version="1.0.0"' in python_code + + +def test_yaml_to_python_governance_single_consumer(): + """Test yaml_to_python with single consumer string""" + yaml_content = """ + tbl: small_table + consumers: analytics + steps: + - col_vals_not_null: + columns: a + """ + python_code = yaml_to_python(yaml_content) + assert 'consumers="analytics"' in python_code + + +def test_yaml_to_python_parameterless_step(): + """Test yaml_to_python handles parameterless string steps (e.g., rows_distinct)""" + yaml_content = """ + tbl: small_table + steps: + - rows_distinct + - rows_complete + """ + python_code = yaml_to_python(yaml_content) + assert ".rows_distinct()" in python_code + assert ".rows_complete()" in python_code + + +def test_yaml_combined_new_and_existing_methods(): + """Test YAML with a mix of existing and newly-added methods""" + yaml_content = """ + tbl: small_table + owner: "team" + version: "1.0.0" + steps: + - col_vals_gt: + columns: a + value: 0 + - col_pct_null: + columns: c + p: 0.15 + - col_sum_gt: + columns: a + value: 0 + - col_avg_gt: + columns: d + value: 0 + - rows_distinct + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert len(result.validation_info) == 5 + assert result.owner == "team" + assert result.version == "1.0.0" + + types = [vi.assertion_type for vi in result.validation_info] + assert "col_vals_gt" in types + assert "col_pct_null" in types + assert "col_sum_gt" in types + assert "col_avg_gt" in types + assert "rows_distinct" in types From 39a5f8e5eeddd1d945d6e061f044e1e1a43d01d0 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Mon, 23 Feb 2026 12:50:31 -0500 Subject: [PATCH 03/11] Validate YAML keys; add reference support --- pointblank/yaml.py | 47 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/pointblank/yaml.py b/pointblank/yaml.py index 3ab325f1b..9a528ab82 100644 --- a/pointblank/yaml.py +++ b/pointblank/yaml.py @@ -326,6 +326,33 @@ def _validate_schema(self, config: dict) -> None: YAMLValidationError If the schema is invalid. """ + # Define known top-level keys + known_keys = { + "tbl", + "steps", + "tbl_name", + "label", + "thresholds", + "actions", + "final_actions", + "brief", + "lang", + "locale", + "df_library", + "owner", + "consumers", + "version", + "reference", + } + + # Warn about unknown top-level keys (likely typos) + unknown_keys = set(config.keys()) - known_keys + if unknown_keys: + raise YAMLValidationError( + f"Unknown top-level key(s): {sorted(unknown_keys)}. " + f"Valid keys are: {sorted(known_keys)}" + ) + # Check required fields if "tbl" not in config: raise YAMLValidationError("YAML must contain 'tbl' field") @@ -1420,6 +1447,11 @@ def yaml_to_python(yaml: Union[str, Path]) -> str: if isinstance(raw_config.get("tbl"), dict) and "python" in raw_config["tbl"]: original_tbl_expression = raw_config["tbl"]["python"].strip() + # Extract the original reference python expression if it exists + original_reference_expression = None + if isinstance(raw_config.get("reference"), dict) and "python" in raw_config["reference"]: + original_reference_expression = raw_config["reference"]["python"].strip() + # Extract original Actions expressions if they exist original_actions_expressions = {} if "actions" in raw_config: @@ -1565,6 +1597,21 @@ def extract_python_expressions(obj, path=""): if "version" in config: validate_args.append(f'version="{config["version"]}"') + # Add reference if present + if "reference" in config: + ref_spec = config["reference"] + if original_reference_expression: + validate_args.append(f"reference={original_reference_expression}") + elif isinstance(ref_spec, str): + if ref_spec.endswith((".csv", ".parquet")): + validate_args.append( + f'reference=pb.load_dataset("{ref_spec}", tbl_type="{df_library}")' + ) + else: + validate_args.append( + f'reference=pb.load_dataset("{ref_spec}", tbl_type="{df_library}")' + ) + # Create the `pb.Validate()` call if len(validate_args) == 1: # Single argument fits on one line From 6c0192fef36f233ea4a2705209ae540d736dbfd0 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Mon, 23 Feb 2026 12:50:45 -0500 Subject: [PATCH 04/11] Add tests for YAML unknown keys and references --- tests/test_yaml.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/test_yaml.py b/tests/test_yaml.py index 7604c03f4..1ba5d529d 100644 --- a/tests/test_yaml.py +++ b/tests/test_yaml.py @@ -5214,3 +5214,60 @@ def test_yaml_combined_new_and_existing_methods(): assert "col_sum_gt" in types assert "col_avg_gt" in types assert "rows_distinct" in types + + +def test_yaml_unknown_top_level_key_rejected(): + """Test that unknown top-level keys (likely typos) are caught""" + yaml_content = """ + tbl: small_table + tbl_nmae: "Typo Table Name" + steps: + - col_vals_not_null: + columns: a + """ + with pytest.raises(YAMLValidationError, match="Unknown top-level key"): + yaml_interrogate(yaml_content) + + +def test_yaml_unknown_top_level_key_error_message(): + """Test that the error message includes the bad key and valid keys""" + yaml_content = """ + tbl: small_table + labell: "typo" + steps: + - rows_distinct + """ + with pytest.raises(YAMLValidationError, match="labell") as exc_info: + validate_yaml(yaml_content) + # Should also mention valid keys + assert "label" in str(exc_info.value) + + +def test_yaml_to_python_reference(): + """Test yaml_to_python renders reference parameter""" + yaml_content = """ + tbl: small_table + reference: small_table + steps: + - col_vals_not_null: + columns: a + """ + python_code = yaml_to_python(yaml_content) + assert "reference=" in python_code + assert 'pb.load_dataset("small_table"' in python_code + + +def test_yaml_to_python_reference_with_python_expression(): + """Test yaml_to_python renders reference with python: expression""" + yaml_content = """ + tbl: small_table + reference: + python: | + pb.load_dataset("small_table", tbl_type="polars") + steps: + - col_vals_not_null: + columns: a + """ + python_code = yaml_to_python(yaml_content) + assert "reference=" in python_code + assert 'pb.load_dataset("small_table", tbl_type="polars")' in python_code From 8e8e3a464375ad04d9528ab93785f3381db8b859 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Mon, 23 Feb 2026 13:43:03 -0500 Subject: [PATCH 05/11] Support shortcut syntax for `active` in YAML --- pointblank/yaml.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pointblank/yaml.py b/pointblank/yaml.py index 9a528ab82..d57683048 100644 --- a/pointblank/yaml.py +++ b/pointblank/yaml.py @@ -672,6 +672,10 @@ def _parse_validation_step( elif key == "pre" and isinstance(value, str): # Treat string directly as Python code (shortcut syntax) processed_parameters[key] = _safe_eval_python_code(value, namespaces=namespaces) + # Special case: `active=` parameter can use shortcut syntax for callables + # (e.g., `active: pb.has_columns("col_a")` or `active: false`) + elif key == "active" and isinstance(value, str): + processed_parameters[key] = _safe_eval_python_code(value, namespaces=namespaces) else: # Normal processing (requires python: block syntax) processed_parameters[key] = _process_python_expressions( @@ -1468,9 +1472,9 @@ def extract_python_expressions(obj, path=""): else: for key, value in obj.items(): new_path = f"{path}.{key}" if path else key - # Special handling for `expr=` and `pre=` parameters that - # can use shortcut syntax - if key in ["expr", "pre"] and isinstance(value, str): + # Special handling for `expr=`, `pre=`, and `active=` parameters + # that can use shortcut syntax + if key in ["expr", "pre", "active"] and isinstance(value, str): expressions[new_path] = value.strip() # Special handling for actions that might contain python: expressions elif key == "actions" and isinstance(value, dict): From 8a401ab89e6290f05490ad96db58409e27e93f57 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Mon, 23 Feb 2026 13:43:27 -0500 Subject: [PATCH 06/11] Add tests for YAML 'active' handling --- tests/test_yaml.py | 123 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/tests/test_yaml.py b/tests/test_yaml.py index 1ba5d529d..9f2562b99 100644 --- a/tests/test_yaml.py +++ b/tests/test_yaml.py @@ -5271,3 +5271,126 @@ def test_yaml_to_python_reference_with_python_expression(): python_code = yaml_to_python(yaml_content) assert "reference=" in python_code assert 'pb.load_dataset("small_table", tbl_type="polars")' in python_code + + +def test_yaml_active_boolean_false(): + """Test active=false disables a step via YAML""" + yaml_content = """ + tbl: small_table + steps: + - col_vals_gt: + columns: d + value: 100 + active: false + - col_vals_not_null: + columns: a + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert len(result.validation_info) == 2 + # First step should be inactive + assert result.validation_info[0].active is False + # Second step should be active (default) + assert result.validation_info[1].active is True + + +def test_yaml_active_boolean_true(): + """Test active=true keeps a step active via YAML""" + yaml_content = """ + tbl: small_table + steps: + - col_vals_gt: + columns: d + value: 100 + active: true + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert result.validation_info[0].active is True + + +def test_yaml_active_callable_shortcut(): + """Test active with callable shortcut syntax (e.g., has_columns)""" + yaml_content = """ + tbl: small_table + steps: + - col_vals_gt: + columns: d + value: 100 + active: "pb.has_columns('d')" + - col_vals_gt: + columns: nonexistent_column + value: 0 + active: "pb.has_columns('nonexistent_column')" + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert len(result.validation_info) == 2 + + # First step: column 'd' exists, so active should be True and step should run + assert result.validation_info[0].active is not False + + # Second step: column 'nonexistent_column' doesn't exist, + # so active callable returns False and step is inactive + assert result.validation_info[1].active is not True + + +def test_yaml_active_python_block(): + """Test active with python: block syntax""" + yaml_content = """ + tbl: small_table + steps: + - col_vals_gt: + columns: d + value: 100 + active: + python: "pb.has_columns('d')" + """ + result = yaml_interrogate(yaml_content) + assert result is not None + assert len(result.validation_info) == 1 + # Column 'd' exists, so step should be active/run + assert result.validation_info[0].active is not False + + +def test_yaml_to_python_active_boolean(): + """Test yaml_to_python renders active=False correctly""" + yaml_content = """ + tbl: small_table + steps: + - col_vals_gt: + columns: d + value: 100 + active: false + """ + python_code = yaml_to_python(yaml_content) + assert "active=False" in python_code + + +def test_yaml_to_python_active_callable_shortcut(): + """Test yaml_to_python preserves active callable shortcut expression""" + yaml_content = """ + tbl: small_table + steps: + - col_vals_gt: + columns: d + value: 100 + active: "pb.has_columns('d')" + """ + python_code = yaml_to_python(yaml_content) + assert "active=pb.has_columns('d')" in python_code + + +def test_yaml_to_python_active_python_block(): + """Test yaml_to_python preserves active python: block expression""" + yaml_content = """ + tbl: small_table + steps: + - col_vals_gt: + columns: d + value: 100 + active: + python: "pb.has_columns('d')" + """ + python_code = yaml_to_python(yaml_content) + assert "active=pb.has_columns('d')" in python_code From 026e753478c3b12986f354566be6f52b63b6abfb Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Mon, 23 Feb 2026 15:36:17 -0500 Subject: [PATCH 07/11] Add YAML-to-Python roundtrip tests --- tests/test_yaml.py | 257 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 257 insertions(+) diff --git a/tests/test_yaml.py b/tests/test_yaml.py index 9f2562b99..ce9f8c212 100644 --- a/tests/test_yaml.py +++ b/tests/test_yaml.py @@ -5394,3 +5394,260 @@ def test_yaml_to_python_active_python_block(): """ python_code = yaml_to_python(yaml_content) assert "active=pb.has_columns('d')" in python_code + + +def _exec_yaml_to_python(yaml_content: str) -> Validate: + """Helper: convert YAML to Python code via yaml_to_python, then exec it and return result.""" + import pointblank as pb + + python_code = yaml_to_python(yaml_content) + + # Strip markdown code fences + code = python_code.strip() + if code.startswith("```python"): + code = code[len("```python") :].strip() + if code.endswith("```"): + code = code[: -len("```")].strip() + + # Execute the generated code: split imports from expression + # The code has imports on top, then a blank line, then a parenthesized expression + parts = code.split("\n\n", 1) + imports_section = parts[0] + expression_section = parts[1] if len(parts) > 1 else "" + + code_with_capture = imports_section + "\n\nresult = " + expression_section + exec_globals = {"__builtins__": __builtins__} + exec(code_with_capture, exec_globals) + return exec_globals["result"] + + +def _compare_validation_results(yaml_result: Validate, python_result: Validate): + """Helper: compare two Validate objects for equivalence.""" + # Same number of validation steps + assert len(yaml_result.validation_info) == len(python_result.validation_info) + + for i, (y_step, p_step) in enumerate( + zip(yaml_result.validation_info, python_result.validation_info) + ): + # Same validation method + assert y_step.assertion_type == p_step.assertion_type, ( + f"Step {i}: method mismatch: {y_step.assertion_type} vs {p_step.assertion_type}" + ) + # Same pass/fail outcome + assert y_step.all_passed == p_step.all_passed, ( + f"Step {i} ({y_step.assertion_type}): " + f"all_passed mismatch: {y_step.all_passed} vs {p_step.all_passed}" + ) + # Same test unit counts + assert y_step.n == p_step.n, f"Step {i}: n mismatch: {y_step.n} vs {p_step.n}" + assert y_step.n_passed == p_step.n_passed, ( + f"Step {i}: n_passed mismatch: {y_step.n_passed} vs {p_step.n_passed}" + ) + assert y_step.n_failed == p_step.n_failed, ( + f"Step {i}: n_failed mismatch: {y_step.n_failed} vs {p_step.n_failed}" + ) + + +def test_roundtrip_basic(): + """Round-trip: basic YAML with parameterless and parameterized steps.""" + yaml_content = """ + tbl: small_table + steps: + - rows_distinct + - col_exists: + columns: [date, a, b] + """ + yaml_result = yaml_interrogate(yaml_content) + python_result = _exec_yaml_to_python(yaml_content) + _compare_validation_results(yaml_result, python_result) + + +def test_roundtrip_with_thresholds(): + """Round-trip: YAML with global thresholds.""" + yaml_content = """ + tbl: small_table + tbl_name: threshold_test + label: Test thresholds + thresholds: + warning: 0.1 + error: 0.25 + steps: + - col_vals_gt: + columns: [d] + value: 100 + - col_vals_not_null: + columns: [date, a] + """ + yaml_result = yaml_interrogate(yaml_content) + python_result = _exec_yaml_to_python(yaml_content) + _compare_validation_results(yaml_result, python_result) + assert python_result.tbl_name == "threshold_test" + assert python_result.label == "Test thresholds" + + +def test_roundtrip_column_validations(): + """Round-trip: various column validation methods.""" + yaml_content = """ + tbl: small_table + steps: + - col_vals_gt: + columns: [d] + value: 0 + - col_vals_lt: + columns: [a] + value: 100 + - col_vals_between: + columns: [c] + left: 1 + right: 10 + - col_vals_not_null: + columns: [date] + - col_vals_in_set: + columns: [f] + set: [low, mid, high] + """ + yaml_result = yaml_interrogate(yaml_content) + python_result = _exec_yaml_to_python(yaml_content) + _compare_validation_results(yaml_result, python_result) + + +def test_roundtrip_regex_and_schema(): + """Round-trip: regex and column existence validation.""" + yaml_content = """ + tbl: small_table + steps: + - col_vals_regex: + columns: [b] + pattern: '[0-9]-[a-z]{3}-[0-9]{3}' + - col_exists: + columns: [date, a, b, c, d, f] + """ + yaml_result = yaml_interrogate(yaml_content) + python_result = _exec_yaml_to_python(yaml_content) + _compare_validation_results(yaml_result, python_result) + + +def test_roundtrip_governance_metadata(): + """Round-trip: governance metadata (owner, consumers, version) preserved.""" + yaml_content = """ + tbl: small_table + tbl_name: governance_test + owner: Data Engineering + consumers: [Analytics, Finance] + version: "2.1.0" + steps: + - col_vals_not_null: + columns: [a] + """ + yaml_result = yaml_interrogate(yaml_content) + python_result = _exec_yaml_to_python(yaml_content) + _compare_validation_results(yaml_result, python_result) + assert python_result.owner == "Data Engineering" + assert python_result.consumers == ["Analytics", "Finance"] + assert python_result.version == "2.1.0" + + +def test_roundtrip_aggregate_methods(): + """Round-trip: aggregate validation methods (col_sum_gt, col_avg_le).""" + yaml_content = """ + tbl: small_table + steps: + - col_sum_gt: + columns: [d] + value: 0 + - col_avg_le: + columns: [a] + value: 10 + """ + yaml_result = yaml_interrogate(yaml_content) + python_result = _exec_yaml_to_python(yaml_content) + _compare_validation_results(yaml_result, python_result) + + +def test_roundtrip_mixed_workflow(): + """Round-trip: comprehensive workflow mixing multiple method types.""" + yaml_content = """ + tbl: small_table + tbl_name: mixed_test + label: Comprehensive test + thresholds: + warning: 0.1 + error: 0.25 + critical: 0.35 + steps: + - rows_distinct + - col_exists: + columns: [date, a, b] + - col_vals_gt: + columns: [d] + value: 100 + - col_vals_regex: + columns: [b] + pattern: '[0-9]-[a-z]{3}-[0-9]{3}' + - col_vals_not_null: + columns: [date, a] + - col_vals_between: + columns: [c] + left: 1 + right: 10 + - col_vals_in_set: + columns: [f] + set: [low, mid, high] + """ + yaml_result = yaml_interrogate(yaml_content) + python_result = _exec_yaml_to_python(yaml_content) + _compare_validation_results(yaml_result, python_result) + + +def test_roundtrip_with_brief(): + """Round-trip: validation steps with brief descriptions.""" + yaml_content = """ + tbl: small_table + steps: + - col_vals_gt: + columns: [d] + value: 100 + brief: "Values must exceed 100" + - col_vals_not_null: + columns: [a] + brief: true + """ + yaml_result = yaml_interrogate(yaml_content) + python_result = _exec_yaml_to_python(yaml_content) + _compare_validation_results(yaml_result, python_result) + + +def test_roundtrip_row_validations(): + """Round-trip: row-level validations.""" + yaml_content = """ + tbl: small_table + steps: + - rows_distinct + - rows_complete + - row_count_match: + count: 13 + """ + yaml_result = yaml_interrogate(yaml_content) + python_result = _exec_yaml_to_python(yaml_content) + _compare_validation_results(yaml_result, python_result) + + +def test_roundtrip_step_level_thresholds(): + """Round-trip: step-level thresholds.""" + yaml_content = """ + tbl: small_table + steps: + - col_vals_gt: + columns: [d] + value: 100 + thresholds: + warning: 0.05 + error: 0.15 + - col_vals_not_null: + columns: [a] + thresholds: + warning: 0.01 + """ + yaml_result = yaml_interrogate(yaml_content) + python_result = _exec_yaml_to_python(yaml_content) + _compare_validation_results(yaml_result, python_result) From 0695afdb833c5cfa128bc72ba99d5bc5d6685815 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Mon, 23 Feb 2026 15:39:50 -0500 Subject: [PATCH 08/11] Include governance and aggregates in YAML docs --- pointblank/yaml.py | 143 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) diff --git a/pointblank/yaml.py b/pointblank/yaml.py index d57683048..0e03441a1 100644 --- a/pointblank/yaml.py +++ b/pointblank/yaml.py @@ -1082,6 +1082,94 @@ def my_custom_action(): pipeline or version control system, allowing you to maintain validation rules alongside your code. + ### Governance Metadata + + YAML workflows support governance metadata via `owner`, `consumers`, and `version` top-level + keys. These are forwarded to the `Validate` constructor and embedded in the validation report: + + ```{python} + yaml_config = ''' + tbl: small_table + tbl_name: sales_pipeline + owner: Data Engineering + consumers: [Analytics, Finance, Compliance] + version: "2.1.0" + steps: + - col_vals_not_null: + columns: [a, b] + ''' + + result = pb.yaml_interrogate(yaml_config) + print(f"Owner: {result.owner}") + print(f"Consumers: {result.consumers}") + print(f"Version: {result.version}") + ``` + + ### Aggregate Validations + + YAML supports aggregate validation methods for checking column-level statistics. These methods + validate that a column's sum, average, or standard deviation meets a threshold: + + ```{python} + yaml_config = ''' + tbl: small_table + steps: + - col_sum_gt: + columns: [d] + value: 0 + - col_avg_le: + columns: [a] + value: 10 + ''' + + result = pb.yaml_interrogate(yaml_config) + result + ``` + + The 15 available aggregate methods follow the pattern `col_{stat}_{comparator}` where + `{stat}` is `sum`, `avg`, or `sd` and `{comparator}` is `gt`, `lt`, `ge`, + `le`, or `eq`. + + ### Data Freshness + + Check that a date/datetime column has recent data using `data_freshness`: + + ```yaml + tbl: events.csv + steps: + - data_freshness: + columns: event_date + freshness: "24h" + ``` + + ### Active Parameter Shortcut + + The `active=` parameter controls whether a validation step runs. It supports boolean values + and Python expression shortcuts: + + ```yaml + steps: + - col_vals_gt: + columns: [d] + value: 100 + active: false # Skip this step + + - col_vals_not_null: + columns: [a] + active: true # Always run (default) + ``` + + ### Null Percentage Check + + Use `col_pct_null` to validate that the percentage of null values in a column is within bounds: + + ```yaml + steps: + - col_pct_null: + columns: [a, b] + value: 0.05 + ``` + ### Using `set_tbl=` to Override the Table The `set_tbl=` parameter allows you to override the table specified in the YAML configuration. @@ -1314,6 +1402,39 @@ def safe_yaml_interrogate(yaml_config): source ('tbl') exists or is accessible. Data source validation occurs during execution with `yaml_interrogate()`. + Supported Top-level Keys + ------------------------ + The following top-level keys are recognized in the YAML configuration: + + - `tbl`: data source specification (required) + - `steps`: list of validation steps (required) + - `tbl_name`: human-readable table name + - `label`: validation description + - `df_library`: DataFrame library (`"polars"`, `"pandas"`, `"duckdb"`) + - `lang`: language code + - `locale`: locale setting + - `brief`: global brief template + - `thresholds`: global failure thresholds + - `actions`: global failure actions + - `final_actions`: actions triggered after all steps complete + - `owner`: data owner (governance metadata) + - `consumers`: data consumers (governance metadata) + - `version`: validation version string (governance metadata) + - `reference`: reference table for comparison-based validations + + Unknown top-level keys are rejected, which catches typos like `tbl_nmae` or `step`. + + Supported Validation Methods + ---------------------------- + In addition to all standard validation methods (e.g., `col_vals_gt`, `rows_distinct`, + `col_schema_match`), the following methods are also supported: + + - `col_pct_null`: check the percentage of null values in a column + - `data_freshness`: check that data is recent + - aggregate methods: `col_sum_gt`, `col_sum_lt`, `col_sum_ge`, `col_sum_le`, + `col_sum_eq`, `col_avg_gt`, `col_avg_lt`, `col_avg_ge`, `col_avg_le`, + `col_avg_eq`, `col_sd_gt`, `col_sd_lt`, `col_sd_ge`, `col_sd_le`, `col_sd_eq` + See Also -------- yaml_interrogate : execute YAML-based validation workflows @@ -1416,6 +1537,28 @@ def yaml_to_python(yaml: Union[str, Path]) -> str: The generated code includes all configuration parameters, thresholds, and maintains the exact same validation logic as the original YAML workflow. + Governance metadata (`owner`, `consumers`, `version`) and `reference` are also rendered + in the generated Python code: + + ```{python} + yaml_config = ''' + tbl: small_table + tbl_name: Sales Pipeline + owner: Data Engineering + consumers: [Analytics, Finance] + version: "2.1.0" + steps: + - col_vals_not_null: + columns: [a] + - col_sum_gt: + columns: [d] + value: 0 + ''' + + python_code = pb.yaml_to_python(yaml_config) + print(python_code) + ``` + This function is also useful for educational purposes, helping users understand how YAML configurations map to the underlying Python API calls. """ From aaeeea6645875c2138b2cf4e2db4c08cb7998d80 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Mon, 23 Feb 2026 15:40:08 -0500 Subject: [PATCH 09/11] Add optional metadata and validation examples --- pointblank/cli.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/pointblank/cli.py b/pointblank/cli.py index 3cebc4024..e20b6d2a2 100644 --- a/pointblank/cli.py +++ b/pointblank/cli.py @@ -3944,12 +3944,20 @@ def make_template(output_file: str | None): tbl: small_table # Replace with your data source # Can be: dataset name, CSV file, Parquet file, database connection, etc. +# Optional: DataFrame library ("polars", "pandas", "duckdb") +# df_library: polars + # Optional: Table name for reporting (defaults to filename if not specified) tbl_name: "Example Validation" # Optional: Label for this validation run label: "Validation Template" +# Optional: Governance metadata +# owner: "Data Engineering" +# consumers: [Analytics, Finance] +# version: "1.0.0" + # Optional: Validation thresholds (defaults shown below) # thresholds: # warning: 0.05 # 5% failure rate triggers warning @@ -3993,6 +4001,27 @@ def make_template(output_file: str | None): # columns: status # set: [active, inactive, pending] + # Aggregate validations (uncomment and modify as needed) + # - col_sum_gt: + # columns: revenue + # value: 0 + # brief: "Total revenue is positive" + + # - col_avg_between: + # columns: rating + # left: 1 + # right: 5 + + # Check null percentage (uncomment and modify as needed) + # - col_pct_null: + # columns: [email, phone] + # value: 0.05 + + # Data freshness check (uncomment and modify as needed) + # - data_freshness: + # columns: event_date + # freshness: "24h" + # Add more validation steps as needed # See the Pointblank documentation for the full list of available validation functions """ From 37261e953064513cbe50360a0a800818b82f5728 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Mon, 23 Feb 2026 15:40:41 -0500 Subject: [PATCH 10/11] Extend YAML reference with metadata and methods --- docs/user-guide/yaml-reference.qmd | 90 ++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/docs/user-guide/yaml-reference.qmd b/docs/user-guide/yaml-reference.qmd index d34f9bb2b..cbe3341b2 100644 --- a/docs/user-guide/yaml-reference.qmd +++ b/docs/user-guide/yaml-reference.qmd @@ -22,6 +22,12 @@ label: "Validation Description" # OPTIONAL: Description for the validatio lang: "en" # OPTIONAL: Language code (default: "en") locale: "en" # OPTIONAL: Locale setting (default: "en") brief: "Global brief: {auto}" # OPTIONAL: Global brief template +owner: "Data Engineering" # OPTIONAL: Data owner (governance metadata) +consumers: [Analytics, Finance] # OPTIONAL: Data consumers (governance metadata) +version: "1.0.0" # OPTIONAL: Validation version (governance metadata) +reference: # OPTIONAL: Reference table for comparison validations + python: | + pb.load_dataset("ref_table") thresholds: # OPTIONAL: Global failure thresholds warning: 0.1 error: 0.2 @@ -31,6 +37,9 @@ actions: # OPTIONAL: Global failure actions error: "Error message template" critical: "Critical message template" highest_only: false +final_actions: # OPTIONAL: Actions triggered after all steps complete + warning: "Post-validation warning" + error: "Post-validation error" steps: # REQUIRED: List of validation steps - validation_method_name - validation_method_name: @@ -838,6 +847,68 @@ Examples: - Performance-critical validations with large datasets - When deterministic results are required +### Data Quality Methods + +`col_pct_null`: is the percentage of null values in a column within bounds? + +```yaml +- col_pct_null: + columns: [column_name] # REQUIRED: Column(s) to validate + value: 0.05 # REQUIRED: Maximum allowed null fraction + thresholds: # OPTIONAL: Step-level thresholds + warning: 0.1 + actions: # OPTIONAL: Step-level actions + warning: "Custom message" + brief: "Null rate check" # OPTIONAL: Step description +``` + +`data_freshness`: is the data in a date/datetime column recent? + +```yaml +- data_freshness: + columns: [date_column] # REQUIRED: Date/datetime column + freshness: "24h" # REQUIRED: Maximum age of data + thresholds: # OPTIONAL: Step-level thresholds + warning: 0.1 + actions: # OPTIONAL: Step-level actions + warning: "Custom message" + brief: "Data is recent" # OPTIONAL: Step description +``` + +### Aggregate Validations + +Aggregate methods validate column-level statistics (sum, average, standard deviation) against a +threshold. They follow the pattern `col_{stat}_{comparator}`: + +```yaml +# Sum validations +- col_sum_gt: + columns: [revenue] + value: 0 + brief: "Total revenue is positive" + +# Average validations +- col_avg_le: + columns: [rating] + value: 5 + brief: "Average rating at most 5" + +# Standard deviation validations +- col_sd_lt: + columns: [temperature] + value: 10 + brief: "Temperature variation is bounded" +``` + +Available aggregate methods: + +- **Sum**: `col_sum_gt`, `col_sum_lt`, `col_sum_ge`, `col_sum_le`, `col_sum_eq` +- **Average**: `col_avg_gt`, `col_avg_lt`, `col_avg_ge`, `col_avg_le`, `col_avg_eq` +- **Standard deviation**: `col_sd_gt`, `col_sd_lt`, `col_sd_ge`, `col_sd_le`, `col_sd_eq` + +All aggregate methods accept these common parameters: `columns`, `value`, `thresholds`, `actions`, +`brief`, `active`, and `pre`. + ## Column Selection Patterns All validation methods that accept a `columns` parameter support these selection patterns: @@ -871,6 +942,25 @@ These parameters are available for most validation methods: - `thresholds`: step-level failure thresholds (dict) - `actions`: step-level failure actions (dict) - `brief`: step description (string, boolean, or template) +- `active`: whether the step is active (boolean, default: true) + +### Active Parameter + +The `active` parameter controls whether a validation step runs. It defaults to `true`; set it to +`false` to skip a step without removing it from the configuration: + +```yaml +steps: + # This step will be skipped + - col_vals_gt: + columns: [amount] + value: 0 + active: false + + # This step runs normally (default active: true) + - col_vals_not_null: + columns: [customer_id] +``` ### Brief Parameter Options From a149a45a55869e90a04eb610fd4ed9f3da0ae997 Mon Sep 17 00:00:00 2001 From: Richard Iannone Date: Mon, 23 Feb 2026 15:40:57 -0500 Subject: [PATCH 11/11] Add YAML governance and validation docs --- docs/user-guide/yaml-validation-workflows.qmd | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/docs/user-guide/yaml-validation-workflows.qmd b/docs/user-guide/yaml-validation-workflows.qmd index 521ab5f17..c0e25262e 100644 --- a/docs/user-guide/yaml-validation-workflows.qmd +++ b/docs/user-guide/yaml-validation-workflows.qmd @@ -722,6 +722,111 @@ Brief Templating Options: - `{value}`: the comparison value used in the validation (for single-value comparisons) - `{pattern}`: for regex validations, the pattern being matched +### Governance Metadata + +YAML workflows support governance metadata that identifies ownership and usage of validation +workflows. These fields are embedded in the validation report: + +```yaml +tbl: sales_data.csv +tbl_name: "Sales Pipeline" +owner: "Data Engineering" +consumers: [Analytics Team, Finance, Compliance] +version: "2.1.0" +steps: + - col_vals_not_null: + columns: [customer_id, revenue] + - col_vals_gt: + columns: [revenue] + value: 0 +``` + +The `owner`, `consumers`, and `version` fields are forwarded to the `Validate` constructor and +appear in the validation report header. These fields are optional and do not affect validation +behavior. + +### Data Freshness and Null Percentage + +Two additional validation methods support common data quality checks: + +**`data_freshness`**: Validate that a date/datetime column has recent data: + +```yaml +steps: + - data_freshness: + columns: event_date + freshness: "24h" +``` + +**`col_pct_null`**: Validate that the percentage of null values is within bounds: + +```yaml +steps: + - col_pct_null: + columns: [email, phone] + value: 0.05 +``` + +### Aggregate Validations + +Aggregate methods validate column-level statistics like sum, average, and standard deviation: + +```yaml +steps: + # Check that total revenue is positive + - col_sum_gt: + columns: [revenue] + value: 0 + + # Validate average rating is at most 5 + - col_avg_le: + columns: [rating] + value: 5 + + # Ensure temperature variation is bounded + - col_sd_lt: + columns: [temperature] + value: 10 +``` + +Available methods follow the `col_{stat}_{comparator}` pattern where `{stat}` is `sum`, `avg`, or +`sd`, and `{comparator}` is `gt`, `lt`, `ge`, `le`, `eq`, `between`, or `outside`. + +### Step Activation Control + +The `active` parameter allows you to temporarily disable validation steps without removing them +from the configuration: + +```yaml +steps: + # This step is disabled + - col_vals_gt: + columns: [amount] + value: 0 + active: false + + # This step runs normally (active: true is the default) + - col_vals_not_null: + columns: [customer_id] +``` + +This is useful for debugging, phased rollouts, or temporarily skipping steps that are known to fail. + +### Reference Tables + +The `reference` top-level key specifies a reference table for comparison-based validations: + +```yaml +tbl: current_data.csv +reference: + python: | + pb.load_dataset("baseline_data", tbl_type="polars") +steps: + - tbl_match: + tbl_compare: + python: | + pb.load_dataset("baseline_data", tbl_type="polars") +``` ## Working with YAML Files