From 7486594ce90d110b2a77eeec45da8fc629a52b86 Mon Sep 17 00:00:00 2001 From: Meghansaha Date: Wed, 7 Jan 2026 21:24:34 -0500 Subject: [PATCH 01/11] add note to find spot later --- pointblank/validate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pointblank/validate.py b/pointblank/validate.py index b77260c21..ea70f49a7 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -16369,6 +16369,7 @@ def get_tabular_report( # (it is either Polars or Pandas) df = df_lib.DataFrame(validation_info_dict) + # !!!!!!!! Meghan: NTS this may be the breakpoint here to extract out to df, investigate # Return the DataFrame as a Great Tables table gt_tbl = ( GT(df, id="pb_tbl") From db2b70a86f5d76bce7482789c82cd77601688793 Mon Sep 17 00:00:00 2001 From: Meghansaha Date: Sat, 24 Jan 2026 22:54:32 -0500 Subject: [PATCH 02/11] add in polars lib check; prep for dict handling and schema creation --- pointblank/validate.py | 89 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index ea70f49a7..63f6a1c0a 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -16369,7 +16369,6 @@ def get_tabular_report( # (it is either Polars or Pandas) df = df_lib.DataFrame(validation_info_dict) - # !!!!!!!! Meghan: NTS this may be the breakpoint here to extract out to df, investigate # Return the DataFrame as a Great Tables table gt_tbl = ( GT(df, id="pb_tbl") @@ -16872,6 +16871,94 @@ def get_step_report( return step_report + def get_dataframe( + self, + tbl_type: Literal["polars", "pandas", "duckdb"] = "polars", + keep_extracts: bool = False, + ): + """ + Validation results as a dataframe + + The `get_dataframe()` method returns a dataframe that represents the validation + report. This dataframe provides a summary of the validation results, including the + validation steps, the number of test units, the number of failing test units, and the + fraction of failing test units. This can be particularly helpful for logging purposes + and enables write validation summaries to CSVs and other on-disk formats. + + Parameters + ---------- + tbl_type : + The output backend for the dataframe. The named options are `"polars"`, + `"pandas"`, and `"duckdb"`. Default is 'polars'. + + keep_extracts: + An option to keep any collected extract data for failing rows from validation steps. By + default, this is `False` (i.e., extract data is removed to save space). + + Supported DataFrame Types + ------------------------- + The `tbl_type=` parameter can be set to one of the following: + + - `"polars"`: A Polars DataFrame. + - `"pandas"`: A Pandas DataFrame. + - `"duckdb"`: An Ibis table for a DuckDB database. + + Examples + -------- + In a + ```{python} + import pointblank as pb + + # Create a validation + validation = ( + pb.Validate(data=pb.load_dataset("small_table", tbl_type = "duckdb"), label="My validation") + .col_vals_gt(columns="d", value=100) + .col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}") + .interrogate() + ) + + # Get a dataframe of the validation summary results + df_validation = validation.get_dataframe() + + ``` + + """ + + # Raise an error if tbl_type is not one of the supported types + if tbl_type not in ["polars", "pandas", "duckdb"]: + raise ValueError( + f"The DataFrame type `{tbl_type}` is not valid. Choose one of the following:\n" + "- `polars`\n" + "- `pandas`\n" + "- `duckdb`" + ) + + # Grab the summary data from validation info helper function + report = _validation_info_as_dict(self.validation_info) + + # Pop the extracts off unless specified to keep + if keep_extracts is False and "extract" in report: + report.pop("extract") + + # Remove keys to be dropped + # MEGHAN pick up Here!!! + # I need to + # 1) assess which keys should be turned innto columns + # 2) Determine which keys are used for conditionals (active = data from step shows, inactive replace with "-") + # 3) Create a schema + # 4) return the df + + # Check for polars, raise if not installed + if tbl_type == "polars": + if not _is_lib_present(lib_name="polars"): + raise ImportError( + "The Polars library is not installed but is required when specifying " + '`tbl_type="polars".' + ) + + # Create the schema for the df + schema = pl.Schema({}) + def _add_validation(self, validation_info): """ Add a validation to the list of validations. From 3f37b6140e308d0c77cf233be6d65e9cce072667 Mon Sep 17 00:00:00 2001 From: Meghansaha Date: Sun, 1 Feb 2026 11:03:53 -0500 Subject: [PATCH 03/11] add schema --- pointblank/validate.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index 63f6a1c0a..ee5996dd1 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -16905,7 +16905,7 @@ def get_dataframe( Examples -------- - In a + ```{python} import pointblank as pb @@ -16943,9 +16943,9 @@ def get_dataframe( # Remove keys to be dropped # MEGHAN pick up Here!!! # I need to - # 1) assess which keys should be turned innto columns + # 1) assess which keys should be turned innto columns - DONE # 2) Determine which keys are used for conditionals (active = data from step shows, inactive replace with "-") - # 3) Create a schema + # 3) Create a schema - DONE # 4) return the df # Check for polars, raise if not installed @@ -16956,8 +16956,24 @@ def get_dataframe( '`tbl_type="polars".' ) - # Create the schema for the df - schema = pl.Schema({}) + # Create the schema for the df + schema = pl.Schema( + { + "step": pl.String, # assertion_type + "columns": pl.String, # column + "values": pl.String, # values + "tbl": pl.String, # pre + "eval": pl.String, # active + "units": pl.Int64, # n + "pass_n": pl.Int64, # n_passed + "pass_pct": pl.Float64, # f_passed + "fail_n": pl.Int64, # n_failed + "fail_pct": pl.Float64, # f_failed + "warning": pl.Boolean, # warning + "error": pl.Boolean, # error + "critical": pl.Boolean, # critical + } + ) def _add_validation(self, validation_info): """ From fc2b74509ed120d057ad28f4b9c15cc7a5ffe757 Mon Sep 17 00:00:00 2001 From: Meghansaha Date: Tue, 3 Feb 2026 23:33:19 -0500 Subject: [PATCH 04/11] proto df --- pointblank/validate.py | 49 +++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index ee5996dd1..8c8ea4360 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -16934,11 +16934,11 @@ def get_dataframe( ) # Grab the summary data from validation info helper function - report = _validation_info_as_dict(self.validation_info) + report_original = _validation_info_as_dict(self.validation_info) # Pop the extracts off unless specified to keep - if keep_extracts is False and "extract" in report: - report.pop("extract") + if keep_extracts is False and "extract" in report_original: + report_original.pop("extract") # Remove keys to be dropped # MEGHAN pick up Here!!! @@ -16955,26 +16955,49 @@ def get_dataframe( "The Polars library is not installed but is required when specifying " '`tbl_type="polars".' ) + import polars as pl # Create the schema for the df schema = pl.Schema( { - "step": pl.String, # assertion_type - "columns": pl.String, # column - "values": pl.String, # values - "tbl": pl.String, # pre - "eval": pl.String, # active - "units": pl.Int64, # n - "pass_n": pl.Int64, # n_passed - "pass_pct": pl.Float64, # f_passed - "fail_n": pl.Int64, # n_failed - "fail_pct": pl.Float64, # f_failed + "assertion_type": pl.String, # assertion_type + "column": pl.String, # column + "values": pl.Unknown, # values + "pre": pl.Unknown, # pre + "active": pl.String, # active + "n": pl.Int64, # n + "n_passed": pl.Int64, # n_passed + "f_passed": pl.Float64, # f_passed + "n_failed": pl.Int64, # n_failed + "f_failed": pl.Float64, # f_failed "warning": pl.Boolean, # warning "error": pl.Boolean, # error "critical": pl.Boolean, # critical } ) + names_dict = { + "assertion_type": "step", + "column": "columns", + "values": "values", + "pre": "tbl", + "active": "eval", + "n": "units", + "n_passed": "pass_n", + "f_passed": "pass_pct", + "n_failed": "failed_n", + "f_failed": "failed_pct", + "warning": "warning", + "error": "error", + "critical": "critical", + } + + report = {key: report_original[key] for key in names_dict.keys() if key in report_original} + + df_validation_results = pl.DataFrame(data=report, schema=schema, strict=False).rename( + names_dict + ) + def _add_validation(self, validation_info): """ Add a validation to the list of validations. From 698fc483af333d80d0233cba5c68b05f9006c6e9 Mon Sep 17 00:00:00 2001 From: Meghansaha Date: Mon, 9 Feb 2026 19:03:00 -0500 Subject: [PATCH 05/11] Created draft df to be returned --- pointblank/validate.py | 1376 +++++++++++++++++++++++++++++----------- 1 file changed, 1010 insertions(+), 366 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index 8c8ea4360..88f303669 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -1,3 +1,4 @@ +# ruff: noqa from __future__ import annotations import base64 @@ -25,7 +26,11 @@ from great_tables.vals import fmt_integer, fmt_number from importlib_resources import files -from pointblank._agg import is_valid_agg, load_validation_method_grid, resolve_agg_registries +from pointblank._agg import ( + is_valid_agg, + load_validation_method_grid, + resolve_agg_registries, +) from pointblank._constants import ( ASSERTION_TYPE_METHOD_MAP, CHECK_MARK_SPAN, @@ -132,7 +137,12 @@ import polars as pl from narwhals.typing import IntoDataFrame, IntoFrame - from pointblank._typing import AbsoluteBounds, Tolerance, _CompliantValue, _CompliantValues + from pointblank._typing import ( + AbsoluteBounds, + Tolerance, + _CompliantValue, + _CompliantValues, + ) __all__ = [ @@ -439,14 +449,20 @@ def config( global global_config global_config.report_incl_header = report_incl_header # pragma: no cover global_config.report_incl_footer = report_incl_footer # pragma: no cover - global_config.report_incl_footer_timings = report_incl_footer_timings # pragma: no cover - global_config.report_incl_footer_notes = report_incl_footer_notes # pragma: no cover + global_config.report_incl_footer_timings = ( + report_incl_footer_timings # pragma: no cover + ) + global_config.report_incl_footer_notes = ( + report_incl_footer_notes # pragma: no cover + ) global_config.preview_incl_header = preview_incl_header # pragma: no cover return global_config # pragma: no cover def load_dataset( - dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table", + dataset: Literal[ + "small_table", "game_revenue", "nycflights", "global_sales" + ] = "small_table", tbl_type: Literal["polars", "pandas", "duckdb"] = "polars", ) -> Any: """ @@ -581,7 +597,9 @@ def load_dataset( import polars as pl - dataset = pl.read_csv(ZipFile(data_path).read(f"{dataset}.csv"), try_parse_dates=True) + dataset = pl.read_csv( + ZipFile(data_path).read(f"{dataset}.csv"), try_parse_dates=True + ) if tbl_type == "pandas": if not _is_lib_present(lib_name="pandas"): @@ -739,7 +757,9 @@ def read_file(filepath: str | Path) -> Validate: pass # pragma: no cover # Execute the function source code with the enhanced namespace - exec(source_code, execution_namespace, execution_namespace) # pragma: no cover + exec( + source_code, execution_namespace, execution_namespace + ) # pragma: no cover # The function should now be in the execution namespace if func_name in execution_namespace: # pragma: no cover @@ -761,13 +781,17 @@ def read_file(filepath: str | Path) -> Validate: and validation_info._pb_function_name in restored_functions ): func_name = validation_info._pb_function_name # pragma: no cover - validation_info.pre = restored_functions[func_name] # pragma: no cover + validation_info.pre = restored_functions[ + func_name + ] # pragma: no cover # Clean up the temporary attribute delattr(validation_info, "_pb_function_name") # pragma: no cover # Verify that we loaded a Validate object if not isinstance(validation, Validate): # pragma: no cover - raise RuntimeError(f"File does not contain a valid Validate object: {file_path}") + raise RuntimeError( + f"File does not contain a valid Validate object: {file_path}" + ) return validation @@ -775,7 +799,9 @@ def read_file(filepath: str | Path) -> Validate: raise RuntimeError(f"Failed to read validation object from {file_path}: {e}") -def _check_for_unpicklable_objects(validation: Validate) -> tuple[dict[str, str], list[int]]: +def _check_for_unpicklable_objects( + validation: Validate, +) -> tuple[dict[str, str], list[int]]: """ Check for functions and capture source code for preservation across sessions. @@ -828,19 +854,27 @@ def _check_for_unpicklable_objects(validation: Validate) -> tuple[dict[str, str] except (OSError, TypeError): # pragma: no cover # If we can't get source, check if it's at least picklable try: # pragma: no cover - pickle.dumps(func, protocol=pickle.HIGHEST_PROTOCOL) # pragma: no cover + pickle.dumps( + func, protocol=pickle.HIGHEST_PROTOCOL + ) # pragma: no cover # It's picklable but no source: this might cause issues across sessions print( # pragma: no cover f"Warning: Function '{func_name}' is picklable but source code could not be captured. " f"It may not be available when loading in a different session." ) - except (pickle.PicklingError, AttributeError, TypeError): # pragma: no cover + except ( + pickle.PicklingError, + AttributeError, + TypeError, + ): # pragma: no cover # Not picklable and no source: treat as problematic print( # pragma: no cover f"Warning: Function '{func_name}' is not picklable and source could not be captured. " f"It will not be available after saving/loading." ) - unpicklable_lambda_steps.append((i, validation_info)) # pragma: no cover + unpicklable_lambda_steps.append( + (i, validation_info) + ) # pragma: no cover # Only raise error for lambda functions now if unpicklable_lambda_steps: @@ -924,7 +958,11 @@ def _provide_serialization_guidance(validation: Validate) -> None: try: # pragma: no cover pickle.dumps(func, protocol=pickle.HIGHEST_PROTOCOL) # pragma: no cover can_pickle = True # pragma: no cover - except (pickle.PicklingError, AttributeError, TypeError): # pragma: no cover + except ( + pickle.PicklingError, + AttributeError, + TypeError, + ): # pragma: no cover can_pickle = False # pragma: no cover functions_analysis["unpicklable_functions"].append( (i, func_name, func_module) @@ -933,7 +971,9 @@ def _provide_serialization_guidance(validation: Validate) -> None: # Check if it's likely to work across sessions if ( - func_module == "__main__" or not func_module or func_module == "" + func_module == "__main__" + or not func_module + or func_module == "" ): # pragma: no cover # Function defined interactively - risky for cross-session use functions_analysis["interactive_functions"].append( @@ -1319,7 +1359,10 @@ def add_computed_column(df): function_sources, lambda_steps = _check_for_unpicklable_objects(validation_copy) # Create a validation package that includes both the object and function sources - validation_package = {"validation": validation_copy, "function_sources": function_sources} + validation_package = { + "validation": validation_copy, + "function_sources": function_sources, + } # Serialize to disk using pickle try: @@ -1335,7 +1378,9 @@ def add_computed_column(df): ) for func_name in function_sources.keys(): print(f" • {func_name}") - print(" 📥 These functions will be automatically restored when loading") + print( + " 📥 These functions will be automatically restored when loading" + ) # Provide loading instructions preprocessing_funcs = [ @@ -1353,7 +1398,9 @@ def add_computed_column(df): for info in preprocessing_funcs if hasattr(info, "pre") and info.pre ): - print(" # Import any preprocessing functions from their modules") + print( + " # Import any preprocessing functions from their modules" + ) modules_mentioned = set() for info in preprocessing_funcs: if ( @@ -1368,7 +1415,11 @@ def add_computed_column(df): modules_mentioned.add(info.pre.__module__) print(f" validation = pb.read_file('{file_path.name}')") else: - print(" 📖 To load: validation = pb.read_file('{}')".format(file_path.name)) + print( + " 📖 To load: validation = pb.read_file('{}')".format( + file_path.name + ) + ) except Exception as e: # pragma: no cover raise RuntimeError( @@ -1377,7 +1428,9 @@ def add_computed_column(df): def get_data_path( - dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table", + dataset: Literal[ + "small_table", "game_revenue", "nycflights", "global_sales" + ] = "small_table", file_type: Literal["csv", "parquet", "duckdb"] = "csv", ) -> str: """ @@ -1501,7 +1554,9 @@ def get_data_path( # For CSV files, we need to extract from zip to a temporary location # since most libraries expect actual file paths, not zip contents - with tempfile.NamedTemporaryFile(mode="wb", suffix=".csv", delete=False) as tmp_file: + with tempfile.NamedTemporaryFile( + mode="wb", suffix=".csv", delete=False + ) as tmp_file: with ZipFile(data_path) as zip_file: csv_content = zip_file.read(f"{dataset}.csv") tmp_file.write(csv_content) @@ -1512,12 +1567,16 @@ def get_data_path( data_path = files("pointblank.data") / f"{dataset}.zip" # We'll need to convert CSV to Parquet temporarily - with tempfile.NamedTemporaryFile(mode="wb", suffix=".parquet", delete=False) as tmp_file: + with tempfile.NamedTemporaryFile( + mode="wb", suffix=".parquet", delete=False + ) as tmp_file: # Load CSV data and save as Parquet if _is_lib_present(lib_name="polars"): import polars as pl - df = pl.read_csv(ZipFile(data_path).read(f"{dataset}.csv"), try_parse_dates=True) + df = pl.read_csv( + ZipFile(data_path).read(f"{dataset}.csv"), try_parse_dates=True + ) df.write_parquet(tmp_file.name) elif _is_lib_present(lib_name="pandas"): import pandas as pd @@ -1535,7 +1594,9 @@ def get_data_path( data_path = files("pointblank.data") / f"{dataset}-duckdb.zip" # Extract DuckDB file to temporary location - with tempfile.NamedTemporaryFile(mode="wb", suffix=".ddb", delete=False) as tmp_file: + with tempfile.NamedTemporaryFile( + mode="wb", suffix=".ddb", delete=False + ) as tmp_file: with ZipFile(data_path) as zip_file: ddb_content = zip_file.read(f"{dataset}.ddb") tmp_file.write(ddb_content) @@ -1668,7 +1729,9 @@ def _process_github_url(data: Any) -> Any: return data user, repo, branch, file_path = match.groups() - raw_url = f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/{file_path}" + raw_url = ( + f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/{file_path}" + ) # Download the file content to a temporary file try: @@ -1679,7 +1742,9 @@ def _process_github_url(data: Any) -> Any: file_ext = ".csv" if path_lower.endswith(".csv") else ".parquet" # Create a temporary file - with tempfile.NamedTemporaryFile(mode="wb", suffix=file_ext, delete=False) as tmp_file: + with tempfile.NamedTemporaryFile( + mode="wb", suffix=file_ext, delete=False + ) as tmp_file: tmp_file.write(content) tmp_file_path = tmp_file.name @@ -1913,7 +1978,9 @@ def _process_parquet_input(data: Any) -> Any: dfs = [pd.read_parquet(path) for path in parquet_paths] return pd.concat(dfs, ignore_index=True) except Exception as e: - raise RuntimeError(f"Failed to read Parquet file(s) with Pandas: {e}") from e + raise RuntimeError( + f"Failed to read Parquet file(s) with Pandas: {e}" + ) from e else: raise ImportError( "Neither Polars nor Pandas is available for reading Parquet files. " @@ -2217,7 +2284,9 @@ def _generate_display_table( # Check that the n_head and n_tail aren't greater than the limit if n_head + n_tail > limit: - raise ValueError(f"The sum of `n_head=` and `n_tail=` cannot exceed the limit ({limit}).") + raise ValueError( + f"The sum of `n_head=` and `n_tail=` cannot exceed the limit ({limit})." + ) # Do we have a DataFrame library to work with? We need at least one to display # the table using Great Tables @@ -2261,7 +2330,9 @@ def _generate_display_table( # If `columns_subset=` is not None, resolve the columns to display if columns_subset is not None: - col_names = _get_column_names(data, ibis_tbl=ibis_tbl, df_lib_name_gt=df_lib_name_gt) + col_names = _get_column_names( + data, ibis_tbl=ibis_tbl, df_lib_name_gt=df_lib_name_gt + ) resolved_columns = _validate_columns_subset( columns_subset=columns_subset, col_names=col_names @@ -2278,7 +2349,10 @@ def _generate_display_table( # Select the columns to display in the table with the `resolved_columns` value data = _select_columns( - data, resolved_columns=resolved_columns, ibis_tbl=ibis_tbl, tbl_type=tbl_type + data, + resolved_columns=resolved_columns, + ibis_tbl=ibis_tbl, + tbl_type=tbl_type, ) # From an Ibis table: @@ -2295,7 +2369,11 @@ def _generate_display_table( # Get the row count for the table # Note: ibis tables have count(), to_polars(), to_pandas() methods ibis_rows = data.count() # type: ignore[union-attr] - n_rows = ibis_rows.to_polars() if df_lib_name_gt == "polars" else int(ibis_rows.to_pandas()) + n_rows = ( + ibis_rows.to_polars() + if df_lib_name_gt == "polars" + else int(ibis_rows.to_pandas()) + ) # If n_head + n_tail is greater than the row count, display the entire table if n_head + n_tail > n_rows: @@ -2437,7 +2515,9 @@ def _generate_display_table( # method can be used none_values = {k: data[k].isnull() for k in col_names} - none_values = [(k, i) for k, v in none_values.items() for i, val in enumerate(v) if val] + none_values = [ + (k, i) for k, v in none_values.items() for i, val in enumerate(v) if val + ] # Import Great Tables to get preliminary renders of the columns import great_tables as gt @@ -2456,8 +2536,12 @@ def _generate_display_table( data_col = data.select([column]) # Using Great Tables, render the columns and get the list of values as formatted strings - built_gt = GT(data=data_col).fmt_markdown(columns=column)._build_data(context="html") - column_values = gt.gt._get_column_of_values(built_gt, column_name=column, context="html") + built_gt = ( + GT(data=data_col).fmt_markdown(columns=column)._build_data(context="html") + ) + column_values = gt.gt._get_column_of_values( + built_gt, column_name=column, context="html" + ) # Get the maximum number of characters in the column if column_values: # Check if column_values is not empty @@ -2493,7 +2577,9 @@ def _generate_display_table( if sum_col_widths < min_tbl_width: remaining_width = min_tbl_width - sum_col_widths n_remaining_cols = len(col_widths) - col_widths = [width + remaining_width // n_remaining_cols for width in col_widths] + col_widths = [ + width + remaining_width // n_remaining_cols for width in col_widths + ] # Add the `px` suffix to each of the column widths, stringifying them col_widths = [f"{width}px" for width in col_widths] @@ -2552,9 +2638,13 @@ def _generate_display_table( col_dtype_labels_dict = {"_row_num_": ""} | col_dtype_labels_dict # Create the label, table type, and thresholds HTML fragments - table_type_html = _create_table_type_html(tbl_type=tbl_type, tbl_name=None, font_size="10px") + table_type_html = _create_table_type_html( + tbl_type=tbl_type, tbl_name=None, font_size="10px" + ) - tbl_dims_html = _create_table_dims_html(columns=n_columns, rows=n_rows, font_size="10px") + tbl_dims_html = _create_table_dims_html( + columns=n_columns, rows=n_rows, font_size="10px" + ) # Compose the subtitle HTML fragment combined_subtitle = ( @@ -2579,11 +2669,15 @@ def _generate_display_table( locations=loc.body(), ) .tab_style( - style=style.text(color="black", font=google_font(name="IBM Plex Mono"), size="12px"), + style=style.text( + color="black", font=google_font(name="IBM Plex Mono"), size="12px" + ), locations=loc.body(), ) .tab_style( - style=style.text(color="gray20", font=google_font(name="IBM Plex Mono"), size="12px"), + style=style.text( + color="gray20", font=google_font(name="IBM Plex Mono"), size="12px" + ), locations=loc.column_labels(), ) .tab_style( @@ -2623,15 +2717,21 @@ def _generate_display_table( if not full_dataset: gt_tbl = gt_tbl.tab_style( - style=style.borders(sides="bottom", color="#6699CC80", style="solid", weight="2px"), + style=style.borders( + sides="bottom", color="#6699CC80", style="solid", weight="2px" + ), locations=loc.body(rows=n_head - 1), ) if show_row_numbers: gt_tbl = gt_tbl.tab_style( style=[ - style.text(color="gray", font=google_font(name="IBM Plex Mono"), size="10px"), - style.borders(sides="right", color="#6699CC80", style="solid", weight="2px"), + style.text( + color="gray", font=google_font(name="IBM Plex Mono"), size="10px" + ), + style.borders( + sides="right", color="#6699CC80", style="solid", weight="2px" + ), ], locations=loc.body(columns="_row_num_"), ) @@ -2807,9 +2907,13 @@ def _calculate_missing_proportions(use_polars_conversion: bool = False): # Apply the appropriate conversion method if use_polars_conversion: - null_sum_converted = null_sum.to_polars() # pragma: no cover + null_sum_converted = ( + null_sum.to_polars() + ) # pragma: no cover else: - null_sum_converted = null_sum.to_pandas() # pragma: no cover + null_sum_converted = ( + null_sum.to_pandas() + ) # pragma: no cover missing_prop = (null_sum_converted / sector_size) * 100 col_missing_props.append(missing_prop) @@ -2880,7 +2984,9 @@ def _calculate_missing_proportions_dataframe(is_polars=False): col: [ ( getattr( - data[(cut_points[i - 1] if i > 0 else 0) : cut_points[i]][col], + data[(cut_points[i - 1] if i > 0 else 0) : cut_points[i]][ + col + ], null_method, )().sum() / (cut_points[i] - (cut_points[i - 1] if i > 0 else 0)) @@ -2952,13 +3058,17 @@ def _calculate_missing_proportions_dataframe(is_polars=False): window = Window.orderBy( pyspark_col(data.columns[0]) ) # Order by first column - sector_data = data.withColumn("row_num", row_number().over(window)).filter( + sector_data = data.withColumn( + "row_num", row_number().over(window) + ).filter( (pyspark_col("row_num") > start_row) & (pyspark_col("row_num") <= end_row) ) # Count nulls in this sector - null_count = sector_data.filter(pyspark_col(col_name).isNull()).count() + null_count = sector_data.filter( + pyspark_col(col_name).isNull() + ).count() missing_prop = (null_count / sector_size) * 100 col_missing_props.append(missing_prop) else: @@ -2974,11 +3084,13 @@ def _calculate_missing_proportions_dataframe(is_polars=False): from pyspark.sql.window import Window window = Window.orderBy(pyspark_col(data.columns[0])) - sector_data = data.withColumn("row_num", row_number().over(window)).filter( - pyspark_col("row_num") > start_row - ) + sector_data = data.withColumn( + "row_num", row_number().over(window) + ).filter(pyspark_col("row_num") > start_row) - null_count = sector_data.filter(pyspark_col(col_name).isNull()).count() + null_count = sector_data.filter( + pyspark_col(col_name).isNull() + ).count() missing_prop = (null_count / sector_size) * 100 col_missing_props.append(missing_prop) else: @@ -3021,9 +3133,13 @@ def _calculate_missing_proportions_dataframe(is_polars=False): n_missing_total_fmt = _format_to_integer_value(n_missing_total) # Create the label, table type, and thresholds HTML fragments - table_type_html = _create_table_type_html(tbl_type=tbl_type, tbl_name=None, font_size="10px") + table_type_html = _create_table_type_html( + tbl_type=tbl_type, tbl_name=None, font_size="10px" + ) - tbl_dims_html = _create_table_dims_html(columns=len(col_names), rows=n_rows, font_size="10px") + tbl_dims_html = _create_table_dims_html( + columns=len(col_names), rows=n_rows, font_size="10px" + ) check_mark = '' @@ -3052,7 +3168,10 @@ def _calculate_missing_proportions_dataframe(is_polars=False): row_ranges_html = ( "
    " + "".join( - [f"
  1. {row_range[0]} – {row_range[1]}
  2. " for row_range in zip(*row_ranges)] + [ + f"
  3. {row_range[0]} – {row_range[1]}
  4. " + for row_range in zip(*row_ranges) + ] ) + "
" ) @@ -3120,7 +3239,9 @@ def _calculate_missing_proportions_dataframe(is_polars=False): locations=loc.body(), ) .tab_style( - style=style.text(color="black", font=google_font(name="IBM Plex Mono"), size="12px"), + style=style.text( + color="black", font=google_font(name="IBM Plex Mono"), size="12px" + ), locations=loc.body(), ) .tab_style( @@ -3139,7 +3260,8 @@ def _calculate_missing_proportions_dataframe(is_polars=False): import polars.selectors as cs missing_vals_tbl = missing_vals_tbl.tab_style( - style=style.fill(color="lightblue"), locations=loc.body(mask=cs.numeric().eq(0)) + style=style.fill(color="lightblue"), + locations=loc.body(mask=cs.numeric().eq(0)), ) if df_lib_name_gt == "pandas": @@ -3257,7 +3379,9 @@ def _validate_columns_subset( ) -> list[str]: if isinstance(columns_subset, str): if columns_subset not in col_names: - raise ValueError("The `columns_subset=` value doesn't match any columns in the table.") + raise ValueError( + "The `columns_subset=` value doesn't match any columns in the table." + ) return [columns_subset] if isinstance(columns_subset, list): @@ -3271,7 +3395,9 @@ def _validate_columns_subset( return columns_subset.resolve(columns=col_names) # type: ignore[union-attr] -def _select_columns(data: Any, resolved_columns: list[str], ibis_tbl: bool, tbl_type: str) -> Any: +def _select_columns( + data: Any, resolved_columns: list[str], ibis_tbl: bool, tbl_type: str +) -> Any: if ibis_tbl: return data[resolved_columns] if tbl_type == "polars": @@ -3446,7 +3572,9 @@ def get_column_count(data: Any) -> int: if "pandas" in str(type(data)): return data.shape[1] # pragma: no cover else: - raise ValueError("The input table type supplied in `data=` is not supported.") + raise ValueError( + "The input table type supplied in `data=` is not supported." + ) def _extract_enum_values(set_values: Any) -> list[Any]: @@ -3666,7 +3794,9 @@ def get_row_count(data: Any) -> int: elif "pyspark" in str(type(data)): # pragma: no cover return data.count() else: - raise ValueError("The input table type supplied in `data=` is not supported.") + raise ValueError( + "The input table type supplied in `data=` is not supported." + ) @dataclass @@ -3861,7 +3991,9 @@ def _add_note(self, key: str, markdown: str, text: str | None = None) -> None: # Add the note entry self.notes[key] = {"markdown": markdown, "text": text} - def _get_notes(self, format: str = "dict") -> dict[str, dict[str, str]] | list[str] | None: + def _get_notes( + self, format: str = "dict" + ) -> dict[str, dict[str, str]] | list[str] | None: """ Get notes associated with this validation step. @@ -4005,7 +4137,9 @@ def _handle_connection_errors(e: Exception, connection_string: str) -> NoReturn: # Check if this is a missing backend dependency for backend, install_cmd in backend_install_map.items(): - if backend in error_str and ("not found" in error_str or "no module" in error_str): + if backend in error_str and ( + "not found" in error_str or "no module" in error_str + ): raise ConnectionError( f"Missing {backend.upper()} backend for Ibis. Install it with:\n" f" {install_cmd}\n\n" @@ -4176,7 +4310,9 @@ def connect_to_table(connection_string: str) -> Any: # Check if this is a "table not found" error if "table" in error_str and ( - "not found" in error_str or "does not exist" in error_str or "not exist" in error_str + "not found" in error_str + or "does not exist" in error_str + or "not exist" in error_str ): # Try to get available tables for a helpful error message try: # pragma: no cover @@ -4853,7 +4989,9 @@ def __post_init__(self): # Check that `actions` is an Actions object if provided # TODO: allow string, callable, of list of either and upgrade to Actions object - if self.actions is not None and not isinstance(self.actions, Actions): # pragma: no cover + if self.actions is not None and not isinstance( + self.actions, Actions + ): # pragma: no cover raise TypeError( "The `actions=` parameter must be an `Actions` object. " "Please use `Actions()` to wrap your actions." @@ -5382,7 +5520,9 @@ def col_vals_gt( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) columns = _resolve_columns(columns) @@ -5666,7 +5806,9 @@ def col_vals_lt( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -5953,11 +6095,15 @@ def col_vals_eq( # If value is a string-based date or datetime, convert it to the appropriate type # Allow regular strings to pass through for string comparisons - value = _conditional_string_date_dttm_conversion(value=value, allow_regular_strings=True) + value = _conditional_string_date_dttm_conversion( + value=value, allow_regular_strings=True + ) # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -6242,11 +6388,15 @@ def col_vals_ne( # If value is a string-based date or datetime, convert it to the appropriate type # Allow regular strings to pass through for string comparisons - value = _conditional_string_date_dttm_conversion(value=value, allow_regular_strings=True) + value = _conditional_string_date_dttm_conversion( + value=value, allow_regular_strings=True + ) # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -6538,7 +6688,9 @@ def col_vals_ge( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -6830,7 +6982,9 @@ def col_vals_le( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -7149,7 +7303,9 @@ def col_vals_between( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -7469,7 +7625,9 @@ def col_vals_outside( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -7778,7 +7936,9 @@ class Color(Enum): if val is None: continue if not isinstance(val, (float, int, str)): - raise ValueError("`set=` must be a list of floats, integers, or strings.") + raise ValueError( + "`set=` must be a list of floats, integers, or strings." + ) _check_pre(pre=pre) # TODO: add check for segments @@ -7788,7 +7948,9 @@ class Color(Enum): # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -8077,7 +8239,9 @@ class InvalidStatus(Enum): # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -8260,7 +8424,9 @@ def col_vals_increasing( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -8448,7 +8614,9 @@ def col_vals_decreasing( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -8697,7 +8865,9 @@ def col_vals_null( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -8940,7 +9110,9 @@ def col_vals_not_null( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -9199,7 +9371,9 @@ def col_vals_regex( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -9484,7 +9658,9 @@ def col_vals_within_spec( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -9719,7 +9895,9 @@ def col_vals_expr( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # Determine brief to use (global or local) and transform any shorthands of `brief=` @@ -9884,7 +10062,9 @@ def col_exists( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -10178,7 +10358,9 @@ def col_pct_null( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later @@ -10426,7 +10608,9 @@ def rows_distinct( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) if columns_subset is not None and isinstance(columns_subset, str): @@ -10667,10 +10851,14 @@ def rows_complete( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) - if columns_subset is not None and isinstance(columns_subset, str): # pragma: no cover + if columns_subset is not None and isinstance( + columns_subset, str + ): # pragma: no cover columns_subset = [columns_subset] # pragma: no cover # TODO: incorporate Column object @@ -11033,7 +11221,9 @@ def prompt( try: provider, model_name = model.split(sep=":", maxsplit=1) except ValueError: - raise ValueError(f"Model must be in format 'provider:model_name', got: {model}") + raise ValueError( + f"Model must be in format 'provider:model_name', got: {model}" + ) # Error if an unsupported provider is used if provider not in MODEL_PROVIDERS: @@ -11057,7 +11247,9 @@ def prompt( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # Determine brief to use (global or local) and transform any shorthands of `brief=` @@ -11275,13 +11467,19 @@ def col_schema_match( _check_boolean_input(param=active, param_name="active") _check_boolean_input(param=complete, param_name="complete") _check_boolean_input(param=in_order, param_name="in_order") - _check_boolean_input(param=case_sensitive_colnames, param_name="case_sensitive_colnames") - _check_boolean_input(param=case_sensitive_dtypes, param_name="case_sensitive_dtypes") + _check_boolean_input( + param=case_sensitive_colnames, param_name="case_sensitive_colnames" + ) + _check_boolean_input( + param=case_sensitive_dtypes, param_name="case_sensitive_dtypes" + ) _check_boolean_input(param=full_match_dtypes, param_name="full_match_dtypes") # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # Package up the `schema=` and boolean params into a dictionary for later interrogation @@ -11499,12 +11697,16 @@ def row_count_match( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `count` is a DataFrame or table then use the row count of the DataFrame as # the expected count - if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)): + if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str( + type(count) + ): count = get_row_count(count) # Check the integrity of tolerance @@ -11678,12 +11880,16 @@ def col_count_match( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # If `count` is a DataFrame or table then use the column count of the DataFrame as # the expected count - if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str(type(count)): + if _is_value_a_df(count) or "ibis.expr.types.relations.Table" in str( + type(count) + ): count = get_column_count(count) # Package up the `count=` and boolean params into a dictionary for later interrogation @@ -11952,7 +12158,9 @@ def tbl_match( # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # Package up the `tbl_compare` into a dictionary for later interrogation @@ -12199,7 +12407,9 @@ def filter_by_c_gt_5(df): # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # Determine brief to use (global or local) and transform any shorthands of `brief=` @@ -12547,7 +12757,9 @@ def validate_pointblank_version(): # Determine threshold to use (global or local) and normalize a local `thresholds=` value thresholds = ( - self.thresholds if thresholds is None else _normalize_thresholds_creation(thresholds) + self.thresholds + if thresholds is None + else _normalize_thresholds_creation(thresholds) ) # Determine brief to use (global or local) and transform any shorthands of `brief=` @@ -12678,7 +12890,9 @@ def interrogate( """ # Raise if `get_first_n` and either or `sample_n` or `sample_frac` arguments are provided - if get_first_n is not None and (sample_n is not None or sample_frac is not None): + if get_first_n is not None and ( + sample_n is not None or sample_frac is not None + ): raise ValueError( "The `get_first_n=` argument cannot be provided with the `sample_n=` or " "`sample_frac=` arguments." @@ -12721,7 +12935,9 @@ def interrogate( segment = validation.segments # Get compatible data types for this assertion type - assertion_method = ASSERTION_TYPE_METHOD_MAP.get(assertion_type, assertion_type) + assertion_method = ASSERTION_TYPE_METHOD_MAP.get( + assertion_type, assertion_type + ) compatible_dtypes = COMPATIBLE_DTYPES.get(assertion_method, []) # Process the `brief` text for the validation step by including template variables to @@ -12843,7 +13059,10 @@ def interrogate( processed_cols = get_column_count(data_tbl_step) # Always add a note when preprocessing is applied - if original_rows != processed_rows or original_cols != processed_cols: + if ( + original_rows != processed_rows + or original_cols != processed_cols + ): # Dimensions changed - show the change note_html = _create_preprocessing_note_html( original_rows=original_rows, @@ -12860,7 +13079,9 @@ def interrogate( ) else: # No dimension change - just indicate preprocessing was applied - note_html = _create_preprocessing_no_change_note_html(locale=self.locale) + note_html = _create_preprocessing_no_change_note_html( + locale=self.locale + ) note_text = _create_preprocessing_no_change_note_text() validation._add_note( @@ -12879,7 +13100,10 @@ def interrogate( ) # Check if the target column is in the processed table but not in original - if column in processed_column_names and column not in original_column_names: + if ( + column in processed_column_names + and column not in original_column_names + ): note_html = _create_synthetic_target_column_note_html( column_name=column, locale=self.locale, @@ -12898,7 +13122,9 @@ def interrogate( validation.eval_error = True end_time = datetime.datetime.now(datetime.timezone.utc) validation.proc_duration_s = (end_time - start_time).total_seconds() - validation.time_processed = end_time.isoformat(timespec="milliseconds") + validation.time_processed = end_time.isoformat( + timespec="milliseconds" + ) validation.active = False continue @@ -12928,9 +13154,9 @@ def interrogate( # Set the number of test units # ------------------------------------------------ - validation.n = NumberOfTestUnits(df=data_tbl_step, column=column).get_test_units( - tbl_type=tbl_type - ) + validation.n = NumberOfTestUnits( + df=data_tbl_step, column=column + ).get_test_units(tbl_type=tbl_type) # Check if preprocessing or segmentation resulted in zero rows # Only apply this check to row-based validations, not table-level validations @@ -12980,7 +13206,9 @@ def interrogate( ]: # Process table for column validation tbl = _column_test_prep( - df=data_tbl_step, column=column, allowed_types=compatible_dtypes + df=data_tbl_step, + column=column, + allowed_types=compatible_dtypes, ) if assertion_method == "gt": @@ -13016,8 +13244,12 @@ def interrogate( from pointblank._interrogation import interrogate_increasing # Extract direction options from val_info - allow_stationary = validation.val_info.get("allow_stationary", False) - decreasing_tol = validation.val_info.get("decreasing_tol", 0.0) + allow_stationary = validation.val_info.get( + "allow_stationary", False + ) + decreasing_tol = validation.val_info.get( + "decreasing_tol", 0.0 + ) results_tbl = interrogate_increasing( tbl=tbl, @@ -13031,8 +13263,12 @@ def interrogate( from pointblank._interrogation import interrogate_decreasing # Extract direction options from val_info - allow_stationary = validation.val_info.get("allow_stationary", False) - increasing_tol = validation.val_info.get("increasing_tol", 0.0) + allow_stationary = validation.val_info.get( + "allow_stationary", False + ) + increasing_tol = validation.val_info.get( + "increasing_tol", 0.0 + ) results_tbl = interrogate_decreasing( tbl=tbl, @@ -13063,7 +13299,9 @@ def interrogate( ) elif assertion_type == "col_vals_in_set": - results_tbl = interrogate_isin(tbl=tbl, column=column, set_values=value) + results_tbl = interrogate_isin( + tbl=tbl, column=column, set_values=value + ) elif assertion_type == "col_vals_not_in_set": results_tbl = interrogate_notin( @@ -13076,7 +13314,9 @@ def interrogate( ) elif assertion_type == "col_vals_within_spec": - from pointblank._interrogation import interrogate_within_spec + from pointblank._interrogation import ( + interrogate_within_spec, + ) results_tbl = interrogate_within_spec( tbl=tbl, column=column, values=value, na_pass=na_pass @@ -13108,7 +13348,9 @@ def interrogate( ) elif assertion_type == "rows_complete": - results_tbl = rows_complete(data_tbl=data_tbl_step, columns_subset=column) + results_tbl = rows_complete( + data_tbl=data_tbl_step, columns_subset=column + ) elif assertion_type == "prompt": from pointblank._interrogation import interrogate_prompt @@ -13164,7 +13406,9 @@ def interrogate( schema_info=schema_validation_info ) validation._add_note( - key="schema_check", markdown=schema_note_html, text=schema_note_text + key="schema_check", + markdown=schema_note_html, + text=schema_note_text, ) validation.all_passed = result_bool @@ -13191,7 +13435,9 @@ def interrogate( elif assertion_type == "col_count_match": result_bool = col_count_match( - data_tbl=data_tbl_step, count=value["count"], inverse=value["inverse"] + data_tbl=data_tbl_step, + count=value["count"], + inverse=value["inverse"], ) validation.all_passed = result_bool @@ -13211,7 +13457,9 @@ def interrogate( if callable(tbl_compare): tbl_compare = tbl_compare() - result_bool = tbl_match(data_tbl=data_tbl_step, tbl_compare=tbl_compare) + result_bool = tbl_match( + data_tbl=data_tbl_step, tbl_compare=tbl_compare + ) validation.all_passed = result_bool validation.n = 1 @@ -13247,9 +13495,9 @@ def interrogate( "setting reference data on the Validate object. " "Use Validate(data=..., reference=...) to set reference data." ) - ref_vec: nw.DataFrame = nw.from_native(self.reference).select( - raw_value.column_name - ) + ref_vec: nw.DataFrame = nw.from_native( + self.reference + ).select(raw_value.column_name) target: float | int = agg(ref_vec) else: target = raw_value @@ -13285,14 +13533,19 @@ def interrogate( or ("dtype" in error_msg and "compare" in error_msg) ) - is_column_not_found = "column" in error_msg and "not found" in error_msg + is_column_not_found = ( + "column" in error_msg and "not found" in error_msg + ) is_comparison_column_not_found = ( - "unable to find column" in error_msg and "valid columns" in error_msg + "unable to find column" in error_msg + and "valid columns" in error_msg ) if ( - is_comparison_error or is_column_not_found or is_comparison_column_not_found + is_comparison_error + or is_column_not_found + or is_comparison_column_not_found ): # pragma: no cover # If data quality comparison fails or column not found, mark as eval_error validation.eval_error = True # pragma: no cover @@ -13301,16 +13554,20 @@ def interrogate( if is_column_not_found: note_html = _create_column_not_found_note_html( column_name=column, - available_columns=list(data_tbl_step.columns) - if hasattr(data_tbl_step, "columns") - else [], + available_columns=( + list(data_tbl_step.columns) + if hasattr(data_tbl_step, "columns") + else [] + ), locale=self.locale, ) note_text = _create_column_not_found_note_text( column_name=column, - available_columns=list(data_tbl_step.columns) - if hasattr(data_tbl_step, "columns") - else [], + available_columns=( + list(data_tbl_step.columns) + if hasattr(data_tbl_step, "columns") + else [] + ), ) validation._add_note( key="column_not_found", @@ -13322,14 +13579,19 @@ def interrogate( elif is_comparison_column_not_found: # Extract column name from error message # Error format: 'unable to find column "col_name"; valid columns: ...' - match = re.search(r'unable to find column "([^"]+)"', str(e)) + match = re.search( + r'unable to find column "([^"]+)"', str(e) + ) if match: missing_col_name = match.group(1) # Determine position for between/outside validations position = None - if assertion_type in ["col_vals_between", "col_vals_outside"]: + if assertion_type in [ + "col_vals_between", + "col_vals_outside", + ]: # Check if missing column is in left or right position from pointblank.column import Column @@ -13344,20 +13606,28 @@ def interrogate( ): position = "right" - note_html = _create_comparison_column_not_found_note_html( - column_name=missing_col_name, - position=position, - available_columns=list(data_tbl_step.columns) - if hasattr(data_tbl_step, "columns") - else [], - locale=self.locale, + note_html = ( + _create_comparison_column_not_found_note_html( + column_name=missing_col_name, + position=position, + available_columns=( + list(data_tbl_step.columns) + if hasattr(data_tbl_step, "columns") + else [] + ), + locale=self.locale, + ) ) - note_text = _create_comparison_column_not_found_note_text( - column_name=missing_col_name, - position=position, - available_columns=list(data_tbl_step.columns) - if hasattr(data_tbl_step, "columns") - else [], + note_text = ( + _create_comparison_column_not_found_note_text( + column_name=missing_col_name, + position=position, + available_columns=( + list(data_tbl_step.columns) + if hasattr(data_tbl_step, "columns") + else [] + ), + ) ) validation._add_note( key="comparison_column_not_found", @@ -13365,7 +13635,9 @@ def interrogate( text=note_text, ) - end_time = datetime.datetime.now(datetime.timezone.utc) # pragma: no cover + end_time = datetime.datetime.now( + datetime.timezone.utc + ) # pragma: no cover validation.proc_duration_s = ( end_time - start_time @@ -13427,7 +13699,9 @@ def interrogate( # Solely for the col_vals_in_set assertion type, any Null values in the # `pb_is_good_` column are counted as failing test units if assertion_type == "col_vals_in_set": - null_count = _count_null_values_in_column(tbl=results_tbl, column="pb_is_good_") + null_count = _count_null_values_in_column( + tbl=results_tbl, column="pb_is_good_" + ) validation.n_failed += null_count # For column-value validations, the number of test units is the number of rows @@ -13458,7 +13732,9 @@ def interrogate( validation, level, threshold._threshold_result( - fraction_failing=validation.f_failed, test_units=validation.n, level=level + fraction_failing=validation.f_failed, + test_units=validation.n, + level=level, ), ) @@ -13469,7 +13745,9 @@ def interrogate( threshold_note_html = _create_local_threshold_note_html( thresholds=threshold, locale=self.locale ) - threshold_note_text = _create_local_threshold_note_text(thresholds=threshold) + threshold_note_text = _create_local_threshold_note_text( + thresholds=threshold + ) # Add the note to the validation step validation._add_note( @@ -13480,7 +13758,9 @@ def interrogate( elif self.thresholds != Thresholds(): # Thresholds explicitly reset to empty when global thresholds exist - reset_note_html = _create_threshold_reset_note_html(locale=self.locale) + reset_note_html = _create_threshold_reset_note_html( + locale=self.locale + ) reset_note_text = _create_threshold_reset_note_text() # Add the note to the validation step @@ -13627,7 +13907,9 @@ def interrogate( # Handle LazyFrame row indexing which requires order_by parameter try: # Try without order_by first (for DataFrames) - validation_extract_nw = validation_extract_nw.with_row_index(name="_row_num_") + validation_extract_nw = validation_extract_nw.with_row_index( + name="_row_num_" + ) except TypeError: # LazyFrames require order_by parameter: use first column for ordering first_col = validation_extract_nw.columns[0] @@ -13635,12 +13917,16 @@ def interrogate( name="_row_num_", order_by=first_col ) - validation_extract_nw = validation_extract_nw.filter(~nw.col("pb_is_good_")).drop( + validation_extract_nw = validation_extract_nw.filter( + ~nw.col("pb_is_good_") + ).drop( "pb_is_good_" ) # noqa # Add 1 to the row numbers to make them 1-indexed - validation_extract_nw = validation_extract_nw.with_columns(nw.col("_row_num_") + 1) + validation_extract_nw = validation_extract_nw.with_columns( + nw.col("_row_num_") + 1 + ) # Apply any sampling or limiting to the number of rows to extract if get_first_n is not None: @@ -13651,13 +13937,18 @@ def interrogate( validation_extract_nw = validation_extract_nw.sample(n=sample_n) except AttributeError: # For LazyFrames without sample method, collect first then sample - validation_extract_native = validation_extract_nw.collect().to_native() - if hasattr(validation_extract_native, "sample"): # pragma: no cover + validation_extract_native = ( + validation_extract_nw.collect().to_native() + ) + if hasattr( + validation_extract_native, "sample" + ): # pragma: no cover # PySpark DataFrame has sample method validation_extract_native = ( validation_extract_native.sample( # pragma: no cover fraction=min( - 1.0, sample_n / validation_extract_native.count() + 1.0, + sample_n / validation_extract_native.count(), ) # pragma: no cover ).limit(sample_n) ) # pragma: no cover @@ -13666,32 +13957,40 @@ def interrogate( ) # pragma: no cover else: # Fallback: just take first n rows after collecting - validation_extract_nw = validation_extract_nw.collect().head( - sample_n + validation_extract_nw = ( + validation_extract_nw.collect().head(sample_n) ) # pragma: no cover elif sample_frac is not None: try: - validation_extract_nw = validation_extract_nw.sample(fraction=sample_frac) + validation_extract_nw = validation_extract_nw.sample( + fraction=sample_frac + ) except AttributeError: # pragma: no cover # For LazyFrames without sample method, collect first then sample validation_extract_native = ( validation_extract_nw.collect().to_native() ) # pragma: no cover - if hasattr(validation_extract_native, "sample"): # pragma: no cover + if hasattr( + validation_extract_native, "sample" + ): # pragma: no cover # PySpark DataFrame has sample method - validation_extract_native = validation_extract_native.sample( - fraction=sample_frac + validation_extract_native = ( + validation_extract_native.sample(fraction=sample_frac) ) # pragma: no cover validation_extract_nw = nw.from_native( validation_extract_native ) # pragma: no cover else: # Fallback: use fraction to calculate head size - collected = validation_extract_nw.collect() # pragma: no cover + collected = ( + validation_extract_nw.collect() + ) # pragma: no cover sample_size = max( 1, int(len(collected) * sample_frac) ) # pragma: no cover - validation_extract_nw = collected.head(sample_size) # pragma: no cover + validation_extract_nw = collected.head( + sample_size + ) # pragma: no cover # Ensure a limit is set on the number of rows to extract try: @@ -13720,7 +14019,9 @@ def interrogate( if column is not None: column_names = column column_names_subset = ["_row_num_"] + column - validation_extract_nw = validation_extract_nw.select(column_names_subset) + validation_extract_nw = validation_extract_nw.select( + column_names_subset + ) validation_extract_nw = ( validation_extract_nw.with_columns( @@ -13729,8 +14030,9 @@ def interrogate( # First sort by the columns to group duplicates and by row numbers # within groups; this type of sorting will preserve the original order in a # single operation - .sort(by=["group_min_row"] + column_names + ["_row_num_"]) - .drop("group_min_row") + .sort(by=["group_min_row"] + column_names + ["_row_num_"]).drop( + "group_min_row" + ) ) # Ensure that the extract is collected and set to its native format @@ -14025,7 +14327,8 @@ def assert_below_threshold( validation_step = self.validation_info[step_num - 1] step_descriptor = ( validation_step.autobrief - if hasattr(validation_step, "autobrief") and validation_step.autobrief + if hasattr(validation_step, "autobrief") + and validation_step.autobrief else f"Validation step {step_num}" ) failures.append(f"Step {step_num}: {step_descriptor}") @@ -14035,8 +14338,9 @@ def assert_below_threshold( if message: msg = message else: - msg = f"The following steps exceeded the {level} threshold level:\n" + "\n".join( - failures + msg = ( + f"The following steps exceeded the {level} threshold level:\n" + + "\n".join(failures) ) raise AssertionError(msg) @@ -14181,7 +14485,9 @@ def process_data(validation_obj): # Return True if any steps exceeded the threshold return any(status.values()) - def n(self, i: int | list[int] | None = None, scalar: bool = False) -> dict[int, int] | int: + def n( + self, i: int | list[int] | None = None, scalar: bool = False + ) -> dict[int, int] | int: """ Provides a dictionary of the number of test units for each validation step. @@ -15094,7 +15400,9 @@ def get_data_extracts( return result def get_json_report( - self, use_fields: list[str] | None = None, exclude_fields: list[str] | None = None + self, + use_fields: list[str] | None = None, + exclude_fields: list[str] | None = None, ) -> str: """ Get a report of the validation results as a JSON-formatted string. @@ -15248,7 +15556,8 @@ def get_json_report( for validation_info in self.validation_info: report_entry = { - field: getattr(validation_info, field) for field in VALIDATION_REPORT_FIELDS + field: getattr(validation_info, field) + for field in VALIDATION_REPORT_FIELDS } # If preprocessing functions are included in the report, convert them to strings @@ -15355,14 +15664,17 @@ def get_sundered_data(self, type="pass") -> Any: validation_info = [ validation for validation in self.validation_info - if validation.assertion_type in ROW_BASED_VALIDATION_TYPES and validation.active + if validation.assertion_type in ROW_BASED_VALIDATION_TYPES + and validation.active ] # TODO: ensure that the stored evaluation tables across all steps have not been mutated # from the original table (via any `pre=` functions) # Obtain the validation steps that are to be used for sundering - validation_steps_i = [validation.assertion_type for validation in validation_info] + validation_steps_i = [ + validation.assertion_type for validation in validation_info + ] if len(validation_steps_i) == 0: if type == "pass": @@ -15414,7 +15726,9 @@ def get_sundered_data(self, type="pass") -> Any: if labeled_tbl_nw is None: labeled_tbl_nw = results_tbl else: - labeled_tbl_nw = labeled_tbl_nw.join(results_tbl, on=index_name, how="left") + labeled_tbl_nw = labeled_tbl_nw.join( + results_tbl, on=index_name, how="left" + ) # Get list of columns that are the `pb_is_good_` columns pb_is_good_cols = [f"pb_is_good_{i}" for i in range(len(validation_steps_i))] @@ -15513,7 +15827,9 @@ def get_notes( # Step not found return None - def get_note(self, i: int, key: str, format: str = "dict") -> dict[str, str] | str | None: + def get_note( + self, i: int, key: str, format: str = "dict" + ) -> dict[str, str] | str | None: """ Get a specific note from a validation step by its step number and note key. @@ -15764,7 +16080,9 @@ def get_tabular_report( # Create the label, table type, and thresholds HTML fragments label_html = _create_label_html(label=self.label, start_time="") - table_type_html = _create_table_type_html(tbl_type=tbl_info, tbl_name=self.tbl_name) + table_type_html = _create_table_type_html( + tbl_type=tbl_info, tbl_name=self.tbl_name + ) thresholds_html = _create_thresholds_html( thresholds=thresholds, locale=locale, df_lib=df_lib ) @@ -15784,7 +16102,9 @@ def get_tabular_report( { "status_color": "", "i": "", - "type_upd": VALIDATION_REPORT_TEXT["no_validation_steps_text"][lang], + "type_upd": VALIDATION_REPORT_TEXT["no_validation_steps_text"][ + lang + ], "columns_upd": "", "values_upd": "", "tbl": "", @@ -15806,10 +16126,13 @@ def get_tabular_report( .opt_align_table_header(align=before) .tab_style(style=style.css("height: 20px;"), locations=loc.body()) .tab_style( - style=style.text(weight="bold", color="#666666"), locations=loc.column_labels() + style=style.text(weight="bold", color="#666666"), + locations=loc.column_labels(), ) .tab_style( - style=style.text(size="28px", weight="bold", align=before, color="#444444"), + style=style.text( + size="28px", weight="bold", align=before, color="#444444" + ), locations=loc.title(), ) .tab_style( @@ -15829,7 +16152,9 @@ def get_tabular_report( "status_color": "", "i": "", "type_upd": VALIDATION_REPORT_TEXT["report_col_step"][lang], - "columns_upd": VALIDATION_REPORT_TEXT["report_col_columns"][lang], + "columns_upd": VALIDATION_REPORT_TEXT["report_col_columns"][ + lang + ], "values_upd": VALIDATION_REPORT_TEXT["report_col_values"][lang], "tbl": "TBL", "eval": "EVAL", @@ -15865,11 +16190,15 @@ def get_tabular_report( columns=["tbl", "eval", "w_upd", "e_upd", "c_upd", "extract_upd"], ) .cols_align(align="right", columns=["test_units", "pass", "fail"]) - .cols_align(align=before, columns=["type_upd", "columns_upd", "values_upd"]) + .cols_align( + align=before, columns=["type_upd", "columns_upd", "values_upd"] + ) .cols_move_to_start(columns=column_order) .tab_options(table_font_size="90%") .tab_source_note( - source_note=VALIDATION_REPORT_TEXT["use_validation_methods_text"][lang] + source_note=VALIDATION_REPORT_TEXT["use_validation_methods_text"][ + lang + ] ) ) @@ -15879,7 +16208,9 @@ def get_tabular_report( ) # pragma: no cover if incl_header: - gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle)) + gt_tbl = gt_tbl.tab_header( + title=html(title_text), subtitle=html(combined_subtitle) + ) # If the version of `great_tables` is `>=0.17.0` then disable Quarto table processing if version("great_tables") >= "0.17.0": @@ -15888,12 +16219,16 @@ def get_tabular_report( return gt_tbl # Convert the `validation_info` object to a dictionary - validation_info_dict = _validation_info_as_dict(validation_info=self.validation_info) + validation_info_dict = _validation_info_as_dict( + validation_info=self.validation_info + ) # Has the validation been performed? We can check the first `time_processed` entry in the # dictionary to see if it is `None` or not; The output of many cells in the reporting table # will be made blank if the validation has not been performed - interrogation_performed = validation_info_dict.get("proc_duration_s", [None])[0] is not None + interrogation_performed = ( + validation_info_dict.get("proc_duration_s", [None])[0] is not None + ) # Determine which steps are those using segmented data segmented_steps = [ @@ -15936,7 +16271,9 @@ def get_tabular_report( for i, column in enumerate(columns): # Check if this validation has a synthetic target column note has_synthetic_column = ( - notes[i] is not None and isinstance(notes[i], dict) and "syn_target_col" in notes[i] + notes[i] is not None + and isinstance(notes[i], dict) + and "syn_target_col" in notes[i] ) column_text = None @@ -16013,7 +16350,9 @@ def get_tabular_report( elif assertion_type[i] in ["col_vals_between", "col_vals_outside"]: left_bracket = "[" if inclusive[i][0] else "(" right_bracket = "]" if inclusive[i][1] else ")" - values_upd.append(f"{left_bracket}{value[0]}, {value[1]}{right_bracket}") + values_upd.append( + f"{left_bracket}{value[0]}, {value[1]}{right_bracket}" + ) # If the assertion type is a comparison of a set of values; strip the leading and # trailing square brackets and single quotes @@ -16213,7 +16552,9 @@ def get_tabular_report( status_color_list.append(SEVERITY_LEVEL_COLORS["warning"]) # CASE 4 else: # No threshold exceeded for {W, E, C} and NOT `all_passed` - status_color_list.append(SEVERITY_LEVEL_COLORS["green"] + "66") # CASE 5 + status_color_list.append( + SEVERITY_LEVEL_COLORS["green"] + "66" + ) # CASE 5 # Add the `status_color` entry to the dictionary validation_info_dict["status_color"] = status_color_list @@ -16265,7 +16606,9 @@ def get_tabular_report( csv_text = extract_nw.write_csv() # Use Base64 encoding to encode the CSV text - csv_text_encoded = base64.b64encode(csv_text.encode("utf-8")).decode("utf-8") + csv_text_encoded = base64.b64encode(csv_text.encode("utf-8")).decode( + "utf-8" + ) output_file_name = f"extract_{format(step_num, '04d')}.csv" @@ -16334,10 +16677,14 @@ def get_tabular_report( # If no interrogation performed, populate the `i` entry with a sequence of integers # from `1` to the number of validation steps if not interrogation_performed: - validation_info_dict["i"] = list(range(1, len(validation_info_dict["type_upd"]) + 1)) + validation_info_dict["i"] = list( + range(1, len(validation_info_dict["type_upd"]) + 1) + ) # Create a table time string - table_time = _create_table_time_html(time_start=self.time_start, time_end=self.time_end) + table_time = _create_table_time_html( + time_start=self.time_start, time_end=self.time_end + ) # Create the title text title_text = _get_title_text( @@ -16349,7 +16696,9 @@ def get_tabular_report( # Create the label, table type, and thresholds HTML fragments label_html = _create_label_html(label=self.label, start_time=self.time_start) - table_type_html = _create_table_type_html(tbl_type=tbl_info, tbl_name=self.tbl_name) + table_type_html = _create_table_type_html( + tbl_type=tbl_info, tbl_name=self.tbl_name + ) thresholds_html = _create_thresholds_html( thresholds=thresholds, locale=locale, df_lib=df_lib ) @@ -16381,10 +16730,13 @@ def get_tabular_report( locations=loc.body(columns="i"), ) .tab_style( - style=style.text(weight="bold", color="#666666"), locations=loc.column_labels() + style=style.text(weight="bold", color="#666666"), + locations=loc.column_labels(), ) .tab_style( - style=style.text(size="28px", weight="bold", align=before, color="#444444"), + style=style.text( + size="28px", weight="bold", align=before, color="#444444" + ), locations=loc.title(), ) .tab_style( @@ -16392,7 +16744,14 @@ def get_tabular_report( color="black", font=google_font(name="IBM Plex Mono"), size="11px" ), locations=loc.body( - columns=["type_upd", "columns_upd", "values_upd", "test_units", "pass", "fail"] + columns=[ + "type_upd", + "columns_upd", + "values_upd", + "test_units", + "pass", + "fail", + ] ), ) .tab_style( @@ -16400,11 +16759,15 @@ def get_tabular_report( locations=loc.body(columns="type_upd", rows=segmented_steps), ) .tab_style( - style=style.fill(color="#FCFCFC" if interrogation_performed else "white"), + style=style.fill( + color="#FCFCFC" if interrogation_performed else "white" + ), locations=loc.body(columns=["w_upd", "e_upd", "c_upd"]), ) .tab_style( - style=style.fill(color="#FCFCFC" if interrogation_performed else "white"), + style=style.fill( + color="#FCFCFC" if interrogation_performed else "white" + ), locations=loc.body(columns=["tbl", "eval"]), ) .tab_style( @@ -16453,7 +16816,11 @@ def get_tabular_report( ) .tab_style( style=style.fill( - color=from_column(column="status_color") if interrogation_performed else "white" + color=( + from_column(column="status_color") + if interrogation_performed + else "white" + ) ), locations=loc.body(columns="status_color"), ) @@ -16462,7 +16829,9 @@ def get_tabular_report( locations=loc.body(columns="status_color"), ) .tab_style( - style=style.css("white-space: nowrap; text-overflow: ellipsis; overflow: hidden;"), + style=style.css( + "white-space: nowrap; text-overflow: ellipsis; overflow: hidden;" + ), locations=loc.body(columns=["columns_upd", "values_upd"]), ) .cols_label( @@ -16502,7 +16871,8 @@ def get_tabular_report( } ) .cols_align( - align="center", columns=["tbl", "eval", "w_upd", "e_upd", "c_upd", "extract_upd"] + align="center", + columns=["tbl", "eval", "w_upd", "e_upd", "c_upd", "extract_upd"], ) .cols_align(align="right", columns=["test_units", "pass", "fail"]) .cols_align(align=before, columns=["type_upd", "columns_upd", "values_upd"]) @@ -16511,7 +16881,9 @@ def get_tabular_report( ) if incl_header: - gt_tbl = gt_tbl.tab_header(title=html(title_text), subtitle=html(combined_subtitle)) + gt_tbl = gt_tbl.tab_header( + title=html(title_text), subtitle=html(combined_subtitle) + ) if incl_footer: # Add table time as HTML source note if enabled @@ -16530,7 +16902,16 @@ def get_tabular_report( gt_tbl = gt_tbl.tab_style( style=style.fill(color="#F2F2F2"), locations=loc.body( - columns=["tbl", "eval", "test_units", "pass", "fail", "w_upd", "e_upd", "c_upd"] + columns=[ + "tbl", + "eval", + "test_units", + "pass", + "fail", + "w_upd", + "e_upd", + "c_upd", + ] ), ) @@ -16550,12 +16931,15 @@ def get_tabular_report( if eval_error: gt_tbl = gt_tbl.tab_style( style=style.fill(color="#FFC1C159"), - locations=loc.body(rows=[i for i, error in enumerate(eval_error) if error]), + locations=loc.body( + rows=[i for i, error in enumerate(eval_error) if error] + ), ) gt_tbl = gt_tbl.tab_style( style=style.text(color="#B22222"), locations=loc.body( - columns="columns_upd", rows=[i for i, error in enumerate(eval_error) if error] + columns="columns_upd", + rows=[i for i, error in enumerate(eval_error) if error], ), ) @@ -16743,7 +17127,9 @@ def get_step_report( raise ValueError("The limit must be an integer value greater than 0.") # Convert the `validation_info` object to a dictionary - validation_info_dict = _validation_info_as_dict(validation_info=self.validation_info) + validation_info_dict = _validation_info_as_dict( + validation_info=self.validation_info + ) # Obtain the language and locale lang = self.lang @@ -16751,7 +17137,9 @@ def get_step_report( # Filter the dictionary to include only the information for the selected step validation_step = { - key: value[i - 1] for key, value in validation_info_dict.items() if key != "i" + key: value[i - 1] + for key, value in validation_info_dict.items() + if key != "i" } # From `validation_step` pull out key values for the report @@ -16773,7 +17161,9 @@ def get_step_report( column_list = list(self.data.columns) column_position = column_list.index(column) + 1 elif isinstance(column, list): - column_position = [list(self.data.columns).index(col) + 1 for col in column] + column_position = [ + list(self.data.columns).index(col) + 1 for col in column + ] else: column_position = None # pragma: no cover else: @@ -16923,7 +17313,6 @@ def get_dataframe( ``` """ - # Raise an error if tbl_type is not one of the supported types if tbl_type not in ["polars", "pandas", "duckdb"]: raise ValueError( @@ -16992,11 +17381,17 @@ def get_dataframe( "critical": "critical", } - report = {key: report_original[key] for key in names_dict.keys() if key in report_original} + report = { + key: report_original[key] + for key in names_dict.keys() + if key in report_original + } - df_validation_results = pl.DataFrame(data=report, schema=schema, strict=False).rename( - names_dict - ) + df_validation_results = pl.DataFrame( + data=report, schema=schema, strict=False + ).rename(names_dict) + + return df_validation_results def _add_validation(self, validation_info): """ @@ -17009,7 +17404,9 @@ def _add_validation(self, validation_info): """ # Get the largest value of `i_o` in the `validation_info` - max_i_o = max([validation.i_o for validation in self.validation_info], default=0) + max_i_o = max( + [validation.i_o for validation in self.validation_info], default=0 + ) # Set the `i_o` attribute to the largest value of `i_o` plus 1 validation_info.i_o = max_i_o + 1 @@ -17060,7 +17457,9 @@ def _evaluate_column_exprs(self, validation_info): # Evaluate the column expression if isinstance(column_expr, ColumnSelectorNarwhals): - columns_resolved = ColumnSelectorNarwhals(column_expr).resolve(table=table) + columns_resolved = ColumnSelectorNarwhals(column_expr).resolve( + table=table + ) else: columns_resolved = column_expr.resolve(columns=columns, table=table) @@ -17154,7 +17553,9 @@ def _evaluate_segments(self, validation_info): # If the `segments` expression is a string, that string is taken as a column name # for which segmentation should occur across unique values in the column if isinstance(segments_expr, str): - seg_tuples = _seg_expr_from_string(data_tbl=table, segments_expr=segments_expr) + seg_tuples = _seg_expr_from_string( + data_tbl=table, segments_expr=segments_expr + ) # If the 'segments' expression is a tuple, then normalize it to a list of tuples # - ("col", "value") -> [("col", "value")] @@ -17201,7 +17602,9 @@ def _evaluate_segments(self, validation_info): return self - def _get_validation_dict(self, i: int | list[int] | None, attr: str) -> dict[int, int]: + def _get_validation_dict( + self, i: int | list[int] | None, attr: str + ) -> dict[int, int]: """ Utility function to get a dictionary of validation attributes for each validation step. @@ -17222,7 +17625,10 @@ def _get_validation_dict(self, i: int | list[int] | None, attr: str) -> dict[int i = [i] if i is None: - return {validation.i: getattr(validation, attr) for validation in self.validation_info} + return { + validation.i: getattr(validation, attr) + for validation in self.validation_info + } return { validation.i: getattr(validation, attr) @@ -17252,13 +17658,23 @@ def _execute_final_actions(self): # Create a summary of validation results as a dictionary summary = { "n_steps": len(self.validation_info), - "n_passing_steps": sum(1 for step in self.validation_info if step.all_passed), - "n_failing_steps": sum(1 for step in self.validation_info if not step.all_passed), + "n_passing_steps": sum( + 1 for step in self.validation_info if step.all_passed + ), + "n_failing_steps": sum( + 1 for step in self.validation_info if not step.all_passed + ), "n_warning_steps": sum(1 for step in self.validation_info if step.warning), "n_error_steps": sum(1 for step in self.validation_info if step.error), - "n_critical_steps": sum(1 for step in self.validation_info if step.critical), - "list_passing_steps": [step.i for step in self.validation_info if step.all_passed], - "list_failing_steps": [step.i for step in self.validation_info if not step.all_passed], + "n_critical_steps": sum( + 1 for step in self.validation_info if step.critical + ), + "list_passing_steps": [ + step.i for step in self.validation_info if step.all_passed + ], + "list_failing_steps": [ + step.i for step in self.validation_info if not step.all_passed + ], "dict_n": {step.i: step.n for step in self.validation_info}, "dict_n_passed": {step.i: step.n_passed for step in self.validation_info}, "dict_n_failed": {step.i: step.n_failed for step in self.validation_info}, @@ -17572,7 +17988,9 @@ def _process_brief( segment_value_str = ", ".join(str(v) for v in segments[0]) else: # Multiple segments: join each segment with commas, separate segments with " | " - segment_value_str = " | ".join([", ".join(str(v) for v in seg) for seg in segments]) + segment_value_str = " | ".join( + [", ".join(str(v) for v in seg) for seg in segments] + ) else: # For regular lists or other types, convert to string if isinstance(segment_value, list): @@ -17802,9 +18220,11 @@ def _create_autobrief_or_failure_text( if assertion_type == "prompt": return _create_text_prompt( lang=lang, - prompt=values["prompt"] - if isinstance(values, dict) and "prompt" in values - else str(values), + prompt=( + values["prompt"] + if isinstance(values, dict) and "prompt" in values + else str(values) + ), for_failure=for_failure, ) @@ -17874,7 +18294,11 @@ def _create_text_between( def _create_text_set( - lang: str, column: str, values: list[Any], not_: bool = False, for_failure: bool = False + lang: str, + column: str, + values: list[Any], + not_: bool = False, + for_failure: bool = False, ) -> str: type_ = _expect_failure_type(for_failure=for_failure) @@ -17896,7 +18320,9 @@ def _create_text_set( return text -def _create_text_null(lang: str, column: str, not_: bool = False, for_failure: bool = False) -> str: +def _create_text_null( + lang: str, column: str, not_: bool = False, for_failure: bool = False +) -> str: type_ = _expect_failure_type(for_failure=for_failure) column_text = _prep_column_text(column=column) @@ -17913,7 +18339,9 @@ def _create_text_null(lang: str, column: str, not_: bool = False, for_failure: b return text -def _create_text_regex(lang: str, column: str, pattern: str, for_failure: bool = False) -> str: +def _create_text_regex( + lang: str, column: str, pattern: str, for_failure: bool = False +) -> str: type_ = _expect_failure_type(for_failure=for_failure) column_text = _prep_column_text(column=column) @@ -17950,7 +18378,9 @@ def _create_text_col_exists(lang: str, column: str, for_failure: bool = False) - column_text = _prep_column_text(column=column) - return EXPECT_FAIL_TEXT[f"col_exists_{type_}_text"][lang].format(column_text=column_text) + return EXPECT_FAIL_TEXT[f"col_exists_{type_}_text"][lang].format( + column_text=column_text + ) def _create_text_col_schema_match(lang: str, for_failure: bool) -> str: @@ -17995,20 +18425,28 @@ def _create_text_rows_complete( return text -def _create_text_row_count_match(lang: str, value: dict, for_failure: bool = False) -> str: +def _create_text_row_count_match( + lang: str, value: dict, for_failure: bool = False +) -> str: type_ = _expect_failure_type(for_failure=for_failure) values_text = _prep_values_text(value["count"], lang=lang) - return EXPECT_FAIL_TEXT[f"row_count_match_n_{type_}_text"][lang].format(values_text=values_text) + return EXPECT_FAIL_TEXT[f"row_count_match_n_{type_}_text"][lang].format( + values_text=values_text + ) -def _create_text_col_count_match(lang: str, value: dict, for_failure: bool = False) -> str: +def _create_text_col_count_match( + lang: str, value: dict, for_failure: bool = False +) -> str: type_ = _expect_failure_type(for_failure=for_failure) values_text = _prep_values_text(value["count"], lang=lang) - return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format(values_text=values_text) + return EXPECT_FAIL_TEXT[f"col_count_match_n_{type_}_text"][lang].format( + values_text=values_text + ) def _create_text_col_pct_null( @@ -18087,7 +18525,9 @@ def _create_text_col_pct_null( # Format numbers with locale-aware formatting p_formatted = _format_number_safe(p_value, decimals=1, locale=fmt_locale) - p_original_formatted = _format_number_safe(p_value_original, decimals=2, locale=fmt_locale) + p_original_formatted = _format_number_safe( + p_value_original, decimals=2, locale=fmt_locale + ) # Choose the appropriate translation key based on tolerance if not has_tolerance: @@ -18099,10 +18539,16 @@ def _create_text_col_pct_null( elif is_asymmetric or isinstance(tol_value, tuple): # Use deviation format for tuple tolerances (including symmetric ones) # Format the deviation values with signs (using proper minus sign U+2212) - lower_dev = f"−{_format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale)}%" - upper_dev = f"+{_format_number_safe(upper_pct_delta, decimals=1, locale=fmt_locale)}%" + lower_dev = ( + f"−{_format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale)}%" + ) + upper_dev = ( + f"+{_format_number_safe(upper_pct_delta, decimals=1, locale=fmt_locale)}%" + ) - text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol_deviation"][lang].format( + text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol_deviation"][ + lang + ].format( column_text=column_text, lower_dev=lower_dev, upper_dev=upper_dev, @@ -18110,7 +18556,9 @@ def _create_text_col_pct_null( ) else: # Single value tolerance - use the symmetric ± format - tol_formatted = _format_number_safe(lower_pct_delta, decimals=1, locale=fmt_locale) + tol_formatted = _format_number_safe( + lower_pct_delta, decimals=1, locale=fmt_locale + ) text = EXPECT_FAIL_TEXT[f"col_pct_null_{type_}_text_tol"][lang].format( column_text=column_text, p=p_formatted, @@ -18292,7 +18740,9 @@ def _seg_expr_from_tuple(segments_expr: tuple) -> list[tuple[str, Any]]: seg_tuples = [(column, segment)] # If the first element is not a string, raise an error else: # pragma: no cover - raise ValueError("The first element of the segments expression must be a string.") + raise ValueError( + "The first element of the segments expression must be a string." + ) return seg_tuples @@ -18402,16 +18852,22 @@ def _apply_segments(data_tbl: Any, segments_expr: tuple[str, str]) -> Any: content = segment_str[2:-2] # pragma: no cover # Try parsing as date first - if len(content) == 10 and content.count("-") == 2: # pragma: no cover + if ( + len(content) == 10 and content.count("-") == 2 + ): # pragma: no cover try: # pragma: no cover - parsed_value = date.fromisoformat(content) # pragma: no cover + parsed_value = date.fromisoformat( + content + ) # pragma: no cover except ValueError: # pragma: no cover pass # pragma: no cover # Try parsing as datetime if parsed_value is None: # pragma: no cover try: # pragma: no cover - parsed_dt = datetime.fromisoformat(content.replace(" UTC", "")) + parsed_dt = datetime.fromisoformat( + content.replace(" UTC", "") + ) if parsed_dt.time() == datetime.min.time(): parsed_value = parsed_dt.date() else: @@ -18460,7 +18916,9 @@ def _apply_segments(data_tbl: Any, segments_expr: tuple[str, str]) -> Any: if segment is None: data_tbl = data_tbl.filter(data_tbl[column].isnull()) # pragma: no cover elif isinstance(segment, list): - data_tbl = data_tbl.filter(data_tbl[column].isin(segment)) # pragma: no cover + data_tbl = data_tbl.filter( + data_tbl[column].isin(segment) + ) # pragma: no cover else: data_tbl = data_tbl.filter(data_tbl[column] == segment) @@ -18552,7 +19010,9 @@ def _get_title_text( if interrogation_performed: return title - no_interrogation_text = VALIDATION_REPORT_TEXT["no_interrogation_performed_text"][lang] + no_interrogation_text = VALIDATION_REPORT_TEXT["no_interrogation_performed_text"][ + lang + ] # If no interrogation was performed, return title text indicating that if lang not in RTL_LANGUAGES: @@ -18586,7 +19046,9 @@ def _get_title_text( def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> str: - default_title_text = VALIDATION_REPORT_TEXT["pointblank_validation_title_text"][lang] + default_title_text = VALIDATION_REPORT_TEXT["pointblank_validation_title_text"][ + lang + ] if title is None: title_text = "" @@ -18605,7 +19067,9 @@ def _process_title_text(title: str | None, tbl_name: str | None, lang: str) -> s return title_text -def _transform_tbl_preprocessed(pre: Any, seg: Any, interrogation_performed: bool) -> list[str]: +def _transform_tbl_preprocessed( + pre: Any, seg: Any, interrogation_performed: bool +) -> list[str]: # If no interrogation was performed, return a list of empty strings if not interrogation_performed: return ["" for _ in range(len(pre))] @@ -18631,7 +19095,10 @@ def _get_preprocessed_table_icon(icon: list[str]) -> list[str]: def _transform_eval( - n: list[int], interrogation_performed: bool, eval_error: list[bool], active: list[bool] + n: list[int], + interrogation_performed: bool, + eval_error: list[bool], + active: list[bool], ) -> list[str]: # If no interrogation was performed, return a list of empty strings if not interrogation_performed: @@ -18679,10 +19146,14 @@ def _format_single_number_with_gt( df = df_lib.DataFrame({"value": [value]}) # Create GT object and format the column - gt_obj = GT(df).fmt_number(columns="value", n_sigfig=n_sigfig, compact=compact, locale=locale) + gt_obj = GT(df).fmt_number( + columns="value", n_sigfig=n_sigfig, compact=compact, locale=locale + ) # Extract the formatted value using _get_column_of_values - formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html") + formatted_values = _get_column_of_values( + gt_obj, column_name="value", context="html" + ) return formatted_values[0] # Return the single formatted value @@ -18711,7 +19182,11 @@ def _format_number_safe(value: int) -> str: return [ ( - (str(test_units[i]) if test_units[i] < 10000 else _format_number_safe(test_units[i])) + ( + str(test_units[i]) + if test_units[i] < 10000 + else _format_number_safe(test_units[i]) + ) if active[i] else "—" ) @@ -18755,7 +19230,9 @@ def _format_single_float_with_gt( gt_obj = GT(df).fmt_number(columns="value", decimals=decimals, locale=locale) # Extract the formatted value using _get_column_of_values - formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html") + formatted_values = _get_column_of_values( + gt_obj, column_name="value", context="html" + ) return formatted_values[0] # Return the single formatted value @@ -18775,10 +19252,14 @@ def _transform_passed_failed( def _format_float_safe(value: float) -> str: if df_lib is not None: # Use GT-based formatting to avoid Pandas dependency completely - return _format_single_float_with_gt(value, decimals=2, locale=locale, df_lib=df_lib) + return _format_single_float_with_gt( + value, decimals=2, locale=locale, df_lib=df_lib + ) else: # Fallback to the original behavior - return vals.fmt_number(value, decimals=2, locale=locale)[0] # pragma: no cover + return vals.fmt_number(value, decimals=2, locale=locale)[ + 0 + ] # pragma: no cover passed_failed = [ ( @@ -18805,9 +19286,11 @@ def _transform_w_e_c(values, color, interrogation_performed): else ( f'' if value is True - else f'' - if value is False - else value + else ( + f'' + if value is False + else value + ) ) ) for value in values @@ -18839,9 +19322,11 @@ def _transform_assertion_str( # If the template text `{auto}` is in the `brief_str` then replace it with # the corresponding `autobrief_str` entry brief_str = [ - brief_str[i].replace("{auto}", autobrief_str[i]) - if "{auto}" in brief_str[i] - else brief_str[i] + ( + brief_str[i].replace("{auto}", autobrief_str[i]) + if "{auto}" in brief_str[i] + else brief_str[i] + ) for i in range(len(brief_str)) ] @@ -18852,7 +19337,11 @@ def _transform_assertion_str( # In some sandboxed HTML environments (e.g., Streamlit),

tags don't inherit # font-size from parent divs, so we add inline styles directly to the

tags brief_str = [ - re.sub(r"

", r'

', x) if x.strip() else x + ( + re.sub(r"

", r'

', x) + if x.strip() + else x + ) for x in brief_str ] @@ -18868,9 +19357,11 @@ def _transform_assertion_str( # Define the brief's HTML div tag for each row brief_divs = [ - f"

{brief}
" - if brief.strip() - else "" + ( + f"
{brief}
" + if brief.strip() + else "" + ) for brief in brief_str ] @@ -18885,7 +19376,9 @@ def _transform_assertion_str( {brief_div} """ - for assertion, svg, size, brief_div in zip(assertion_str, svg_icon, text_size, brief_divs) + for assertion, svg, size, brief_div in zip( + assertion_str, svg_icon, text_size, brief_divs + ) ] # If the `segments` list is not empty, prepend a segmentation div to the `type_upd` strings @@ -19106,7 +19599,9 @@ def _format_single_integer_with_gt(value: int, locale: str = "en", df_lib=None) gt_obj = GT(df).fmt_integer(columns="value", locale=locale) # Extract the formatted value using _get_column_of_values - formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html") + formatted_values = _get_column_of_values( + gt_obj, column_name="value", context="html" + ) return formatted_values[0] # Return the single formatted value @@ -19139,17 +19634,26 @@ def _format_single_float_with_gt_custom( # Create GT object and format the column gt_obj = GT(df).fmt_number( - columns="value", decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale + columns="value", + decimals=decimals, + drop_trailing_zeros=drop_trailing_zeros, + locale=locale, ) # Extract the formatted value using _get_column_of_values - formatted_values = _get_column_of_values(gt_obj, column_name="value", context="html") + formatted_values = _get_column_of_values( + gt_obj, column_name="value", context="html" + ) return formatted_values[0] # Return the single formatted value def _format_number_safe( - value: float, decimals: int, drop_trailing_zeros: bool = False, locale: str = "en", df_lib=None + value: float, + decimals: int, + drop_trailing_zeros: bool = False, + locale: str = "en", + df_lib=None, ) -> str: """ Safely format a float value with locale support. @@ -19235,7 +19739,9 @@ def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> ) if thresholds.critical_fraction is not None else ( - _format_integer_safe(thresholds.critical_count, locale=locale, df_lib=df_lib) + _format_integer_safe( + thresholds.critical_count, locale=locale, df_lib=df_lib + ) if thresholds.critical_count is not None else "—" ) @@ -19281,7 +19787,9 @@ def _create_thresholds_html(thresholds: Thresholds, locale: str, df_lib=None) -> ) -def _create_local_threshold_note_html(thresholds: Thresholds, locale: str = "en") -> str: +def _create_local_threshold_note_html( + thresholds: Thresholds, locale: str = "en" +) -> str: """ Create a miniature HTML representation of local thresholds for display in notes. @@ -19323,12 +19831,18 @@ def _format_threshold_value(fraction: float | None, count: int | None) -> str: return "0" elif fraction < 0.01: # For very small fractions, show "<0.01" with locale formatting - formatted = _format_number_safe(0.01, decimals=2, locale=locale, df_lib=df_lib) + formatted = _format_number_safe( + 0.01, decimals=2, locale=locale, df_lib=df_lib + ) return f"<{formatted}" else: # Use shared formatting function with drop_trailing_zeros formatted = _format_number_safe( - fraction, decimals=2, drop_trailing_zeros=True, locale=locale, df_lib=df_lib + fraction, + decimals=2, + drop_trailing_zeros=True, + locale=locale, + df_lib=df_lib, ) return formatted elif count is not None: @@ -19337,9 +19851,13 @@ def _format_threshold_value(fraction: float | None, count: int | None) -> str: else: return "—" - warning = _format_threshold_value(thresholds.warning_fraction, thresholds.warning_count) + warning = _format_threshold_value( + thresholds.warning_fraction, thresholds.warning_count + ) error = _format_threshold_value(thresholds.error_fraction, thresholds.error_count) - critical = _format_threshold_value(thresholds.critical_fraction, thresholds.critical_count) + critical = _format_threshold_value( + thresholds.critical_fraction, thresholds.critical_count + ) warning_color = SEVERITY_LEVEL_COLORS["warning"] error_color = SEVERITY_LEVEL_COLORS["error"] @@ -19367,10 +19885,14 @@ def _format_threshold_value(fraction: float | None, count: int | None) -> str: ) # Join with "|" separator (only between multiple thresholds) - thresholds_html = f'{"|".join(threshold_parts)}' + thresholds_html = ( + f'{"|".join(threshold_parts)}' + ) # Get localized text and format with threshold HTML - localized_text = NOTES_TEXT["local_threshold"].get(locale, NOTES_TEXT["local_threshold"]["en"]) + localized_text = NOTES_TEXT["local_threshold"].get( + locale, NOTES_TEXT["local_threshold"]["en"] + ) note_html = localized_text.replace("{thresholds}", thresholds_html) return note_html @@ -19413,15 +19935,21 @@ def _format_threshold_value(fraction: float | None, count: int | None) -> str: parts = [] if thresholds.warning is not None: - warning = _format_threshold_value(thresholds.warning_fraction, thresholds.warning_count) + warning = _format_threshold_value( + thresholds.warning_fraction, thresholds.warning_count + ) parts.append(f"W: {warning}") if thresholds.error is not None: - error = _format_threshold_value(thresholds.error_fraction, thresholds.error_count) + error = _format_threshold_value( + thresholds.error_fraction, thresholds.error_count + ) parts.append(f"E: {error}") if thresholds.critical is not None: - critical = _format_threshold_value(thresholds.critical_fraction, thresholds.critical_count) + critical = _format_threshold_value( + thresholds.critical_fraction, thresholds.critical_count + ) parts.append(f"C: {critical}") if parts: @@ -19484,7 +20012,8 @@ def _create_no_columns_resolved_note_html( """ # Get translated strings intro = NOTES_TEXT.get("column_not_found_intro", {}).get( - locale, NOTES_TEXT.get("column_not_found_intro", {}).get("en", "The column expression") + locale, + NOTES_TEXT.get("column_not_found_intro", {}).get("en", "The column expression"), ) no_resolve = NOTES_TEXT.get("column_not_found_no_resolve", {}).get( locale, @@ -19494,7 +20023,9 @@ def _create_no_columns_resolved_note_html( ) # Format the column expression with monospace font - col_expr_html = f"{column_expr}" + col_expr_html = ( + f"{column_expr}" + ) # Build the HTML note html = f"{intro} {col_expr_html} {no_resolve}." @@ -19502,7 +20033,9 @@ def _create_no_columns_resolved_note_html( return html -def _create_no_columns_resolved_note_text(column_expr: str, available_columns: list[str]) -> str: +def _create_no_columns_resolved_note_text( + column_expr: str, available_columns: list[str] +) -> str: """ Create a plain text note explaining that a column expression resolved to no columns. @@ -19543,7 +20076,10 @@ def _create_column_not_found_note_html( """ # Get translated strings intro = NOTES_TEXT.get("target_column_provided", {}).get( - locale, NOTES_TEXT.get("target_column_provided", {}).get("en", "The target column provided") + locale, + NOTES_TEXT.get("target_column_provided", {}).get( + "en", "The target column provided" + ), ) not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get( locale, @@ -19553,7 +20089,9 @@ def _create_column_not_found_note_html( ) # Format the column name with monospace font - col_name_html = f"{column_name}" + col_name_html = ( + f"{column_name}" + ) # Build the HTML note html = f"{intro} ({col_name_html}) {not_found}." @@ -19561,7 +20099,9 @@ def _create_column_not_found_note_html( return html -def _create_column_not_found_note_text(column_name: str, available_columns: list[str]) -> str: +def _create_column_not_found_note_text( + column_name: str, available_columns: list[str] +) -> str: """ Create a plain text note explaining that a specific column was not found. @@ -19581,7 +20121,10 @@ def _create_column_not_found_note_text(column_name: str, available_columns: list def _create_comparison_column_not_found_note_html( - column_name: str, position: str | None, available_columns: list[str], locale: str = "en" + column_name: str, + position: str | None, + available_columns: list[str], + locale: str = "en", ) -> str: """ Create an HTML note explaining that a comparison column was not found. @@ -19611,7 +20154,9 @@ def _create_comparison_column_not_found_note_html( ) intro_with_for = NOTES_TEXT.get("comparison_column_for", {}).get( locale, - NOTES_TEXT.get("comparison_column_for", {}).get("en", "The comparison column provided for"), + NOTES_TEXT.get("comparison_column_for", {}).get( + "en", "The comparison column provided for" + ), ) not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get( locale, @@ -19621,14 +20166,14 @@ def _create_comparison_column_not_found_note_html( ) # Format the column name with monospace font - col_name_html = f"{column_name}" + col_name_html = ( + f"{column_name}" + ) # Add position if provided (for between/outside validations) if position: # Format position parameter with monospace font (e.g., "left=", "right=") - position_param = ( - f"{position}=" - ) + position_param = f"{position}=" # Use the "for" version of the intro text html = f"{intro_with_for} {position_param} ({col_name_html}) {not_found}." else: @@ -19699,7 +20244,8 @@ def _create_preprocessing_note_html( """ # Get translated strings precondition_text = NOTES_TEXT.get("precondition_applied", {}).get( - locale, NOTES_TEXT.get("precondition_applied", {}).get("en", "Precondition applied") + locale, + NOTES_TEXT.get("precondition_applied", {}).get("en", "Precondition applied"), ) table_dims_text = NOTES_TEXT.get("table_dimensions", {}).get( locale, NOTES_TEXT.get("table_dimensions", {}).get("en", "table dimensions") @@ -19708,8 +20254,12 @@ def _create_preprocessing_note_html( # Helper function to get singular or plural form def get_row_text(count: int) -> str: if count == 1: - return NOTES_TEXT.get("row", {}).get(locale, NOTES_TEXT.get("row", {}).get("en", "row")) - return NOTES_TEXT.get("rows", {}).get(locale, NOTES_TEXT.get("rows", {}).get("en", "rows")) + return NOTES_TEXT.get("row", {}).get( + locale, NOTES_TEXT.get("row", {}).get("en", "row") + ) + return NOTES_TEXT.get("rows", {}).get( + locale, NOTES_TEXT.get("rows", {}).get("en", "rows") + ) def get_col_text(count: int) -> str: if count == 1: @@ -19747,7 +20297,9 @@ def get_col_text(count: int) -> str: else: cols_display = f"{processed_cols} {processed_cols_text}" - processed_dim = f'[{rows_display}, {cols_display}]' + processed_dim = ( + f'[{rows_display}, {cols_display}]' + ) # Build the HTML note html = f"{precondition_text}: {table_dims_text} {original_dim} → {processed_dim}." @@ -19830,7 +20382,9 @@ def _create_preprocessing_no_change_note_text() -> str: return "Precondition applied: no table dimension change." -def _create_synthetic_target_column_note_html(column_name: str, locale: str = "en") -> str: +def _create_synthetic_target_column_note_html( + column_name: str, locale: str = "en" +) -> str: """ Create an HTML note indicating that the target column was created via preprocessing. @@ -19848,15 +20402,22 @@ def _create_synthetic_target_column_note_html(column_name: str, locale: str = "e """ # Get translated strings synthetic_text = NOTES_TEXT.get("synthetic_target_column", {}).get( - locale, NOTES_TEXT.get("synthetic_target_column", {}).get("en", "Synthetic target column") + locale, + NOTES_TEXT.get("synthetic_target_column", {}).get( + "en", "Synthetic target column" + ), ) created_via_text = NOTES_TEXT.get("created_via_preprocessing", {}).get( locale, - NOTES_TEXT.get("created_via_preprocessing", {}).get("en", "created via preprocessing"), + NOTES_TEXT.get("created_via_preprocessing", {}).get( + "en", "created via preprocessing" + ), ) # Format the column name with monospace font - col_name_html = f"{column_name}" + col_name_html = ( + f"{column_name}" + ) # Build the HTML note html = f"{synthetic_text} {col_name_html} {created_via_text}." @@ -19918,8 +20479,11 @@ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") -> disclosure_text = VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"].get( locale, VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"]["en"] ) - settings_title_text = VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"].get( - locale, VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"]["en"] + settings_title_text = VALIDATION_REPORT_TEXT[ + "note_schema_comparison_match_settings_title" + ].get( + locale, + VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"]["en"], ) # Build summary message @@ -19933,17 +20497,24 @@ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") -> n_expect = len(expect_schema) n_target = len(target_schema) if n_expect != n_target: - count_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"].get( - locale, VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"]["en"] + count_mismatch_text = VALIDATION_REPORT_TEXT[ + "note_schema_column_count_mismatch" + ].get( + locale, + VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"]["en"], + ) + failures.append( + count_mismatch_text.format(n_expect=n_expect, n_target=n_target) ) - failures.append(count_mismatch_text.format(n_expect=n_expect, n_target=n_target)) # Check for unmatched columns - unmatched_cols = [col for col, info in columns_dict.items() if not info["colname_matched"]] + unmatched_cols = [ + col for col, info in columns_dict.items() if not info["colname_matched"] + ] if unmatched_cols: - unmatched_text = VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"].get( - locale, VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"]["en"] - ) + unmatched_text = VALIDATION_REPORT_TEXT[ + "note_schema_unmatched_columns" + ].get(locale, VALIDATION_REPORT_TEXT["note_schema_unmatched_columns"]["en"]) failures.append(unmatched_text.format(n=len(unmatched_cols))) # Check for wrong order (if in_order=True) @@ -19954,26 +20525,30 @@ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") -> if info["colname_matched"] and not info["index_matched"] ] if wrong_order: - wrong_order_text = VALIDATION_REPORT_TEXT["note_schema_wrong_order"].get( - locale, VALIDATION_REPORT_TEXT["note_schema_wrong_order"]["en"] - ) + wrong_order_text = VALIDATION_REPORT_TEXT[ + "note_schema_wrong_order" + ].get(locale, VALIDATION_REPORT_TEXT["note_schema_wrong_order"]["en"]) failures.append(wrong_order_text.format(n=len(wrong_order))) # Check for dtype mismatches dtype_mismatches = [ col for col, info in columns_dict.items() - if info["colname_matched"] and info["dtype_present"] and not info["dtype_matched"] + if info["colname_matched"] + and info["dtype_present"] + and not info["dtype_matched"] ] if dtype_mismatches: - dtype_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"].get( - locale, VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"]["en"] - ) + dtype_mismatch_text = VALIDATION_REPORT_TEXT[ + "note_schema_dtype_mismatch" + ].get(locale, VALIDATION_REPORT_TEXT["note_schema_dtype_mismatch"]["en"]) failures.append(dtype_mismatch_text.format(n=len(dtype_mismatches))) if failures: summary = ( - f' {failed_text}: ' + ", ".join(failures) + "." + f' {failed_text}: ' + + ", ".join(failures) + + "." ) else: summary = f' {failed_text}.' @@ -19983,11 +20558,19 @@ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") -> # depending on the in_order parameter if in_order: step_report_gt = _step_report_schema_in_order( - step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False + step=1, + schema_info=schema_info, + header=None, + lang=locale, + debug_return_df=False, ) else: step_report_gt = _step_report_schema_any_order( - step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False + step=1, + schema_info=schema_info, + header=None, + lang=locale, + debug_return_df=False, ) # Generate the settings HTML using the existing function @@ -20001,7 +20584,9 @@ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") -> ) # Remove the inner div containing column_schema_match_str - settings_html = re.sub(r'
.*?
', "", settings_html, count=1) + settings_html = re.sub( + r'
.*?
', "", settings_html, count=1 + ) # Change padding-top from 7px to 2px settings_html = settings_html.replace("padding-top: 7px;", "padding-top: 2px;") @@ -20105,7 +20690,9 @@ def _step_report_row_based( assert inclusive is not None symbol_left = "<" if inclusive[0] else "≤" symbol_right = ">" if inclusive[1] else "≥" - text = f"{column} {symbol_left} {values[0]}, {column} {symbol_right} {values[1]}" + text = ( + f"{column} {symbol_left} {values[0]}, {column} {symbol_right} {values[1]}" + ) elif assertion_type == "col_vals_in_set": elements = ", ".join(map(str, values)) text = f"{column} ∈ {{{elements}}}" @@ -20114,7 +20701,9 @@ def _step_report_row_based( text = f"{column} ∉ {{{elements}}}" elif assertion_type == "col_vals_regex": pattern = values["pattern"] - text = STEP_REPORT_TEXT["column_matches_regex"][lang].format(column=column, values=pattern) + text = STEP_REPORT_TEXT["column_matches_regex"][lang].format( + column=column, values=pattern + ) elif assertion_type == "col_vals_null": text = STEP_REPORT_TEXT["column_is_null"][lang].format(column=column) elif assertion_type == "col_vals_not_null": @@ -20128,9 +20717,7 @@ def _step_report_row_based( text = STEP_REPORT_TEXT["rows_complete_subset"][lang] # Wrap assertion text in a tag - text = ( - f"{text}" - ) + text = f"{text}" if all_passed: # Style the target column in green and add borders but only if that column is present @@ -20153,7 +20740,10 @@ def _step_report_row_based( locations=loc.body(columns=column), ).tab_style( style=style.borders( - sides=["left", "right"], color="#1B4D3E80", style="solid", weight="2px" + sides=["left", "right"], + color="#1B4D3E80", + style="solid", + weight="2px", ), locations=loc.column_labels(columns=column), ) @@ -20164,7 +20754,11 @@ def _step_report_row_based( if header is None: return step_report - title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN + title = ( + STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + + " " + + CHECK_MARK_SPAN + ) assertion_header_text = STEP_REPORT_TEXT["assertion_header_text"][lang] # Use 'success_statement_no_column' for col_vals_expr() since it doesn't target @@ -20235,7 +20829,10 @@ def _step_report_row_based( style.text(color="#B22222"), style.fill(color="#FFC1C159"), style.borders( - sides=["left", "right"], color="black", style="solid", weight="2px" + sides=["left", "right"], + color="black", + style="solid", + weight="2px", ), ], locations=loc.body(columns=column), @@ -20259,9 +20856,9 @@ def _step_report_row_based( # Use failure_rate_summary_no_column for col_vals_expr since it doesn't target a specific column if assertion_type == "col_vals_expr": - failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary_no_column"][lang].format( - failure_rate=failure_rate_metrics - ) + failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary_no_column"][ + lang + ].format(failure_rate=failure_rate_metrics) else: failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary"][lang].format( failure_rate=failure_rate_metrics, @@ -20271,13 +20868,15 @@ def _step_report_row_based( if limit < extract_length: extract_length_resolved = limit extract_text = STEP_REPORT_TEXT["extract_text_first"][lang].format( - extract_length_resolved=extract_length_resolved, shown_failures=shown_failures + extract_length_resolved=extract_length_resolved, + shown_failures=shown_failures, ) else: extract_length_resolved = extract_length extract_text = STEP_REPORT_TEXT["extract_text_all"][lang].format( - extract_length_resolved=extract_length_resolved, shown_failures=shown_failures + extract_length_resolved=extract_length_resolved, + shown_failures=shown_failures, ) details = ( @@ -20340,7 +20939,9 @@ def _step_report_rows_distinct( text = STEP_REPORT_TEXT["rows_distinct_all"][lang].format(column=column) else: columns_list = ", ".join(column) - text = STEP_REPORT_TEXT["rows_distinct_subset"][lang].format(columns_subset=columns_list) + text = STEP_REPORT_TEXT["rows_distinct_subset"][lang].format( + columns_subset=columns_list + ) if all_passed: step_report = tbl_preview @@ -20348,7 +20949,11 @@ def _step_report_rows_distinct( if header is None: return step_report - title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + " " + CHECK_MARK_SPAN + title = ( + STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) + + " " + + CHECK_MARK_SPAN + ) success_stmt = STEP_REPORT_TEXT["success_statement_no_column"][lang].format( n=n, @@ -20400,22 +21005,24 @@ def _step_report_rows_distinct( title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=i) failure_rate_metrics = f"{n_failed} / {n}" - failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary_rows_distinct"][lang].format( + failure_rate_stmt = STEP_REPORT_TEXT["failure_rate_summary_rows_distinct"][ + lang + ].format( failure_rate=failure_rate_metrics, column_position=column_position, ) if limit < extract_length: # pragma: no cover extract_length_resolved = limit - extract_text = STEP_REPORT_TEXT["extract_text_first_rows_distinct"][lang].format( - extract_length_resolved=extract_length_resolved - ) + extract_text = STEP_REPORT_TEXT["extract_text_first_rows_distinct"][ + lang + ].format(extract_length_resolved=extract_length_resolved) else: extract_length_resolved = extract_length - extract_text = STEP_REPORT_TEXT["extract_text_all_rows_distinct"][lang].format( - extract_length_resolved=extract_length_resolved - ) + extract_text = STEP_REPORT_TEXT["extract_text_all_rows_distinct"][ + lang + ].format(extract_length_resolved=extract_length_resolved) details = ( f"
" @@ -20450,7 +21057,11 @@ def _step_report_rows_distinct( def _step_report_schema_in_order( - step: int, schema_info: dict, header: str | None, lang: str, debug_return_df: bool = False + step: int, + schema_info: dict, + header: str | None, + lang: str, + debug_return_df: bool = False, ) -> GT | Any: """ This is the case for schema validation where the schema is supposed to have the same column @@ -20666,7 +21277,9 @@ def _step_report_schema_in_order( } ) .tab_style( - style=style.text(color="black", font=google_font(name="IBM Plex Mono"), size="13px"), + style=style.text( + color="black", font=google_font(name="IBM Plex Mono"), size="13px" + ), locations=loc.body( columns=["col_name_target", "dtype_target", "col_name_exp", "dtype_exp"] ), @@ -20676,11 +21289,15 @@ def _step_report_schema_in_order( locations=loc.body(columns=["index_target", "index_exp"]), ) .tab_style( - style=style.borders(sides="left", color="#E5E5E5", style="double", weight="3px"), + style=style.borders( + sides="left", color="#E5E5E5", style="double", weight="3px" + ), locations=loc.body(columns="index_exp"), ) .tab_style( - style=style.css("white-space: nowrap; text-overflow: ellipsis; overflow: hidden;"), + style=style.css( + "white-space: nowrap; text-overflow: ellipsis; overflow: hidden;" + ), locations=loc.body( columns=["col_name_target", "dtype_target", "col_name_exp", "dtype_exp"] ), @@ -20756,9 +21373,12 @@ def _step_report_schema_in_order( # Add a border below the row that terminates the target table schema step_report = step_report.tab_style( - style=style.borders(sides="bottom", color="#6699CC80", style="solid", weight="1px"), + style=style.borders( + sides="bottom", color="#6699CC80", style="solid", weight="1px" + ), locations=loc.body( - rows=len(colnames_tgt) - 1 # ty: ignore (bug in GT, should allow an int) + rows=len(colnames_tgt) + - 1 # ty: ignore (bug in GT, should allow an int) ), ) @@ -20779,7 +21399,11 @@ def _step_report_schema_in_order( passing_symbol = CHECK_MARK_SPAN if all_passed else CROSS_MARK_SPAN # Generate the title for the step report - title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=step) + " " + passing_symbol + title = ( + STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=step) + + " " + + passing_symbol + ) # Generate the details for the step report details = _create_col_schema_match_params_html( @@ -20808,7 +21432,11 @@ def _step_report_schema_in_order( def _step_report_schema_any_order( - step: int, schema_info: dict, header: str | None, lang: str, debug_return_df: bool = False + step: int, + schema_info: dict, + header: str | None, + lang: str, + debug_return_df: bool = False, ) -> GT | pl.DataFrame: """ This is the case for schema validation where the schema is permitted to not have to be in the @@ -20997,7 +21625,9 @@ def _step_report_schema_any_order( dtype_exp.append("") # pragma: no cover elif len(exp_columns_dict[column_name_exp_i]["dtype_input"]) > 1: - dtype = exp_columns_dict[column_name_exp_i]["dtype_input"] # pragma: no cover + dtype = exp_columns_dict[column_name_exp_i][ + "dtype_input" + ] # pragma: no cover if ( exp_columns_dict[column_name_exp_i]["dtype_matched_pos"] is not None @@ -21034,9 +21664,13 @@ def _step_report_schema_any_order( if not exp_columns_dict[column_name_exp_i]["colname_matched"]: dtype_exp_correct.append("—") - elif not exp_columns_dict[column_name_exp_i]["dtype_present"]: # pragma: no cover + elif not exp_columns_dict[column_name_exp_i][ + "dtype_present" + ]: # pragma: no cover dtype_exp_correct.append("") # pragma: no cover - elif exp_columns_dict[column_name_exp_i]["dtype_matched"]: # pragma: no cover + elif exp_columns_dict[column_name_exp_i][ + "dtype_matched" + ]: # pragma: no cover dtype_exp_correct.append(CHECK_MARK_SPAN) # pragma: no cover else: # pragma: no cover dtype_exp_correct.append(CROSS_MARK_SPAN) # pragma: no cover @@ -21051,7 +21685,9 @@ def _step_report_schema_any_order( # Generate the range and convert to strings index_exp = [ str(i + len(columns_found) - 1) - for i in range(last_index_int, last_index_int + len(colnames_exp_unmatched)) + for i in range( + last_index_int, last_index_int + len(colnames_exp_unmatched) + ) ] else: @@ -21116,7 +21752,9 @@ def _step_report_schema_any_order( ) .cols_align(align="right", columns="index_exp") .tab_style( - style=style.text(color="black", font=google_font(name="IBM Plex Mono"), size="13px"), + style=style.text( + color="black", font=google_font(name="IBM Plex Mono"), size="13px" + ), locations=loc.body( columns=["col_name_target", "dtype_target", "col_name_exp", "dtype_exp"] ), @@ -21126,11 +21764,15 @@ def _step_report_schema_any_order( locations=loc.body(columns=["index_target", "index_exp"]), ) .tab_style( - style=style.borders(sides="left", color="#E5E5E5", style="double", weight="3px"), + style=style.borders( + sides="left", color="#E5E5E5", style="double", weight="3px" + ), locations=loc.body(columns="index_exp"), ) .tab_style( - style=style.css("white-space: nowrap; text-overflow: ellipsis; overflow: hidden;"), + style=style.css( + "white-space: nowrap; text-overflow: ellipsis; overflow: hidden;" + ), locations=loc.body( columns=["col_name_target", "dtype_target", "col_name_exp", "dtype_exp"] ), @@ -21205,7 +21847,11 @@ def _step_report_schema_any_order( passing_symbol = CHECK_MARK_SPAN if all_passed else CROSS_MARK_SPAN # Generate the title for the step report - title = STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=step) + " " + passing_symbol + title = ( + STEP_REPORT_TEXT["report_for_step_i"][lang].format(i=step) + + " " + + passing_symbol + ) # Generate the details for the step report details = _create_col_schema_match_params_html( @@ -21247,9 +21893,7 @@ def _create_label_text_html( margin_top: str = "2px", ) -> str: if strikethrough: - strikethrough_rules = ( - f" text-decoration: line-through; text-decoration-color: {strikethrough_color};" - ) + strikethrough_rules = f" text-decoration: line-through; text-decoration-color: {strikethrough_color};" else: strikethrough_rules = "" From a958dedf51199be8a8b8b498c5b8fe1c6708c24f Mon Sep 17 00:00:00 2001 From: Meghansaha Date: Tue, 24 Feb 2026 21:52:24 -0500 Subject: [PATCH 06/11] first draft of polars table complete. --- pointblank/validate.py | 97 ++++++++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 37 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index 998de4b91..83d73a157 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -17843,29 +17843,16 @@ def get_step_report( return step_report - def get_dataframe( - self, - tbl_type: Literal["polars", "pandas", "duckdb"] = "polars", - keep_extracts: bool = False, - ): + def get_dataframe(self, tbl_type: Literal["polars", "pandas", "duckdb"] = "polars"): """ Validation results as a dataframe - The `get_dataframe()` method returns a dataframe that represents the validation - report. This dataframe provides a summary of the validation results, including the - validation steps, the number of test units, the number of failing test units, and the - fraction of failing test units. This can be particularly helpful for logging purposes - and enables write validation summaries to CSVs and other on-disk formats. + The `get_dataframe()` method returns a dataframe that represents the validation report. This dataframe provides a summary of the validation results, including the validation steps, the number of test units, the number of failing test units, and the fraction of failing test units. This can be particularly helpful for logging purposes and enables writing validation summaries to CSVs and other on-disk formats. Parameters ---------- tbl_type : - The output backend for the dataframe. The named options are `"polars"`, - `"pandas"`, and `"duckdb"`. Default is 'polars'. - - keep_extracts: - An option to keep any collected extract data for failing rows from validation steps. By - default, this is `False` (i.e., extract data is removed to save space). + The output backend for the dataframe. The named options are `"polars"`, `"pandas"`, and `"duckdb"`. Default is 'polars'. Supported DataFrame Types ------------------------- @@ -17883,7 +17870,7 @@ def get_dataframe( # Create a validation validation = ( - pb.Validate(data=pb.load_dataset("small_table", tbl_type = "duckdb"), label="My validation") + pb.Validate(data=pb.load_dataset("small_table", tbl_type = "polars"), label="My validation") .col_vals_gt(columns="d", value=100) .col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}") .interrogate() @@ -17907,8 +17894,8 @@ def get_dataframe( # Grab the summary data from validation info helper function report_original = _validation_info_as_dict(self.validation_info) - # Pop the extracts off unless specified to keep - if keep_extracts is False and "extract" in report_original: + # Pop the extracts off if present + if "extract" in report_original: report_original.pop("extract") # Remove keys to be dropped @@ -17931,29 +17918,39 @@ def get_dataframe( # Create the schema for the df schema = pl.Schema( { - "assertion_type": pl.String, # assertion_type - "column": pl.String, # column - "values": pl.Unknown, # values - "pre": pl.Unknown, # pre - "active": pl.String, # active - "n": pl.Int64, # n - "n_passed": pl.Int64, # n_passed - "f_passed": pl.Float64, # f_passed - "n_failed": pl.Int64, # n_failed - "f_failed": pl.Float64, # f_failed - "warning": pl.Boolean, # warning - "error": pl.Boolean, # error - "critical": pl.Boolean, # critical + "active": pl.Boolean, + "i": pl.Int64, + "assertion_type": pl.String, + "column": pl.String, + "values": pl.Object, + "pre": pl.Object, + "segments": pl.String, + "eval_error": pl.Boolean, + "n": pl.Int64, + "all_passed": pl.Boolean, + "n_passed": pl.Int64, + "f_passed": pl.Float64, + "n_failed": pl.Int64, + "f_failed": pl.Float64, + "warning": pl.Boolean, + "error": pl.Boolean, + "critical": pl.Boolean, + "brief": pl.String, + "autobrief": pl.String, # Default brief if none found } ) names_dict = { - "assertion_type": "step", + "active": "active", + "i": "step_number", + "assertion_type": "step_decription", "column": "columns", "values": "values", - "pre": "tbl", - "active": "eval", + "pre": "original_pre", + "segments": "original_segments", + "eval_error": "step_evaluated", "n": "units", + "all_passed": "all_units_passed", "n_passed": "pass_n", "f_passed": "pass_pct", "n_failed": "failed_n", @@ -17961,12 +17958,38 @@ def get_dataframe( "warning": "warning", "error": "error", "critical": "critical", + "brief": "input_brief", + "autobrief": "autobrief", } report = {key: report_original[key] for key in names_dict.keys() if key in report_original} - df_validation_results = pl.DataFrame(data=report, schema=schema, strict=False).rename( - names_dict + df_validation_results = ( + pl.DataFrame(data=report, schema=schema) + .rename(names_dict) + .with_columns( + brief=pl.coalesce("input_brief", "autobrief"), + preprocessed=pl.when(pl.col("original_pre").is_not_null()) + .then(pl.lit(True)) + .otherwise(pl.lit(False)), + segmented=pl.when(pl.col("original_segments").is_not_null()) + .then(pl.lit(True)) + .otherwise(pl.lit(False)), + # Extract pattern from values if it's a dict, otherwise keep as-is + values=pl.col("values").map_elements( + lambda x: x.get("pattern") if isinstance(x, dict) and "pattern" in x else x, + return_dtype=pl.Object + ) + ) + .with_columns( + pl.when(pl.col("active") == False) + .then(pl.lit("-")) + .otherwise(pl.col(col)) + .alias(col) + for col in ["step_evaluated", "units", "all_units_passed", "pass_n", "pass_pct", + "failed_n", "failed_pct", "warning", "error", "critical"] + ) + .drop(["input_brief", "autobrief", "original_pre", "original_segments"]) ) return df_validation_results From 44f41c7bc45b1df44892f0866666675ce12a9eab Mon Sep 17 00:00:00 2001 From: Meghansaha Date: Fri, 6 Mar 2026 21:42:43 -0500 Subject: [PATCH 07/11] pandas done --- pointblank/validate.py | 202 +++++++++++++++++++++++++++-------------- 1 file changed, 133 insertions(+), 69 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index 83d73a157..22467725d 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -17898,48 +17898,7 @@ def get_dataframe(self, tbl_type: Literal["polars", "pandas", "duckdb"] = "polar if "extract" in report_original: report_original.pop("extract") - # Remove keys to be dropped - # MEGHAN pick up Here!!! - # I need to - # 1) assess which keys should be turned innto columns - DONE - # 2) Determine which keys are used for conditionals (active = data from step shows, inactive replace with "-") - # 3) Create a schema - DONE - # 4) return the df - - # Check for polars, raise if not installed - if tbl_type == "polars": - if not _is_lib_present(lib_name="polars"): - raise ImportError( - "The Polars library is not installed but is required when specifying " - '`tbl_type="polars".' - ) - import polars as pl - - # Create the schema for the df - schema = pl.Schema( - { - "active": pl.Boolean, - "i": pl.Int64, - "assertion_type": pl.String, - "column": pl.String, - "values": pl.Object, - "pre": pl.Object, - "segments": pl.String, - "eval_error": pl.Boolean, - "n": pl.Int64, - "all_passed": pl.Boolean, - "n_passed": pl.Int64, - "f_passed": pl.Float64, - "n_failed": pl.Int64, - "f_failed": pl.Float64, - "warning": pl.Boolean, - "error": pl.Boolean, - "critical": pl.Boolean, - "brief": pl.String, - "autobrief": pl.String, # Default brief if none found - } - ) - + # Set a dictionary for converting column names for df names_dict = { "active": "active", "i": "step_number", @@ -17962,37 +17921,142 @@ def get_dataframe(self, tbl_type: Literal["polars", "pandas", "duckdb"] = "polar "autobrief": "autobrief", } - report = {key: report_original[key] for key in names_dict.keys() if key in report_original} - - df_validation_results = ( - pl.DataFrame(data=report, schema=schema) - .rename(names_dict) - .with_columns( - brief=pl.coalesce("input_brief", "autobrief"), - preprocessed=pl.when(pl.col("original_pre").is_not_null()) - .then(pl.lit(True)) - .otherwise(pl.lit(False)), - segmented=pl.when(pl.col("original_segments").is_not_null()) - .then(pl.lit(True)) - .otherwise(pl.lit(False)), - # Extract pattern from values if it's a dict, otherwise keep as-is - values=pl.col("values").map_elements( - lambda x: x.get("pattern") if isinstance(x, dict) and "pattern" in x else x, - return_dtype=pl.Object + final_report = { + key: report_original[key] for key in names_dict.keys() if key in report_original + } + + # Check for polars, raise if not installed + if tbl_type == "polars": + if not _is_lib_present(lib_name="polars"): + raise ImportError( + "The Polars library is not installed but is required when specifying " + '`tbl_type="polars".' + ) + + import polars as pl + + # Create the schema for the df + pl_schema = pl.Schema( + { + "active": pl.Boolean, + "i": pl.Int64, + "assertion_type": pl.String, + "column": pl.String, + "values": pl.Object, + "pre": pl.Object, + "segments": pl.String, + "eval_error": pl.Boolean, + "n": pl.Int64, + "all_passed": pl.Boolean, + "n_passed": pl.Int64, + "f_passed": pl.Float64, + "n_failed": pl.Int64, + "f_failed": pl.Float64, + "warning": pl.Boolean, + "error": pl.Boolean, + "critical": pl.Boolean, + "brief": pl.String, + "autobrief": pl.String, # Default brief if none found + } + ) + + df_validation_results = ( + pl.DataFrame(data=final_report, schema=pl_schema) + .rename(names_dict) + .with_columns( + brief=pl.coalesce("input_brief", "autobrief"), + preprocessed=pl.when(pl.col("original_pre").is_not_null()) + .then(pl.lit(True)) + .otherwise(pl.lit(False)), + segmented=pl.when(pl.col("original_segments").is_not_null()) + .then(pl.lit(True)) + .otherwise(pl.lit(False)), + # Extract pattern from values if it's a dict, otherwise keep as-is + values=pl.col("values").map_elements( + lambda x: x.get("pattern") if isinstance(x, dict) and "pattern" in x else x, + return_dtype=pl.Object, + ), ) + .with_columns( + pl.when(pl.col("active") == False) + .then(pl.lit("-")) + .otherwise(pl.col(col)) + .alias(col) + for col in [ + "step_evaluated", + "units", + "all_units_passed", + "pass_n", + "pass_pct", + "failed_n", + "failed_pct", + "warning", + "error", + "critical", + ] + ) + .drop(["input_brief", "autobrief", "original_pre", "original_segments"]) ) - .with_columns( - pl.when(pl.col("active") == False) - .then(pl.lit("-")) - .otherwise(pl.col(col)) - .alias(col) - for col in ["step_evaluated", "units", "all_units_passed", "pass_n", "pass_pct", - "failed_n", "failed_pct", "warning", "error", "critical"] + + return df_validation_results + + if tbl_type == "pandas": + if not _is_lib_present(lib_name="pandas"): + raise ImportError( + "The Pandas library is not installed but is required when specifying " + '`tbl_type="pandas".' + ) + + import pandas as pd + + def transform_validation_results(df): + # Coalesce: use fillna for first occurrence + df = df.assign(brief=df["input_brief"].fillna(df["autobrief"])) + + # Boolean columns based on null checks + df = df.assign( + preprocessed=df["original_pre"].notna(), + segmented=df["original_segments"].notna(), + ) + + # Extract pattern from dict + df = df.assign( + values=df["values"].apply( + lambda x: x.get("pattern") if isinstance(x, dict) and "pattern" in x else x + ) + ) + + # Create conditional columns in a loop + conditional_cols = [ + "step_evaluated", + "units", + "all_units_passed", + "pass_n", + "pass_pct", + "failed_n", + "failed_pct", + "warning", + "error", + "critical", + ] + + for col in conditional_cols: + df[col] = df[col].where(df["active"] != False, "-") + + # Drop columns + df_validation_results = df.drop( + columns=["input_brief", "autobrief", "original_pre", "original_segments"] + ) + + return df + + df_validation_results = ( + pd.DataFrame(data=final_report) + .rename(columns=names_dict) + .pipe(transform_validation_results) ) - .drop(["input_brief", "autobrief", "original_pre", "original_segments"]) - ) - return df_validation_results + return df_validation_results def _add_validation(self, validation_info): """ From b3099a8c4f19b815db939ab81223fef26e567b28 Mon Sep 17 00:00:00 2001 From: Meghansaha Date: Wed, 11 Mar 2026 14:35:24 -0400 Subject: [PATCH 08/11] add duckdb/ibis workflow --- pointblank/validate.py | 76 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 2 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index 22467725d..112456094 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -1,4 +1,3 @@ -# ruff: noqa from __future__ import annotations import base64 @@ -18044,7 +18043,7 @@ def transform_validation_results(df): df[col] = df[col].where(df["active"] != False, "-") # Drop columns - df_validation_results = df.drop( + df = df.drop( columns=["input_brief", "autobrief", "original_pre", "original_segments"] ) @@ -18058,6 +18057,79 @@ def transform_validation_results(df): return df_validation_results + if tbl_type == "duckdb": + if not _is_lib_present(lib_name="ibis"): + raise ImportError( + "The Ibis library is not installed but is required when specifying " + '`tbl_type="duckdb".' + ) + + import ibis + import ibis.expr.datatypes as dt + + ibis_schema = { + "active": dt.Boolean(), + "i": dt.Int64(), + "assertion_type": dt.String(), + "column": dt.String(), + "values": dt.json(), + "pre": dt.json(), + "segments": dt.String(), + "eval_error": dt.Boolean(), + "n": dt.Int64(), + "all_passed": dt.Boolean(), + "n_passed": dt.Int64(), + "f_passed": dt.Float64(), + "n_failed": dt.Int64(), + "f_failed": dt.Float64(), + "warning": dt.Boolean(), + "error": dt.Boolean(), + "critical": dt.Boolean(), + "brief": dt.String(), + "autobrief": dt.String(), + } + + # Pulling out clean regex pattern if needed + final_report["values"] = [ + values.get("pattern") + if isinstance(values, dict) and "pattern" in values + else values + for values in final_report["values"] + ] + + report_table = ibis.memtable(final_report, schema=ibis_schema).rename( + {values: keys for keys, values in names_dict.items()} + ) + + conditional_cols = [ + "step_evaluated", + "units", + "all_units_passed", + "pass_n", + "pass_pct", + "failed_n", + "failed_pct", + "warning", + "error", + "critical", + ] + + df_validation_results = report_table.mutate( + brief=ibis.coalesce(report_table.input_brief, report_table.autobrief), + preprocessed=report_table.original_pre.notnull(), + segmented=report_table.original_segments.notnull(), + **{ + col: ibis.ifelse( + report_table.active == False, + ibis.literal("-"), + report_table[col].cast("string"), + ) + for col in conditional_cols + }, + ).drop("input_brief", "autobrief", "original_pre", "original_segments") + + return df_validation_results + def _add_validation(self, validation_info): """ Add a validation to the list of validations. From c53e3ed704b981f3c2e53fa59882fc69064fdcdb Mon Sep 17 00:00:00 2001 From: Meghansaha Date: Wed, 11 Mar 2026 14:49:23 -0400 Subject: [PATCH 09/11] fix ruff errors --- pointblank/validate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index 112456094..40052c406 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -17977,7 +17977,7 @@ def get_dataframe(self, tbl_type: Literal["polars", "pandas", "duckdb"] = "polar ), ) .with_columns( - pl.when(pl.col("active") == False) + pl.when(~pl.col("active")) .then(pl.lit("-")) .otherwise(pl.col(col)) .alias(col) @@ -18040,7 +18040,7 @@ def transform_validation_results(df): ] for col in conditional_cols: - df[col] = df[col].where(df["active"] != False, "-") + df[col] = df[col].where(df["active"], "-") # Drop columns df = df.drop( @@ -18120,7 +18120,7 @@ def transform_validation_results(df): segmented=report_table.original_segments.notnull(), **{ col: ibis.ifelse( - report_table.active == False, + ~report_table.active, ibis.literal("-"), report_table[col].cast("string"), ) From 9f47df9e9b5474b21363f6ec27b5e1a97c47aba1 Mon Sep 17 00:00:00 2001 From: Meghansaha Date: Wed, 11 Mar 2026 14:54:39 -0400 Subject: [PATCH 10/11] reformat w/ ruff --- pointblank/validate.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index 2b3f46b4d..a80d02b18 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -17999,10 +17999,7 @@ def get_dataframe(self, tbl_type: Literal["polars", "pandas", "duckdb"] = "polar ), ) .with_columns( - pl.when(~pl.col("active")) - .then(pl.lit("-")) - .otherwise(pl.col(col)) - .alias(col) + pl.when(~pl.col("active")).then(pl.lit("-")).otherwise(pl.col(col)).alias(col) for col in [ "step_evaluated", "units", From 3b5f38e65424616ffbb4c8272faa04f0c845aed7 Mon Sep 17 00:00:00 2001 From: Meghansaha Date: Wed, 11 Mar 2026 15:05:00 -0400 Subject: [PATCH 11/11] pull in updates from main --- pointblank/validate.py | 282 ++++++++++------------------------------- 1 file changed, 67 insertions(+), 215 deletions(-) diff --git a/pointblank/validate.py b/pointblank/validate.py index a80d02b18..84eba2156 100644 --- a/pointblank/validate.py +++ b/pointblank/validate.py @@ -27,11 +27,7 @@ from great_tables.vals import fmt_integer, fmt_number from importlib_resources import files -from pointblank._agg import ( - is_valid_agg, - load_validation_method_grid, - resolve_agg_registries, -) +from pointblank._agg import is_valid_agg, load_validation_method_grid, resolve_agg_registries from pointblank._constants import ( ASSERTION_TYPE_METHOD_MAP, CHECK_MARK_SPAN, @@ -140,12 +136,7 @@ import polars as pl from narwhals.typing import IntoDataFrame, IntoFrame - from pointblank._typing import ( - AbsoluteBounds, - Tolerance, - _CompliantValue, - _CompliantValues, - ) + from pointblank._typing import AbsoluteBounds, Tolerance, _CompliantValue, _CompliantValues __all__ = [ @@ -788,9 +779,7 @@ def read_file(filepath: str | Path) -> Validate: raise RuntimeError(f"Failed to read validation object from {file_path}: {e}") -def _check_for_unpicklable_objects( - validation: Validate, -) -> tuple[dict[str, str], list[int]]: +def _check_for_unpicklable_objects(validation: Validate) -> tuple[dict[str, str], list[int]]: """ Check for functions and capture source code for preservation across sessions. @@ -849,11 +838,7 @@ def _check_for_unpicklable_objects( f"Warning: Function '{func_name}' is picklable but source code could not be captured. " f"It may not be available when loading in a different session." ) - except ( - pickle.PicklingError, - AttributeError, - TypeError, - ): # pragma: no cover + except (pickle.PicklingError, AttributeError, TypeError): # pragma: no cover # Not picklable and no source: treat as problematic print( # pragma: no cover f"Warning: Function '{func_name}' is not picklable and source could not be captured. " @@ -943,11 +928,7 @@ def _provide_serialization_guidance(validation: Validate) -> None: try: # pragma: no cover pickle.dumps(func, protocol=pickle.HIGHEST_PROTOCOL) # pragma: no cover can_pickle = True # pragma: no cover - except ( - pickle.PicklingError, - AttributeError, - TypeError, - ): # pragma: no cover + except (pickle.PicklingError, AttributeError, TypeError): # pragma: no cover can_pickle = False # pragma: no cover functions_analysis["unpicklable_functions"].append( (i, func_name, func_module) @@ -1342,10 +1323,7 @@ def add_computed_column(df): function_sources, lambda_steps = _check_for_unpicklable_objects(validation_copy) # Create a validation package that includes both the object and function sources - validation_package = { - "validation": validation_copy, - "function_sources": function_sources, - } + validation_package = {"validation": validation_copy, "function_sources": function_sources} # Serialize to disk using pickle try: @@ -2304,10 +2282,7 @@ def _generate_display_table( # Select the columns to display in the table with the `resolved_columns` value data = _select_columns( - data, - resolved_columns=resolved_columns, - ibis_tbl=ibis_tbl, - tbl_type=tbl_type, + data, resolved_columns=resolved_columns, ibis_tbl=ibis_tbl, tbl_type=tbl_type ) # From an Ibis table: @@ -3189,8 +3164,7 @@ def _calculate_missing_proportions_dataframe(is_polars=False): import polars.selectors as cs missing_vals_tbl = missing_vals_tbl.tab_style( - style=style.fill(color="lightblue"), - locations=loc.body(mask=cs.numeric().eq(0)), + style=style.fill(color="lightblue"), locations=loc.body(mask=cs.numeric().eq(0)) ) if df_lib_name_gt == "pandas": @@ -13703,9 +13677,7 @@ def interrogate( ]: # Process table for column validation tbl = _column_test_prep( - df=data_tbl_step, - column=column, - allowed_types=compatible_dtypes, + df=data_tbl_step, column=column, allowed_types=compatible_dtypes ) if assertion_method == "gt": @@ -13801,9 +13773,7 @@ def interrogate( ) elif assertion_type == "col_vals_within_spec": - from pointblank._interrogation import ( - interrogate_within_spec, - ) + from pointblank._interrogation import interrogate_within_spec results_tbl = interrogate_within_spec( tbl=tbl, column=column, values=value, na_pass=na_pass @@ -13891,9 +13861,7 @@ def interrogate( schema_info=schema_validation_info ) validation._add_note( - key="schema_check", - markdown=schema_note_html, - text=schema_note_text, + key="schema_check", markdown=schema_note_html, text=schema_note_text ) validation.all_passed = result_bool @@ -13920,9 +13888,7 @@ def interrogate( elif assertion_type == "col_count_match": result_bool = col_count_match( - data_tbl=data_tbl_step, - count=value["count"], - inverse=value["inverse"], + data_tbl=data_tbl_step, count=value["count"], inverse=value["inverse"] ) validation.all_passed = result_bool @@ -14140,20 +14106,16 @@ def interrogate( if is_column_not_found: note_html = _create_column_not_found_note_html( column_name=column, - available_columns=( - list(data_tbl_step.columns) - if hasattr(data_tbl_step, "columns") - else [] - ), + available_columns=list(data_tbl_step.columns) + if hasattr(data_tbl_step, "columns") + else [], locale=self.locale, ) note_text = _create_column_not_found_note_text( column_name=column, - available_columns=( - list(data_tbl_step.columns) - if hasattr(data_tbl_step, "columns") - else [] - ), + available_columns=list(data_tbl_step.columns) + if hasattr(data_tbl_step, "columns") + else [], ) validation._add_note( key="column_not_found", @@ -14172,10 +14134,7 @@ def interrogate( # Determine position for between/outside validations position = None - if assertion_type in [ - "col_vals_between", - "col_vals_outside", - ]: + if assertion_type in ["col_vals_between", "col_vals_outside"]: # Check if missing column is in left or right position from pointblank.column import Column @@ -14193,21 +14152,17 @@ def interrogate( note_html = _create_comparison_column_not_found_note_html( column_name=missing_col_name, position=position, - available_columns=( - list(data_tbl_step.columns) - if hasattr(data_tbl_step, "columns") - else [] - ), + available_columns=list(data_tbl_step.columns) + if hasattr(data_tbl_step, "columns") + else [], locale=self.locale, ) note_text = _create_comparison_column_not_found_note_text( column_name=missing_col_name, position=position, - available_columns=( - list(data_tbl_step.columns) - if hasattr(data_tbl_step, "columns") - else [] - ), + available_columns=list(data_tbl_step.columns) + if hasattr(data_tbl_step, "columns") + else [], ) validation._add_note( key="comparison_column_not_found", @@ -14308,9 +14263,7 @@ def interrogate( validation, level, threshold._threshold_result( - fraction_failing=validation.f_failed, - test_units=validation.n, - level=level, + fraction_failing=validation.f_failed, test_units=validation.n, level=level ), ) @@ -14509,8 +14462,7 @@ def interrogate( validation_extract_native = ( validation_extract_native.sample( # pragma: no cover fraction=min( - 1.0, - sample_n / validation_extract_native.count(), + 1.0, sample_n / validation_extract_native.count() ) # pragma: no cover ).limit(sample_n) ) # pragma: no cover @@ -15947,9 +15899,7 @@ def get_data_extracts( return result def get_json_report( - self, - use_fields: list[str] | None = None, - exclude_fields: list[str] | None = None, + self, use_fields: list[str] | None = None, exclude_fields: list[str] | None = None ) -> str: """ Get a report of the validation results as a JSON-formatted string. @@ -16661,8 +16611,7 @@ def get_tabular_report( .opt_align_table_header(align=before) .tab_style(style=style.css("height: 20px;"), locations=loc.body()) .tab_style( - style=style.text(weight="bold", color="#666666"), - locations=loc.column_labels(), + style=style.text(weight="bold", color="#666666"), locations=loc.column_labels() ) .tab_style( style=style.text(size="28px", weight="bold", align=before, color="#444444"), @@ -17332,8 +17281,7 @@ def get_tabular_report( locations=loc.body(columns="i"), ) .tab_style( - style=style.text(weight="bold", color="#666666"), - locations=loc.column_labels(), + style=style.text(weight="bold", color="#666666"), locations=loc.column_labels() ) .tab_style( style=style.text(size="28px", weight="bold", align=before, color="#444444"), @@ -17344,14 +17292,7 @@ def get_tabular_report( color="black", font=google_font(name="IBM Plex Mono"), size="11px" ), locations=loc.body( - columns=[ - "type_upd", - "columns_upd", - "values_upd", - "test_units", - "pass", - "fail", - ] + columns=["type_upd", "columns_upd", "values_upd", "test_units", "pass", "fail"] ), ) .tab_style( @@ -17412,9 +17353,7 @@ def get_tabular_report( ) .tab_style( style=style.fill( - color=( - from_column(column="status_color") if interrogation_performed else "white" - ) + color=from_column(column="status_color") if interrogation_performed else "white" ), locations=loc.body(columns="status_color"), ) @@ -17463,8 +17402,7 @@ def get_tabular_report( } ) .cols_align( - align="center", - columns=["tbl", "eval", "w_upd", "e_upd", "c_upd", "extract_upd"], + align="center", columns=["tbl", "eval", "w_upd", "e_upd", "c_upd", "extract_upd"] ) .cols_align(align="right", columns=["test_units", "pass", "fail"]) .cols_align(align=before, columns=["type_upd", "columns_upd", "values_upd"]) @@ -17501,16 +17439,7 @@ def get_tabular_report( gt_tbl = gt_tbl.tab_style( style=style.fill(color="#F2F2F2"), locations=loc.body( - columns=[ - "tbl", - "eval", - "test_units", - "pass", - "fail", - "w_upd", - "e_upd", - "c_upd", - ] + columns=["tbl", "eval", "test_units", "pass", "fail", "w_upd", "e_upd", "c_upd"] ), ) @@ -17535,8 +17464,7 @@ def get_tabular_report( gt_tbl = gt_tbl.tab_style( style=style.text(color="#B22222"), locations=loc.body( - columns="columns_upd", - rows=[i for i, error in enumerate(eval_error) if error], + columns="columns_upd", rows=[i for i, error in enumerate(eval_error) if error] ), ) @@ -19220,9 +19148,9 @@ def _create_autobrief_or_failure_text( if assertion_type == "prompt": return _create_text_prompt( lang=lang, - prompt=( - values["prompt"] if isinstance(values, dict) and "prompt" in values else str(values) - ), + prompt=values["prompt"] + if isinstance(values, dict) and "prompt" in values + else str(values), for_failure=for_failure, ) @@ -19292,11 +19220,7 @@ def _create_text_between( def _create_text_set( - lang: str, - column: str, - values: list[Any], - not_: bool = False, - for_failure: bool = False, + lang: str, column: str, values: list[Any], not_: bool = False, for_failure: bool = False ) -> str: type_ = _expect_failure_type(for_failure=for_failure) @@ -19460,33 +19384,6 @@ def _create_text_data_freshness( ) -def _create_text_data_freshness( - lang: str, - column: str | None, - value: dict, - for_failure: bool = False, -) -> str: - """Create text for data_freshness validation.""" - type_ = _expect_failure_type(for_failure=for_failure) - - column_text = _prep_column_text(column=column) - max_age_text = _format_timedelta(value.get("max_age")) - - if for_failure: - age = value.get("age") - age_text = _format_timedelta(age) if age else "unknown" - return EXPECT_FAIL_TEXT[f"data_freshness_{type_}_text"][lang].format( - column_text=column_text, - max_age_text=max_age_text, - age_text=age_text, - ) - else: - return EXPECT_FAIL_TEXT[f"data_freshness_{type_}_text"][lang].format( - column_text=column_text, - max_age_text=max_age_text, - ) - - def _create_text_col_pct_null( lang: str, column: str | None, @@ -20107,10 +20004,7 @@ def _get_preprocessed_table_icon(icon: list[str]) -> list[str]: def _transform_eval( - n: list[int], - interrogation_performed: bool, - eval_error: list[bool], - active: list[bool], + n: list[int], interrogation_performed: bool, eval_error: list[bool], active: list[bool] ) -> list[str]: # If no interrogation was performed, return a list of empty strings if not interrogation_performed: @@ -20284,7 +20178,9 @@ def _transform_w_e_c(values, color, interrogation_performed): else ( f'' if value is True - else (f'' if value is False else value) + else f'' + if value is False + else value ) ) for value in values @@ -20316,11 +20212,9 @@ def _transform_assertion_str( # If the template text `{auto}` is in the `brief_str` then replace it with # the corresponding `autobrief_str` entry brief_str = [ - ( - brief_str[i].replace("{auto}", autobrief_str[i]) - if "{auto}" in brief_str[i] - else brief_str[i] - ) + brief_str[i].replace("{auto}", autobrief_str[i]) + if "{auto}" in brief_str[i] + else brief_str[i] for i in range(len(brief_str)) ] @@ -20331,7 +20225,7 @@ def _transform_assertion_str( # In some sandboxed HTML environments (e.g., Streamlit),

tags don't inherit # font-size from parent divs, so we add inline styles directly to the

tags brief_str = [ - (re.sub(r"

", r'

', x) if x.strip() else x) + re.sub(r"

", r'

', x) if x.strip() else x for x in brief_str ] @@ -20347,11 +20241,9 @@ def _transform_assertion_str( # Define the brief's HTML div tag for each row brief_divs = [ - ( - f"

{brief}
" - if brief.strip() - else "" - ) + f"
{brief}
" + if brief.strip() + else "" for brief in brief_str ] @@ -20685,10 +20577,7 @@ def _format_single_float_with_gt_custom( # Create GT object and format the column gt_obj = GT(df).fmt_number( - columns="value", - decimals=decimals, - drop_trailing_zeros=drop_trailing_zeros, - locale=locale, + columns="value", decimals=decimals, drop_trailing_zeros=drop_trailing_zeros, locale=locale ) # Extract the formatted value using _get_column_of_values @@ -20698,11 +20587,7 @@ def _format_single_float_with_gt_custom( def _format_number_safe( - value: float, - decimals: int, - drop_trailing_zeros: bool = False, - locale: str = "en", - df_lib=None, + value: float, decimals: int, drop_trailing_zeros: bool = False, locale: str = "en", df_lib=None ) -> str: """ Safely format a float value with locale support. @@ -20883,11 +20768,7 @@ def _format_threshold_value(fraction: float | None, count: int | None) -> str: else: # Use shared formatting function with drop_trailing_zeros formatted = _format_number_safe( - fraction, - decimals=2, - drop_trailing_zeros=True, - locale=locale, - df_lib=df_lib, + fraction, decimals=2, drop_trailing_zeros=True, locale=locale, df_lib=df_lib ) return formatted elif count is not None: @@ -21043,8 +20924,7 @@ def _create_no_columns_resolved_note_html( """ # Get translated strings intro = NOTES_TEXT.get("column_not_found_intro", {}).get( - locale, - NOTES_TEXT.get("column_not_found_intro", {}).get("en", "The column expression"), + locale, NOTES_TEXT.get("column_not_found_intro", {}).get("en", "The column expression") ) no_resolve = NOTES_TEXT.get("column_not_found_no_resolve", {}).get( locale, @@ -21103,8 +20983,7 @@ def _create_column_not_found_note_html( """ # Get translated strings intro = NOTES_TEXT.get("target_column_provided", {}).get( - locale, - NOTES_TEXT.get("target_column_provided", {}).get("en", "The target column provided"), + locale, NOTES_TEXT.get("target_column_provided", {}).get("en", "The target column provided") ) not_found = NOTES_TEXT.get("does_not_match_any_columns", {}).get( locale, @@ -21142,10 +21021,7 @@ def _create_column_not_found_note_text(column_name: str, available_columns: list def _create_comparison_column_not_found_note_html( - column_name: str, - position: str | None, - available_columns: list[str], - locale: str = "en", + column_name: str, position: str | None, available_columns: list[str], locale: str = "en" ) -> str: """ Create an HTML note explaining that a comparison column was not found. @@ -21263,8 +21139,7 @@ def _create_preprocessing_note_html( """ # Get translated strings precondition_text = NOTES_TEXT.get("precondition_applied", {}).get( - locale, - NOTES_TEXT.get("precondition_applied", {}).get("en", "Precondition applied"), + locale, NOTES_TEXT.get("precondition_applied", {}).get("en", "Precondition applied") ) table_dims_text = NOTES_TEXT.get("table_dimensions", {}).get( locale, NOTES_TEXT.get("table_dimensions", {}).get("en", "table dimensions") @@ -21413,8 +21288,7 @@ def _create_synthetic_target_column_note_html(column_name: str, locale: str = "e """ # Get translated strings synthetic_text = NOTES_TEXT.get("synthetic_target_column", {}).get( - locale, - NOTES_TEXT.get("synthetic_target_column", {}).get("en", "Synthetic target column"), + locale, NOTES_TEXT.get("synthetic_target_column", {}).get("en", "Synthetic target column") ) created_via_text = NOTES_TEXT.get("created_via_preprocessing", {}).get( locale, @@ -21485,8 +21359,7 @@ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") -> locale, VALIDATION_REPORT_TEXT["note_schema_comparison_disclosure"]["en"] ) settings_title_text = VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"].get( - locale, - VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"]["en"], + locale, VALIDATION_REPORT_TEXT["note_schema_comparison_match_settings_title"]["en"] ) # Build summary message @@ -21501,8 +21374,7 @@ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") -> n_target = len(target_schema) if n_expect != n_target: count_mismatch_text = VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"].get( - locale, - VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"]["en"], + locale, VALIDATION_REPORT_TEXT["note_schema_column_count_mismatch"]["en"] ) failures.append(count_mismatch_text.format(n_expect=n_expect, n_target=n_target)) @@ -21555,11 +21427,7 @@ def _create_col_schema_match_note_html(schema_info: dict, locale: str = "en") -> ) else: step_report_gt = _step_report_schema_any_order( - step=1, - schema_info=schema_info, - header=None, - lang=locale, - debug_return_df=False, + step=1, schema_info=schema_info, header=None, lang=locale, debug_return_df=False ) # Generate the settings HTML using the existing function @@ -21725,10 +21593,7 @@ def _step_report_row_based( locations=loc.body(columns=column), ).tab_style( style=style.borders( - sides=["left", "right"], - color="#1B4D3E80", - style="solid", - weight="2px", + sides=["left", "right"], color="#1B4D3E80", style="solid", weight="2px" ), locations=loc.column_labels(columns=column), ) @@ -21810,10 +21675,7 @@ def _step_report_row_based( style.text(color="#B22222"), style.fill(color="#FFC1C159"), style.borders( - sides=["left", "right"], - color="black", - style="solid", - weight="2px", + sides=["left", "right"], color="black", style="solid", weight="2px" ), ], locations=loc.body(columns=column), @@ -21849,15 +21711,13 @@ def _step_report_row_based( if limit < extract_length: extract_length_resolved = limit extract_text = STEP_REPORT_TEXT["extract_text_first"][lang].format( - extract_length_resolved=extract_length_resolved, - shown_failures=shown_failures, + extract_length_resolved=extract_length_resolved, shown_failures=shown_failures ) else: extract_length_resolved = extract_length extract_text = STEP_REPORT_TEXT["extract_text_all"][lang].format( - extract_length_resolved=extract_length_resolved, - shown_failures=shown_failures, + extract_length_resolved=extract_length_resolved, shown_failures=shown_failures ) details = ( @@ -22320,11 +22180,7 @@ def format_value(v): def _step_report_schema_in_order( - step: int, - schema_info: dict, - header: str | None, - lang: str, - debug_return_df: bool = False, + step: int, schema_info: dict, header: str | None, lang: str, debug_return_df: bool = False ) -> GT | Any: """ This is the case for schema validation where the schema is supposed to have the same column @@ -22682,11 +22538,7 @@ def _step_report_schema_in_order( def _step_report_schema_any_order( - step: int, - schema_info: dict, - header: str | None, - lang: str, - debug_return_df: bool = False, + step: int, schema_info: dict, header: str | None, lang: str, debug_return_df: bool = False ) -> GT | pl.DataFrame: """ This is the case for schema validation where the schema is permitted to not have to be in the