Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
285 changes: 285 additions & 0 deletions pointblank/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -17792,6 +17792,291 @@ def get_step_report(

return step_report

def get_dataframe(self, tbl_type: Literal["polars", "pandas", "duckdb"] = "polars"):
"""
Validation results as a dataframe

The `get_dataframe()` method returns a dataframe that represents the validation report. This dataframe provides a summary of the validation results, including the validation steps, the number of test units, the number of failing test units, and the fraction of failing test units. This can be particularly helpful for logging purposes and enables writing validation summaries to CSVs and other on-disk formats.

Parameters
----------
tbl_type :
The output backend for the dataframe. The named options are `"polars"`, `"pandas"`, and `"duckdb"`. Default is 'polars'.

Supported DataFrame Types
-------------------------
The `tbl_type=` parameter can be set to one of the following:

- `"polars"`: A Polars DataFrame.
- `"pandas"`: A Pandas DataFrame.
- `"duckdb"`: An Ibis table for a DuckDB database.

Examples
--------

```{python}
import pointblank as pb

# Create a validation
validation = (
pb.Validate(data=pb.load_dataset("small_table", tbl_type = "polars"), label="My validation")
.col_vals_gt(columns="d", value=100)
.col_vals_regex(columns="b", pattern=r"[0-9]-[a-z]{3}-[0-9]{3}")
.interrogate()
)

# Get a dataframe of the validation summary results
df_validation = validation.get_dataframe()

```

"""
# Raise an error if tbl_type is not one of the supported types
if tbl_type not in ["polars", "pandas", "duckdb"]:
raise ValueError(
f"The DataFrame type `{tbl_type}` is not valid. Choose one of the following:\n"
"- `polars`\n"
"- `pandas`\n"
"- `duckdb`"
)

# Grab the summary data from validation info helper function
report_original = _validation_info_as_dict(self.validation_info)

# Pop the extracts off if present
if "extract" in report_original:
report_original.pop("extract")

# Set a dictionary for converting column names for df
names_dict = {
"active": "active",
"i": "step_number",
"assertion_type": "step_decription",
"column": "columns",
"values": "values",
"pre": "original_pre",
"segments": "original_segments",
"eval_error": "step_evaluated",
"n": "units",
"all_passed": "all_units_passed",
"n_passed": "pass_n",
"f_passed": "pass_pct",
"n_failed": "failed_n",
"f_failed": "failed_pct",
"warning": "warning",
"error": "error",
"critical": "critical",
"brief": "input_brief",
"autobrief": "autobrief",
}

final_report = {
key: report_original[key] for key in names_dict.keys() if key in report_original
}

# Check for polars, raise if not installed
if tbl_type == "polars":
if not _is_lib_present(lib_name="polars"):
raise ImportError(
"The Polars library is not installed but is required when specifying "
'`tbl_type="polars".'
)

import polars as pl

# Create the schema for the df
pl_schema = pl.Schema(
{
"active": pl.Boolean,
"i": pl.Int64,
"assertion_type": pl.String,
"column": pl.String,
"values": pl.Object,
"pre": pl.Object,
"segments": pl.String,
"eval_error": pl.Boolean,
"n": pl.Int64,
"all_passed": pl.Boolean,
"n_passed": pl.Int64,
"f_passed": pl.Float64,
"n_failed": pl.Int64,
"f_failed": pl.Float64,
"warning": pl.Boolean,
"error": pl.Boolean,
"critical": pl.Boolean,
"brief": pl.String,
"autobrief": pl.String, # Default brief if none found
}
)

df_validation_results = (
pl.DataFrame(data=final_report, schema=pl_schema)
.rename(names_dict)
.with_columns(
brief=pl.coalesce("input_brief", "autobrief"),
preprocessed=pl.when(pl.col("original_pre").is_not_null())
.then(pl.lit(True))
.otherwise(pl.lit(False)),
segmented=pl.when(pl.col("original_segments").is_not_null())
.then(pl.lit(True))
.otherwise(pl.lit(False)),
# Extract pattern from values if it's a dict, otherwise keep as-is
values=pl.col("values").map_elements(
lambda x: x.get("pattern") if isinstance(x, dict) and "pattern" in x else x,
return_dtype=pl.Object,
),
)
.with_columns(
pl.when(~pl.col("active")).then(pl.lit("-")).otherwise(pl.col(col)).alias(col)
for col in [
"step_evaluated",
"units",
"all_units_passed",
"pass_n",
"pass_pct",
"failed_n",
"failed_pct",
"warning",
"error",
"critical",
]
)
.drop(["input_brief", "autobrief", "original_pre", "original_segments"])
)

return df_validation_results

if tbl_type == "pandas":
if not _is_lib_present(lib_name="pandas"):
raise ImportError(
"The Pandas library is not installed but is required when specifying "
'`tbl_type="pandas".'
)

import pandas as pd

def transform_validation_results(df):
# Coalesce: use fillna for first occurrence
df = df.assign(brief=df["input_brief"].fillna(df["autobrief"]))

# Boolean columns based on null checks
df = df.assign(
preprocessed=df["original_pre"].notna(),
segmented=df["original_segments"].notna(),
)

# Extract pattern from dict
df = df.assign(
values=df["values"].apply(
lambda x: x.get("pattern") if isinstance(x, dict) and "pattern" in x else x
)
)

# Create conditional columns in a loop
conditional_cols = [
"step_evaluated",
"units",
"all_units_passed",
"pass_n",
"pass_pct",
"failed_n",
"failed_pct",
"warning",
"error",
"critical",
]

for col in conditional_cols:
df[col] = df[col].where(df["active"], "-")

# Drop columns
df = df.drop(
columns=["input_brief", "autobrief", "original_pre", "original_segments"]
)

return df

df_validation_results = (
pd.DataFrame(data=final_report)
.rename(columns=names_dict)
.pipe(transform_validation_results)
)

return df_validation_results

if tbl_type == "duckdb":
if not _is_lib_present(lib_name="ibis"):
raise ImportError(
"The Ibis library is not installed but is required when specifying "
'`tbl_type="duckdb".'
)

import ibis
import ibis.expr.datatypes as dt

ibis_schema = {
"active": dt.Boolean(),
"i": dt.Int64(),
"assertion_type": dt.String(),
"column": dt.String(),
"values": dt.json(),
"pre": dt.json(),
"segments": dt.String(),
"eval_error": dt.Boolean(),
"n": dt.Int64(),
"all_passed": dt.Boolean(),
"n_passed": dt.Int64(),
"f_passed": dt.Float64(),
"n_failed": dt.Int64(),
"f_failed": dt.Float64(),
"warning": dt.Boolean(),
"error": dt.Boolean(),
"critical": dt.Boolean(),
"brief": dt.String(),
"autobrief": dt.String(),
}

# Pulling out clean regex pattern if needed
final_report["values"] = [
values.get("pattern")
if isinstance(values, dict) and "pattern" in values
else values
for values in final_report["values"]
]

report_table = ibis.memtable(final_report, schema=ibis_schema).rename(
{values: keys for keys, values in names_dict.items()}
)

conditional_cols = [
"step_evaluated",
"units",
"all_units_passed",
"pass_n",
"pass_pct",
"failed_n",
"failed_pct",
"warning",
"error",
"critical",
]

df_validation_results = report_table.mutate(
brief=ibis.coalesce(report_table.input_brief, report_table.autobrief),
preprocessed=report_table.original_pre.notnull(),
segmented=report_table.original_segments.notnull(),
**{
col: ibis.ifelse(
~report_table.active,
ibis.literal("-"),
report_table[col].cast("string"),
)
for col in conditional_cols
},
).drop("input_brief", "autobrief", "original_pre", "original_segments")

return df_validation_results

def _add_validation(self, validation_info):
"""
Add a validation to the list of validations.
Expand Down
Loading