Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions python-pointblank/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Validating Data With Pointblank in Python

Supporting code and sample data for the Real Python tutorial
"Validating Data With Pointblank in Python".

## Requirements

The Python scripts use PEP 723 dependency metadata and run with
[uv](https://docs.astral.sh/uv/):

```console
$ uv run pointblank_quickstart.py
$ uv run pointblank_thresholds.py
$ uv run pointblank_atoms.py
```

The command-line examples can run without a project environment:

```console
$ uv run --no-project --with 'pointblank[pl]' -- pb scan pointblank_atoms.csv
$ uv run --no-project --with 'pointblank[pl]' -- pb missing pointblank_atoms.csv
$ uvx --from 'pointblank[pl]' pb run pointblank_atoms.yaml --output-html pointblank_report.html
```

14 changes: 14 additions & 0 deletions python-pointblank/pointblank_atoms.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
atom_id,symbol,x,y,z,fx,fy,fz
0,Cu,1.0,0.5,0.1,0.1,0.0,0.0
1,Pt,2.1,1.5,0.2,-0.2,0.1,-0.1
2,Cu,3.2,2.5,0.3,0.3,-0.1,0.1
3,Pt,4.3,3.5,0.4,-0.1,0.0,0.0
4,Cu,5.4,4.5,0.5,0.2,0.1,-0.1
5,Pt,6.5,5.5,0.6,-0.3,-0.1,0.1
6,Cu,7.6,6.5,0.7,0.1,0.0,0.0
7,Pt,8.7,7.5,0.8,-0.2,0.1,-0.1
8,Cu,9.8,8.5,0.9,0.3,-0.1,0.1
9,Pt,10.9,9.5,1.0,-0.1,0.0,0.0
10,Zz,0.5,0.5,0.1,0.0,0.0,0.0
11,Cu,,1.5,0.2,0.0,0.0,0.0
12,Pt,12.1,2.5,0.3,1500.0,0.0,0.0
43 changes: 43 additions & 0 deletions python-pointblank/pointblank_atoms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "pointblank[pl]",
# ]
# ///

import polars as pl
import pointblank as pb

VALID_ELEMENTS = ["Cu", "Pt"]


def main() -> None:
atoms = pl.read_csv("pointblank_atoms.csv")

validation = (
pb.Validate(
data=atoms,
tbl_name="atoms_from_parser",
label="Round-trip validation before re-export",
thresholds=pb.Thresholds(warning=0.02, error=0.05, critical=0.07),
)
.col_vals_in_set(columns="symbol", set=VALID_ELEMENTS)
.col_vals_not_null(columns=["x", "y", "z"])
.col_vals_between(columns=["x", "y", "z"], left=0, right=20)
.col_vals_between(columns="fx", left=-1000, right=1000)
.interrogate()
)

clean = validation.get_sundered_data(type="pass")
dirty = validation.get_sundered_data(type="fail")

print(f"Safe to re-export: {len(clean)} rows")
print(f"Needs review: {len(dirty)} rows")
print("\nClean rows")
print(clean.select(["atom_id", "symbol", "x", "fx"]))
print("\nDirty rows")
print(dirty.select(["atom_id", "symbol", "x", "fx"]))


if __name__ == "__main__":
main()
22 changes: 22 additions & 0 deletions python-pointblank/pointblank_atoms.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
tbl: pointblank_atoms.csv
df_library: polars
tbl_name: "Atom Validation"
label: "Tutorial YAML validation"
thresholds:
warning: 0.02
error: 0.05
critical: 0.07
steps:
- col_vals_in_set:
columns: symbol
set: [Cu, Pt]
- col_vals_not_null:
columns: [x, y, z]
- col_vals_between:
columns: [x, y, z]
left: 0
right: 20
- col_vals_between:
columns: fx
left: -1000
right: 1000
39 changes: 39 additions & 0 deletions python-pointblank/pointblank_quickstart.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "pointblank[pl]",
# ]
# ///

import pointblank as pb


def main() -> None:
validation = (
pb.Validate(
data=pb.load_dataset("small_table", tbl_type="polars"),
tbl_name="small_table",
label="Quickstart validation",
)
.col_vals_between(columns="d", left=0, right=5000)
.col_vals_in_set(columns="f", set=["low", "mid", "high"])
.col_vals_not_null(columns="c")
.interrogate()
)

report = validation.get_dataframe_report()
summary = report.select(
["step_description", "pass_n", "failed_n"]
).iter_rows(named=True)

print("Validation summary:\n")
for step in summary:
print(
f"{step['step_description']:20}"
f"passed={step['pass_n']:<4}"
f"failed={step['failed_n']}"
)


if __name__ == "__main__":
main()
443 changes: 443 additions & 0 deletions python-pointblank/pointblank_report.html

Large diffs are not rendered by default.

Binary file added python-pointblank/pointblank_report.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
24 changes: 24 additions & 0 deletions python-pointblank/pointblank_starter_validation.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Starter Pointblank template for adapting to your own pipeline.
#
# You can run this template against a real file with:
# uv run --no-project --with 'pointblank[pl]' pb run pointblank_starter_validation.yaml --data your_data.csv --fail-on critical

tbl: small_table
df_library: polars
tbl_name: "Starter Validation"
label: "Adapt this template to your data"
thresholds:
warning: 0.02
error: 0.05
critical: 0.10
steps:
- col_exists:
columns: [record_id, status, amount]
- col_vals_not_null:
columns: record_id
- col_vals_in_set:
columns: status
set: [pending, shipped, delivered]
- col_vals_gt:
columns: amount
value: 0
48 changes: 48 additions & 0 deletions python-pointblank/pointblank_thresholds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "pointblank[pl]",
# ]
# ///

import pointblank as pb


def main() -> None:
validation = (
pb.Validate(
data=pb.load_dataset("small_table", tbl_type="polars"),
tbl_name="small_table",
label="Threshold-driven validation",
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
actions=pb.Actions(
warning=(
"Warning: step {step} reached {level} severity during "
"{type}."
),
critical=(
"Critical: step {step} reached {level} severity during "
"{type}."
),
),
)
.col_vals_between(columns="d", left=0, right=5000)
.col_vals_not_null(columns="c")
.rows_distinct()
.interrogate()
)

print("All checks passed perfectly:", validation.all_passed())
print(
"Anything above the error threshold:",
validation.above_threshold(level="error"),
)

try:
validation.assert_below_threshold(level="critical")
except AssertionError as exc:
print("CI gate tripped:", exc)


if __name__ == "__main__":
main()
Loading