Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: CI

on:
push:
branches: ["main", "feat/**"]
pull_request:
branches: ["main"]

jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: python -m pip install --upgrade pip && pip install -e ".[dev]"
- name: Lint
run: python -m ruff check nullwatch/ tests/
- name: Test
run: python -m pytest tests/ -v
55 changes: 55 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: Publish to PyPI

on:
push:
tags:
- "v*"

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install build tooling
run: python -m pip install --upgrade pip build twine
- name: Build distributions
run: python -m build
- name: Check distributions
run: python -m twine check dist/*
- uses: actions/upload-artifact@v4
with:
name: dist
path: dist/*

publish-testpypi:
needs: build
runs-on: ubuntu-latest
permissions:
id-token: write
environment: testpypi
steps:
- uses: actions/download-artifact@v4
with:
name: dist
path: dist
- name: Publish to TestPyPI
uses: pypa/gh-action-pypi-publish@release/v1.12
with:
repository-url: https://test.pypi.org/legacy/

publish-pypi:
needs: publish-testpypi
runs-on: ubuntu-latest
permissions:
id-token: write
environment: pypi
steps:
- uses: actions/download-artifact@v4
with:
name: dist
path: dist
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1.12
38 changes: 38 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Python
__pycache__/
*.py[cod]
*.pyo
*.pyd
.Python
*.egg-info/
dist/
build/
.eggs/
*.egg

# Virtual environments
venv/
.venv/
env/

# pytest
.pytest_cache/
htmlcov/
.coverage
coverage.xml

# mypy / ruff / pyright
.mypy_cache/
.ruff_cache/

# HuggingFace model cache (can be large)
.cache/

# IDE
.idea/
.vscode/
*.swp
*.swo

# macOS
.DS_Store
20 changes: 20 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
.PHONY: install lint fmt test build check-package

install:
pip install -r requirements-dev.txt

lint:
ruff check nullwatch/ tests/

fmt:
ruff format nullwatch/ tests/ examples/

test:
pytest

build:
python -m build

check-package:
python -m build
python -m twine check dist/*
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,8 @@ metadata Structured details for downstream analysis.

The client covers the common lifecycle for Python agents and RAG services:

By default the scorer is strict: if it finds any unsupported answer span above the confidence threshold, the eval verdict is `fail`. You can relax this by passing a larger `fail_threshold` if you want to tolerate small unsupported fragments.

```python
client = NullwatchClient()

Expand Down Expand Up @@ -296,6 +298,11 @@ does not require an ML model.

Compact Nullwatch schema:

You can pass either:

- the compact `nullwatch-py` schema format shown below, or
- the same OpenAI-style `tools=[...]` JSON schema you send to the model

```python
from nullwatch.scorers import ToolCallScorer

Expand Down
94 changes: 94 additions & 0 deletions examples/basic_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from nullwatch import NullwatchClient, Span, Eval
from nullwatch.scorers import ToolCallScorer

# 1. Connect to nullwatch
client = NullwatchClient(
base_url="http://127.0.0.1:7710",
raise_on_error=False, # won't raise if server is not running
)

print("Server alive:", client.is_alive())

# 2. Manual span ingestion
span = Span(
run_id="run-demo-001",
operation="llm.call",
model="gpt-4o",
input_tokens=420,
output_tokens=96,
cost_usd=0.018,
)
span.finish()
client.ingest_span(span)
print("Span ingested:", span.span_id)

# 3. Context-manager span (auto-finish + auto-ingest)
with client.span("run-demo-001", "tool.call", tool_name="search_web") as s:
# simulate work
import time

time.sleep(0.05)
# you can mutate `s` inside the block
s.status = "ok"

print("Tool span done, duration_ms:", s.duration_ms)

# 4. Manual eval ingestion
eval_ = Eval(
run_id="run-demo-001",
eval_key="helpfulness",
scorer="llm-judge",
score=0.94,
verdict="pass",
dataset="prod-shadow",
)
client.ingest_eval(eval_)
print("Eval ingested:", eval_.eval_key)

# 5. Tool-call validity scorer
tools = [
{
"name": "search_web",
"parameters": {
"query": {"type": "string", "required": True},
"max_results": {"type": "integer", "required": False},
},
},
{
"name": "read_file",
"parameters": {
"path": {"type": "string", "required": True},
},
},
]

scorer = ToolCallScorer(tools=tools, dataset="prod-shadow")

# Valid call
eval_valid = scorer.score(
run_id="run-demo-001",
tool_call={"name": "search_web", "arguments": {"query": "open source Zig"}},
)
print(f"\nValid tool call → verdict={eval_valid.verdict}, score={eval_valid.score}")
print("Notes:", eval_valid.notes)

# Hallucinated / invalid call
eval_invalid = scorer.score(
run_id="run-demo-001",
tool_call={"name": "search_web", "arguments": {"querY": "open source Zig"}},
)
print(f"\nBad tool call → verdict={eval_invalid.verdict}, score={eval_invalid.score}")
print("Notes:", eval_invalid.notes)

# Send the evals
client.ingest_eval(eval_valid)
client.ingest_eval(eval_invalid)

# 6. Query runs
summary = client.get_run("run-demo-001")
if summary:
print(
f"\nRun summary: spans={summary.span_count}, evals={summary.eval_count}, verdict={summary.verdict}"
)
else:
print("\n(nullwatch server not running — skipping run summary query)")
Loading
Loading