nullclaw · Viroslav · May 5, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,25 @@
+name: CI
+
+on:
+  push:
+    branches: ["main", "feat/**"]
+  pull_request:
+    branches: ["main"]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: python -m pip install --upgrade pip && pip install -e ".[dev]"
+      - name: Lint
+        run: python -m ruff check nullwatch/ tests/
+      - name: Test
+        run: python -m pytest tests/ -v
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,55 @@
+name: Publish to PyPI
+
+on:
+  push:
+    tags:
+      - "v*"
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install build tooling
+        run: python -m pip install --upgrade pip build twine
+      - name: Build distributions
+        run: python -m build
+      - name: Check distributions
+        run: python -m twine check dist/*
+      - uses: actions/upload-artifact@v4
+        with:
+          name: dist
+          path: dist/*
+
+  publish-testpypi:
+    needs: build
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+    environment: testpypi
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist
+      - name: Publish to TestPyPI
+        uses: pypa/gh-action-pypi-publish@release/v1.12
+        with:
+          repository-url: https://test.pypi.org/legacy/
+
+  publish-pypi:
+    needs: publish-testpypi
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+    environment: pypi
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: dist
+          path: dist
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1.12
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,38 @@
+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+.Python
+*.egg-info/
+dist/
+build/
+.eggs/
+*.egg
+
+# Virtual environments
+venv/
+.venv/
+env/
+
+# pytest
+.pytest_cache/
+htmlcov/
+.coverage
+coverage.xml
+
+# mypy / ruff / pyright
+.mypy_cache/
+.ruff_cache/
+
+# HuggingFace model cache (can be large)
+.cache/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# macOS
+.DS_Store
diff --git a/Makefile b/Makefile
@@ -0,0 +1,20 @@
+.PHONY: install lint fmt test build check-package
+
+install:
+	pip install -r requirements-dev.txt
+
+lint:
+	ruff check nullwatch/ tests/
+
+fmt:
+	ruff format nullwatch/ tests/ examples/
+
+test:
+	pytest
+
+build:
+	python -m build
+
+check-package:
+	python -m build
+	python -m twine check dist/*
diff --git a/README.md b/README.md
@@ -166,6 +166,8 @@ metadata    Structured details for downstream analysis.
 
 The client covers the common lifecycle for Python agents and RAG services:
 
+By default the scorer is strict: if it finds any unsupported answer span above the confidence threshold, the eval verdict is `fail`. You can relax this by passing a larger `fail_threshold` if you want to tolerate small unsupported fragments.
+
 ```python
 client = NullwatchClient()
 
@@ -296,6 +298,11 @@ does not require an ML model.
 
 Compact Nullwatch schema:
 
+You can pass either:
+
+- the compact `nullwatch-py` schema format shown below, or
+- the same OpenAI-style `tools=[...]` JSON schema you send to the model
+
 ```python
 from nullwatch.scorers import ToolCallScorer
 

diff --git a/examples/basic_usage.py b/examples/basic_usage.py
@@ -0,0 +1,94 @@
+from nullwatch import NullwatchClient, Span, Eval
+from nullwatch.scorers import ToolCallScorer
+
+# 1. Connect to nullwatch
+client = NullwatchClient(
+    base_url="http://127.0.0.1:7710",
+    raise_on_error=False,  # won't raise if server is not running
+)
+
+print("Server alive:", client.is_alive())
+
+# 2. Manual span ingestion
+span = Span(
+    run_id="run-demo-001",
+    operation="llm.call",
+    model="gpt-4o",
+    input_tokens=420,
+    output_tokens=96,
+    cost_usd=0.018,
+)
+span.finish()
+client.ingest_span(span)
+print("Span ingested:", span.span_id)
+
+# 3. Context-manager span (auto-finish + auto-ingest)
+with client.span("run-demo-001", "tool.call", tool_name="search_web") as s:
+    # simulate work
+    import time
+
+    time.sleep(0.05)
+    # you can mutate `s` inside the block
+    s.status = "ok"
+
+print("Tool span done, duration_ms:", s.duration_ms)
+
+# 4. Manual eval ingestion
+eval_ = Eval(
+    run_id="run-demo-001",
+    eval_key="helpfulness",
+    scorer="llm-judge",
+    score=0.94,
+    verdict="pass",
+    dataset="prod-shadow",
+)
+client.ingest_eval(eval_)
+print("Eval ingested:", eval_.eval_key)
+
+# 5. Tool-call validity scorer
+tools = [
+    {
+        "name": "search_web",
+        "parameters": {
+            "query": {"type": "string", "required": True},
+            "max_results": {"type": "integer", "required": False},
+        },
+    },
+    {
+        "name": "read_file",
+        "parameters": {
+            "path": {"type": "string", "required": True},
+        },
+    },
+]
+
+scorer = ToolCallScorer(tools=tools, dataset="prod-shadow")
+
+# Valid call
+eval_valid = scorer.score(
+    run_id="run-demo-001",
+    tool_call={"name": "search_web", "arguments": {"query": "open source Zig"}},
+)
+print(f"\nValid tool call → verdict={eval_valid.verdict}, score={eval_valid.score}")
+print("Notes:", eval_valid.notes)
+
+# Hallucinated / invalid call
+eval_invalid = scorer.score(
+    run_id="run-demo-001",
+    tool_call={"name": "search_web", "arguments": {"querY": "open source Zig"}},
+)
+print(f"\nBad tool call → verdict={eval_invalid.verdict}, score={eval_invalid.score}")
+print("Notes:", eval_invalid.notes)
+
+# Send the evals
+client.ingest_eval(eval_valid)
+client.ingest_eval(eval_invalid)
+
+# 6. Query runs
+summary = client.get_run("run-demo-001")
+if summary:
+    print(
+        f"\nRun summary: spans={summary.span_count}, evals={summary.eval_count}, verdict={summary.verdict}"
+    )
+else:
+    print("\n(nullwatch server not running — skipping run summary query)")