Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions .github/workflows/feature-testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,27 @@ jobs:
version: "latest"

- name: Install dependencies
run: uv sync
run: uv sync --all-groups

- name: Start API for testing
run: |
uv run api.py &
echo "API_PID=$!" >> $GITHUB_ENV
sleep 15 # wait for SpaCy model load + pre-warm

# Wait for API to be ready (with healthcheck loop, max 2 minutes)
for i in {1..120}; do
if curl -s http://localhost:8080/api/v1/health | grep -q pong; then
echo "API is ready"
exit 0
fi
echo "Waiting for API... ($i/120)"
sleep 1
done
echo "API failed to start within 120 seconds"
exit 1

- name: Run test suite
run: pytest tests/ -q
run: uv run pytest tests/ -q

- name: Stop API
if: always()
Expand Down
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ Alle belangrijke wijzigingen in dit project worden in dit bestand gedocumenteerd

De opmaak is gebaseerd op [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) en dit project maakt gebruik van [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.4.0] - 2026-04-03

### Changed
- Presidio GLiNER Recognizer toegevoegd als module
- Custom pattern recognizers (BSN/IBAN/etc.) op false gezet
- Skip pattern recognizers tests

## [1.3.0] - 2026-03-02

### Removed
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@ dependencies = [
"presidio-anonymizer>=2.2.358",
"nl_core_news_lg",
"python-dotenv>=1.1.0",
"gliner>=0.1.13"
]

authors = [
{ name = "Mark Westerweel", email = "mark.westerweel@conduction.nl" },
{ name = "Razo van Berkel", email = "razo.van.berkel@centric.eu" },
{ name = "Nena Meijer", email = "nena.meijer@centric.eu" },
]

[tool.uv.sources]
Expand Down
37 changes: 26 additions & 11 deletions src/api/plugins.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,51 +20,66 @@ ner:
recognizers:
- name: DutchPhoneNumberRecognizer
type: pattern
enabled: true
enabled: false

- name: DutchIBANRecognizer
type: pattern
enabled: true
enabled: false

- name: DutchBSNRecognizer
type: pattern
enabled: true
enabled: false

- name: DutchDateRecognizer
type: pattern
enabled: true
enabled: false

- name: EmailRecognizer
type: pattern
enabled: true
enabled: false

- name: DutchPassportIdRecognizer
type: pattern
enabled: true
enabled: false

- name: DutchDriversLicenseRecognizer
type: pattern
enabled: true
enabled: false

- name: DutchVATRecognizer
type: pattern
enabled: true
enabled: false

- name: DutchKvKRecognizer
type: pattern
enabled: true
enabled: false

- name: DutchLicensePlateRecognizer
type: pattern
enabled: true
enabled: false

- name: IPv4Recognizer
type: pattern
enabled: true
enabled: false

- name: CaseNumberRecognizer
type: pattern
enabled: false

- name: GLiNERRecognizer
type: gliner
enabled: true
model: "urchade/gliner_multi_pii-v1"
entity_mapping:
person: PERSON
name: PERSON
organization: ORGANIZATION
location: LOCATION
gpe: LOCATION
flat_ner: false
multi_label: true
map_location: "cpu"
supported_language: "nl"

# Optionele modules — standaard uitgeschakeld.
# Schakel in door enabled: true te zetten en de vereiste packages te installeren.
Expand Down
9 changes: 9 additions & 0 deletions src/api/services/text_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,15 @@ def _build_analyzer() -> AnalyzerEngine:
for recognizer in plugin_cfg.recognizers:
registry.add_recognizer(recognizer)

# If GLiNER is enabled, remove spaCy recognizer to avoid NER coming from spaCy
has_gliner = any(
recognizer.__class__.__name__ == "GLiNERRecognizer"
for recognizer in plugin_cfg.recognizers
)
if has_gliner:
registry.remove_recognizer("SpacyRecognizer")


engine = AnalyzerEngine(
nlp_engine=nlp_engine,
registry=registry,
Expand Down
23 changes: 23 additions & 0 deletions src/api/utils/plugin_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,24 @@ def _load_llm_recognizer(cfg: dict[str, Any]) -> EntityRecognizer:
)


def _load_gliner_recognizer(cfg: dict[str, Any]) -> EntityRecognizer:
"Laad de GLiNER recognizer (lazy import)."
try: from presidio_analyzer.predefined_recognizers import GLiNERRecognizer
except ImportError as exc:
raise ImportError(
f"GLiNER plugin '{cfg['name']}' vereist package: 'presidio-analyzer[gliner]'. "
f"Fout: {exc}"
) from exc
return GLiNERRecognizer(
model_name=cfg["model"],
supported_language=cfg["supported_language"],
entity_mapping=cfg.get("entity_mapping", {}),
flat_ner=cfg.get("flat_ner", False),
multi_label=cfg.get("multi_label", True),
map_location=cfg.get("map_location", "cpu"),
)


def load_plugins(plugins_path: Path | None = None) -> PluginConfig:
"""Laad plugin-configuratie en instantieer alle actieve recognizers.

Expand Down Expand Up @@ -163,6 +181,11 @@ def load_plugins(plugins_path: Path | None = None) -> PluginConfig:
recognizer = _load_llm_recognizer(plugin)
recognizers.append(recognizer)
logger.debug("LLM plugin geladen: %s", name)

elif plugin_type == "gliner":
recognizer = _load_gliner_recognizer(plugin)
recognizers.append(recognizer)
logger.debug("GLiNER plugin geladen: %s", name)

else:
logger.warning("Onbekend plugin type '%s' voor '%s', overgeslagen.", plugin_type, name)
Expand Down
5 changes: 5 additions & 0 deletions tests/test_string_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

import httpx
import pytest


class TestHealth:
Expand Down Expand Up @@ -140,6 +141,7 @@ def test_entity_filter_preserves_non_targeted(self, client: httpx.Client) -> Non


class TestAnonymizeStrategies:
@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_replace_uses_entity_placeholder(self, client: httpx.Client) -> None:
r = client.post(
"/api/v1/anonymize",
Expand All @@ -155,6 +157,7 @@ def test_replace_uses_entity_placeholder(self, client: httpx.Client) -> None:
assert "<PHONE_NUMBER>" in data["anonymized_text"]
assert "0612345678" not in data["anonymized_text"]

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_redact_removes_value(self, client: httpx.Client) -> None:
r = client.post(
"/api/v1/anonymize",
Expand All @@ -168,6 +171,7 @@ def test_redact_removes_value(self, client: httpx.Client) -> None:
assert r.status_code == 200
assert "0612345678" not in r.json()["anonymized_text"]

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_hash_replaces_with_hex_string(self, client: httpx.Client) -> None:
r = client.post(
"/api/v1/anonymize",
Expand All @@ -183,6 +187,7 @@ def test_hash_replaces_with_hex_string(self, client: httpx.Client) -> None:
assert "NL91ABNA0417164300" not in data["anonymized_text"]
assert data["anonymized_text"] != data["original_text"]

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_mask_inserts_asterisks(self, client: httpx.Client) -> None:
r = client.post(
"/api/v1/anonymize",
Expand Down
21 changes: 21 additions & 0 deletions tests/test_supported_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"""

import httpx
import pytest


def _detected_types(client: httpx.Client, text: str, entities: list[str]) -> set[str]:
Expand Down Expand Up @@ -70,19 +71,22 @@ def test_ner_scores_are_floats(self, client: httpx.Client) -> None:
class TestPatternEntities:
"""Regex pattern recognizer entities."""

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_phone_mobile(self, client: httpx.Client) -> None:
types = _detected_types(
client, "Bel ons op 0612345678.", ["PHONE_NUMBER"]
)
assert "PHONE_NUMBER" in types

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_phone_international_0031(self, client: httpx.Client) -> None:
# +31 after a space doesn't trigger \b (both non-word chars) — use 0031 form.
types = _detected_types(
client, "Bereikbaar op 0031612345678.", ["PHONE_NUMBER"]
)
assert "PHONE_NUMBER" in types

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_email_detected(self, client: httpx.Client) -> None:
texts = _detected_texts(
client,
Expand All @@ -91,28 +95,33 @@ def test_email_detected(self, client: httpx.Client) -> None:
)
assert "support@example.nl" in texts

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_iban_nl(self, client: httpx.Client) -> None:
texts = _detected_texts(
client, "IBAN: NL91ABNA0417164300.", ["IBAN"]
)
assert "NL91ABNA0417164300" in texts

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_iban_nl_spaced(self, client: httpx.Client) -> None:
texts = _detected_texts(
client, "Rekeningnummer: NL91 ABNA 0417 1643 00.", ["IBAN"]
)
assert any("NL91" in t for t in texts)

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_bsn_nine_digits(self, client: httpx.Client) -> None:
types = _detected_types(client, "BSN: 111222333.", ["BSN"])
assert "BSN" in types

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_date_dd_mm_yyyy(self, client: httpx.Client) -> None:
types = _detected_types(
client, "Geboortedatum: 15-03-1990.", ["DATE_TIME"]
)
assert "DATE_TIME" in types

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_date_spelled_out(self, client: httpx.Client) -> None:
types = _detected_types(
client,
Expand All @@ -121,30 +130,35 @@ def test_date_spelled_out(self, client: httpx.Client) -> None:
)
assert "DATE_TIME" in types

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_id_no_passport(self, client: httpx.Client) -> None:
types = _detected_types(
client, "Paspoortnummer: AB1234561.", ["ID_NO"]
)
assert "ID_NO" in types

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_vat_number(self, client: httpx.Client) -> None:
texts = _detected_texts(
client, "BTW-nummer: NL123456789B01.", ["VAT_NUMBER"]
)
assert "NL123456789B01" in texts

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_license_plate(self, client: httpx.Client) -> None:
types = _detected_types(
client, "Kenteken AB-12-CD werd gesignaleerd.", ["LICENSE_PLATE"]
)
assert "LICENSE_PLATE" in types

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_ip_address(self, client: httpx.Client) -> None:
texts = _detected_texts(
client, "Verbonden via IP 192.168.1.1.", ["IP_ADDRESS"]
)
assert "192.168.1.1" in texts

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_case_no_z_format(self, client: httpx.Client) -> None:
types = _detected_types(
client,
Expand All @@ -153,18 +167,21 @@ def test_case_no_z_format(self, client: httpx.Client) -> None:
)
assert "CASE_NO" in types

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_case_no_awb_format(self, client: httpx.Client) -> None:
types = _detected_types(
client, "Bezwaar AWB 21/12345 ingediend.", ["CASE_NO"]
)
assert "CASE_NO" in types

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_drivers_license(self, client: httpx.Client) -> None:
types = _detected_types(
client, "Rijbewijsnummer: 1234567890.", ["DRIVERS_LICENSE"]
)
assert "DRIVERS_LICENSE" in types

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_kvk_number(self, client: httpx.Client) -> None:
types = _detected_types(
client, "KvK-nummer: 12345678.", ["KVK_NUMBER"]
Expand All @@ -175,6 +192,7 @@ def test_kvk_number(self, client: httpx.Client) -> None:
class TestEmailNotOrganization:
"""Regression: SpaCy NER must not tag emails as ORGANIZATION."""

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_no_organization_overlapping_email(self, client: httpx.Client) -> None:
r = client.post(
"/api/v1/analyze",
Expand Down Expand Up @@ -202,6 +220,7 @@ def test_no_organization_overlapping_email(self, client: httpx.Client) -> None:
"SpaCy NER false positive should be filtered"
)

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_email_entity_is_detected(self, client: httpx.Client) -> None:
"""The email itself must still be detected as EMAIL."""
r = client.post(
Expand All @@ -226,6 +245,7 @@ class TestPhoneVsDriversLicense:
so both are returned. This is expected behavior.
"""

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_phone_number_is_detected(self, client: httpx.Client) -> None:
r = client.post(
"/api/v1/analyze",
Expand All @@ -239,6 +259,7 @@ def test_phone_number_is_detected(self, client: httpx.Client) -> None:
types = {e["entity_type"] for e in r.json()["pii_entities"]}
assert "PHONE_NUMBER" in types

@pytest.mark.skip(reason="Pattern recognizers disabled in plugins.yaml")
def test_phone_has_higher_score_than_drivers_license(
self, client: httpx.Client
) -> None:
Expand Down
Loading
Loading