From c4d9b473553bc50286d10558ef51d3ad74a921e0 Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Fri, 1 May 2026 21:37:36 +0000 Subject: [PATCH 01/13] Fix tests that fail on machines without GPUs Signed-off-by: Fred Reiss --- mellea/formatters/granite/base/util.py | 6 ++- pyproject.toml | 2 +- .../granite/test_intrinsics_formatters.py | 53 +++++++++++++++++++ .../formatters/granite/test_output/.gitignore | 4 ++ .../answerability_answerable.json | 1 + .../answerability_simple.json | 3 +- .../answerability_unanswerable.json | 1 + .../test_canned_input/context_relevance.json | 3 +- 8 files changed, 69 insertions(+), 4 deletions(-) create mode 100644 test/formatters/granite/test_output/.gitignore diff --git a/mellea/formatters/granite/base/util.py b/mellea/formatters/granite/base/util.py index 3ca3587aa..ec4b746e1 100644 --- a/mellea/formatters/granite/base/util.py +++ b/mellea/formatters/granite/base/util.py @@ -162,9 +162,13 @@ def chat_completion_request_to_transformers_inputs( tokenizer_input = { "conversation": request["messages"], "add_generation_prompt": True, - "tools": request["tools"], } + # Copy tools if present. Do this carefully, because we can't guarantee that + # "tools is None" is the same as "tools not present" across all tokenizers + if "tools" in request: + tokenizer_input["tools"] = request["tools"] + # pylint: disable=unsupported-membership-test if ( request.get("extra_body") is not None diff --git a/pyproject.toml b/pyproject.toml index b13604a1b..5f5130a1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -302,7 +302,7 @@ split-on-trailing-comma = false install_types = true non_interactive = true disable_error_code = ["empty-body", "import-untyped"] -python_version = "3.11" +python_version = "3.12" exclude = [ "^tooling/", "^scratchpad/", diff --git a/test/formatters/granite/test_intrinsics_formatters.py b/test/formatters/granite/test_intrinsics_formatters.py index d1ae3618d..de6de43b4 100644 --- a/test/formatters/granite/test_intrinsics_formatters.py +++ b/test/formatters/granite/test_intrinsics_formatters.py @@ -38,6 +38,9 @@ def _read_file(name): _TEST_DATA_DIR = pathlib.Path(os.path.dirname(__file__)) / "testdata" +_TEST_OUTPUT_DIR = pathlib.Path(os.path.dirname(__file__)) / "test_output" +"""Directory string we compare against something from _TEST_DATA_DIR is written""" + # Location from which our tests download adapters and YAML files _RAG_INTRINSICS_REPO_NAME = "ibm-granite/granitelib-rag-r1.0" _CORE_R1_REPO_NAME = "ibm-granite/granitelib-core-r1.0" @@ -48,6 +51,55 @@ def _read_file(name): _INPUT_ARGS_DIR = _TEST_DATA_DIR / "input_args" +def _substitute_root( + child_path: pathlib.Path, old_root: pathlib.Path, new_root: pathlib.Path +): + """Change a path rooted in one root directory to the same path rooted in another. + + Handles common corner cases such as when a given path has multiple equivalent + string representations. + + :param child_path: A path that is a descedent of a known root + :param old_root: Root directory that is an ancestor of ``child_path`` + :param new_root: Root directory to substitute for ``old_root`` + + :returns: A version of ``child_path`` in which the prefix corresponding to + ``old_root`` has been replaced with ``new_root`` + """ + # Resolve paths to handle symlinks, relative components, and other corner cases + child_path = child_path.resolve() + old_root = old_root.resolve() + new_root = new_root.resolve() + + # Get the relative path from old_root to child_path + try: + relative_path = child_path.relative_to(old_root) + except ValueError: + raise ValueError(f"{child_path} is not a descendant of {old_root}") + + # Construct new path with the new root + return new_root / relative_path + + +def _dump_output(expected_file: pathlib.Path, actual_string: str): + """Dump outputs to disk to aid debugging. + + Given the string representation of something that the current test is about to + compare against a canned output and the location of said canned output, write + the string to a controlled place on the filesystem to aid debugging. + + :param expected_file: Location of the file we're going to compare against + :param actual_string: String that the current test case produced that + """ + actual_file = _substitute_root(expected_file, _TEST_DATA_DIR, _TEST_OUTPUT_DIR) + + if not os.path.exists(actual_file.parent): + os.makedirs(actual_file.parent) + + with open(actual_file, "w", encoding="utf-8") as f: + f.write(actual_string) + + class YamlJsonCombo(pydantic.BaseModel): """Dataclass that drives configuration for most tests in this file.""" @@ -428,6 +480,7 @@ def test_canned_input(yaml_json_combo_no_alora): after_json = after.model_dump_json(indent=2) expected_file = _CANNED_INPUT_EXPECTED_DIR / f"{cfg.short_name}.json" + _dump_output(expected_file, after_json) with open(expected_file, encoding="utf-8") as f: expected_json = f.read() diff --git a/test/formatters/granite/test_output/.gitignore b/test/formatters/granite/test_output/.gitignore new file mode 100644 index 000000000..07c5555a0 --- /dev/null +++ b/test/formatters/granite/test_output/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything... +* +# ...except this file +!.gitignore \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_input/answerability_answerable.json b/test/formatters/granite/testdata/test_canned_input/answerability_answerable.json index c8f127d67..5843e43e5 100644 --- a/test/formatters/granite/testdata/test_canned_input/answerability_answerable.json +++ b/test/formatters/granite/testdata/test_canned_input/answerability_answerable.json @@ -26,5 +26,6 @@ } } }, + "temperature": 0.0, "max_completion_tokens": 6 } \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_input/answerability_simple.json b/test/formatters/granite/testdata/test_canned_input/answerability_simple.json index 3a42fc67e..d247c43d3 100644 --- a/test/formatters/granite/testdata/test_canned_input/answerability_simple.json +++ b/test/formatters/granite/testdata/test_canned_input/answerability_simple.json @@ -16,5 +16,6 @@ } } }, - "max_completion_tokens": 6 + "max_completion_tokens": 6, + "temperature": 0.0 } \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_input/answerability_unanswerable.json b/test/formatters/granite/testdata/test_canned_input/answerability_unanswerable.json index 8475fd979..4d390b598 100644 --- a/test/formatters/granite/testdata/test_canned_input/answerability_unanswerable.json +++ b/test/formatters/granite/testdata/test_canned_input/answerability_unanswerable.json @@ -30,5 +30,6 @@ } } }, + "temperature": 0.0, "max_completion_tokens": 6 } \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_input/context_relevance.json b/test/formatters/granite/testdata/test_canned_input/context_relevance.json index 05ecc0562..9ce036c51 100644 --- a/test/formatters/granite/testdata/test_canned_input/context_relevance.json +++ b/test/formatters/granite/testdata/test_canned_input/context_relevance.json @@ -30,5 +30,6 @@ ] } } - } + }, + "temperature": 0.0 } \ No newline at end of file From 0140a1f71dde1c77af03e6a85a4fae827af020b2 Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Mon, 4 May 2026 18:56:33 +0000 Subject: [PATCH 02/13] Fix warning Signed-off-by: Fred Reiss --- mellea/formatters/template_formatter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mellea/formatters/template_formatter.py b/mellea/formatters/template_formatter.py index d6c17796a..e21066cc7 100644 --- a/mellea/formatters/template_formatter.py +++ b/mellea/formatters/template_formatter.py @@ -145,7 +145,7 @@ def _render_representation( if expected_vars: unused_keys = set(stringified_template_args.keys()) - expected_vars if unused_keys: - MelleaLogger.get_logger().warn( + MelleaLogger.get_logger().warning( f"TemplateRepresentation for {representation.obj.__class__.__name__} " f"provides keys not referenced by template '{template.name}': {sorted(unused_keys)}" ) From ca6a74f045804ef5a83912da1a9a37b19f29d02f Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Mon, 4 May 2026 19:03:34 +0000 Subject: [PATCH 03/13] Fix broken tests Signed-off-by: Fred Reiss --- .../granite/test_intrinsics_formatters.py | 14 ++--- .../hallucination_detection.json | 2 +- .../intrinsic/test_output/.gitignore | 4 ++ test/stdlib/components/intrinsic/test_rag.py | 57 +++++++++++++------ .../testdata/output_json/citations.json | 31 ++++++---- .../output_json/hallucination_detection.json | 27 +++++---- 6 files changed, 88 insertions(+), 47 deletions(-) create mode 100644 test/stdlib/components/intrinsic/test_output/.gitignore diff --git a/test/formatters/granite/test_intrinsics_formatters.py b/test/formatters/granite/test_intrinsics_formatters.py index 7b51a9db8..ddadfb6f1 100644 --- a/test/formatters/granite/test_intrinsics_formatters.py +++ b/test/formatters/granite/test_intrinsics_formatters.py @@ -767,15 +767,13 @@ def test_run_transformers(yaml_json_combo_with_model, gh_run): # Output processing transformed_responses = result_processor.transform(responses, transformed_input) - - # Pull this string out of the debugger to create a fresh expected file. transformed_str = transformed_responses.model_dump_json(indent=4) - print(transformed_str) - with open( - _TEST_DATA_DIR / f"test_run_transformers/{cfg.short_name}.json", - encoding="utf-8", - ) as f: + # If you are certain that the output is correct, you can use the file written here + # to create a fresh expected file. + expected_file = _TEST_DATA_DIR / f"test_run_transformers/{cfg.short_name}.json" + _dump_output(expected_file, transformed_str) + with open(expected_file, encoding="utf-8") as f: expected = ChatCompletionResponse.model_validate_json(f.read()) # expected_str = expected.model_dump_json(indent=4) @@ -802,7 +800,7 @@ def test_run_transformers(yaml_json_combo_with_model, gh_run): assert t_json == pytest.approx(e_json, abs=0.1) except AssertionError as e: - # Known intermittent failure under Transformers 5.0 + # Known intermittent failure under Transformers 5.0. if cfg.short_name == "hallucination_detection": pytest.xfail("Known failure due to Transformers 5.0") raise e diff --git a/test/formatters/granite/testdata/test_run_transformers/hallucination_detection.json b/test/formatters/granite/testdata/test_run_transformers/hallucination_detection.json index 6448ad68a..4460bdc98 100644 --- a/test/formatters/granite/testdata/test_run_transformers/hallucination_detection.json +++ b/test/formatters/granite/testdata/test_run_transformers/hallucination_detection.json @@ -3,7 +3,7 @@ { "index": 0, "message": { - "content": "[{\"response_begin\": 0, \"response_end\": 31, \"response_text\": \"Purple bumble fish are yellow. \", \"faithfulness_likelihood\": 0.5825082205614557, \"explanation\": \"This sentence makes a factual claim about the color of purple bumble fish. The document states 'The only type of fish that is yellow is the purple bumble fish.' This directly supports the claim in the sentence.\"}, {\"response_begin\": 31, \"response_end\": 65, \"response_text\": \"Green bumble fish are also yellow.\", \"faithfulness_likelihood\": 0.09613224257737445, \"explanation\": \"This sentence makes a factual claim about the color of green bumble fish. However, the document does not mention green bumble fish at all. Therefore, this claim cannot be verified from the provided context.\"}]", + "content": "[{\"response_begin\": 0, \"response_end\": 31, \"response_text\": \"Purple bumble fish are yellow. \", \"faithfulness\": \"faithful\", \"explanation\": \"This sentence makes a factual claim about the color of purple bumble fish. The provided context states: 'The only type of fish that is yellow is the purple bumble fish.' This directly supports the claim in the sentence.\"}, {\"response_begin\": 31, \"response_end\": 65, \"response_text\": \"Green bumble fish are also yellow.\", \"faithfulness\": \"unfaithful\", \"explanation\": \"This sentence makes a factual claim about the color of green bumble fish. However, the provided context does not mention green bumble fish or their color. Therefore, the claim cannot be verified based on the context.\"}]", "role": "assistant" } } diff --git a/test/stdlib/components/intrinsic/test_output/.gitignore b/test/stdlib/components/intrinsic/test_output/.gitignore new file mode 100644 index 000000000..07c5555a0 --- /dev/null +++ b/test/stdlib/components/intrinsic/test_output/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything... +* +# ...except this file +!.gitignore \ No newline at end of file diff --git a/test/stdlib/components/intrinsic/test_rag.py b/test/stdlib/components/intrinsic/test_rag.py index 14d7df06b..205f2c30b 100644 --- a/test/stdlib/components/intrinsic/test_rag.py +++ b/test/stdlib/components/intrinsic/test_rag.py @@ -31,6 +31,9 @@ DATA_ROOT = pathlib.Path(os.path.dirname(__file__)) / "testdata" """Location of data files for the tests in this file.""" +TEST_OUTPUT_ROOT = pathlib.Path(os.path.dirname(__file__)) / "test_output" +"""Location where the tests in this file dump internal outputs for debugging.""" + @pytest.fixture(name="backend", scope="module") def _backend(): @@ -73,16 +76,30 @@ def _read_input_json(file_name: str): def _read_output_json(file_name: str): """Shared code for reading canned outputs stored in JSON files and converting to Mellea types. + + By convention, canned outputs hold the contents of + ``["choices"][0]["message"]["content"]``, + where ```` is a JSON chat completion after post-processing. """ with open(DATA_ROOT / "output_json" / file_name, encoding="utf-8") as f: json_data = json.load(f) + return json_data + - # Output is in OpenAI chat completion response format. Assume only one choice. - result_str = json_data["choices"][0]["message"]["content"] +def _dump_output_json(file_name: str, to_write): + """Shared code for dumping a test's generated JSON data. - # Intrinsic outputs are always JSON, serialized to a string for OpenAI - # compatibility. - return json.loads(result_str) + Dump the Python data structures that that will be compared against canned + JSON output files. Outputs go to the local directory ``test_output``. + + If you are sure the current output is correct, you can use this output to update + the contents of the ``testdata`` directory. + """ + target_path = TEST_OUTPUT_ROOT / "output_json" / file_name + if not os.path.exists(target_path.parent): + os.makedirs(target_path.parent) + with open(target_path, "w", encoding="utf-8") as f: + json.dump(to_write, f, indent=2) @pytest.mark.qualitative @@ -116,7 +133,6 @@ def test_query_rewrite(backend): assert result == expected -@pytest.mark.xfail(reason="Non-deterministic citation boundaries across environments") @pytest.mark.qualitative def test_citations(backend): """Verify that the citations intrinsic functions properly.""" @@ -125,7 +141,14 @@ def test_citations(backend): # First call triggers adapter loading result = rag.find_citations(assistant_response, docs, context, backend) - assert result == expected + _dump_output_json("citations.json", result) + # There are some known differences between GPU and CPU output due to different + # matrix multiply implementations. Ignore those differences but attempt to complete + # the test when they are not present. + try: + assert result == expected + except AssertionError as ae: + pytest.xfail(f"Known differences across platforms. Diff was: {ae}") # Second call hits a different code path from the first one result = rag.find_citations(assistant_response, docs, context, backend) @@ -157,14 +180,12 @@ def test_hallucination_detection(backend): # First call triggers adapter loading result = rag.flag_hallucinated_content(assistant_response, docs, context, backend) - # pytest.approx() chokes on lists of records, so we do this complicated dance. - for r, e in zip(result, expected, strict=True): # type: ignore - assert r == e + _dump_output_json("hallucination_detection.json", result) + assert result == expected # Second call hits a different code path from the first one result = rag.flag_hallucinated_content(assistant_response, docs, context, backend) - for r, e in zip(result, expected, strict=True): # type: ignore - assert r == e + assert result == expected @pytest.mark.qualitative @@ -231,7 +252,6 @@ def test_query_rewrite_resolve(backend): assert result == expected -@pytest.mark.xfail(reason="Non-deterministic citation boundaries across environments") @pytest.mark.qualitative def test_citations_resolve(backend): """Verify citations when response is resolved from context.""" @@ -240,7 +260,13 @@ def test_citations_resolve(backend): expected = _read_output_json("citations.json") result = rag.find_citations(None, docs, context, backend) - assert result == expected + # There are some known differences between GPU and CPU output due to different + # matrix multiply implementations. Ignore those differences but attempt to complete + # the test when they are not present. + try: + assert result == expected + except AssertionError as ae: + pytest.xfail(f"Known differences across platforms. Diff was: {ae}") @pytest.mark.qualitative @@ -262,8 +288,7 @@ def test_hallucination_detection_resolve(backend): expected = _read_output_json("hallucination_detection.json") result = rag.flag_hallucinated_content(None, docs, context, backend) - for r, e in zip(result, expected, strict=True): # type: ignore - assert r == e + assert result == expected @pytest.mark.qualitative diff --git a/test/stdlib/components/intrinsic/testdata/output_json/citations.json b/test/stdlib/components/intrinsic/testdata/output_json/citations.json index 67dc2bb51..d62175369 100644 --- a/test/stdlib/components/intrinsic/testdata/output_json/citations.json +++ b/test/stdlib/components/intrinsic/testdata/output_json/citations.json @@ -1,11 +1,20 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "[{\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 692, \"citation_end\": 1030, \"citation_text\": \"He began to direct his attention to acquisition and expansion, buying the troubled Sunday Times in Perth, Western Australia (1956) and over the next few years acquiring suburban and provincial newspapers in New South Wales, Queensland, Victoria and the Northern Territory, including the Sydney afternoon tabloid, The Daily Mirror (1960). \"}, {\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 1219, \"citation_end\": 1346, \"citation_text\": \"Murdoch's first foray outside Australia involved the purchase of a controlling interest in the New Zealand daily The Dominion. \"}]", - "role": "assistant" - } - } - ] -} \ No newline at end of file +[ + { + "response_begin": 0, + "response_end": 91, + "response_text": "Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. ", + "citation_doc_id": "0", + "citation_begin": 598, + "citation_end": 1030, + "citation_text": "Rupert Murdoch turned its Adelaide newspaper, The News, its main asset, into a major success. He began to direct his attention to acquisition and expansion, buying the troubled Sunday Times in Perth, Western Australia (1956) and over the next few years acquiring suburban and provincial newspapers in New South Wales, Queensland, Victoria and the Northern Territory, including the Sydney afternoon tabloid, The Daily Mirror (1960). " + }, + { + "response_begin": 0, + "response_end": 91, + "response_text": "Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. ", + "citation_doc_id": "0", + "citation_begin": 1219, + "citation_end": 1346, + "citation_text": "Murdoch's first foray outside Australia involved the purchase of a controlling interest in the New Zealand daily The Dominion. " + } +] \ No newline at end of file diff --git a/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json b/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json index c8763bbec..21924c116 100644 --- a/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json +++ b/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json @@ -1,11 +1,16 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "[{\"response_begin\": 0, \"response_end\": 31, \"response_text\": \"Purple bumble fish are yellow. \", \"faithfulness\": \"faithful\", \"explanation\": \"This sentence makes a factual claim about the color of purple bumble fish. The document states 'The only type of fish that is yellow is the purple bumble fish.' This directly supports the claim in the sentence.\"}, {\"response_begin\": 31, \"response_end\": 65, \"response_text\": \"Green bumble fish are also yellow.\", \"faithfulness\": \"unfaithful\", \"explanation\": \"This sentence makes a factual claim about the color of green bumble fish. However, the document does not mention green bumble fish at all. Therefore, this claim cannot be verified from the provided context.\"}]", - "role": "assistant" - } - } - ] -} +[ + { + "response_begin": 0, + "response_end": 31, + "response_text": "Purple bumble fish are yellow. ", + "faithfulness": "faithful", + "explanation": "This sentence makes a factual claim about the color of purple bumble fish. The provided context states: 'The only type of fish that is yellow is the purple bumble fish.' This directly supports the claim in the sentence." + }, + { + "response_begin": 31, + "response_end": 65, + "response_text": "Green bumble fish are also yellow.", + "faithfulness": "unfaithful", + "explanation": "This sentence makes a factual claim about the color of green bumble fish. However, the provided context does not mention green bumble fish, so the claim cannot be verified based on the context." + } +] \ No newline at end of file From 46b556d3ab4224a9f4942f55701eeb5b36e70d8f Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Mon, 4 May 2026 21:40:53 +0000 Subject: [PATCH 04/13] Remove references to deprecated models in tests Signed-off-by: Fred Reiss --- AGENTS.md | 3 +- docs/docs/advanced/intrinsics.md | 24 - .../tutorials/04-making-agents-reliable.md | 4 +- docs/examples/granite-switch/README.md | 2 +- docs/examples/intrinsics/README.md | 6 +- docs/examples/intrinsics/context_relevance.py | 32 - mellea/backends/adapters/catalog.py | 1 - .../stdlib/components/intrinsic/guardian.py | 8 - mellea/stdlib/components/intrinsic/rag.py | 39 -- .../granite/test_intrinsics_formatters.py | 66 --- .../test_canned_input/context_relevance.json | 35 -- .../test_canned_input/requirement_check.json | 40 -- .../test_canned_input/uncertainty.json | 54 -- .../expected_result/context_relevance.json | 11 - .../expected_result/requirement_check.json | 29 - .../expected_result/uncertainty.json | 11 - .../model_output/context_relevance.json | 11 - .../model_output/requirement_check.json | 373 ------------ .../model_output/uncertainty.json | 560 ------------------ .../test_run_ollama/context_relevance.json | 11 - .../context_relevance.json | 11 - .../context_relevance_alora.json | 11 - .../requirement_check.json | 11 - .../requirement_check_alora.json | 11 - .../test_run_transformers/uncertainty.json | 11 - .../uncertainty_alora.json | 11 - test/stdlib/components/intrinsic/test_rag.py | 28 - 27 files changed, 5 insertions(+), 1409 deletions(-) delete mode 100644 docs/examples/intrinsics/context_relevance.py delete mode 100644 test/formatters/granite/testdata/test_canned_input/context_relevance.json delete mode 100644 test/formatters/granite/testdata/test_canned_input/requirement_check.json delete mode 100644 test/formatters/granite/testdata/test_canned_input/uncertainty.json delete mode 100644 test/formatters/granite/testdata/test_canned_output/expected_result/context_relevance.json delete mode 100644 test/formatters/granite/testdata/test_canned_output/expected_result/requirement_check.json delete mode 100644 test/formatters/granite/testdata/test_canned_output/expected_result/uncertainty.json delete mode 100644 test/formatters/granite/testdata/test_canned_output/model_output/context_relevance.json delete mode 100644 test/formatters/granite/testdata/test_canned_output/model_output/requirement_check.json delete mode 100644 test/formatters/granite/testdata/test_canned_output/model_output/uncertainty.json delete mode 100644 test/formatters/granite/testdata/test_run_ollama/context_relevance.json delete mode 100644 test/formatters/granite/testdata/test_run_transformers/context_relevance.json delete mode 100644 test/formatters/granite/testdata/test_run_transformers/context_relevance_alora.json delete mode 100644 test/formatters/granite/testdata/test_run_transformers/requirement_check.json delete mode 100644 test/formatters/granite/testdata/test_run_transformers/requirement_check_alora.json delete mode 100644 test/formatters/granite/testdata/test_run_transformers/uncertainty.json delete mode 100644 test/formatters/granite/testdata/test_run_transformers/uncertainty_alora.json diff --git a/AGENTS.md b/AGENTS.md index cb07d5b31..42bde8be4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -178,7 +178,6 @@ Intrinsics are specialized LoRA adapters that add task-specific capabilities (RA | `rag` | `rewrite_question(question, context, backend)` | Rewrite question into a retrieval query | | `rag` | `clarify_query(question, documents, context, backend)` | Generate clarification or return "CLEAR" | | `rag` | `find_citations(response, documents, context, backend)` | Document sentences supporting the response | -| `rag` | `check_context_relevance(question, document, context, backend)` | Whether a document is relevant (0–1); only supported for granite-4.0, not granite-4.1 | | `rag` | `flag_hallucinated_content(response, documents, context, backend)` | Flag potentially hallucinated sentences | ```python @@ -212,7 +211,7 @@ When adding support for a new intrinsic (not just using an existing one), fetch | Repo | Purpose | Intrinsics | |------|---------|------------| -| [`ibm-granite/granitelib-rag-r1.0`](https://huggingface.co/ibm-granite/granitelib-rag-r1.0) | RAG pipeline | answerability, citations, context_relevance, hallucination_detection, query_rewrite, query_clarification | +| [`ibm-granite/granitelib-rag-r1.0`](https://huggingface.co/ibm-granite/granitelib-rag-r1.0) | RAG pipeline | answerability, citations, hallucination_detection, query_rewrite, query_clarification | | [`ibm-granite/granitelib-core-r1.0`](https://huggingface.co/ibm-granite/granitelib-core-r1.0) | Core capabilities | context-attribution, requirement-check, uncertainty | | [`ibm-granite/granitelib-guardian-r1.0`](https://huggingface.co/ibm-granite/granitelib-guardian-r1.0) | Safety & compliance | guardian-core, policy-guardrails, factuality-detection, factuality-correction | diff --git a/docs/docs/advanced/intrinsics.md b/docs/docs/advanced/intrinsics.md index 317295cb7..e6b03dd31 100644 --- a/docs/docs/advanced/intrinsics.md +++ b/docs/docs/advanced/intrinsics.md @@ -71,30 +71,6 @@ print(rag.check_answerability(question, docs_answerable, context, backend)) # print(rag.check_answerability(question, docs_not_answerable, context, backend)) # False ``` -## Context relevance - -Assess whether a document is relevant to a question: - -```python -# Requires: mellea[hf] -# Returns: float -from mellea.backends.huggingface import LocalHFBackend -from mellea.stdlib.components import Document -from mellea.stdlib.components.intrinsic import rag -from mellea.stdlib.context import ChatContext - -backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") -context = ChatContext() -question = "Who is the CEO of Microsoft?" -document = Document( - "Microsoft Corporation is an American multinational corporation " - "headquartered in Redmond, Washington." -) - -result = rag.check_context_relevance(question, document, context, backend) -print(result) # False — the document does not mention the CEO -``` - ## Hallucination detection Flag sentences in an assistant response that are not grounded in the source documents: diff --git a/docs/docs/tutorials/04-making-agents-reliable.md b/docs/docs/tutorials/04-making-agents-reliable.md index 81a21d1a4..73dd2a756 100644 --- a/docs/docs/tutorials/04-making-agents-reliable.md +++ b/docs/docs/tutorials/04-making-agents-reliable.md @@ -397,7 +397,7 @@ and dynamic applications with ease. The word "Mellea" consists of Scores are floats between 0.0 (safe) and 1.0 (risk detected); 0.5 is the threshold. The available criteria are: `"harm"`, `"jailbreak"`, `"social_bias"`, `"profanity"`, `"violence"`, `"sexual_content"`, `"unethical_behavior"`, `"groundedness"`, -`"answer_relevance"`, `"context_relevance"`, and `"function_call"`. +`"answer_relevance"`, and `"function_call"`. --- @@ -474,7 +474,7 @@ for criterion in criteria: The available criteria are: `"harm"`, `"jailbreak"`, `"social_bias"`, `"profanity"`, `"violence"`, `"sexual_content"`, `"unethical_behavior"`, `"groundedness"`, -`"answer_relevance"`, `"context_relevance"`, and `"function_call"`. +`"answer_relevance"`, and `"function_call"`. --- diff --git a/docs/examples/granite-switch/README.md b/docs/examples/granite-switch/README.md index 30f4bfece..374548d30 100644 --- a/docs/examples/granite-switch/README.md +++ b/docs/examples/granite-switch/README.md @@ -28,7 +28,7 @@ python -m vllm.entrypoints.openai.api_server \ Not all intrinsics are embedded in every Granite Switch model. Check the model's `adapter_index.json` for the list of available adapters. The current model -includes: `answerability`, `citations`, `context_relevance`, `guardian-core`, +includes: `answerability`, `citations`, `guardian-core`, `hallucination_detection`, `query_clarification`, `query_rewrite`, and `requirement-check`. diff --git a/docs/examples/intrinsics/README.md b/docs/examples/intrinsics/README.md index bf50c914b..df0bfcfe0 100644 --- a/docs/examples/intrinsics/README.md +++ b/docs/examples/intrinsics/README.md @@ -19,9 +19,6 @@ Checks if a question can be answered given the context. ### citations.py Validates and extracts citations from generated text. -### context_relevance.py -Assesses if retrieved context is relevant to a query. - ### hallucination_detection.py Detects when model outputs contain hallucinated information. @@ -104,7 +101,7 @@ For complete runnable examples using the OpenAI backend with Granite Switch, see [`../granite-switch/`](../granite-switch/). > **Note:** Not all intrinsics are embedded in every Granite Switch model. The -> current model includes: `answerability`, `citations`, `context_relevance`, +> current model includes: `answerability`, `citations`, > `guardian-core`, `hallucination_detection`, `query_clarification`, > `query_rewrite`, and `requirement-check`. Check the model's > `adapter_index.json` for the full list. @@ -114,7 +111,6 @@ see [`../granite-switch/`](../granite-switch/). - **requirement_check**: Validate requirements (used by ALoraRequirement) - **answerability**: Determine if question is answerable - **citations**: Extract and validate citations -- **context_relevance**: Assess context-query relevance - **hallucination_detection**: Detect hallucinated content - **query_rewrite**: Improve query formulation - **uncertainty**: Estimate certainty about answering a question diff --git a/docs/examples/intrinsics/context_relevance.py b/docs/examples/intrinsics/context_relevance.py deleted file mode 100644 index 2080f8141..000000000 --- a/docs/examples/intrinsics/context_relevance.py +++ /dev/null @@ -1,32 +0,0 @@ -# pytest: huggingface, e2e - -"""Example usage of the context relevance intrinsic for RAG applications. - -To run this script from the root of the Mellea source tree, use the command: -``` -uv run python docs/examples/intrinsics/context_relevance.py -``` -""" - -from mellea import model_ids, start_backend -from mellea.stdlib.components.intrinsic import rag - -ctx, backend = start_backend( - "hf", model_id=model_ids.IBM_GRANITE_4_MICRO_3B, context_type="chat" -) -# NOTE: this example uses Granite 4.0 micro because there is no context_relevance intrinsic for Graniet 4.1 - -question = "Who is the CEO of Microsoft?" -document = ( - # Document text does not say who is the CEO. - "Microsoft Corporation is an American multinational corporation and technology " - "conglomerate headquartered in Redmond, Washington.[2] Founded in 1975, the " - "company became influential in the rise of personal computers through software " - "like Windows, and the company has since expanded to Internet services, cloud " - "computing, video gaming and other fields. Microsoft is the largest software " - "maker, one of the most valuable public U.S. companies,[a] and one of the most " - "valuable brands globally." -) - -result = rag.check_context_relevance(question, document, ctx, backend) -print(f"Result of context relevance check with irrelevant document: {result}") diff --git a/mellea/backends/adapters/catalog.py b/mellea/backends/adapters/catalog.py index 9edeea2e4..807376dc5 100644 --- a/mellea/backends/adapters/catalog.py +++ b/mellea/backends/adapters/catalog.py @@ -76,7 +76,6 @@ class IntriniscsCatalogEntry(pydantic.BaseModel): ############################################ IntriniscsCatalogEntry(name="answerability", repo_id=_RAG_REPO), IntriniscsCatalogEntry(name="citations", repo_id=_RAG_REPO), - IntriniscsCatalogEntry(name="context_relevance", repo_id=_RAG_REPO), IntriniscsCatalogEntry(name="hallucination_detection", repo_id=_RAG_REPO), IntriniscsCatalogEntry(name="query_clarification", repo_id=_RAG_REPO), IntriniscsCatalogEntry(name="query_rewrite", repo_id=_RAG_REPO), diff --git a/mellea/stdlib/components/intrinsic/guardian.py b/mellea/stdlib/components/intrinsic/guardian.py index 3dcc843a9..5da8ca8ab 100644 --- a/mellea/stdlib/components/intrinsic/guardian.py +++ b/mellea/stdlib/components/intrinsic/guardian.py @@ -107,14 +107,6 @@ def policy_guardrails( "irrelevant if it fails to directly answer or meet the specific " "intent of the query." ), - "context_relevance": ( - "A document is deemed irrelevant when it doesn't contain information " - "pertinent to the query's specific needs. This means the retrieved or " - "provided content fails to adequately address the question at hand. " - "Irrelevant information could be on a different topic, originate from " - "an unrelated field, or simply not offer any valuable insights for " - "crafting a suitable response." - ), "function_call": ( "Function call hallucination occurs when a text includes function " "calls that either don't adhere to the correct format defined by the " diff --git a/mellea/stdlib/components/intrinsic/rag.py b/mellea/stdlib/components/intrinsic/rag.py index ea77422b8..f390c4ddc 100644 --- a/mellea/stdlib/components/intrinsic/rag.py +++ b/mellea/stdlib/components/intrinsic/rag.py @@ -154,45 +154,6 @@ def find_citations( return result_json -def check_context_relevance( - question: str | None, - document: str | Document, - context: ChatContext, - backend: AdapterMixin, -) -> str: - """Test whether a document is relevant to a user's question. - - Intrinsic function that checks whether a single document contains part or all of - the answer to a user's question. Does not consider the context in which the - question was asked. - - Args: - question: Question that the user has posed. When ``None``, the question - is extracted from the last user message in ``context``. - document: A retrieved document snippet. May be a ``Document`` or a plain - string (automatically wrapped in ``Document``). - context: The chat up to the point where the user asked a question. - backend: Backend instance that supports the adapters that implement this - intrinsic. - - Returns: - Context relevance judgement as one of the following strings: - - "relevant" - - "irrelevant" - - "partially relevant" - """ - question, context = _resolve_question(question, context, backend) - document = _coerce_to_document(document) - result_json = call_intrinsic( - "context_relevance", - context.add(Message("user", question)), - backend, - # Target document is passed as an argument - kwargs={"document_content": document.text}, - ) - return result_json["context_relevance"] - - def flag_hallucinated_content( response: str | None, documents: collections.abc.Iterable[str | Document], diff --git a/test/formatters/granite/test_intrinsics_formatters.py b/test/formatters/granite/test_intrinsics_formatters.py index ddadfb6f1..53e4582b1 100644 --- a/test/formatters/granite/test_intrinsics_formatters.py +++ b/test/formatters/granite/test_intrinsics_formatters.py @@ -183,13 +183,6 @@ def _resolve_yaml(self): inputs_file=_INPUT_JSON_DIR / "hallucination_detection.json", task="hallucination_detection", ), - # aLoRA adapter for this intrinsic not currently available - # YamlJsonCombo( - # short_name="hallucination_detection_alora", - # inputs_file=_INPUT_JSON_DIR / "hallucination_detection.json", - # task="hallucination_detection", - # is_alora=True - # ), YamlJsonCombo( short_name="query_clarification", inputs_file=_INPUT_JSON_DIR / "query_clarification.json", @@ -200,70 +193,11 @@ def _resolve_yaml(self): inputs_file=_INPUT_JSON_DIR / "query_rewrite.json", task="query_rewrite", ), - # NOTE for the following two entries: - # The "requirement_check" intrinsic has not yet been ported to the latest format - # or to Granite 4.0. - YamlJsonCombo( - short_name="requirement_check", - inputs_file=_INPUT_JSON_DIR / "requirement_check.json", - arguments_file=_INPUT_ARGS_DIR / "requirement_check.json", - task="requirement_check", - # Granite 4.0 adapters not currently available - repo_id="ibm-granite/rag-intrinsics-lib", - base_model_id="ibm-granite/granite-3.3-2b-instruct", - ), - YamlJsonCombo( - short_name="requirement_check_alora", - inputs_file=_INPUT_JSON_DIR / "requirement_check.json", - arguments_file=_INPUT_ARGS_DIR / "requirement_check.json", - task="requirement_check", - is_alora=True, - # Granite 4.0 adapters not currently available - repo_id="ibm-granite/rag-intrinsics-lib", - base_model_id="ibm-granite/granite-3.3-2b-instruct", - ), - YamlJsonCombo( - short_name="uncertainty", - inputs_file=_INPUT_JSON_DIR / "uncertainty.json", - task="uncertainty", - # Granite 4.0 adapters not currently available - repo_id="ibm-granite/granitelib-core-r1.0", - revision="c9c189f5ad0b2890660397070613fda46d6ceb80", - ), - # aLoRA adapter for this intrinsic not currently available - # YamlJsonCombo( - # short_name="uncertainty_alora", - # inputs_file=_INPUT_JSON_DIR / "uncertainty.json", - # task="uncertainty", - # is_alora=True, - # # Granite 4.0 adapters not currently available - # repo_id="ibm-granite/granitelib-core-r1.0", - # ), - YamlJsonCombo( - short_name="context_relevance", - inputs_file=_INPUT_JSON_DIR / "context_relevance.json", - arguments_file=_INPUT_ARGS_DIR / "context_relevance.json", - task="context_relevance", - ), - YamlJsonCombo( - short_name="context_relevance_alora", - inputs_file=_INPUT_JSON_DIR / "context_relevance.json", - arguments_file=_INPUT_ARGS_DIR / "context_relevance.json", - task="context_relevance", - is_alora=True, - ), YamlJsonCombo( short_name="citations", inputs_file=_INPUT_JSON_DIR / "citations.json", task="citations", ), - # aLoRA adapter for this intrinsic not currently available - # YamlJsonCombo( - # short_name="citations_alora", - # inputs_file=_INPUT_JSON_DIR / "citations.json", - # task="citations", - # is_alora=True, - # ), YamlJsonCombo( short_name="context-attribution", inputs_file=_INPUT_JSON_DIR / "context-attribution.json", diff --git a/test/formatters/granite/testdata/test_canned_input/context_relevance.json b/test/formatters/granite/testdata/test_canned_input/context_relevance.json deleted file mode 100644 index 9ce036c51..000000000 --- a/test/formatters/granite/testdata/test_canned_input/context_relevance.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "messages": [ - { - "content": "Who is the CEO of Microsoft?", - "role": "user" - }, - { - "content": "DOCUMENT: Microsoft Corporation is an American multinational corporation and technology conglomerate headquartered in Redmond, Washington.[2] Founded in 1975, the company became influential in the rise of personal computers through software like Windows, and the company has since expanded to Internet services, cloud computing, video gaming and other fields. Microsoft is the largest software maker, one of the most valuable public U.S. companies,[a] and one of the most valuable brands globally.\n", - "role": "user" - } - ], - "extra_body": { - "structured_outputs": { - "json": { - "title": "ContextRelevanceOutput", - "type": "object", - "properties": { - "context_relevance": { - "type": "string", - "description": "Context relevancy judgment.", - "enum": [ - "relevant", - "irrelevant", - "partially relevant" - ] - } - }, - "required": [ - "context_relevance" - ] - } - } - }, - "temperature": 0.0 -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_input/requirement_check.json b/test/formatters/granite/testdata/test_canned_input/requirement_check.json deleted file mode 100644 index f110d3901..000000000 --- a/test/formatters/granite/testdata/test_canned_input/requirement_check.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "messages": [ - { - "content": "Hello there, welcome to math questions!", - "role": "assistant" - }, - { - "content": "What is the square root of 4?", - "role": "user" - }, - { - "content": "The square root of 4 is 2.", - "role": "assistant" - }, - { - "content": "What is six times seven?", - "role": "user" - }, - { - "content": "Check if this requirement was satisfied:\n The user's question is not one of the homework questions given in the provided documents.\n", - "role": "user" - } - ], - "extra_body": { - "documents": [ - { - "text": "\nHere's a list of math problems that are on the homework assignment:\n\n1) 12+5=17\n2) 20-8=12\n3) 6*7=42\n4) 45/9=5\n5) 13*2=26\n6) 99+11=110\n7) 100-35=65\n\n", - "doc_id": "1" - } - ], - "structured_outputs": { - "json": { - "type": "boolean" - } - } - }, - "max_completion_tokens": 5, - "logprobs": true, - "top_logprobs": 10 -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_input/uncertainty.json b/test/formatters/granite/testdata/test_canned_input/uncertainty.json deleted file mode 100644 index 70d584a6e..000000000 --- a/test/formatters/granite/testdata/test_canned_input/uncertainty.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "messages": [ - { - "content": "Welcome to pet questions!", - "role": "assistant" - }, - { - "content": "Which of my pets have fleas?", - "role": "user" - } - ], - "extra_body": { - "documents": [ - { - "text": "My dog has fleas.", - "doc_id": "1" - }, - { - "text": "My cat does not have fleas.", - "doc_id": "2" - } - ], - "structured_outputs": { - "json": { - "type": "object", - "properties": { - "score": { - "type": "string", - "enum": [ - "0", - "1", - "2", - "3", - "4", - "5", - "6", - "7", - "8", - "9" - ] - } - }, - "required": [ - "score" - ], - "additionalProperties": false - } - } - }, - "max_completion_tokens": 15, - "temperature": 0.0, - "logprobs": true, - "top_logprobs": 10 -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_output/expected_result/context_relevance.json b/test/formatters/granite/testdata/test_canned_output/expected_result/context_relevance.json deleted file mode 100644 index 645d51516..000000000 --- a/test/formatters/granite/testdata/test_canned_output/expected_result/context_relevance.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"context_relevance\": \"irrelevant\"}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_output/expected_result/requirement_check.json b/test/formatters/granite/testdata/test_canned_output/expected_result/requirement_check.json deleted file mode 100644 index 6ea04fe57..000000000 --- a/test/formatters/granite/testdata/test_canned_output/expected_result/requirement_check.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"requirement_likelihood\": 1.0}", - "role": "assistant", - "tool_calls": [], - "reasoning_content": null - }, - "finish_reason": "stop" - } - ], - "prompt_logprobs": null, - "id": "chatcmpl-161ba1db8d6b4d05bbd5658b6d27c798", - "created": 1758304397, - "model": "requirement_check", - "object": "chat.completion", - "service_tier": null, - "system_fingerprint": null, - "usage": { - "completion_tokens": 2, - "prompt_tokens": 293, - "total_tokens": 295, - "completion_tokens_details": null, - "prompt_tokens_details": null - }, - "kv_transfer_params": null -} diff --git a/test/formatters/granite/testdata/test_canned_output/expected_result/uncertainty.json b/test/formatters/granite/testdata/test_canned_output/expected_result/uncertainty.json deleted file mode 100644 index 71426909b..000000000 --- a/test/formatters/granite/testdata/test_canned_output/expected_result/uncertainty.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"certainty\": 0.4597152663768006}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_output/model_output/context_relevance.json b/test/formatters/granite/testdata/test_canned_output/model_output/context_relevance.json deleted file mode 100644 index ce3e97dfe..000000000 --- a/test/formatters/granite/testdata/test_canned_output/model_output/context_relevance.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\n \"context_relevance\": \"irrelevant\"\n}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_output/model_output/requirement_check.json b/test/formatters/granite/testdata/test_canned_output/model_output/requirement_check.json deleted file mode 100644 index 613baa490..000000000 --- a/test/formatters/granite/testdata/test_canned_output/model_output/requirement_check.json +++ /dev/null @@ -1,373 +0,0 @@ -{ - "id": "chatcmpl-161ba1db8d6b4d05bbd5658b6d27c798", - "choices": [ - { - "finish_reason": "stop", - "index": 0, - "logprobs": { - "content": [ - { - "token": "true", - "bytes": [ - 116, - 114, - 117, - 101 - ], - "logprob": -0.0052339909598231316, - "top_logprobs": [ - { - "token": "true", - "bytes": [ - 116, - 114, - 117, - 101 - ], - "logprob": -0.0052339909598231316 - }, - { - "token": "false", - "bytes": [ - 102, - 97, - 108, - 115, - 101 - ], - "logprob": -5.2552337646484375 - }, - { - "token": "tr", - "bytes": [ - 116, - 114 - ], - "logprob": -16.192733764648438 - }, - { - "token": "f", - "bytes": [ - 102 - ], - "logprob": -16.692733764648438 - }, - { - "token": "t", - "bytes": [ - 116 - ], - "logprob": -16.880233764648438 - }, - { - "token": "fa", - "bytes": [ - 102, - 97 - ], - "logprob": -18.505233764648438 - }, - { - "token": "fal", - "bytes": [ - 102, - 97, - 108 - ], - "logprob": -19.786483764648438 - }, - { - "token": "", - "bytes": [ - 60, - 102, - 105, - 109, - 95, - 109, - 105, - 100, - 100, - 108, - 101, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 102, - 105, - 109, - 95, - 112, - 114, - 101, - 102, - 105, - 120, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "<|end_of_text|>", - "bytes": [ - 60, - 124, - 101, - 110, - 100, - 95, - 111, - 102, - 95, - 116, - 101, - 120, - 116, - 124, - 62 - ], - "logprob": -9999.0 - } - ] - }, - { - "token": "<|end_of_text|>", - "bytes": [ - 60, - 124, - 101, - 110, - 100, - 95, - 111, - 102, - 95, - 116, - 101, - 120, - 116, - 124, - 62 - ], - "logprob": 0.0, - "top_logprobs": [ - { - "token": "<|end_of_text|>", - "bytes": [ - 60, - 124, - 101, - 110, - 100, - 95, - 111, - 102, - 95, - 116, - 101, - 120, - 116, - 124, - 62 - ], - "logprob": 0.0 - }, - { - "token": "", - "bytes": [ - 60, - 102, - 105, - 109, - 95, - 109, - 105, - 100, - 100, - 108, - 101, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 102, - 105, - 109, - 95, - 112, - 97, - 100, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 105, - 115, - 115, - 117, - 101, - 95, - 115, - 116, - 97, - 114, - 116, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 105, - 115, - 115, - 117, - 101, - 95, - 99, - 111, - 109, - 109, - 101, - 110, - 116, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 102, - 105, - 109, - 95, - 112, - 114, - 101, - 102, - 105, - 120, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 102, - 105, - 109, - 95, - 115, - 117, - 102, - 102, - 105, - 120, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 103, - 104, - 95, - 115, - 116, - 97, - 114, - 115, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 105, - 115, - 115, - 117, - 101, - 95, - 99, - 108, - 111, - 115, - 101, - 100, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 102, - 105, - 108, - 101, - 110, - 97, - 109, - 101, - 62 - ], - "logprob": -9999.0 - } - ] - } - ], - "refusal": null - }, - "message": { - "content": "true", - "refusal": null, - "role": "assistant", - "annotations": null, - "audio": null, - "function_call": null, - "tool_calls": [], - "reasoning_content": null - }, - "stop_reason": null - } - ], - "created": 1758304397, - "model": "requirement_check", - "object": "chat.completion", - "service_tier": null, - "system_fingerprint": null, - "usage": { - "completion_tokens": 2, - "prompt_tokens": 293, - "total_tokens": 295, - "completion_tokens_details": null, - "prompt_tokens_details": null - }, - "prompt_logprobs": null, - "kv_transfer_params": null -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_output/model_output/uncertainty.json b/test/formatters/granite/testdata/test_canned_output/model_output/uncertainty.json deleted file mode 100644 index cd88187a1..000000000 --- a/test/formatters/granite/testdata/test_canned_output/model_output/uncertainty.json +++ /dev/null @@ -1,560 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"score\": \"4\"}", - "role": "assistant" - }, - "logprobs": { - "content": [ - { - "token": "{\"", - "logprob": -4.0531076592742465e-06, - "bytes": [ - 123, - 34 - ], - "top_logprobs": [ - { - "token": "{\"", - "logprob": -4.0531076592742465e-06, - "bytes": [ - 123, - 34 - ] - }, - { - "token": "{\n", - "logprob": -12.511640548706055, - "bytes": [ - 123, - 10 - ] - }, - { - "token": "{", - "logprob": -14.91580581665039, - "bytes": [ - 123 - ] - }, - { - "token": "{\n\n", - "logprob": -18.89350128173828, - "bytes": [ - 123, - 10, - 10 - ] - }, - { - "token": "{\n\n\n", - "logprob": -25.736167907714844, - "bytes": [ - 123, - 10, - 10, - 10 - ] - }, - { - "token": "(", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 40 - ] - }, - { - "token": "$", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 36 - ] - }, - { - "token": ")", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 41 - ] - }, - { - "token": "\"", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 34 - ] - }, - { - "token": "%", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 37 - ] - } - ] - }, - { - "token": "score", - "logprob": 0.0, - "bytes": [ - 115, - 99, - 111, - 114, - 101 - ], - "top_logprobs": [ - { - "token": "score", - "logprob": 0.0, - "bytes": [ - 115, - 99, - 111, - 114, - 101 - ] - }, - { - "token": "s", - "logprob": -22.850893020629883, - "bytes": [ - 115 - ] - }, - { - "token": "sc", - "logprob": -24.523799896240234, - "bytes": [ - 115, - 99 - ] - }, - { - "token": ")", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 41 - ] - }, - { - "token": "(", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 40 - ] - }, - { - "token": "$", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 36 - ] - }, - { - "token": "\"", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 34 - ] - }, - { - "token": "%", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 37 - ] - }, - { - "token": "*", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 42 - ] - }, - { - "token": "&", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 38 - ] - } - ] - }, - { - "token": "\":", - "logprob": 0.0, - "bytes": [ - 34, - 58 - ], - "top_logprobs": [ - { - "token": "\":", - "logprob": 0.0, - "bytes": [ - 34, - 58 - ] - }, - { - "token": "\":\"", - "logprob": -23.256229400634766, - "bytes": [ - 34, - 58, - 34 - ] - }, - { - "token": "\":\n", - "logprob": -26.574241638183594, - "bytes": [ - 34, - 58, - 10 - ] - }, - { - "token": "\"", - "logprob": -27.456998825073242, - "bytes": [ - 34 - ] - }, - { - "token": "\":\n\n", - "logprob": -30.049991607666016, - "bytes": [ - 34, - 58, - 10, - 10 - ] - }, - { - "token": "\"\n\n", - "logprob": -31.085031509399414, - "bytes": [ - 34, - 10, - 10 - ] - }, - { - "token": "\"\n", - "logprob": -31.665332794189453, - "bytes": [ - 34, - 10 - ] - }, - { - "token": "\"\n\n\n\n", - "logprob": -37.62274932861328, - "bytes": [ - 34, - 10, - 10, - 10, - 10 - ] - }, - { - "token": "\"\n\n\n", - "logprob": -37.79510498046875, - "bytes": [ - 34, - 10, - 10, - 10 - ] - }, - { - "token": "(", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 40 - ] - } - ] - }, - { - "token": " \"", - "logprob": 0.0, - "bytes": [ - 32, - 34 - ], - "top_logprobs": [ - { - "token": " \"", - "logprob": 0.0, - "bytes": [ - 32, - 34 - ] - }, - { - "token": "\"", - "logprob": -21.91130828857422, - "bytes": [ - 34 - ] - }, - { - "token": " ", - "logprob": -22.00535011291504, - "bytes": [ - 32 - ] - }, - { - "token": "\t", - "logprob": -25.931697845458984, - "bytes": [ - 9 - ] - }, - { - "token": " ", - "logprob": -26.31612205505371, - "bytes": [ - 32, - 32 - ] - }, - { - "token": " ", - "logprob": -27.29484748840332, - "bytes": [ - 32, - 32, - 32 - ] - }, - { - "token": " \n", - "logprob": -27.30437660217285, - "bytes": [ - 32, - 10 - ] - }, - { - "token": " ", - "logprob": -27.461917877197266, - "bytes": [ - 32, - 32, - 32, - 32 - ] - }, - { - "token": " ", - "logprob": -28.473848342895508, - "bytes": [ - 32, - 32, - 32, - 32, - 32 - ] - }, - { - "token": " ", - "logprob": -29.2908992767334, - "bytes": [ - 32, - 32, - 32, - 32, - 32, - 32, - 32, - 32, - 32, - 32, - 32 - ] - } - ] - }, - { - "token": "4", - "logprob": -1.2333626747131348, - "bytes": [ - 52 - ], - "top_logprobs": [ - { - "token": "4", - "logprob": -1.2333626747131348, - "bytes": [ - 52 - ] - }, - { - "token": "5", - "logprob": -1.3502278327941895, - "bytes": [ - 53 - ] - }, - { - "token": "3", - "logprob": -1.4603333473205566, - "bytes": [ - 51 - ] - }, - { - "token": "6", - "logprob": -2.1628003120422363, - "bytes": [ - 54 - ] - }, - { - "token": "2", - "logprob": -2.558149814605713, - "bytes": [ - 50 - ] - }, - { - "token": "7", - "logprob": -4.393080234527588, - "bytes": [ - 55 - ] - }, - { - "token": "0", - "logprob": -4.83385705947876, - "bytes": [ - 48 - ] - }, - { - "token": "1", - "logprob": -5.500879764556885, - "bytes": [ - 49 - ] - }, - { - "token": "8", - "logprob": -7.648081302642822, - "bytes": [ - 56 - ] - }, - { - "token": "9", - "logprob": -10.840843200683594, - "bytes": [ - 57 - ] - } - ] - }, - { - "token": "\"}", - "logprob": 0.0, - "bytes": [ - 34, - 125 - ], - "top_logprobs": [ - { - "token": "\"}", - "logprob": 0.0, - "bytes": [ - 34, - 125 - ] - }, - { - "token": "\"", - "logprob": -27.13329315185547, - "bytes": [ - 34 - ] - }, - { - "token": "\"\n\n", - "logprob": -35.55782699584961, - "bytes": [ - 34, - 10, - 10 - ] - }, - { - "token": "\"\n", - "logprob": -36.93128204345703, - "bytes": [ - 34, - 10 - ] - }, - { - "token": "\"\n\n\n\n", - "logprob": -41.27574157714844, - "bytes": [ - 34, - 10, - 10, - 10, - 10 - ] - }, - { - "token": "\"\n\n\n", - "logprob": -43.764984130859375, - "bytes": [ - 34, - 10, - 10, - 10 - ] - }, - { - "token": "(", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 40 - ] - }, - { - "token": "$", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 36 - ] - }, - { - "token": ")", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 41 - ] - }, - { - "token": "%", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 37 - ] - } - ] - } - ] - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_ollama/context_relevance.json b/test/formatters/granite/testdata/test_run_ollama/context_relevance.json deleted file mode 100644 index 645d51516..000000000 --- a/test/formatters/granite/testdata/test_run_ollama/context_relevance.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"context_relevance\": \"irrelevant\"}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/context_relevance.json b/test/formatters/granite/testdata/test_run_transformers/context_relevance.json deleted file mode 100644 index 645d51516..000000000 --- a/test/formatters/granite/testdata/test_run_transformers/context_relevance.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"context_relevance\": \"irrelevant\"}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/context_relevance_alora.json b/test/formatters/granite/testdata/test_run_transformers/context_relevance_alora.json deleted file mode 100644 index 645d51516..000000000 --- a/test/formatters/granite/testdata/test_run_transformers/context_relevance_alora.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"context_relevance\": \"irrelevant\"}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/requirement_check.json b/test/formatters/granite/testdata/test_run_transformers/requirement_check.json deleted file mode 100644 index 1012eaf73..000000000 --- a/test/formatters/granite/testdata/test_run_transformers/requirement_check.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"requirement_likelihood\": 0.0}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/requirement_check_alora.json b/test/formatters/granite/testdata/test_run_transformers/requirement_check_alora.json deleted file mode 100644 index 1012eaf73..000000000 --- a/test/formatters/granite/testdata/test_run_transformers/requirement_check_alora.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"requirement_likelihood\": 0.0}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/uncertainty.json b/test/formatters/granite/testdata/test_run_transformers/uncertainty.json deleted file mode 100644 index 6b423fcc4..000000000 --- a/test/formatters/granite/testdata/test_run_transformers/uncertainty.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"certainty\": 0.4597152663768006}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/uncertainty_alora.json b/test/formatters/granite/testdata/test_run_transformers/uncertainty_alora.json deleted file mode 100644 index 9a7742fd8..000000000 --- a/test/formatters/granite/testdata/test_run_transformers/uncertainty_alora.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"certainty\": 0.5784631010929226}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/stdlib/components/intrinsic/test_rag.py b/test/stdlib/components/intrinsic/test_rag.py index 205f2c30b..56f96219c 100644 --- a/test/stdlib/components/intrinsic/test_rag.py +++ b/test/stdlib/components/intrinsic/test_rag.py @@ -155,23 +155,6 @@ def test_citations(backend): assert result == expected -@pytest.mark.qualitative -def test_context_relevance(backend): - """Verify that the context relevance intrinsic functions properly.""" - context, question, docs = _read_input_json("context_relevance.json") - - # Context relevance can only check against a single document at a time. - document = docs[0] - - # First call triggers adapter loading - result = rag.check_context_relevance(question, document, context, backend) - assert result == "irrelevant" - - # Second call hits a different code path from the first one - result = rag.check_context_relevance(question, document, context, backend) - assert result == "irrelevant" - - @pytest.mark.qualitative def test_hallucination_detection(backend): """Verify that the hallucination detection intrinsic functions properly.""" @@ -269,17 +252,6 @@ def test_citations_resolve(backend): pytest.xfail(f"Known differences across platforms. Diff was: {ae}") -@pytest.mark.qualitative -def test_context_relevance_resolve(backend): - """Verify context relevance when question is resolved from context.""" - context, question, docs = _read_input_json("context_relevance.json") - context = context.add(Message("user", question)) - document = docs[0] - - result = rag.check_context_relevance(None, document, context, backend) - assert result == "irrelevant" - - @pytest.mark.qualitative def test_hallucination_detection_resolve(backend): """Verify hallucination detection when response is resolved from context.""" From 85c3e2249e36781ff5d3f7b7641f36c1fb967260 Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Mon, 4 May 2026 23:15:11 +0000 Subject: [PATCH 05/13] Remove unused test files Signed-off-by: Fred Reiss --- .../input_json/context_relevance.json | 8 ------ .../input_json/requirement_check.json | 28 ------------------- .../testdata/input_json/uncertainty.json | 25 ----------------- 3 files changed, 61 deletions(-) delete mode 100644 test/formatters/granite/testdata/input_json/context_relevance.json delete mode 100644 test/formatters/granite/testdata/input_json/requirement_check.json delete mode 100644 test/formatters/granite/testdata/input_json/uncertainty.json diff --git a/test/formatters/granite/testdata/input_json/context_relevance.json b/test/formatters/granite/testdata/input_json/context_relevance.json deleted file mode 100644 index f55807a33..000000000 --- a/test/formatters/granite/testdata/input_json/context_relevance.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "messages": [ - { - "content": "Who is the CEO of Microsoft?", - "role": "user" - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/input_json/requirement_check.json b/test/formatters/granite/testdata/input_json/requirement_check.json deleted file mode 100644 index 37fcef83b..000000000 --- a/test/formatters/granite/testdata/input_json/requirement_check.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "messages": [ - { - "role": "assistant", - "content": "Hello there, welcome to math questions!" - }, - { - "content": "What is the square root of 4?", - "role": "user" - }, - { - "role": "assistant", - "content": "The square root of 4 is 2." - }, - { - "content": "What is six times seven?", - "role": "user" - } - ], - "extra_body": { - "documents": [ - { - "doc_id": "1", - "text": "\nHere's a list of math problems that are on the homework assignment:\n\n1) 12+5=17\n2) 20-8=12\n3) 6*7=42\n4) 45/9=5\n5) 13*2=26\n6) 99+11=110\n7) 100-35=65\n\n" - } - ] - } -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/input_json/uncertainty.json b/test/formatters/granite/testdata/input_json/uncertainty.json deleted file mode 100644 index 69f817ba0..000000000 --- a/test/formatters/granite/testdata/input_json/uncertainty.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "messages": [ - { - "role": "assistant", - "content": "Welcome to pet questions!" - }, - { - "role": "user", - "content": "Which of my pets have fleas?" - } - ], - "max_completion_tokens": 1024, - "extra_body": { - "documents": [ - { - "doc_id": "1", - "text": "My dog has fleas." - }, - { - "doc_id": "2", - "text": "My cat does not have fleas." - } - ] - } -} \ No newline at end of file From ff5fbfdb5344a655567d9a25eb8f035c988c9a96 Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Tue, 5 May 2026 10:48:52 -0700 Subject: [PATCH 06/13] Selectively revert changes --- AGENTS.md | 3 +- docs/docs/advanced/intrinsics.md | 24 ++++++++++++ docs/examples/intrinsics/README.md | 4 ++ docs/examples/intrinsics/context_relevance.py | 32 +++++++++++++++ mellea/backends/adapters/catalog.py | 1 + mellea/stdlib/components/intrinsic/rag.py | 39 +++++++++++++++++++ .../granite/test_intrinsics_formatters.py | 13 +++++++ .../input_json/context_relevance.json | 8 ++++ .../test_canned_input/context_relevance.json | 35 +++++++++++++++++ .../expected_result/context_relevance.json | 11 ++++++ .../model_output/context_relevance.json | 11 ++++++ .../test_run_ollama/context_relevance.json | 11 ++++++ .../context_relevance.json | 11 ++++++ .../context_relevance_alora.json | 11 ++++++ test/stdlib/components/intrinsic/test_rag.py | 28 +++++++++++++ 15 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 docs/examples/intrinsics/context_relevance.py create mode 100644 test/formatters/granite/testdata/input_json/context_relevance.json create mode 100644 test/formatters/granite/testdata/test_canned_input/context_relevance.json create mode 100644 test/formatters/granite/testdata/test_canned_output/expected_result/context_relevance.json create mode 100644 test/formatters/granite/testdata/test_canned_output/model_output/context_relevance.json create mode 100644 test/formatters/granite/testdata/test_run_ollama/context_relevance.json create mode 100644 test/formatters/granite/testdata/test_run_transformers/context_relevance.json create mode 100644 test/formatters/granite/testdata/test_run_transformers/context_relevance_alora.json diff --git a/AGENTS.md b/AGENTS.md index 42bde8be4..cb07d5b31 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -178,6 +178,7 @@ Intrinsics are specialized LoRA adapters that add task-specific capabilities (RA | `rag` | `rewrite_question(question, context, backend)` | Rewrite question into a retrieval query | | `rag` | `clarify_query(question, documents, context, backend)` | Generate clarification or return "CLEAR" | | `rag` | `find_citations(response, documents, context, backend)` | Document sentences supporting the response | +| `rag` | `check_context_relevance(question, document, context, backend)` | Whether a document is relevant (0–1); only supported for granite-4.0, not granite-4.1 | | `rag` | `flag_hallucinated_content(response, documents, context, backend)` | Flag potentially hallucinated sentences | ```python @@ -211,7 +212,7 @@ When adding support for a new intrinsic (not just using an existing one), fetch | Repo | Purpose | Intrinsics | |------|---------|------------| -| [`ibm-granite/granitelib-rag-r1.0`](https://huggingface.co/ibm-granite/granitelib-rag-r1.0) | RAG pipeline | answerability, citations, hallucination_detection, query_rewrite, query_clarification | +| [`ibm-granite/granitelib-rag-r1.0`](https://huggingface.co/ibm-granite/granitelib-rag-r1.0) | RAG pipeline | answerability, citations, context_relevance, hallucination_detection, query_rewrite, query_clarification | | [`ibm-granite/granitelib-core-r1.0`](https://huggingface.co/ibm-granite/granitelib-core-r1.0) | Core capabilities | context-attribution, requirement-check, uncertainty | | [`ibm-granite/granitelib-guardian-r1.0`](https://huggingface.co/ibm-granite/granitelib-guardian-r1.0) | Safety & compliance | guardian-core, policy-guardrails, factuality-detection, factuality-correction | diff --git a/docs/docs/advanced/intrinsics.md b/docs/docs/advanced/intrinsics.md index d83529771..8fec1b8cd 100644 --- a/docs/docs/advanced/intrinsics.md +++ b/docs/docs/advanced/intrinsics.md @@ -72,6 +72,30 @@ print(rag.check_answerability(question, docs_answerable, context, backend)) # print(rag.check_answerability(question, docs_not_answerable, context, backend)) # False ``` +## Context relevance + +Assess whether a document is relevant to a question: + +```python +# Requires: mellea[hf] +# Returns: float +from mellea.backends.huggingface import LocalHFBackend +from mellea.stdlib.components import Document +from mellea.stdlib.components.intrinsic import rag +from mellea.stdlib.context import ChatContext + +backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") +context = ChatContext() +question = "Who is the CEO of Microsoft?" +document = Document( + "Microsoft Corporation is an American multinational corporation " + "headquartered in Redmond, Washington." +) + +result = rag.check_context_relevance(question, document, context, backend) +print(result) # False — the document does not mention the CEO +``` + ## Hallucination detection Flag sentences in an assistant response that are not grounded in the source documents: diff --git a/docs/examples/intrinsics/README.md b/docs/examples/intrinsics/README.md index 11558c5b2..eff7bb738 100644 --- a/docs/examples/intrinsics/README.md +++ b/docs/examples/intrinsics/README.md @@ -20,6 +20,9 @@ Checks if a question can be answered given the context. ### citations.py Validates and extracts citations from generated text. +### context_relevance.py +Assesses if retrieved context is relevant to a query. + ### hallucination_detection.py Detects when model outputs contain hallucinated information. @@ -114,6 +117,7 @@ see [`../granite-switch/`](../granite-switch/). - **answerability**: Determine if question is answerable - **citations**: Extract and validate citations - **context-attribution**: Identify context sentences that most influenced response +- **context_relevance**: Assess context-query relevance - **factuality_correction**: Correct factually incorrect responses - **factuality_detection**: Detect factually incorrect responses - **guardian-core**: Safety risk detection (harm, bias, groundedness, custom criteria) diff --git a/docs/examples/intrinsics/context_relevance.py b/docs/examples/intrinsics/context_relevance.py new file mode 100644 index 000000000..2080f8141 --- /dev/null +++ b/docs/examples/intrinsics/context_relevance.py @@ -0,0 +1,32 @@ +# pytest: huggingface, e2e + +"""Example usage of the context relevance intrinsic for RAG applications. + +To run this script from the root of the Mellea source tree, use the command: +``` +uv run python docs/examples/intrinsics/context_relevance.py +``` +""" + +from mellea import model_ids, start_backend +from mellea.stdlib.components.intrinsic import rag + +ctx, backend = start_backend( + "hf", model_id=model_ids.IBM_GRANITE_4_MICRO_3B, context_type="chat" +) +# NOTE: this example uses Granite 4.0 micro because there is no context_relevance intrinsic for Graniet 4.1 + +question = "Who is the CEO of Microsoft?" +document = ( + # Document text does not say who is the CEO. + "Microsoft Corporation is an American multinational corporation and technology " + "conglomerate headquartered in Redmond, Washington.[2] Founded in 1975, the " + "company became influential in the rise of personal computers through software " + "like Windows, and the company has since expanded to Internet services, cloud " + "computing, video gaming and other fields. Microsoft is the largest software " + "maker, one of the most valuable public U.S. companies,[a] and one of the most " + "valuable brands globally." +) + +result = rag.check_context_relevance(question, document, ctx, backend) +print(f"Result of context relevance check with irrelevant document: {result}") diff --git a/mellea/backends/adapters/catalog.py b/mellea/backends/adapters/catalog.py index 807376dc5..9edeea2e4 100644 --- a/mellea/backends/adapters/catalog.py +++ b/mellea/backends/adapters/catalog.py @@ -76,6 +76,7 @@ class IntriniscsCatalogEntry(pydantic.BaseModel): ############################################ IntriniscsCatalogEntry(name="answerability", repo_id=_RAG_REPO), IntriniscsCatalogEntry(name="citations", repo_id=_RAG_REPO), + IntriniscsCatalogEntry(name="context_relevance", repo_id=_RAG_REPO), IntriniscsCatalogEntry(name="hallucination_detection", repo_id=_RAG_REPO), IntriniscsCatalogEntry(name="query_clarification", repo_id=_RAG_REPO), IntriniscsCatalogEntry(name="query_rewrite", repo_id=_RAG_REPO), diff --git a/mellea/stdlib/components/intrinsic/rag.py b/mellea/stdlib/components/intrinsic/rag.py index 6219cc85d..4dd20a1c8 100644 --- a/mellea/stdlib/components/intrinsic/rag.py +++ b/mellea/stdlib/components/intrinsic/rag.py @@ -154,6 +154,45 @@ def find_citations( return result_json +def check_context_relevance( + question: str | None, + document: str | Document, + context: ChatContext, + backend: AdapterMixin, +) -> str: + """Test whether a document is relevant to a user's question. + + Intrinsic function that checks whether a single document contains part or all of + the answer to a user's question. Does not consider the context in which the + question was asked. + + Args: + question: Question that the user has posed. When ``None``, the question + is extracted from the last user message in ``context``. + document: A retrieved document snippet. May be a ``Document`` or a plain + string (automatically wrapped in ``Document``). + context: The chat up to the point where the user asked a question. + backend: Backend instance that supports the adapters that implement this + intrinsic. + + Returns: + Context relevance judgement as one of the following strings: + - "relevant" + - "irrelevant" + - "partially relevant" + """ + question, context = _resolve_question(question, context, backend) + document = _coerce_to_document(document) + result_json = call_intrinsic( + "context_relevance", + context.add(Message("user", question)), + backend, + # Target document is passed as an argument + kwargs={"document_content": document.text}, + ) + return result_json["context_relevance"] + + def flag_hallucinated_content( response: str | None, documents: collections.abc.Iterable[str | Document], diff --git a/test/formatters/granite/test_intrinsics_formatters.py b/test/formatters/granite/test_intrinsics_formatters.py index 53e4582b1..996179f2e 100644 --- a/test/formatters/granite/test_intrinsics_formatters.py +++ b/test/formatters/granite/test_intrinsics_formatters.py @@ -193,6 +193,19 @@ def _resolve_yaml(self): inputs_file=_INPUT_JSON_DIR / "query_rewrite.json", task="query_rewrite", ), + YamlJsonCombo( + short_name="context_relevance", + inputs_file=_INPUT_JSON_DIR / "context_relevance.json", + arguments_file=_INPUT_ARGS_DIR / "context_relevance.json", + task="context_relevance", + ), + YamlJsonCombo( + short_name="context_relevance_alora", + inputs_file=_INPUT_JSON_DIR / "context_relevance.json", + arguments_file=_INPUT_ARGS_DIR / "context_relevance.json", + task="context_relevance", + is_alora=True, + ), YamlJsonCombo( short_name="citations", inputs_file=_INPUT_JSON_DIR / "citations.json", diff --git a/test/formatters/granite/testdata/input_json/context_relevance.json b/test/formatters/granite/testdata/input_json/context_relevance.json new file mode 100644 index 000000000..02e3bc5ad --- /dev/null +++ b/test/formatters/granite/testdata/input_json/context_relevance.json @@ -0,0 +1,8 @@ +{ + "messages": [ + { + "content": "Who is the CEO of Microsoft?", + "role": "user" + } + ] +} diff --git a/test/formatters/granite/testdata/test_canned_input/context_relevance.json b/test/formatters/granite/testdata/test_canned_input/context_relevance.json new file mode 100644 index 000000000..9ce036c51 --- /dev/null +++ b/test/formatters/granite/testdata/test_canned_input/context_relevance.json @@ -0,0 +1,35 @@ +{ + "messages": [ + { + "content": "Who is the CEO of Microsoft?", + "role": "user" + }, + { + "content": "DOCUMENT: Microsoft Corporation is an American multinational corporation and technology conglomerate headquartered in Redmond, Washington.[2] Founded in 1975, the company became influential in the rise of personal computers through software like Windows, and the company has since expanded to Internet services, cloud computing, video gaming and other fields. Microsoft is the largest software maker, one of the most valuable public U.S. companies,[a] and one of the most valuable brands globally.\n", + "role": "user" + } + ], + "extra_body": { + "structured_outputs": { + "json": { + "title": "ContextRelevanceOutput", + "type": "object", + "properties": { + "context_relevance": { + "type": "string", + "description": "Context relevancy judgment.", + "enum": [ + "relevant", + "irrelevant", + "partially relevant" + ] + } + }, + "required": [ + "context_relevance" + ] + } + } + }, + "temperature": 0.0 +} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_output/expected_result/context_relevance.json b/test/formatters/granite/testdata/test_canned_output/expected_result/context_relevance.json new file mode 100644 index 000000000..645d51516 --- /dev/null +++ b/test/formatters/granite/testdata/test_canned_output/expected_result/context_relevance.json @@ -0,0 +1,11 @@ +{ + "choices": [ + { + "index": 0, + "message": { + "content": "{\"context_relevance\": \"irrelevant\"}", + "role": "assistant" + } + } + ] +} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_output/model_output/context_relevance.json b/test/formatters/granite/testdata/test_canned_output/model_output/context_relevance.json new file mode 100644 index 000000000..ce3e97dfe --- /dev/null +++ b/test/formatters/granite/testdata/test_canned_output/model_output/context_relevance.json @@ -0,0 +1,11 @@ +{ + "choices": [ + { + "index": 0, + "message": { + "content": "{\n \"context_relevance\": \"irrelevant\"\n}", + "role": "assistant" + } + } + ] +} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_ollama/context_relevance.json b/test/formatters/granite/testdata/test_run_ollama/context_relevance.json new file mode 100644 index 000000000..645d51516 --- /dev/null +++ b/test/formatters/granite/testdata/test_run_ollama/context_relevance.json @@ -0,0 +1,11 @@ +{ + "choices": [ + { + "index": 0, + "message": { + "content": "{\"context_relevance\": \"irrelevant\"}", + "role": "assistant" + } + } + ] +} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/context_relevance.json b/test/formatters/granite/testdata/test_run_transformers/context_relevance.json new file mode 100644 index 000000000..645d51516 --- /dev/null +++ b/test/formatters/granite/testdata/test_run_transformers/context_relevance.json @@ -0,0 +1,11 @@ +{ + "choices": [ + { + "index": 0, + "message": { + "content": "{\"context_relevance\": \"irrelevant\"}", + "role": "assistant" + } + } + ] +} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/context_relevance_alora.json b/test/formatters/granite/testdata/test_run_transformers/context_relevance_alora.json new file mode 100644 index 000000000..645d51516 --- /dev/null +++ b/test/formatters/granite/testdata/test_run_transformers/context_relevance_alora.json @@ -0,0 +1,11 @@ +{ + "choices": [ + { + "index": 0, + "message": { + "content": "{\"context_relevance\": \"irrelevant\"}", + "role": "assistant" + } + } + ] +} \ No newline at end of file diff --git a/test/stdlib/components/intrinsic/test_rag.py b/test/stdlib/components/intrinsic/test_rag.py index 56f96219c..205f2c30b 100644 --- a/test/stdlib/components/intrinsic/test_rag.py +++ b/test/stdlib/components/intrinsic/test_rag.py @@ -155,6 +155,23 @@ def test_citations(backend): assert result == expected +@pytest.mark.qualitative +def test_context_relevance(backend): + """Verify that the context relevance intrinsic functions properly.""" + context, question, docs = _read_input_json("context_relevance.json") + + # Context relevance can only check against a single document at a time. + document = docs[0] + + # First call triggers adapter loading + result = rag.check_context_relevance(question, document, context, backend) + assert result == "irrelevant" + + # Second call hits a different code path from the first one + result = rag.check_context_relevance(question, document, context, backend) + assert result == "irrelevant" + + @pytest.mark.qualitative def test_hallucination_detection(backend): """Verify that the hallucination detection intrinsic functions properly.""" @@ -252,6 +269,17 @@ def test_citations_resolve(backend): pytest.xfail(f"Known differences across platforms. Diff was: {ae}") +@pytest.mark.qualitative +def test_context_relevance_resolve(backend): + """Verify context relevance when question is resolved from context.""" + context, question, docs = _read_input_json("context_relevance.json") + context = context.add(Message("user", question)) + document = docs[0] + + result = rag.check_context_relevance(None, document, context, backend) + assert result == "irrelevant" + + @pytest.mark.qualitative def test_hallucination_detection_resolve(backend): """Verify hallucination detection when response is resolved from context.""" From 27dba8dd05282453293d0df461fd9503eb50bfbf Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Tue, 5 May 2026 10:55:48 -0700 Subject: [PATCH 07/13] Revert more changes --- .../tutorials/04-making-agents-reliable.md | 4 +- docs/examples/intrinsics/README.md | 51 ------------------- docs/examples/intrinsics/context_relevance.py | 2 +- .../stdlib/components/intrinsic/guardian.py | 8 +++ .../input_json/context_relevance.json | 2 +- 5 files changed, 12 insertions(+), 55 deletions(-) diff --git a/docs/docs/tutorials/04-making-agents-reliable.md b/docs/docs/tutorials/04-making-agents-reliable.md index 73dd2a756..81a21d1a4 100644 --- a/docs/docs/tutorials/04-making-agents-reliable.md +++ b/docs/docs/tutorials/04-making-agents-reliable.md @@ -397,7 +397,7 @@ and dynamic applications with ease. The word "Mellea" consists of Scores are floats between 0.0 (safe) and 1.0 (risk detected); 0.5 is the threshold. The available criteria are: `"harm"`, `"jailbreak"`, `"social_bias"`, `"profanity"`, `"violence"`, `"sexual_content"`, `"unethical_behavior"`, `"groundedness"`, -`"answer_relevance"`, and `"function_call"`. +`"answer_relevance"`, `"context_relevance"`, and `"function_call"`. --- @@ -474,7 +474,7 @@ for criterion in criteria: The available criteria are: `"harm"`, `"jailbreak"`, `"social_bias"`, `"profanity"`, `"violence"`, `"sexual_content"`, `"unethical_behavior"`, `"groundedness"`, -`"answer_relevance"`, and `"function_call"`. +`"answer_relevance"`, `"context_relevance"`, and `"function_call"`. --- diff --git a/docs/examples/intrinsics/README.md b/docs/examples/intrinsics/README.md index eff7bb738..df3e57e2f 100644 --- a/docs/examples/intrinsics/README.md +++ b/docs/examples/intrinsics/README.md @@ -2,57 +2,6 @@ This directory contains examples for using Mellea's intrinsic functions - specialized model capabilities accessed through adapters. -<<<<<<< HEAD -## Files - -### intrinsics.py -Core example showing how to directly use intrinsics with adapters. - -**Key Features:** -- Creating and adding adapters to backends -- Using `Intrinsic` component for specialized tasks -- Working with Granite Common adapters (aLoRA-based) -- Understanding adapter output formats - -### answerability.py -Checks if a question can be answered given the context. - -### citations.py -Validates and extracts citations from generated text. - -### context_relevance.py -Assesses if retrieved context is relevant to a query. - -### hallucination_detection.py -Detects when model outputs contain hallucinated information. - -### query_rewrite.py -Rewrites queries for better retrieval or understanding. - -### uncertainty.py -Estimates the model's certainty about answering a question. - -### requirement_check.py -Detect if text adheres to provided requirements. - -### policy_guardrails.py -Checks if a scenario is compliant/non-compliant/ambiguous with respect to a given policy, - -### guardian_core.py -Uses the guardian-core LoRA adapter for safety risk detection, including prompt-level harm, response-level social bias, RAG groundedness, and custom criteria. - -### factuality_detection.py -Detects if the the model's output is factually incorrect relative to context. - -### factuality_correction.py -Corrects a factually incorrect response relative to context. - -### context_attribution.py -Identifies sentences in conversation history and documents that most influenced the response. - - -======= ->>>>>>> main ## Concepts Demonstrated - **Intrinsic Functions**: Specialized model capabilities beyond text generation diff --git a/docs/examples/intrinsics/context_relevance.py b/docs/examples/intrinsics/context_relevance.py index 2080f8141..bd6bfd7d2 100644 --- a/docs/examples/intrinsics/context_relevance.py +++ b/docs/examples/intrinsics/context_relevance.py @@ -14,7 +14,7 @@ ctx, backend = start_backend( "hf", model_id=model_ids.IBM_GRANITE_4_MICRO_3B, context_type="chat" ) -# NOTE: this example uses Granite 4.0 micro because there is no context_relevance intrinsic for Graniet 4.1 +# NOTE: this example uses Granite 4.0 micro because there is no context_relevance intrinsic for Granite 4.1 question = "Who is the CEO of Microsoft?" document = ( diff --git a/mellea/stdlib/components/intrinsic/guardian.py b/mellea/stdlib/components/intrinsic/guardian.py index 5da8ca8ab..3dcc843a9 100644 --- a/mellea/stdlib/components/intrinsic/guardian.py +++ b/mellea/stdlib/components/intrinsic/guardian.py @@ -107,6 +107,14 @@ def policy_guardrails( "irrelevant if it fails to directly answer or meet the specific " "intent of the query." ), + "context_relevance": ( + "A document is deemed irrelevant when it doesn't contain information " + "pertinent to the query's specific needs. This means the retrieved or " + "provided content fails to adequately address the question at hand. " + "Irrelevant information could be on a different topic, originate from " + "an unrelated field, or simply not offer any valuable insights for " + "crafting a suitable response." + ), "function_call": ( "Function call hallucination occurs when a text includes function " "calls that either don't adhere to the correct format defined by the " diff --git a/test/formatters/granite/testdata/input_json/context_relevance.json b/test/formatters/granite/testdata/input_json/context_relevance.json index 02e3bc5ad..f55807a33 100644 --- a/test/formatters/granite/testdata/input_json/context_relevance.json +++ b/test/formatters/granite/testdata/input_json/context_relevance.json @@ -5,4 +5,4 @@ "role": "user" } ] -} +} \ No newline at end of file From 4fa1e8990d9b5987d40e65eeabdf8259705538ac Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Tue, 5 May 2026 11:05:27 -0700 Subject: [PATCH 08/13] Update docstrings --- .../granite/test_intrinsics_formatters.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/test/formatters/granite/test_intrinsics_formatters.py b/test/formatters/granite/test_intrinsics_formatters.py index 996179f2e..f66a1e1ea 100644 --- a/test/formatters/granite/test_intrinsics_formatters.py +++ b/test/formatters/granite/test_intrinsics_formatters.py @@ -59,12 +59,14 @@ def _substitute_root( Handles common corner cases such as when a given path has multiple equivalent string representations. - :param child_path: A path that is a descedent of a known root - :param old_root: Root directory that is an ancestor of ``child_path`` - :param new_root: Root directory to substitute for ``old_root`` - - :returns: A version of ``child_path`` in which the prefix corresponding to - ``old_root`` has been replaced with ``new_root`` + Args: + child_path: A path that is a descendant of a known root. + old_root: Root directory that is an ancestor of ``child_path``. + new_root: Root directory to substitute for ``old_root``. + + Returns: + A version of ``child_path`` in which the prefix corresponding to + ``old_root`` has been replaced with ``new_root``. """ # Resolve paths to handle symlinks, relative components, and other corner cases child_path = child_path.resolve() @@ -88,8 +90,9 @@ def _dump_output(expected_file: pathlib.Path, actual_string: str): compare against a canned output and the location of said canned output, write the string to a controlled place on the filesystem to aid debugging. - :param expected_file: Location of the file we're going to compare against - :param actual_string: String that the current test case produced that + Args: + expected_file: Location of the file we're going to compare against. + actual_string: String that the current test case produced. """ actual_file = _substitute_root(expected_file, _TEST_DATA_DIR, _TEST_OUTPUT_DIR) From 3a0bae030ee2e0966358803e99ced5ae48c49e61 Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Tue, 5 May 2026 18:42:35 +0000 Subject: [PATCH 09/13] Switch mypy back to 3.11 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e1533d922..5cf8f122c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -302,7 +302,7 @@ split-on-trailing-comma = false install_types = true non_interactive = true disable_error_code = ["empty-body", "import-untyped"] -python_version = "3.12" +python_version = "3.11" exclude = [ "^tooling/", "^scratchpad/", From b4b05ad6e6d58cca09b6424d3349fe51369e8f91 Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Tue, 5 May 2026 18:55:35 +0000 Subject: [PATCH 10/13] Move rag intrinsics tests to 4.1 except for context relevance Signed-off-by: Fred Reiss --- .../granite/test_intrinsics_formatters.py | 10 ++++++- .../test_run_transformers/citations.json | 2 +- .../hallucination_detection.json | 2 +- .../query_clarification.json | 2 +- test/stdlib/components/intrinsic/test_rag.py | 29 ++++++++++++++----- .../testdata/output_json/citations.json | 13 +++++++-- .../output_json/hallucination_detection.json | 4 +-- 7 files changed, 47 insertions(+), 15 deletions(-) diff --git a/test/formatters/granite/test_intrinsics_formatters.py b/test/formatters/granite/test_intrinsics_formatters.py index f66a1e1ea..774eb64c7 100644 --- a/test/formatters/granite/test_intrinsics_formatters.py +++ b/test/formatters/granite/test_intrinsics_formatters.py @@ -45,6 +45,8 @@ def _read_file(name): _RAG_INTRINSICS_REPO_NAME = "ibm-granite/granitelib-rag-r1.0" _CORE_R1_REPO_NAME = "ibm-granite/granitelib-core-r1.0" +_DEFAULT_BASE_MODEL = "ibm-granite/granite-4.1-3b" + _INPUT_JSON_DIR = _TEST_DATA_DIR / "input_json" _INPUT_YAML_DIR = _TEST_DATA_DIR / "input_yaml" @@ -126,7 +128,7 @@ class YamlJsonCombo(pydantic.BaseModel): loaded.""" revision: str = "main" """Revision or branch of the Hugging Face `repo_id`.""" - base_model_id: str = "ibm-granite/granite-4.0-micro" + base_model_id: str = _DEFAULT_BASE_MODEL """Base model on which the target adapter was trained. Should be small enough to run on the CI server.""" @@ -201,6 +203,8 @@ def _resolve_yaml(self): inputs_file=_INPUT_JSON_DIR / "context_relevance.json", arguments_file=_INPUT_ARGS_DIR / "context_relevance.json", task="context_relevance", + # No Granite 4.1 version of this adapter + base_model_id="ibm-granite/granite-4.0-micro", ), YamlJsonCombo( short_name="context_relevance_alora", @@ -208,6 +212,8 @@ def _resolve_yaml(self): arguments_file=_INPUT_ARGS_DIR / "context_relevance.json", task="context_relevance", is_alora=True, + # No Granite 4.1 version of this adapter + base_model_id="ibm-granite/granite-4.0-micro", ), YamlJsonCombo( short_name="citations", @@ -220,6 +226,8 @@ def _resolve_yaml(self): task="context-attribution", repo_id="ibm-granite/granitelib-core-r1.0", revision="c9c189f5ad0b2890660397070613fda46d6ceb80", + # No Granite 4.1 version of this adapter at the selected Git commit + base_model_id="ibm-granite/granite-4.0-micro", ), # gpt-oss-20b intrinsics (canned output tests only, no inference) YamlJsonCombo( diff --git a/test/formatters/granite/testdata/test_run_transformers/citations.json b/test/formatters/granite/testdata/test_run_transformers/citations.json index 67dc2bb51..6cabdcfcd 100644 --- a/test/formatters/granite/testdata/test_run_transformers/citations.json +++ b/test/formatters/granite/testdata/test_run_transformers/citations.json @@ -3,7 +3,7 @@ { "index": 0, "message": { - "content": "[{\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 692, \"citation_end\": 1030, \"citation_text\": \"He began to direct his attention to acquisition and expansion, buying the troubled Sunday Times in Perth, Western Australia (1956) and over the next few years acquiring suburban and provincial newspapers in New South Wales, Queensland, Victoria and the Northern Territory, including the Sydney afternoon tabloid, The Daily Mirror (1960). \"}, {\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 1219, \"citation_end\": 1346, \"citation_text\": \"Murdoch's first foray outside Australia involved the purchase of a controlling interest in the New Zealand daily The Dominion. \"}]", + "content": "[{\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 692, \"citation_end\": 1030, \"citation_text\": \"He began to direct his attention to acquisition and expansion, buying the troubled Sunday Times in Perth, Western Australia (1956) and over the next few years acquiring suburban and provincial newspapers in New South Wales, Queensland, Victoria and the Northern Territory, including the Sydney afternoon tabloid, The Daily Mirror (1960). \"}, {\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 1219, \"citation_end\": 1346, \"citation_text\": \"Murdoch's first foray outside Australia involved the purchase of a controlling interest in the New Zealand daily The Dominion. \"}, {\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 1739, \"citation_end\": 2044, \"citation_text\": \"Later in 1964, Murdoch launched The Australian, Australia's first national daily newspaper, which was based first in Canberra and later in Sydney. In 1972, Murdoch acquired the Sydney morning tabloid The Daily Telegraph from Australian media mogul Sir Frank Packer, who later regretted selling it to him. \"}]", "role": "assistant" } } diff --git a/test/formatters/granite/testdata/test_run_transformers/hallucination_detection.json b/test/formatters/granite/testdata/test_run_transformers/hallucination_detection.json index 4460bdc98..d5a5ce051 100644 --- a/test/formatters/granite/testdata/test_run_transformers/hallucination_detection.json +++ b/test/formatters/granite/testdata/test_run_transformers/hallucination_detection.json @@ -3,7 +3,7 @@ { "index": 0, "message": { - "content": "[{\"response_begin\": 0, \"response_end\": 31, \"response_text\": \"Purple bumble fish are yellow. \", \"faithfulness\": \"faithful\", \"explanation\": \"This sentence makes a factual claim about the color of purple bumble fish. The provided context states: 'The only type of fish that is yellow is the purple bumble fish.' This directly supports the claim in the sentence.\"}, {\"response_begin\": 31, \"response_end\": 65, \"response_text\": \"Green bumble fish are also yellow.\", \"faithfulness\": \"unfaithful\", \"explanation\": \"This sentence makes a factual claim about the color of green bumble fish. However, the provided context does not mention green bumble fish or their color. Therefore, the claim cannot be verified based on the context.\"}]", + "content": "[{\"response_begin\": 0, \"response_end\": 31, \"response_text\": \"Purple bumble fish are yellow. \", \"faithfulness\": \"faithful\", \"explanation\": \"This sentence makes a factual claim about the color of purple bumble fish. The document states 'The only type of fish that is yellow is the purple bumble fish.' This matches exactly with the claim in the sentence.\"}, {\"response_begin\": 31, \"response_end\": 65, \"response_text\": \"Green bumble fish are also yellow.\", \"faithfulness\": \"unfaithful\", \"explanation\": \"This sentence makes a factual claim about the color of green bumble fish. However, the document does not mention anything about the color of green bumble fish, making this claim 'unfaithful'.\"}]", "role": "assistant" } } diff --git a/test/formatters/granite/testdata/test_run_transformers/query_clarification.json b/test/formatters/granite/testdata/test_run_transformers/query_clarification.json index 188a826b8..8fa9501f9 100644 --- a/test/formatters/granite/testdata/test_run_transformers/query_clarification.json +++ b/test/formatters/granite/testdata/test_run_transformers/query_clarification.json @@ -3,7 +3,7 @@ { "index": 0, "message": { - "content": "{\"clarification\": \"There are several languages that descended from Common Brittonic, such as Welsh, Breton, Cornish, and Cumbric. Which one are you referring to?\"}", + "content": "{\"clarification\": \"Several languages descended from Common Brittonic are still spoken or have been revived, like Welsh (a living language in Wales), Breton (spoken in Brittany, France), or Cornish (once extinct but now undergoing revitalization). Which one are you referring to?\"}", "role": "assistant" } } diff --git a/test/stdlib/components/intrinsic/test_rag.py b/test/stdlib/components/intrinsic/test_rag.py index 205f2c30b..b570ddd11 100644 --- a/test/stdlib/components/intrinsic/test_rag.py +++ b/test/stdlib/components/intrinsic/test_rag.py @@ -10,7 +10,7 @@ torch = pytest.importorskip("torch", reason="torch not installed — install mellea[hf]") from mellea.backends.huggingface import LocalHFBackend -from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B +from mellea.backends.model_ids import IBM_GRANITE_4_1_3B, IBM_GRANITE_4_MICRO_3B from mellea.core import ModelOutputThunk from mellea.stdlib.components import Document, Message from mellea.stdlib.components.intrinsic import rag @@ -42,7 +42,22 @@ def _backend(): torch.set_num_threads(4) # No adapters for hybrid version. - backend_ = LocalHFBackend(model_id=IBM_GRANITE_4_MICRO_3B.hf_model_name) # type: ignore + backend_ = LocalHFBackend(model_id=IBM_GRANITE_4_1_3B.hf_model_name) + yield backend_ + + from test.conftest import cleanup_gpu_backend + + cleanup_gpu_backend(backend_, "rag") + + +@pytest.fixture(name="backend_4_0", scope="module") +def _backend_4_0(): + """Granite 4.0 backend used only by tests that don't have Granite 4.1 models..""" + # Prevent thrashing if the default device is CPU + torch.set_num_threads(4) + + # No adapters for hybrid version. + backend_ = LocalHFBackend(model_id=IBM_GRANITE_4_MICRO_3B.hf_model_name) yield backend_ from test.conftest import cleanup_gpu_backend @@ -156,7 +171,7 @@ def test_citations(backend): @pytest.mark.qualitative -def test_context_relevance(backend): +def test_context_relevance(backend_4_0): """Verify that the context relevance intrinsic functions properly.""" context, question, docs = _read_input_json("context_relevance.json") @@ -164,11 +179,11 @@ def test_context_relevance(backend): document = docs[0] # First call triggers adapter loading - result = rag.check_context_relevance(question, document, context, backend) + result = rag.check_context_relevance(question, document, context, backend_4_0) assert result == "irrelevant" # Second call hits a different code path from the first one - result = rag.check_context_relevance(question, document, context, backend) + result = rag.check_context_relevance(question, document, context, backend_4_0) assert result == "irrelevant" @@ -270,13 +285,13 @@ def test_citations_resolve(backend): @pytest.mark.qualitative -def test_context_relevance_resolve(backend): +def test_context_relevance_resolve(backend_4_0): """Verify context relevance when question is resolved from context.""" context, question, docs = _read_input_json("context_relevance.json") context = context.add(Message("user", question)) document = docs[0] - result = rag.check_context_relevance(None, document, context, backend) + result = rag.check_context_relevance(None, document, context, backend_4_0) assert result == "irrelevant" diff --git a/test/stdlib/components/intrinsic/testdata/output_json/citations.json b/test/stdlib/components/intrinsic/testdata/output_json/citations.json index d62175369..001745828 100644 --- a/test/stdlib/components/intrinsic/testdata/output_json/citations.json +++ b/test/stdlib/components/intrinsic/testdata/output_json/citations.json @@ -4,9 +4,9 @@ "response_end": 91, "response_text": "Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. ", "citation_doc_id": "0", - "citation_begin": 598, + "citation_begin": 692, "citation_end": 1030, - "citation_text": "Rupert Murdoch turned its Adelaide newspaper, The News, its main asset, into a major success. He began to direct his attention to acquisition and expansion, buying the troubled Sunday Times in Perth, Western Australia (1956) and over the next few years acquiring suburban and provincial newspapers in New South Wales, Queensland, Victoria and the Northern Territory, including the Sydney afternoon tabloid, The Daily Mirror (1960). " + "citation_text": "He began to direct his attention to acquisition and expansion, buying the troubled Sunday Times in Perth, Western Australia (1956) and over the next few years acquiring suburban and provincial newspapers in New South Wales, Queensland, Victoria and the Northern Territory, including the Sydney afternoon tabloid, The Daily Mirror (1960). " }, { "response_begin": 0, @@ -16,5 +16,14 @@ "citation_begin": 1219, "citation_end": 1346, "citation_text": "Murdoch's first foray outside Australia involved the purchase of a controlling interest in the New Zealand daily The Dominion. " + }, + { + "response_begin": 0, + "response_end": 91, + "response_text": "Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. ", + "citation_doc_id": "0", + "citation_begin": 1739, + "citation_end": 2044, + "citation_text": "Later in 1964, Murdoch launched The Australian, Australia's first national daily newspaper, which was based first in Canberra and later in Sydney. In 1972, Murdoch acquired the Sydney morning tabloid The Daily Telegraph from Australian media mogul Sir Frank Packer, who later regretted selling it to him. " } ] \ No newline at end of file diff --git a/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json b/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json index 21924c116..072598fed 100644 --- a/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json +++ b/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json @@ -4,13 +4,13 @@ "response_end": 31, "response_text": "Purple bumble fish are yellow. ", "faithfulness": "faithful", - "explanation": "This sentence makes a factual claim about the color of purple bumble fish. The provided context states: 'The only type of fish that is yellow is the purple bumble fish.' This directly supports the claim in the sentence." + "explanation": "This sentence makes a factual claim about the color of purple bumble fish. The document states 'The only type of fish that is yellow is the purple bumble fish.' This matches exactly with the claim in the sentence." }, { "response_begin": 31, "response_end": 65, "response_text": "Green bumble fish are also yellow.", "faithfulness": "unfaithful", - "explanation": "This sentence makes a factual claim about the color of green bumble fish. However, the provided context does not mention green bumble fish, so the claim cannot be verified based on the context." + "explanation": "This sentence makes a factual claim about the color of green bumble fish. However, the document does not mention anything about the color of green bumble fish, making this claim 'unfaithful'." } ] \ No newline at end of file From dc91cc7c659e2c623be5adfd1fa0360bfde29463 Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Tue, 5 May 2026 13:38:38 -0700 Subject: [PATCH 11/13] Update test/stdlib/components/intrinsic/test_rag.py Co-authored-by: Alex Bozarth --- test/stdlib/components/intrinsic/test_rag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/stdlib/components/intrinsic/test_rag.py b/test/stdlib/components/intrinsic/test_rag.py index b570ddd11..f66059128 100644 --- a/test/stdlib/components/intrinsic/test_rag.py +++ b/test/stdlib/components/intrinsic/test_rag.py @@ -52,7 +52,7 @@ def _backend(): @pytest.fixture(name="backend_4_0", scope="module") def _backend_4_0(): - """Granite 4.0 backend used only by tests that don't have Granite 4.1 models..""" + """Granite 4.0 backend used only by tests that don't have Granite 4.1 models.""" # Prevent thrashing if the default device is CPU torch.set_num_threads(4) From 82eea754c291f77decaca130bad442d75b337e20 Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Tue, 5 May 2026 13:39:29 -0700 Subject: [PATCH 12/13] Update test/stdlib/components/intrinsic/test_rag.py Co-authored-by: Alex Bozarth --- test/stdlib/components/intrinsic/test_rag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/stdlib/components/intrinsic/test_rag.py b/test/stdlib/components/intrinsic/test_rag.py index f66059128..bc76fa650 100644 --- a/test/stdlib/components/intrinsic/test_rag.py +++ b/test/stdlib/components/intrinsic/test_rag.py @@ -104,7 +104,7 @@ def _read_output_json(file_name: str): def _dump_output_json(file_name: str, to_write): """Shared code for dumping a test's generated JSON data. - Dump the Python data structures that that will be compared against canned + Dump the Python data structures that will be compared against canned JSON output files. Outputs go to the local directory ``test_output``. If you are sure the current output is correct, you can use this output to update From f49b497798f9beaa240a83b2db86148ad6cf5705 Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Tue, 5 May 2026 20:41:52 +0000 Subject: [PATCH 13/13] Update docstring Signed-off-by: Fred Reiss --- test/formatters/granite/test_intrinsics_formatters.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/formatters/granite/test_intrinsics_formatters.py b/test/formatters/granite/test_intrinsics_formatters.py index 774eb64c7..a9e12b3be 100644 --- a/test/formatters/granite/test_intrinsics_formatters.py +++ b/test/formatters/granite/test_intrinsics_formatters.py @@ -37,9 +37,8 @@ def _read_file(name): _TEST_DATA_DIR = pathlib.Path(os.path.dirname(__file__)) / "testdata" - _TEST_OUTPUT_DIR = pathlib.Path(os.path.dirname(__file__)) / "test_output" -"""Directory string we compare against something from _TEST_DATA_DIR is written""" +"""Directory string we substitute for _TEST_DATA_DIR when writing debug outputs.""" # Location from which our tests download adapters and YAML files _RAG_INTRINSICS_REPO_NAME = "ibm-granite/granitelib-rag-r1.0"