diff --git a/mellea/formatters/template_formatter.py b/mellea/formatters/template_formatter.py index d6c17796a..e21066cc7 100644 --- a/mellea/formatters/template_formatter.py +++ b/mellea/formatters/template_formatter.py @@ -145,7 +145,7 @@ def _render_representation( if expected_vars: unused_keys = set(stringified_template_args.keys()) - expected_vars if unused_keys: - MelleaLogger.get_logger().warn( + MelleaLogger.get_logger().warning( f"TemplateRepresentation for {representation.obj.__class__.__name__} " f"provides keys not referenced by template '{template.name}': {sorted(unused_keys)}" ) diff --git a/test/formatters/granite/test_intrinsics_formatters.py b/test/formatters/granite/test_intrinsics_formatters.py index 0a1c2a2f9..a9e12b3be 100644 --- a/test/formatters/granite/test_intrinsics_formatters.py +++ b/test/formatters/granite/test_intrinsics_formatters.py @@ -37,17 +37,73 @@ def _read_file(name): _TEST_DATA_DIR = pathlib.Path(os.path.dirname(__file__)) / "testdata" +_TEST_OUTPUT_DIR = pathlib.Path(os.path.dirname(__file__)) / "test_output" +"""Directory string we substitute for _TEST_DATA_DIR when writing debug outputs.""" # Location from which our tests download adapters and YAML files _RAG_INTRINSICS_REPO_NAME = "ibm-granite/granitelib-rag-r1.0" _CORE_R1_REPO_NAME = "ibm-granite/granitelib-core-r1.0" +_DEFAULT_BASE_MODEL = "ibm-granite/granite-4.1-3b" + _INPUT_JSON_DIR = _TEST_DATA_DIR / "input_json" _INPUT_YAML_DIR = _TEST_DATA_DIR / "input_yaml" _INPUT_ARGS_DIR = _TEST_DATA_DIR / "input_args" +def _substitute_root( + child_path: pathlib.Path, old_root: pathlib.Path, new_root: pathlib.Path +): + """Change a path rooted in one root directory to the same path rooted in another. + + Handles common corner cases such as when a given path has multiple equivalent + string representations. + + Args: + child_path: A path that is a descendant of a known root. + old_root: Root directory that is an ancestor of ``child_path``. + new_root: Root directory to substitute for ``old_root``. + + Returns: + A version of ``child_path`` in which the prefix corresponding to + ``old_root`` has been replaced with ``new_root``. + """ + # Resolve paths to handle symlinks, relative components, and other corner cases + child_path = child_path.resolve() + old_root = old_root.resolve() + new_root = new_root.resolve() + + # Get the relative path from old_root to child_path + try: + relative_path = child_path.relative_to(old_root) + except ValueError: + raise ValueError(f"{child_path} is not a descendant of {old_root}") + + # Construct new path with the new root + return new_root / relative_path + + +def _dump_output(expected_file: pathlib.Path, actual_string: str): + """Dump outputs to disk to aid debugging. + + Given the string representation of something that the current test is about to + compare against a canned output and the location of said canned output, write + the string to a controlled place on the filesystem to aid debugging. + + Args: + expected_file: Location of the file we're going to compare against. + actual_string: String that the current test case produced. + """ + actual_file = _substitute_root(expected_file, _TEST_DATA_DIR, _TEST_OUTPUT_DIR) + + if not os.path.exists(actual_file.parent): + os.makedirs(actual_file.parent) + + with open(actual_file, "w", encoding="utf-8") as f: + f.write(actual_string) + + class YamlJsonCombo(pydantic.BaseModel): """Dataclass that drives configuration for most tests in this file.""" @@ -71,7 +127,7 @@ class YamlJsonCombo(pydantic.BaseModel): loaded.""" revision: str = "main" """Revision or branch of the Hugging Face `repo_id`.""" - base_model_id: str = "ibm-granite/granite-4.0-micro" + base_model_id: str = _DEFAULT_BASE_MODEL """Base model on which the target adapter was trained. Should be small enough to run on the CI server.""" @@ -131,13 +187,6 @@ def _resolve_yaml(self): inputs_file=_INPUT_JSON_DIR / "hallucination_detection.json", task="hallucination_detection", ), - # aLoRA adapter for this intrinsic not currently available - # YamlJsonCombo( - # short_name="hallucination_detection_alora", - # inputs_file=_INPUT_JSON_DIR / "hallucination_detection.json", - # task="hallucination_detection", - # is_alora=True - # ), YamlJsonCombo( short_name="query_clarification", inputs_file=_INPUT_JSON_DIR / "query_clarification.json", @@ -148,50 +197,13 @@ def _resolve_yaml(self): inputs_file=_INPUT_JSON_DIR / "query_rewrite.json", task="query_rewrite", ), - # NOTE for the following two entries: - # The "requirement_check" intrinsic has not yet been ported to the latest format - # or to Granite 4.0. - YamlJsonCombo( - short_name="requirement_check", - inputs_file=_INPUT_JSON_DIR / "requirement_check.json", - arguments_file=_INPUT_ARGS_DIR / "requirement_check.json", - task="requirement_check", - # Granite 4.0 adapters not currently available - repo_id="ibm-granite/rag-intrinsics-lib", - base_model_id="ibm-granite/granite-3.3-2b-instruct", - ), - YamlJsonCombo( - short_name="requirement_check_alora", - inputs_file=_INPUT_JSON_DIR / "requirement_check.json", - arguments_file=_INPUT_ARGS_DIR / "requirement_check.json", - task="requirement_check", - is_alora=True, - # Granite 4.0 adapters not currently available - repo_id="ibm-granite/rag-intrinsics-lib", - base_model_id="ibm-granite/granite-3.3-2b-instruct", - ), - YamlJsonCombo( - short_name="uncertainty", - inputs_file=_INPUT_JSON_DIR / "uncertainty.json", - task="uncertainty", - # Granite 4.0 adapters not currently available - repo_id="ibm-granite/granitelib-core-r1.0", - revision="c9c189f5ad0b2890660397070613fda46d6ceb80", - ), - # aLoRA adapter for this intrinsic not currently available - # YamlJsonCombo( - # short_name="uncertainty_alora", - # inputs_file=_INPUT_JSON_DIR / "uncertainty.json", - # task="uncertainty", - # is_alora=True, - # # Granite 4.0 adapters not currently available - # repo_id="ibm-granite/granitelib-core-r1.0", - # ), YamlJsonCombo( short_name="context_relevance", inputs_file=_INPUT_JSON_DIR / "context_relevance.json", arguments_file=_INPUT_ARGS_DIR / "context_relevance.json", task="context_relevance", + # No Granite 4.1 version of this adapter + base_model_id="ibm-granite/granite-4.0-micro", ), YamlJsonCombo( short_name="context_relevance_alora", @@ -199,25 +211,22 @@ def _resolve_yaml(self): arguments_file=_INPUT_ARGS_DIR / "context_relevance.json", task="context_relevance", is_alora=True, + # No Granite 4.1 version of this adapter + base_model_id="ibm-granite/granite-4.0-micro", ), YamlJsonCombo( short_name="citations", inputs_file=_INPUT_JSON_DIR / "citations.json", task="citations", ), - # aLoRA adapter for this intrinsic not currently available - # YamlJsonCombo( - # short_name="citations_alora", - # inputs_file=_INPUT_JSON_DIR / "citations.json", - # task="citations", - # is_alora=True, - # ), YamlJsonCombo( short_name="context-attribution", inputs_file=_INPUT_JSON_DIR / "context-attribution.json", task="context-attribution", repo_id="ibm-granite/granitelib-core-r1.0", revision="c9c189f5ad0b2890660397070613fda46d6ceb80", + # No Granite 4.1 version of this adapter at the selected Git commit + base_model_id="ibm-granite/granite-4.0-micro", ), # gpt-oss-20b intrinsics (canned output tests only, no inference) YamlJsonCombo( @@ -489,6 +498,7 @@ def test_canned_input(yaml_json_combo_no_alora): after_json = after.model_dump_json(indent=2) expected_file = _CANNED_INPUT_EXPECTED_DIR / f"{cfg.short_name}.json" + _dump_output(expected_file, after_json) with open(expected_file, encoding="utf-8") as f: expected_json = f.read() @@ -714,15 +724,13 @@ def test_run_transformers(yaml_json_combo_with_model, gh_run): # Output processing transformed_responses = result_processor.transform(responses, transformed_input) - - # Pull this string out of the debugger to create a fresh expected file. transformed_str = transformed_responses.model_dump_json(indent=4) - print(transformed_str) - with open( - _TEST_DATA_DIR / f"test_run_transformers/{cfg.short_name}.json", - encoding="utf-8", - ) as f: + # If you are certain that the output is correct, you can use the file written here + # to create a fresh expected file. + expected_file = _TEST_DATA_DIR / f"test_run_transformers/{cfg.short_name}.json" + _dump_output(expected_file, transformed_str) + with open(expected_file, encoding="utf-8") as f: expected = ChatCompletionResponse.model_validate_json(f.read()) # expected_str = expected.model_dump_json(indent=4) @@ -749,7 +757,7 @@ def test_run_transformers(yaml_json_combo_with_model, gh_run): assert t_json == pytest.approx(e_json, abs=0.1) except AssertionError as e: - # Known intermittent failure under Transformers 5.0 + # Known intermittent failure under Transformers 5.0. if cfg.short_name == "hallucination_detection": pytest.xfail("Known failure due to Transformers 5.0") raise e diff --git a/test/formatters/granite/test_output/.gitignore b/test/formatters/granite/test_output/.gitignore new file mode 100644 index 000000000..07c5555a0 --- /dev/null +++ b/test/formatters/granite/test_output/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything... +* +# ...except this file +!.gitignore \ No newline at end of file diff --git a/test/formatters/granite/testdata/input_json/requirement_check.json b/test/formatters/granite/testdata/input_json/requirement_check.json deleted file mode 100644 index 37fcef83b..000000000 --- a/test/formatters/granite/testdata/input_json/requirement_check.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "messages": [ - { - "role": "assistant", - "content": "Hello there, welcome to math questions!" - }, - { - "content": "What is the square root of 4?", - "role": "user" - }, - { - "role": "assistant", - "content": "The square root of 4 is 2." - }, - { - "content": "What is six times seven?", - "role": "user" - } - ], - "extra_body": { - "documents": [ - { - "doc_id": "1", - "text": "\nHere's a list of math problems that are on the homework assignment:\n\n1) 12+5=17\n2) 20-8=12\n3) 6*7=42\n4) 45/9=5\n5) 13*2=26\n6) 99+11=110\n7) 100-35=65\n\n" - } - ] - } -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/input_json/uncertainty.json b/test/formatters/granite/testdata/input_json/uncertainty.json deleted file mode 100644 index 69f817ba0..000000000 --- a/test/formatters/granite/testdata/input_json/uncertainty.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "messages": [ - { - "role": "assistant", - "content": "Welcome to pet questions!" - }, - { - "role": "user", - "content": "Which of my pets have fleas?" - } - ], - "max_completion_tokens": 1024, - "extra_body": { - "documents": [ - { - "doc_id": "1", - "text": "My dog has fleas." - }, - { - "doc_id": "2", - "text": "My cat does not have fleas." - } - ] - } -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_input/requirement_check.json b/test/formatters/granite/testdata/test_canned_input/requirement_check.json deleted file mode 100644 index f110d3901..000000000 --- a/test/formatters/granite/testdata/test_canned_input/requirement_check.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "messages": [ - { - "content": "Hello there, welcome to math questions!", - "role": "assistant" - }, - { - "content": "What is the square root of 4?", - "role": "user" - }, - { - "content": "The square root of 4 is 2.", - "role": "assistant" - }, - { - "content": "What is six times seven?", - "role": "user" - }, - { - "content": "Check if this requirement was satisfied:\n The user's question is not one of the homework questions given in the provided documents.\n", - "role": "user" - } - ], - "extra_body": { - "documents": [ - { - "text": "\nHere's a list of math problems that are on the homework assignment:\n\n1) 12+5=17\n2) 20-8=12\n3) 6*7=42\n4) 45/9=5\n5) 13*2=26\n6) 99+11=110\n7) 100-35=65\n\n", - "doc_id": "1" - } - ], - "structured_outputs": { - "json": { - "type": "boolean" - } - } - }, - "max_completion_tokens": 5, - "logprobs": true, - "top_logprobs": 10 -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_input/uncertainty.json b/test/formatters/granite/testdata/test_canned_input/uncertainty.json deleted file mode 100644 index 70d584a6e..000000000 --- a/test/formatters/granite/testdata/test_canned_input/uncertainty.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "messages": [ - { - "content": "Welcome to pet questions!", - "role": "assistant" - }, - { - "content": "Which of my pets have fleas?", - "role": "user" - } - ], - "extra_body": { - "documents": [ - { - "text": "My dog has fleas.", - "doc_id": "1" - }, - { - "text": "My cat does not have fleas.", - "doc_id": "2" - } - ], - "structured_outputs": { - "json": { - "type": "object", - "properties": { - "score": { - "type": "string", - "enum": [ - "0", - "1", - "2", - "3", - "4", - "5", - "6", - "7", - "8", - "9" - ] - } - }, - "required": [ - "score" - ], - "additionalProperties": false - } - } - }, - "max_completion_tokens": 15, - "temperature": 0.0, - "logprobs": true, - "top_logprobs": 10 -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_output/expected_result/requirement_check.json b/test/formatters/granite/testdata/test_canned_output/expected_result/requirement_check.json deleted file mode 100644 index 6ea04fe57..000000000 --- a/test/formatters/granite/testdata/test_canned_output/expected_result/requirement_check.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"requirement_likelihood\": 1.0}", - "role": "assistant", - "tool_calls": [], - "reasoning_content": null - }, - "finish_reason": "stop" - } - ], - "prompt_logprobs": null, - "id": "chatcmpl-161ba1db8d6b4d05bbd5658b6d27c798", - "created": 1758304397, - "model": "requirement_check", - "object": "chat.completion", - "service_tier": null, - "system_fingerprint": null, - "usage": { - "completion_tokens": 2, - "prompt_tokens": 293, - "total_tokens": 295, - "completion_tokens_details": null, - "prompt_tokens_details": null - }, - "kv_transfer_params": null -} diff --git a/test/formatters/granite/testdata/test_canned_output/expected_result/uncertainty.json b/test/formatters/granite/testdata/test_canned_output/expected_result/uncertainty.json deleted file mode 100644 index 71426909b..000000000 --- a/test/formatters/granite/testdata/test_canned_output/expected_result/uncertainty.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"certainty\": 0.4597152663768006}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_output/model_output/requirement_check.json b/test/formatters/granite/testdata/test_canned_output/model_output/requirement_check.json deleted file mode 100644 index 613baa490..000000000 --- a/test/formatters/granite/testdata/test_canned_output/model_output/requirement_check.json +++ /dev/null @@ -1,373 +0,0 @@ -{ - "id": "chatcmpl-161ba1db8d6b4d05bbd5658b6d27c798", - "choices": [ - { - "finish_reason": "stop", - "index": 0, - "logprobs": { - "content": [ - { - "token": "true", - "bytes": [ - 116, - 114, - 117, - 101 - ], - "logprob": -0.0052339909598231316, - "top_logprobs": [ - { - "token": "true", - "bytes": [ - 116, - 114, - 117, - 101 - ], - "logprob": -0.0052339909598231316 - }, - { - "token": "false", - "bytes": [ - 102, - 97, - 108, - 115, - 101 - ], - "logprob": -5.2552337646484375 - }, - { - "token": "tr", - "bytes": [ - 116, - 114 - ], - "logprob": -16.192733764648438 - }, - { - "token": "f", - "bytes": [ - 102 - ], - "logprob": -16.692733764648438 - }, - { - "token": "t", - "bytes": [ - 116 - ], - "logprob": -16.880233764648438 - }, - { - "token": "fa", - "bytes": [ - 102, - 97 - ], - "logprob": -18.505233764648438 - }, - { - "token": "fal", - "bytes": [ - 102, - 97, - 108 - ], - "logprob": -19.786483764648438 - }, - { - "token": "", - "bytes": [ - 60, - 102, - 105, - 109, - 95, - 109, - 105, - 100, - 100, - 108, - 101, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 102, - 105, - 109, - 95, - 112, - 114, - 101, - 102, - 105, - 120, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "<|end_of_text|>", - "bytes": [ - 60, - 124, - 101, - 110, - 100, - 95, - 111, - 102, - 95, - 116, - 101, - 120, - 116, - 124, - 62 - ], - "logprob": -9999.0 - } - ] - }, - { - "token": "<|end_of_text|>", - "bytes": [ - 60, - 124, - 101, - 110, - 100, - 95, - 111, - 102, - 95, - 116, - 101, - 120, - 116, - 124, - 62 - ], - "logprob": 0.0, - "top_logprobs": [ - { - "token": "<|end_of_text|>", - "bytes": [ - 60, - 124, - 101, - 110, - 100, - 95, - 111, - 102, - 95, - 116, - 101, - 120, - 116, - 124, - 62 - ], - "logprob": 0.0 - }, - { - "token": "", - "bytes": [ - 60, - 102, - 105, - 109, - 95, - 109, - 105, - 100, - 100, - 108, - 101, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 102, - 105, - 109, - 95, - 112, - 97, - 100, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 105, - 115, - 115, - 117, - 101, - 95, - 115, - 116, - 97, - 114, - 116, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 105, - 115, - 115, - 117, - 101, - 95, - 99, - 111, - 109, - 109, - 101, - 110, - 116, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 102, - 105, - 109, - 95, - 112, - 114, - 101, - 102, - 105, - 120, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 102, - 105, - 109, - 95, - 115, - 117, - 102, - 102, - 105, - 120, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 103, - 104, - 95, - 115, - 116, - 97, - 114, - 115, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 105, - 115, - 115, - 117, - 101, - 95, - 99, - 108, - 111, - 115, - 101, - 100, - 62 - ], - "logprob": -9999.0 - }, - { - "token": "", - "bytes": [ - 60, - 102, - 105, - 108, - 101, - 110, - 97, - 109, - 101, - 62 - ], - "logprob": -9999.0 - } - ] - } - ], - "refusal": null - }, - "message": { - "content": "true", - "refusal": null, - "role": "assistant", - "annotations": null, - "audio": null, - "function_call": null, - "tool_calls": [], - "reasoning_content": null - }, - "stop_reason": null - } - ], - "created": 1758304397, - "model": "requirement_check", - "object": "chat.completion", - "service_tier": null, - "system_fingerprint": null, - "usage": { - "completion_tokens": 2, - "prompt_tokens": 293, - "total_tokens": 295, - "completion_tokens_details": null, - "prompt_tokens_details": null - }, - "prompt_logprobs": null, - "kv_transfer_params": null -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_canned_output/model_output/uncertainty.json b/test/formatters/granite/testdata/test_canned_output/model_output/uncertainty.json deleted file mode 100644 index cd88187a1..000000000 --- a/test/formatters/granite/testdata/test_canned_output/model_output/uncertainty.json +++ /dev/null @@ -1,560 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"score\": \"4\"}", - "role": "assistant" - }, - "logprobs": { - "content": [ - { - "token": "{\"", - "logprob": -4.0531076592742465e-06, - "bytes": [ - 123, - 34 - ], - "top_logprobs": [ - { - "token": "{\"", - "logprob": -4.0531076592742465e-06, - "bytes": [ - 123, - 34 - ] - }, - { - "token": "{\n", - "logprob": -12.511640548706055, - "bytes": [ - 123, - 10 - ] - }, - { - "token": "{", - "logprob": -14.91580581665039, - "bytes": [ - 123 - ] - }, - { - "token": "{\n\n", - "logprob": -18.89350128173828, - "bytes": [ - 123, - 10, - 10 - ] - }, - { - "token": "{\n\n\n", - "logprob": -25.736167907714844, - "bytes": [ - 123, - 10, - 10, - 10 - ] - }, - { - "token": "(", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 40 - ] - }, - { - "token": "$", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 36 - ] - }, - { - "token": ")", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 41 - ] - }, - { - "token": "\"", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 34 - ] - }, - { - "token": "%", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 37 - ] - } - ] - }, - { - "token": "score", - "logprob": 0.0, - "bytes": [ - 115, - 99, - 111, - 114, - 101 - ], - "top_logprobs": [ - { - "token": "score", - "logprob": 0.0, - "bytes": [ - 115, - 99, - 111, - 114, - 101 - ] - }, - { - "token": "s", - "logprob": -22.850893020629883, - "bytes": [ - 115 - ] - }, - { - "token": "sc", - "logprob": -24.523799896240234, - "bytes": [ - 115, - 99 - ] - }, - { - "token": ")", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 41 - ] - }, - { - "token": "(", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 40 - ] - }, - { - "token": "$", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 36 - ] - }, - { - "token": "\"", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 34 - ] - }, - { - "token": "%", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 37 - ] - }, - { - "token": "*", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 42 - ] - }, - { - "token": "&", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 38 - ] - } - ] - }, - { - "token": "\":", - "logprob": 0.0, - "bytes": [ - 34, - 58 - ], - "top_logprobs": [ - { - "token": "\":", - "logprob": 0.0, - "bytes": [ - 34, - 58 - ] - }, - { - "token": "\":\"", - "logprob": -23.256229400634766, - "bytes": [ - 34, - 58, - 34 - ] - }, - { - "token": "\":\n", - "logprob": -26.574241638183594, - "bytes": [ - 34, - 58, - 10 - ] - }, - { - "token": "\"", - "logprob": -27.456998825073242, - "bytes": [ - 34 - ] - }, - { - "token": "\":\n\n", - "logprob": -30.049991607666016, - "bytes": [ - 34, - 58, - 10, - 10 - ] - }, - { - "token": "\"\n\n", - "logprob": -31.085031509399414, - "bytes": [ - 34, - 10, - 10 - ] - }, - { - "token": "\"\n", - "logprob": -31.665332794189453, - "bytes": [ - 34, - 10 - ] - }, - { - "token": "\"\n\n\n\n", - "logprob": -37.62274932861328, - "bytes": [ - 34, - 10, - 10, - 10, - 10 - ] - }, - { - "token": "\"\n\n\n", - "logprob": -37.79510498046875, - "bytes": [ - 34, - 10, - 10, - 10 - ] - }, - { - "token": "(", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 40 - ] - } - ] - }, - { - "token": " \"", - "logprob": 0.0, - "bytes": [ - 32, - 34 - ], - "top_logprobs": [ - { - "token": " \"", - "logprob": 0.0, - "bytes": [ - 32, - 34 - ] - }, - { - "token": "\"", - "logprob": -21.91130828857422, - "bytes": [ - 34 - ] - }, - { - "token": " ", - "logprob": -22.00535011291504, - "bytes": [ - 32 - ] - }, - { - "token": "\t", - "logprob": -25.931697845458984, - "bytes": [ - 9 - ] - }, - { - "token": " ", - "logprob": -26.31612205505371, - "bytes": [ - 32, - 32 - ] - }, - { - "token": " ", - "logprob": -27.29484748840332, - "bytes": [ - 32, - 32, - 32 - ] - }, - { - "token": " \n", - "logprob": -27.30437660217285, - "bytes": [ - 32, - 10 - ] - }, - { - "token": " ", - "logprob": -27.461917877197266, - "bytes": [ - 32, - 32, - 32, - 32 - ] - }, - { - "token": " ", - "logprob": -28.473848342895508, - "bytes": [ - 32, - 32, - 32, - 32, - 32 - ] - }, - { - "token": " ", - "logprob": -29.2908992767334, - "bytes": [ - 32, - 32, - 32, - 32, - 32, - 32, - 32, - 32, - 32, - 32, - 32 - ] - } - ] - }, - { - "token": "4", - "logprob": -1.2333626747131348, - "bytes": [ - 52 - ], - "top_logprobs": [ - { - "token": "4", - "logprob": -1.2333626747131348, - "bytes": [ - 52 - ] - }, - { - "token": "5", - "logprob": -1.3502278327941895, - "bytes": [ - 53 - ] - }, - { - "token": "3", - "logprob": -1.4603333473205566, - "bytes": [ - 51 - ] - }, - { - "token": "6", - "logprob": -2.1628003120422363, - "bytes": [ - 54 - ] - }, - { - "token": "2", - "logprob": -2.558149814605713, - "bytes": [ - 50 - ] - }, - { - "token": "7", - "logprob": -4.393080234527588, - "bytes": [ - 55 - ] - }, - { - "token": "0", - "logprob": -4.83385705947876, - "bytes": [ - 48 - ] - }, - { - "token": "1", - "logprob": -5.500879764556885, - "bytes": [ - 49 - ] - }, - { - "token": "8", - "logprob": -7.648081302642822, - "bytes": [ - 56 - ] - }, - { - "token": "9", - "logprob": -10.840843200683594, - "bytes": [ - 57 - ] - } - ] - }, - { - "token": "\"}", - "logprob": 0.0, - "bytes": [ - 34, - 125 - ], - "top_logprobs": [ - { - "token": "\"}", - "logprob": 0.0, - "bytes": [ - 34, - 125 - ] - }, - { - "token": "\"", - "logprob": -27.13329315185547, - "bytes": [ - 34 - ] - }, - { - "token": "\"\n\n", - "logprob": -35.55782699584961, - "bytes": [ - 34, - 10, - 10 - ] - }, - { - "token": "\"\n", - "logprob": -36.93128204345703, - "bytes": [ - 34, - 10 - ] - }, - { - "token": "\"\n\n\n\n", - "logprob": -41.27574157714844, - "bytes": [ - 34, - 10, - 10, - 10, - 10 - ] - }, - { - "token": "\"\n\n\n", - "logprob": -43.764984130859375, - "bytes": [ - 34, - 10, - 10, - 10 - ] - }, - { - "token": "(", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 40 - ] - }, - { - "token": "$", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 36 - ] - }, - { - "token": ")", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 41 - ] - }, - { - "token": "%", - "logprob": -3.4028234663852886e+38, - "bytes": [ - 37 - ] - } - ] - } - ] - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/citations.json b/test/formatters/granite/testdata/test_run_transformers/citations.json index 67dc2bb51..6cabdcfcd 100644 --- a/test/formatters/granite/testdata/test_run_transformers/citations.json +++ b/test/formatters/granite/testdata/test_run_transformers/citations.json @@ -3,7 +3,7 @@ { "index": 0, "message": { - "content": "[{\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 692, \"citation_end\": 1030, \"citation_text\": \"He began to direct his attention to acquisition and expansion, buying the troubled Sunday Times in Perth, Western Australia (1956) and over the next few years acquiring suburban and provincial newspapers in New South Wales, Queensland, Victoria and the Northern Territory, including the Sydney afternoon tabloid, The Daily Mirror (1960). \"}, {\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 1219, \"citation_end\": 1346, \"citation_text\": \"Murdoch's first foray outside Australia involved the purchase of a controlling interest in the New Zealand daily The Dominion. \"}]", + "content": "[{\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 692, \"citation_end\": 1030, \"citation_text\": \"He began to direct his attention to acquisition and expansion, buying the troubled Sunday Times in Perth, Western Australia (1956) and over the next few years acquiring suburban and provincial newspapers in New South Wales, Queensland, Victoria and the Northern Territory, including the Sydney afternoon tabloid, The Daily Mirror (1960). \"}, {\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 1219, \"citation_end\": 1346, \"citation_text\": \"Murdoch's first foray outside Australia involved the purchase of a controlling interest in the New Zealand daily The Dominion. \"}, {\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 1739, \"citation_end\": 2044, \"citation_text\": \"Later in 1964, Murdoch launched The Australian, Australia's first national daily newspaper, which was based first in Canberra and later in Sydney. In 1972, Murdoch acquired the Sydney morning tabloid The Daily Telegraph from Australian media mogul Sir Frank Packer, who later regretted selling it to him. \"}]", "role": "assistant" } } diff --git a/test/formatters/granite/testdata/test_run_transformers/hallucination_detection.json b/test/formatters/granite/testdata/test_run_transformers/hallucination_detection.json index 6448ad68a..d5a5ce051 100644 --- a/test/formatters/granite/testdata/test_run_transformers/hallucination_detection.json +++ b/test/formatters/granite/testdata/test_run_transformers/hallucination_detection.json @@ -3,7 +3,7 @@ { "index": 0, "message": { - "content": "[{\"response_begin\": 0, \"response_end\": 31, \"response_text\": \"Purple bumble fish are yellow. \", \"faithfulness_likelihood\": 0.5825082205614557, \"explanation\": \"This sentence makes a factual claim about the color of purple bumble fish. The document states 'The only type of fish that is yellow is the purple bumble fish.' This directly supports the claim in the sentence.\"}, {\"response_begin\": 31, \"response_end\": 65, \"response_text\": \"Green bumble fish are also yellow.\", \"faithfulness_likelihood\": 0.09613224257737445, \"explanation\": \"This sentence makes a factual claim about the color of green bumble fish. However, the document does not mention green bumble fish at all. Therefore, this claim cannot be verified from the provided context.\"}]", + "content": "[{\"response_begin\": 0, \"response_end\": 31, \"response_text\": \"Purple bumble fish are yellow. \", \"faithfulness\": \"faithful\", \"explanation\": \"This sentence makes a factual claim about the color of purple bumble fish. The document states 'The only type of fish that is yellow is the purple bumble fish.' This matches exactly with the claim in the sentence.\"}, {\"response_begin\": 31, \"response_end\": 65, \"response_text\": \"Green bumble fish are also yellow.\", \"faithfulness\": \"unfaithful\", \"explanation\": \"This sentence makes a factual claim about the color of green bumble fish. However, the document does not mention anything about the color of green bumble fish, making this claim 'unfaithful'.\"}]", "role": "assistant" } } diff --git a/test/formatters/granite/testdata/test_run_transformers/query_clarification.json b/test/formatters/granite/testdata/test_run_transformers/query_clarification.json index 188a826b8..8fa9501f9 100644 --- a/test/formatters/granite/testdata/test_run_transformers/query_clarification.json +++ b/test/formatters/granite/testdata/test_run_transformers/query_clarification.json @@ -3,7 +3,7 @@ { "index": 0, "message": { - "content": "{\"clarification\": \"There are several languages that descended from Common Brittonic, such as Welsh, Breton, Cornish, and Cumbric. Which one are you referring to?\"}", + "content": "{\"clarification\": \"Several languages descended from Common Brittonic are still spoken or have been revived, like Welsh (a living language in Wales), Breton (spoken in Brittany, France), or Cornish (once extinct but now undergoing revitalization). Which one are you referring to?\"}", "role": "assistant" } } diff --git a/test/formatters/granite/testdata/test_run_transformers/requirement_check.json b/test/formatters/granite/testdata/test_run_transformers/requirement_check.json deleted file mode 100644 index 1012eaf73..000000000 --- a/test/formatters/granite/testdata/test_run_transformers/requirement_check.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"requirement_likelihood\": 0.0}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/requirement_check_alora.json b/test/formatters/granite/testdata/test_run_transformers/requirement_check_alora.json deleted file mode 100644 index 1012eaf73..000000000 --- a/test/formatters/granite/testdata/test_run_transformers/requirement_check_alora.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"requirement_likelihood\": 0.0}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/uncertainty.json b/test/formatters/granite/testdata/test_run_transformers/uncertainty.json deleted file mode 100644 index 6b423fcc4..000000000 --- a/test/formatters/granite/testdata/test_run_transformers/uncertainty.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"certainty\": 0.4597152663768006}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/formatters/granite/testdata/test_run_transformers/uncertainty_alora.json b/test/formatters/granite/testdata/test_run_transformers/uncertainty_alora.json deleted file mode 100644 index 9a7742fd8..000000000 --- a/test/formatters/granite/testdata/test_run_transformers/uncertainty_alora.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "{\"certainty\": 0.5784631010929226}", - "role": "assistant" - } - } - ] -} \ No newline at end of file diff --git a/test/stdlib/components/intrinsic/test_output/.gitignore b/test/stdlib/components/intrinsic/test_output/.gitignore new file mode 100644 index 000000000..07c5555a0 --- /dev/null +++ b/test/stdlib/components/intrinsic/test_output/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything... +* +# ...except this file +!.gitignore \ No newline at end of file diff --git a/test/stdlib/components/intrinsic/test_rag.py b/test/stdlib/components/intrinsic/test_rag.py index 14d7df06b..bc76fa650 100644 --- a/test/stdlib/components/intrinsic/test_rag.py +++ b/test/stdlib/components/intrinsic/test_rag.py @@ -10,7 +10,7 @@ torch = pytest.importorskip("torch", reason="torch not installed — install mellea[hf]") from mellea.backends.huggingface import LocalHFBackend -from mellea.backends.model_ids import IBM_GRANITE_4_MICRO_3B +from mellea.backends.model_ids import IBM_GRANITE_4_1_3B, IBM_GRANITE_4_MICRO_3B from mellea.core import ModelOutputThunk from mellea.stdlib.components import Document, Message from mellea.stdlib.components.intrinsic import rag @@ -31,6 +31,9 @@ DATA_ROOT = pathlib.Path(os.path.dirname(__file__)) / "testdata" """Location of data files for the tests in this file.""" +TEST_OUTPUT_ROOT = pathlib.Path(os.path.dirname(__file__)) / "test_output" +"""Location where the tests in this file dump internal outputs for debugging.""" + @pytest.fixture(name="backend", scope="module") def _backend(): @@ -39,7 +42,22 @@ def _backend(): torch.set_num_threads(4) # No adapters for hybrid version. - backend_ = LocalHFBackend(model_id=IBM_GRANITE_4_MICRO_3B.hf_model_name) # type: ignore + backend_ = LocalHFBackend(model_id=IBM_GRANITE_4_1_3B.hf_model_name) + yield backend_ + + from test.conftest import cleanup_gpu_backend + + cleanup_gpu_backend(backend_, "rag") + + +@pytest.fixture(name="backend_4_0", scope="module") +def _backend_4_0(): + """Granite 4.0 backend used only by tests that don't have Granite 4.1 models.""" + # Prevent thrashing if the default device is CPU + torch.set_num_threads(4) + + # No adapters for hybrid version. + backend_ = LocalHFBackend(model_id=IBM_GRANITE_4_MICRO_3B.hf_model_name) yield backend_ from test.conftest import cleanup_gpu_backend @@ -73,16 +91,30 @@ def _read_input_json(file_name: str): def _read_output_json(file_name: str): """Shared code for reading canned outputs stored in JSON files and converting to Mellea types. + + By convention, canned outputs hold the contents of + ``["choices"][0]["message"]["content"]``, + where ```` is a JSON chat completion after post-processing. """ with open(DATA_ROOT / "output_json" / file_name, encoding="utf-8") as f: json_data = json.load(f) + return json_data + - # Output is in OpenAI chat completion response format. Assume only one choice. - result_str = json_data["choices"][0]["message"]["content"] +def _dump_output_json(file_name: str, to_write): + """Shared code for dumping a test's generated JSON data. - # Intrinsic outputs are always JSON, serialized to a string for OpenAI - # compatibility. - return json.loads(result_str) + Dump the Python data structures that will be compared against canned + JSON output files. Outputs go to the local directory ``test_output``. + + If you are sure the current output is correct, you can use this output to update + the contents of the ``testdata`` directory. + """ + target_path = TEST_OUTPUT_ROOT / "output_json" / file_name + if not os.path.exists(target_path.parent): + os.makedirs(target_path.parent) + with open(target_path, "w", encoding="utf-8") as f: + json.dump(to_write, f, indent=2) @pytest.mark.qualitative @@ -116,7 +148,6 @@ def test_query_rewrite(backend): assert result == expected -@pytest.mark.xfail(reason="Non-deterministic citation boundaries across environments") @pytest.mark.qualitative def test_citations(backend): """Verify that the citations intrinsic functions properly.""" @@ -125,7 +156,14 @@ def test_citations(backend): # First call triggers adapter loading result = rag.find_citations(assistant_response, docs, context, backend) - assert result == expected + _dump_output_json("citations.json", result) + # There are some known differences between GPU and CPU output due to different + # matrix multiply implementations. Ignore those differences but attempt to complete + # the test when they are not present. + try: + assert result == expected + except AssertionError as ae: + pytest.xfail(f"Known differences across platforms. Diff was: {ae}") # Second call hits a different code path from the first one result = rag.find_citations(assistant_response, docs, context, backend) @@ -133,7 +171,7 @@ def test_citations(backend): @pytest.mark.qualitative -def test_context_relevance(backend): +def test_context_relevance(backend_4_0): """Verify that the context relevance intrinsic functions properly.""" context, question, docs = _read_input_json("context_relevance.json") @@ -141,11 +179,11 @@ def test_context_relevance(backend): document = docs[0] # First call triggers adapter loading - result = rag.check_context_relevance(question, document, context, backend) + result = rag.check_context_relevance(question, document, context, backend_4_0) assert result == "irrelevant" # Second call hits a different code path from the first one - result = rag.check_context_relevance(question, document, context, backend) + result = rag.check_context_relevance(question, document, context, backend_4_0) assert result == "irrelevant" @@ -157,14 +195,12 @@ def test_hallucination_detection(backend): # First call triggers adapter loading result = rag.flag_hallucinated_content(assistant_response, docs, context, backend) - # pytest.approx() chokes on lists of records, so we do this complicated dance. - for r, e in zip(result, expected, strict=True): # type: ignore - assert r == e + _dump_output_json("hallucination_detection.json", result) + assert result == expected # Second call hits a different code path from the first one result = rag.flag_hallucinated_content(assistant_response, docs, context, backend) - for r, e in zip(result, expected, strict=True): # type: ignore - assert r == e + assert result == expected @pytest.mark.qualitative @@ -231,7 +267,6 @@ def test_query_rewrite_resolve(backend): assert result == expected -@pytest.mark.xfail(reason="Non-deterministic citation boundaries across environments") @pytest.mark.qualitative def test_citations_resolve(backend): """Verify citations when response is resolved from context.""" @@ -240,17 +275,23 @@ def test_citations_resolve(backend): expected = _read_output_json("citations.json") result = rag.find_citations(None, docs, context, backend) - assert result == expected + # There are some known differences between GPU and CPU output due to different + # matrix multiply implementations. Ignore those differences but attempt to complete + # the test when they are not present. + try: + assert result == expected + except AssertionError as ae: + pytest.xfail(f"Known differences across platforms. Diff was: {ae}") @pytest.mark.qualitative -def test_context_relevance_resolve(backend): +def test_context_relevance_resolve(backend_4_0): """Verify context relevance when question is resolved from context.""" context, question, docs = _read_input_json("context_relevance.json") context = context.add(Message("user", question)) document = docs[0] - result = rag.check_context_relevance(None, document, context, backend) + result = rag.check_context_relevance(None, document, context, backend_4_0) assert result == "irrelevant" @@ -262,8 +303,7 @@ def test_hallucination_detection_resolve(backend): expected = _read_output_json("hallucination_detection.json") result = rag.flag_hallucinated_content(None, docs, context, backend) - for r, e in zip(result, expected, strict=True): # type: ignore - assert r == e + assert result == expected @pytest.mark.qualitative diff --git a/test/stdlib/components/intrinsic/testdata/output_json/citations.json b/test/stdlib/components/intrinsic/testdata/output_json/citations.json index 67dc2bb51..001745828 100644 --- a/test/stdlib/components/intrinsic/testdata/output_json/citations.json +++ b/test/stdlib/components/intrinsic/testdata/output_json/citations.json @@ -1,11 +1,29 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "[{\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 692, \"citation_end\": 1030, \"citation_text\": \"He began to direct his attention to acquisition and expansion, buying the troubled Sunday Times in Perth, Western Australia (1956) and over the next few years acquiring suburban and provincial newspapers in New South Wales, Queensland, Victoria and the Northern Territory, including the Sydney afternoon tabloid, The Daily Mirror (1960). \"}, {\"response_begin\": 0, \"response_end\": 91, \"response_text\": \"Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. \", \"citation_doc_id\": \"0\", \"citation_begin\": 1219, \"citation_end\": 1346, \"citation_text\": \"Murdoch's first foray outside Australia involved the purchase of a controlling interest in the New Zealand daily The Dominion. \"}]", - "role": "assistant" - } - } - ] -} \ No newline at end of file +[ + { + "response_begin": 0, + "response_end": 91, + "response_text": "Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. ", + "citation_doc_id": "0", + "citation_begin": 692, + "citation_end": 1030, + "citation_text": "He began to direct his attention to acquisition and expansion, buying the troubled Sunday Times in Perth, Western Australia (1956) and over the next few years acquiring suburban and provincial newspapers in New South Wales, Queensland, Victoria and the Northern Territory, including the Sydney afternoon tabloid, The Daily Mirror (1960). " + }, + { + "response_begin": 0, + "response_end": 91, + "response_text": "Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. ", + "citation_doc_id": "0", + "citation_begin": 1219, + "citation_end": 1346, + "citation_text": "Murdoch's first foray outside Australia involved the purchase of a controlling interest in the New Zealand daily The Dominion. " + }, + { + "response_begin": 0, + "response_end": 91, + "response_text": "Murdoch expanded in Australia and New Zealand by acquiring and expanding local newspapers. ", + "citation_doc_id": "0", + "citation_begin": 1739, + "citation_end": 2044, + "citation_text": "Later in 1964, Murdoch launched The Australian, Australia's first national daily newspaper, which was based first in Canberra and later in Sydney. In 1972, Murdoch acquired the Sydney morning tabloid The Daily Telegraph from Australian media mogul Sir Frank Packer, who later regretted selling it to him. " + } +] \ No newline at end of file diff --git a/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json b/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json index c8763bbec..072598fed 100644 --- a/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json +++ b/test/stdlib/components/intrinsic/testdata/output_json/hallucination_detection.json @@ -1,11 +1,16 @@ -{ - "choices": [ - { - "index": 0, - "message": { - "content": "[{\"response_begin\": 0, \"response_end\": 31, \"response_text\": \"Purple bumble fish are yellow. \", \"faithfulness\": \"faithful\", \"explanation\": \"This sentence makes a factual claim about the color of purple bumble fish. The document states 'The only type of fish that is yellow is the purple bumble fish.' This directly supports the claim in the sentence.\"}, {\"response_begin\": 31, \"response_end\": 65, \"response_text\": \"Green bumble fish are also yellow.\", \"faithfulness\": \"unfaithful\", \"explanation\": \"This sentence makes a factual claim about the color of green bumble fish. However, the document does not mention green bumble fish at all. Therefore, this claim cannot be verified from the provided context.\"}]", - "role": "assistant" - } - } - ] -} +[ + { + "response_begin": 0, + "response_end": 31, + "response_text": "Purple bumble fish are yellow. ", + "faithfulness": "faithful", + "explanation": "This sentence makes a factual claim about the color of purple bumble fish. The document states 'The only type of fish that is yellow is the purple bumble fish.' This matches exactly with the claim in the sentence." + }, + { + "response_begin": 31, + "response_end": 65, + "response_text": "Green bumble fish are also yellow.", + "faithfulness": "unfaithful", + "explanation": "This sentence makes a factual claim about the color of green bumble fish. However, the document does not mention anything about the color of green bumble fish, making this claim 'unfaithful'." + } +] \ No newline at end of file