diff --git a/examples/evaluation_and_profiling/email_phishing_analyzer/uv.lock b/examples/evaluation_and_profiling/email_phishing_analyzer/uv.lock index 6c3ad2a99e..bf83a89b5f 100644 --- a/examples/evaluation_and_profiling/email_phishing_analyzer/uv.lock +++ b/examples/evaluation_and_profiling/email_phishing_analyzer/uv.lock @@ -196,6 +196,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/0e/27be9fdef66e72d64c0cdc3cc2823101b80585f8119b5c112c2e8f5f7dab/anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c", size = 113592, upload-time = "2026-01-06T11:45:19.497Z" }, ] +[[package]] +name = "appdirs" +version = "1.4.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/d8/05696357e0311f5b5c316d7b95f46c669dd9c15aaeecbb48c7d0aeb88c40/appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41", size = 13470, upload-time = "2020-05-11T07:59:51.037Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128", size = 9566, upload-time = "2020-05-11T07:59:49.499Z" }, +] + [[package]] name = "arize-phoenix-otel" version = "0.14.0" @@ -689,6 +698,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/50/3d/9373ad9c56321fdab5b41197068e1d8c25883b3fea29dd361f9b55116869/dill-0.4.0-py3-none-any.whl", hash = "sha256:44f54bf6412c2c8464c14e8243eb163690a9800dbe2c367330883b19c7561049", size = 119668, upload-time = "2025-04-16T00:41:47.671Z" }, ] +[[package]] +name = "diskcache" +version = "5.6.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3f/21/1c1ffc1a039ddcc459db43cc108658f32c57d271d7289a2794e401d0fdb6/diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc", size = 67916, upload-time = "2023-08-31T06:12:00.316Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3f/27/4570e78fc0bf5ea0ca45eb1de3818a23787af9b390c0b0a0033a1b8236f9/diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19", size = 45550, upload-time = "2023-08-31T06:11:58.822Z" }, +] + [[package]] name = "distro" version = "1.9.0" @@ -1954,7 +1972,7 @@ source = { editable = "." } dependencies = [ { name = "beautifulsoup4" }, { name = "networkx" }, - { name = "nvidia-nat", extra = ["eval", "langchain", "phoenix", "profiler", "test"] }, + { name = "nvidia-nat", extra = ["eval", "langchain", "phoenix", "profiler", "ragas", "test"] }, { name = "openinference-instrumentation-langchain" }, ] @@ -1966,6 +1984,15 @@ requires-dist = [ { name = "openinference-instrumentation-langchain", specifier = "==0.1.29" }, ] +[[package]] +name = "nest-asyncio" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/83/f8/51569ac65d696c8ecbee95938f89d4abf00f47d58d48f6fbabfe8f0baefe/nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe", size = 7418, upload-time = "2024-01-21T14:25:19.227Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195, upload-time = "2024-01-21T14:25:17.223Z" }, +] + [[package]] name = "nest-asyncio2" version = "1.7.2" @@ -2062,6 +2089,9 @@ phoenix = [ profiler = [ { name = "nvidia-nat-profiler" }, ] +ragas = [ + { name = "nvidia-nat-ragas" }, +] test = [ { name = "nvidia-nat-test" }, ] @@ -2086,6 +2116,7 @@ requires-dist = [ { name = "nat-math-assistant-a2a-protected", marker = "extra == 'examples'", editable = "../../A2A/math_assistant_a2a_protected" }, { name = "nat-multi-frameworks", marker = "extra == 'examples'", editable = "../../frameworks/multi_frameworks" }, { name = "nat-notebooks", marker = "extra == 'examples'", editable = "../../notebooks" }, + { name = "nat-parallel-executor", marker = "extra == 'examples'", editable = "../../control_flow/parallel_executor" }, { name = "nat-per-user-workflow", marker = "extra == 'examples'", editable = "../../front_ends/per_user_workflow" }, { name = "nat-plot-charts", marker = "extra == 'examples'", editable = "../../custom_functions/plot_charts" }, { name = "nat-por-to-jiratickets", marker = "extra == 'examples'", editable = "../../HITL/por_to_jiratickets" }, @@ -2160,6 +2191,7 @@ requires-dist = [ { name = "nvidia-nat-rag", marker = "extra == 'rag'", editable = "../../../packages/nvidia_nat_rag" }, { name = "nvidia-nat-ragaai", marker = "extra == 'ragaai'", editable = "../../../packages/nvidia_nat_ragaai" }, { name = "nvidia-nat-ragas", marker = "extra == 'most'", editable = "../../../packages/nvidia_nat_ragas" }, + { name = "nvidia-nat-ragas", marker = "extra == 'ragas'", editable = "../../../packages/nvidia_nat_ragas" }, { name = "nvidia-nat-redis", marker = "extra == 'most'", editable = "../../../packages/nvidia_nat_redis" }, { name = "nvidia-nat-redis", marker = "extra == 'redis'", editable = "../../../packages/nvidia_nat_redis" }, { name = "nvidia-nat-s3", marker = "extra == 'most'", editable = "../../../packages/nvidia_nat_s3" }, @@ -2180,7 +2212,7 @@ requires-dist = [ { name = "nvidia-nat-zep-cloud", marker = "extra == 'zep-cloud'", editable = "../../../packages/nvidia_nat_zep_cloud" }, { name = "text-file-ingest", marker = "extra == 'examples'", editable = "../../documentation_guides/workflows/text_file_ingest" }, ] -provides-extras = ["a2a", "adk", "agno", "app", "autogen", "core", "crewai", "eval", "data-flywheel", "fastmcp", "langchain", "llama-index", "mcp", "mem0ai", "nemo-customizer", "openpipe-art", "opentelemetry", "phoenix", "profiler", "rag", "ragaai", "mysql", "redis", "s3", "security", "semantic-kernel", "strands", "test", "vanna", "weave", "zep-cloud", "async-endpoints", "gunicorn", "pii-defense", "most", "examples"] +provides-extras = ["a2a", "adk", "agno", "app", "autogen", "core", "crewai", "eval", "data-flywheel", "fastmcp", "langchain", "llama-index", "mcp", "mem0ai", "nemo-customizer", "openpipe-art", "opentelemetry", "phoenix", "profiler", "rag", "ragas", "ragaai", "mysql", "redis", "s3", "security", "semantic-kernel", "strands", "test", "vanna", "weave", "zep-cloud", "async-endpoints", "gunicorn", "pii-defense", "most", "examples"] [package.metadata.requires-dev] dev = [ @@ -2426,6 +2458,24 @@ requires-dist = [ ] provides-extras = ["test"] +[[package]] +name = "nvidia-nat-ragas" +source = { editable = "../../../packages/nvidia_nat_ragas" } +dependencies = [ + { name = "nvidia-nat-core" }, + { name = "nvidia-nat-eval" }, + { name = "ragas" }, +] + +[package.metadata] +requires-dist = [ + { name = "nvidia-nat-core", editable = "../../../packages/nvidia_nat_core" }, + { name = "nvidia-nat-eval", editable = "../../../packages/nvidia_nat_eval" }, + { name = "nvidia-nat-test", marker = "extra == 'test'", editable = "../../../packages/nvidia_nat_test" }, + { name = "ragas", specifier = "~=0.2.14" }, +] +provides-extras = ["test"] + [[package]] name = "nvidia-nat-test" source = { editable = "../../../packages/nvidia_nat_test" } @@ -3376,6 +3426,29 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" }, ] +[[package]] +name = "ragas" +version = "0.2.15" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "appdirs" }, + { name = "datasets" }, + { name = "diskcache" }, + { name = "langchain" }, + { name = "langchain-community" }, + { name = "langchain-core" }, + { name = "langchain-openai" }, + { name = "nest-asyncio" }, + { name = "numpy" }, + { name = "openai" }, + { name = "pydantic" }, + { name = "tiktoken" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6c/0f/04fddfa94744b1c3d8901aed8832a6b4193cc8e4886881f1bb88ff055350/ragas-0.2.15.tar.gz", hash = "sha256:2d0cd77b315a9c9c02ceb0a19ca8a48e82e1d02416587a2944ea51e6e327cd7b", size = 40867766, upload-time = "2025-04-24T16:39:28.734Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f2/9b/a5641da8aab06e069885a9ffa1b4897878f14c5b9807a9e3c5f1f532a6a9/ragas-0.2.15-py3-none-any.whl", hash = "sha256:298cd3d1fe3bd21ca4d31023a55079740d7bdd27a8c915bb371cec3c50cde608", size = 190947, upload-time = "2025-04-24T16:39:25.841Z" }, +] + [[package]] name = "referencing" version = "0.37.0" diff --git a/examples/evaluation_and_profiling/simple_calculator_eval/src/nat_simple_calculator_eval/configs/config-tunable-rag-eval-atif.yml b/examples/evaluation_and_profiling/simple_calculator_eval/src/nat_simple_calculator_eval/configs/config-tunable-rag-eval-atif.yml new file mode 100644 index 0000000000..8114786ea3 --- /dev/null +++ b/examples/evaluation_and_profiling/simple_calculator_eval/src/nat_simple_calculator_eval/configs/config-tunable-rag-eval-atif.yml @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +function_groups: + calculator: + _type: calculator + +functions: + current_datetime: + _type: current_datetime + +llms: + nim_llm: + _type: nim + model_name: nvidia/nemotron-3-nano-30b-a3b + temperature: 0.0 + max_tokens: 1024 + eval_llm: + _type: nim + model_name: mistralai/mixtral-8x22b-instruct-v0.1 + temperature: 0.0 + max_tokens: 1024 + openai_llm: + _type: openai + model_name: gpt-3.5-turbo + max_tokens: 2000 + +workflow: + _type: react_agent + tool_names: [calculator, current_datetime] + llm_name: nim_llm + verbose: true + parse_agent_response_max_retries: 3 + + +eval: + general: + max_concurrency: 1 + output: + dir: .tmp/nat/examples/getting_started/simple_calculator/atif + write_atif_workflow_output: true + dataset: + _type: json + file_path: examples/getting_started/simple_calculator/data/simple_calculator.json + evaluators: + tuneable_eval: + _type: tunable_rag_evaluator + enable_atif_evaluator: true + llm_name: eval_llm + default_scoring: true + default_score_weights: + coverage: 0.5 + correctness: 0.3 + relevance: 0.2 + judge_llm_prompt: > + You are an intelligent evaluator that scores the generated answer based on the description of the expected answer. + The score is a measure of how well the generated answer matches the description of the expected answer based on the question. + Take into account the question, the relevance of the answer to the question and the quality compared to the description of the expected answer. + + Rules: + - The score must be a float of any value between 0.0 and 1.0 on a sliding scale. + - The reasoning string must be concise and to the point. It should be 1 sentence and 2 only if extra description is needed. It must explain why the score was given and what is different between the generated answer and the expected answer. + - The tags and are real images and charts. diff --git a/examples/evaluation_and_profiling/simple_calculator_eval/uv.lock b/examples/evaluation_and_profiling/simple_calculator_eval/uv.lock index cade30036f..cb42530ebc 100644 --- a/examples/evaluation_and_profiling/simple_calculator_eval/uv.lock +++ b/examples/evaluation_and_profiling/simple_calculator_eval/uv.lock @@ -2070,6 +2070,7 @@ requires-dist = [ { name = "nat-math-assistant-a2a-protected", marker = "extra == 'examples'", editable = "../../A2A/math_assistant_a2a_protected" }, { name = "nat-multi-frameworks", marker = "extra == 'examples'", editable = "../../frameworks/multi_frameworks" }, { name = "nat-notebooks", marker = "extra == 'examples'", editable = "../../notebooks" }, + { name = "nat-parallel-executor", marker = "extra == 'examples'", editable = "../../control_flow/parallel_executor" }, { name = "nat-per-user-workflow", marker = "extra == 'examples'", editable = "../../front_ends/per_user_workflow" }, { name = "nat-plot-charts", marker = "extra == 'examples'", editable = "../../custom_functions/plot_charts" }, { name = "nat-por-to-jiratickets", marker = "extra == 'examples'", editable = "../../HITL/por_to_jiratickets" }, @@ -2144,6 +2145,7 @@ requires-dist = [ { name = "nvidia-nat-rag", marker = "extra == 'rag'", editable = "../../../packages/nvidia_nat_rag" }, { name = "nvidia-nat-ragaai", marker = "extra == 'ragaai'", editable = "../../../packages/nvidia_nat_ragaai" }, { name = "nvidia-nat-ragas", marker = "extra == 'most'", editable = "../../../packages/nvidia_nat_ragas" }, + { name = "nvidia-nat-ragas", marker = "extra == 'ragas'", editable = "../../../packages/nvidia_nat_ragas" }, { name = "nvidia-nat-redis", marker = "extra == 'most'", editable = "../../../packages/nvidia_nat_redis" }, { name = "nvidia-nat-redis", marker = "extra == 'redis'", editable = "../../../packages/nvidia_nat_redis" }, { name = "nvidia-nat-s3", marker = "extra == 'most'", editable = "../../../packages/nvidia_nat_s3" }, @@ -2164,7 +2166,7 @@ requires-dist = [ { name = "nvidia-nat-zep-cloud", marker = "extra == 'zep-cloud'", editable = "../../../packages/nvidia_nat_zep_cloud" }, { name = "text-file-ingest", marker = "extra == 'examples'", editable = "../../documentation_guides/workflows/text_file_ingest" }, ] -provides-extras = ["a2a", "adk", "agno", "app", "autogen", "core", "crewai", "eval", "data-flywheel", "fastmcp", "langchain", "llama-index", "mcp", "mem0ai", "nemo-customizer", "openpipe-art", "opentelemetry", "phoenix", "profiler", "rag", "ragaai", "mysql", "redis", "s3", "security", "semantic-kernel", "strands", "test", "vanna", "weave", "zep-cloud", "async-endpoints", "gunicorn", "pii-defense", "most", "examples"] +provides-extras = ["a2a", "adk", "agno", "app", "autogen", "core", "crewai", "eval", "data-flywheel", "fastmcp", "langchain", "llama-index", "mcp", "mem0ai", "nemo-customizer", "openpipe-art", "opentelemetry", "phoenix", "profiler", "rag", "ragas", "ragaai", "mysql", "redis", "s3", "security", "semantic-kernel", "strands", "test", "vanna", "weave", "zep-cloud", "async-endpoints", "gunicorn", "pii-defense", "most", "examples"] [package.metadata.requires-dev] dev = [ diff --git a/examples/evaluation_and_profiling/simple_web_query_eval/src/nat_simple_web_query_eval/configs/eval_config_atif.yml b/examples/evaluation_and_profiling/simple_web_query_eval/src/nat_simple_web_query_eval/configs/eval_config_atif.yml new file mode 100644 index 0000000000..8a0aeff0d0 --- /dev/null +++ b/examples/evaluation_and_profiling/simple_web_query_eval/src/nat_simple_web_query_eval/configs/eval_config_atif.yml @@ -0,0 +1,82 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +functions: + webpage_query: + _type: webpage_query + webpage_url: https://docs.smith.langchain.com + description: "Search for information about LangSmith. For any questions about LangSmith, you must use this tool!" + embedder_name: nv-embedqa-e5-v5 + current_datetime: + _type: current_datetime + +llms: + nim_llm: + _type: nim + model_name: nvidia/nemotron-3-nano-30b-a3b + temperature: 0.0 + nim_rag_eval_llm: + _type: nim + model_name: nvidia/nemotron-3-nano-30b-a3b + max_tokens: 8 + nim_trajectory_eval_llm: + _type: nim + model_name: nvidia/nemotron-3-nano-30b-a3b + temperature: 0.0 + max_tokens: 1024 + +embedders: + nv-embedqa-e5-v5: + _type: nim + model_name: nvidia/nv-embedqa-e5-v5 + +workflow: + _type: react_agent + tool_names: [webpage_query, current_datetime] + llm_name: nim_llm + verbose: true + parse_agent_response_max_retries: 3 + +eval: + general: + max_concurrency: 1 + output: + dir: ./.tmp/nat/examples/evaluation_and_profiling/simple_web_query_eval/atif/ + cleanup: true + write_atif_workflow_output: true + dataset: + _type: json + file_path: examples/evaluation_and_profiling/simple_web_query_eval/data/langsmith.json + evaluators: + # RAGAS evaluators now run through the ATIF-native evaluator lane. + accuracy: + _type: ragas + enable_atif_evaluator: true + metric: AnswerAccuracy + llm_name: nim_rag_eval_llm + groundedness: + _type: ragas + enable_atif_evaluator: true + metric: ResponseGroundedness + llm_name: nim_rag_eval_llm + relevance: + _type: ragas + enable_atif_evaluator: true + metric: ContextRelevance + llm_name: nim_rag_eval_llm + trajectory_accuracy: + _type: trajectory + enable_atif_evaluator: true + llm_name: nim_trajectory_eval_llm \ No newline at end of file diff --git a/examples/evaluation_and_profiling/simple_web_query_eval/src/nat_simple_web_query_eval/configs/eval_config_llama31_atif.yml b/examples/evaluation_and_profiling/simple_web_query_eval/src/nat_simple_web_query_eval/configs/eval_config_llama31_atif.yml new file mode 100644 index 0000000000..d74abffef0 --- /dev/null +++ b/examples/evaluation_and_profiling/simple_web_query_eval/src/nat_simple_web_query_eval/configs/eval_config_llama31_atif.yml @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +functions: + webpage_query: + _type: webpage_query + webpage_url: https://docs.smith.langchain.com + description: "Search for information about LangSmith. For any questions about LangSmith, you must use this tool!" + embedder_name: nv-embedqa-e5-v5 + current_datetime: + _type: current_datetime + +llms: + nim_llm: + _type: nim + model_name: meta/llama-3.1-8b-instruct + temperature: 0.0 + nim_rag_eval_llm: + _type: nim + model_name: meta/llama-3.1-70b-instruct + max_tokens: 8 + nim_trajectory_eval_llm: + _type: nim + model_name: meta/llama-3.1-70b-instruct + temperature: 0.0 + max_tokens: 1024 + +embedders: + nv-embedqa-e5-v5: + _type: nim + model_name: nvidia/nv-embedqa-e5-v5 + +workflow: + _type: react_agent + tool_names: [webpage_query, current_datetime] + llm_name: nim_llm + verbose: true + parse_agent_response_max_retries: 3 + +eval: + general: + max_concurrency: 1 + workflow_alias: nat-simple-llama-31-8b + output: + dir: ./.tmp/nat/examples/evaluation_and_profiling/simple_web_query_eval/atif/llama-31-8b + cleanup: true + write_atif_workflow_output: true + dataset: + _type: json + file_path: examples/evaluation_and_profiling/simple_web_query_eval/data/langsmith.json + profiler: + base_metrics: true + + evaluators: + accuracy: + _type: ragas + enable_atif_evaluator: true + metric: AnswerAccuracy + llm_name: nim_rag_eval_llm + groundedness: + _type: ragas + enable_atif_evaluator: true + metric: ResponseGroundedness + llm_name: nim_rag_eval_llm + relevance: + _type: ragas + enable_atif_evaluator: true + metric: ContextRelevance + llm_name: nim_rag_eval_llm + trajectory_accuracy: + _type: trajectory + enable_atif_evaluator: true + llm_name: nim_trajectory_eval_llm diff --git a/examples/evaluation_and_profiling/simple_web_query_eval/src/nat_simple_web_query_eval/configs/eval_config_llama33.yml b/examples/evaluation_and_profiling/simple_web_query_eval/src/nat_simple_web_query_eval/configs/eval_config_llama33.yml index 531b954853..2a40694d75 100644 --- a/examples/evaluation_and_profiling/simple_web_query_eval/src/nat_simple_web_query_eval/configs/eval_config_llama33.yml +++ b/examples/evaluation_and_profiling/simple_web_query_eval/src/nat_simple_web_query_eval/configs/eval_config_llama33.yml @@ -14,13 +14,6 @@ # limitations under the License. -general: - telemetry: - tracing: - weave: - _type: weave - project: "nat-simple" - functions: webpage_query: _type: webpage_query @@ -37,11 +30,11 @@ llms: temperature: 0.0 nim_rag_eval_llm: _type: nim - model_name: meta/llama-3.1-70b-instruct + model_name: meta/llama-3.3-70b-instruct max_tokens: 8 nim_trajectory_eval_llm: _type: nim - model_name: meta/llama-3.1-70b-instruct + model_name: meta/llama-3.3-70b-instruct temperature: 0.0 max_tokens: 1024 diff --git a/examples/evaluation_and_profiling/simple_web_query_eval/src/nat_simple_web_query_eval/configs/eval_config_llama33_atif.yml b/examples/evaluation_and_profiling/simple_web_query_eval/src/nat_simple_web_query_eval/configs/eval_config_llama33_atif.yml new file mode 100644 index 0000000000..09ef63ccee --- /dev/null +++ b/examples/evaluation_and_profiling/simple_web_query_eval/src/nat_simple_web_query_eval/configs/eval_config_llama33_atif.yml @@ -0,0 +1,82 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +functions: + webpage_query: + _type: webpage_query + webpage_url: https://docs.smith.langchain.com + description: "Search for information about LangSmith. For any questions about LangSmith, you must use this tool!" + embedder_name: nv-embedqa-e5-v5 + current_datetime: + _type: current_datetime + +llms: + nim_llm: + _type: nim + model_name: meta/llama-3.3-70b-instruct + temperature: 0.0 + nim_rag_eval_llm: + _type: nim + model_name: meta/llama-3.3-70b-instruct + max_tokens: 8 + nim_trajectory_eval_llm: + _type: nim + model_name: meta/llama-3.3-70b-instruct + temperature: 0.0 + max_tokens: 1024 + +embedders: + nv-embedqa-e5-v5: + _type: nim + model_name: nvidia/nv-embedqa-e5-v5 + +workflow: + _type: react_agent + tool_names: [webpage_query, current_datetime] + llm_name: nim_llm + verbose: true + parse_agent_response_max_retries: 3 + +eval: + general: + max_concurrency: 1 + output: + dir: ./.tmp/nat/examples/evaluation_and_profiling/simple_web_query_eval/atif/llama-33-70b + cleanup: true + write_atif_workflow_output: true + dataset: + _type: json + file_path: examples/evaluation_and_profiling/simple_web_query_eval/data/langsmith.json + evaluators: + # RAGAS evaluators now run through the ATIF-native evaluator lane. + accuracy: + _type: ragas + enable_atif_evaluator: true + metric: AnswerAccuracy + llm_name: nim_rag_eval_llm + groundedness: + _type: ragas + enable_atif_evaluator: true + metric: ResponseGroundedness + llm_name: nim_rag_eval_llm + relevance: + _type: ragas + enable_atif_evaluator: true + metric: ContextRelevance + llm_name: nim_rag_eval_llm + trajectory_accuracy: + _type: trajectory + enable_atif_evaluator: true + llm_name: nim_trajectory_eval_llm diff --git a/examples/evaluation_and_profiling/simple_web_query_eval/uv.lock b/examples/evaluation_and_profiling/simple_web_query_eval/uv.lock index d80266bb21..e83c43c508 100644 --- a/examples/evaluation_and_profiling/simple_web_query_eval/uv.lock +++ b/examples/evaluation_and_profiling/simple_web_query_eval/uv.lock @@ -220,6 +220,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/0e/27be9fdef66e72d64c0cdc3cc2823101b80585f8119b5c112c2e8f5f7dab/anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c", size = 113592, upload-time = "2026-01-06T11:45:19.497Z" }, ] +[[package]] +name = "appdirs" +version = "1.4.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/d8/05696357e0311f5b5c316d7b95f46c669dd9c15aaeecbb48c7d0aeb88c40/appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41", size = 13470, upload-time = "2020-05-11T07:59:51.037Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128", size = 9566, upload-time = "2020-05-11T07:59:49.499Z" }, +] + [[package]] name = "appnope" version = "0.1.4" @@ -2830,7 +2839,7 @@ name = "nat-simple-web-query-eval" source = { editable = "." } dependencies = [ { name = "nat-simple-web-query" }, - { name = "nvidia-nat", extra = ["eval", "langchain", "profiler", "test"] }, + { name = "nvidia-nat", extra = ["eval", "langchain", "profiler", "ragas", "test"] }, ] [package.metadata] @@ -2965,6 +2974,9 @@ profiler = [ ragaai = [ { name = "nvidia-nat-ragaai" }, ] +ragas = [ + { name = "nvidia-nat-ragas" }, +] test = [ { name = "nvidia-nat-test" }, ] @@ -2992,6 +3004,7 @@ requires-dist = [ { name = "nat-math-assistant-a2a-protected", marker = "extra == 'examples'", editable = "../../A2A/math_assistant_a2a_protected" }, { name = "nat-multi-frameworks", marker = "extra == 'examples'", editable = "../../frameworks/multi_frameworks" }, { name = "nat-notebooks", marker = "extra == 'examples'", editable = "../../notebooks" }, + { name = "nat-parallel-executor", marker = "extra == 'examples'", editable = "../../control_flow/parallel_executor" }, { name = "nat-per-user-workflow", marker = "extra == 'examples'", editable = "../../front_ends/per_user_workflow" }, { name = "nat-plot-charts", marker = "extra == 'examples'", editable = "../../custom_functions/plot_charts" }, { name = "nat-por-to-jiratickets", marker = "extra == 'examples'", editable = "../../HITL/por_to_jiratickets" }, @@ -3066,6 +3079,7 @@ requires-dist = [ { name = "nvidia-nat-rag", marker = "extra == 'rag'", editable = "../../../packages/nvidia_nat_rag" }, { name = "nvidia-nat-ragaai", marker = "extra == 'ragaai'", editable = "../../../packages/nvidia_nat_ragaai" }, { name = "nvidia-nat-ragas", marker = "extra == 'most'", editable = "../../../packages/nvidia_nat_ragas" }, + { name = "nvidia-nat-ragas", marker = "extra == 'ragas'", editable = "../../../packages/nvidia_nat_ragas" }, { name = "nvidia-nat-redis", marker = "extra == 'most'", editable = "../../../packages/nvidia_nat_redis" }, { name = "nvidia-nat-redis", marker = "extra == 'redis'", editable = "../../../packages/nvidia_nat_redis" }, { name = "nvidia-nat-s3", marker = "extra == 'most'", editable = "../../../packages/nvidia_nat_s3" }, @@ -3086,7 +3100,7 @@ requires-dist = [ { name = "nvidia-nat-zep-cloud", marker = "extra == 'zep-cloud'", editable = "../../../packages/nvidia_nat_zep_cloud" }, { name = "text-file-ingest", marker = "extra == 'examples'", editable = "../../documentation_guides/workflows/text_file_ingest" }, ] -provides-extras = ["a2a", "adk", "agno", "app", "autogen", "core", "crewai", "eval", "data-flywheel", "fastmcp", "langchain", "llama-index", "mcp", "mem0ai", "nemo-customizer", "openpipe-art", "opentelemetry", "phoenix", "profiler", "rag", "ragaai", "mysql", "redis", "s3", "security", "semantic-kernel", "strands", "test", "vanna", "weave", "zep-cloud", "async-endpoints", "gunicorn", "pii-defense", "most", "examples"] +provides-extras = ["a2a", "adk", "agno", "app", "autogen", "core", "crewai", "eval", "data-flywheel", "fastmcp", "langchain", "llama-index", "mcp", "mem0ai", "nemo-customizer", "openpipe-art", "opentelemetry", "phoenix", "profiler", "rag", "ragas", "ragaai", "mysql", "redis", "s3", "security", "semantic-kernel", "strands", "test", "vanna", "weave", "zep-cloud", "async-endpoints", "gunicorn", "pii-defense", "most", "examples"] [package.metadata.requires-dev] dev = [ @@ -3352,6 +3366,24 @@ requires-dist = [ ] provides-extras = ["test"] +[[package]] +name = "nvidia-nat-ragas" +source = { editable = "../../../packages/nvidia_nat_ragas" } +dependencies = [ + { name = "nvidia-nat-core" }, + { name = "nvidia-nat-eval" }, + { name = "ragas" }, +] + +[package.metadata] +requires-dist = [ + { name = "nvidia-nat-core", editable = "../../../packages/nvidia_nat_core" }, + { name = "nvidia-nat-eval", editable = "../../../packages/nvidia_nat_eval" }, + { name = "nvidia-nat-test", marker = "extra == 'test'", editable = "../../../packages/nvidia_nat_test" }, + { name = "ragas", specifier = "~=0.2.14" }, +] +provides-extras = ["test"] + [[package]] name = "nvidia-nat-test" source = { editable = "../../../packages/nvidia_nat_test" } @@ -4896,6 +4928,29 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b6/2f/ecce53efba5696591b615476904ab0bd847d154a0a878c061eb8e461b0a9/ragaai_catalyst-2.2.7-py3-none-any.whl", hash = "sha256:bc9b42504ea2b9d88a48c07a0164fc367b52a6e6c828712137a6682013d12a74", size = 436462, upload-time = "2025-11-26T09:37:36.372Z" }, ] +[[package]] +name = "ragas" +version = "0.2.15" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "appdirs" }, + { name = "datasets" }, + { name = "diskcache" }, + { name = "langchain" }, + { name = "langchain-community" }, + { name = "langchain-core" }, + { name = "langchain-openai" }, + { name = "nest-asyncio" }, + { name = "numpy" }, + { name = "openai" }, + { name = "pydantic" }, + { name = "tiktoken" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6c/0f/04fddfa94744b1c3d8901aed8832a6b4193cc8e4886881f1bb88ff055350/ragas-0.2.15.tar.gz", hash = "sha256:2d0cd77b315a9c9c02ceb0a19ca8a48e82e1d02416587a2944ea51e6e327cd7b", size = 40867766, upload-time = "2025-04-24T16:39:28.734Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f2/9b/a5641da8aab06e069885a9ffa1b4897878f14c5b9807a9e3c5f1f532a6a9/ragas-0.2.15-py3-none-any.whl", hash = "sha256:298cd3d1fe3bd21ca4d31023a55079740d7bdd27a8c915bb371cec3c50cde608", size = 190947, upload-time = "2025-04-24T16:39:25.841Z" }, +] + [[package]] name = "referencing" version = "0.37.0" diff --git a/packages/nvidia_nat_core/src/nat/builder/evaluator.py b/packages/nvidia_nat_core/src/nat/builder/evaluator.py index 07c3ca4698..3019a0d31f 100644 --- a/packages/nvidia_nat_core/src/nat/builder/evaluator.py +++ b/packages/nvidia_nat_core/src/nat/builder/evaluator.py @@ -22,7 +22,10 @@ class EvaluatorInfo: - def __init__(self, *, config: EvaluatorBaseConfig, evaluate_fn: Callable[[EvalInput], EvalOutput], + def __init__(self, + *, + config: EvaluatorBaseConfig, + evaluate_fn: Callable[[EvalInput], EvalOutput] | None = None, description: str): self.config = config self.evaluate_fn = evaluate_fn diff --git a/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py b/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py index d2785a7134..f7b6a62c8a 100644 --- a/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py +++ b/packages/nvidia_nat_core/src/nat/data_models/evaluate_config.py @@ -92,6 +92,11 @@ class EvalOutputConfig(BaseModel): workflow_output_step_filter: list[IntermediateStepType] | None = Field( default=None, description="Filter for the workflow output steps.") + write_atif_workflow_output: bool = Field( + default=False, + description="When enabled, also writes ATIF-converted workflow output to `workflow_output_atif.json` " + "for troubleshooting and debugging.") + class EvalGeneralConfig(BaseModel): """ diff --git a/packages/nvidia_nat_core/src/nat/utils/atif_message_utils.py b/packages/nvidia_nat_core/src/nat/utils/atif_message_utils.py new file mode 100644 index 0000000000..6033a5749c --- /dev/null +++ b/packages/nvidia_nat_core/src/nat/utils/atif_message_utils.py @@ -0,0 +1,50 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Shared helpers for extracting text from ATIF messages and trajectories.""" + +from __future__ import annotations + +from collections.abc import Sequence + +from nat.data_models.atif import ATIFContentPart +from nat.data_models.atif import ATIFTrajectory + + +def content_part_to_text(part: ATIFContentPart) -> str: + """Convert a single ATIF content part to text.""" + if part.type == "text": + return part.text or "" + if part.type == "image": + return part.source.path if part.source else "" + return "" + + +def message_to_text(message: str | Sequence[ATIFContentPart] | None) -> str: + """Convert ATIF message content to plain text.""" + if message is None: + return "" + if isinstance(message, str): + return message + return "\n".join([content_part_to_text(part) for part in message if content_part_to_text(part)]) + + +def trajectory_to_user_input(trajectory: ATIFTrajectory) -> str: + """Return the first non-empty user message from an ATIF trajectory.""" + for step in trajectory.steps: + if step.source == "user": + text = message_to_text(step.message) + if text: + return text + return "" diff --git a/packages/nvidia_nat_core/tests/eval/test_evaluate_callbacks.py b/packages/nvidia_nat_core/tests/eval/test_evaluate_callbacks.py index 772dd058c7..cd68d2a11a 100644 --- a/packages/nvidia_nat_core/tests/eval/test_evaluate_callbacks.py +++ b/packages/nvidia_nat_core/tests/eval/test_evaluate_callbacks.py @@ -31,7 +31,7 @@ def test_callback_manager_accepted_by_init(self): def test_callback_manager_defaults_to_empty(self): """EvaluationRun defaults callback_manager to an empty EvalCallbackManager.""" - config = EvaluationRunConfig(config_file=Path("dummy.yml")) + config = EvaluationRunConfig(config_file=Path("dummy.yml"), write_output=False) runner = EvaluationRun(config=config) assert isinstance(runner.callback_manager, EvalCallbackManager) assert not runner.callback_manager.has_callbacks diff --git a/packages/nvidia_nat_core/tests/nat/builder/test_evaluator.py b/packages/nvidia_nat_core/tests/nat/builder/test_evaluator.py new file mode 100644 index 0000000000..a67ad9eede --- /dev/null +++ b/packages/nvidia_nat_core/tests/nat/builder/test_evaluator.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest.mock import MagicMock + +from nat.builder.evaluator import EvaluatorInfo + + +def test_evaluator_info_allows_missing_evaluate_fn(): + """`EvaluatorInfo` should support ATIF-only evaluators.""" + info = EvaluatorInfo(config=MagicMock(), description="ATIF-only evaluator") + assert info.evaluate_fn is None diff --git a/packages/nvidia_nat_eval/scripts/compare_eval_runs.py b/packages/nvidia_nat_eval/scripts/compare_eval_runs.py new file mode 100644 index 0000000000..8decb8d267 --- /dev/null +++ b/packages/nvidia_nat_eval/scripts/compare_eval_runs.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Compare two eval run output directories. + +This script compares evaluator outputs from two run directories. +By default it prioritizes common files (RAGAS, trajectory, and tunable RAG), +and it also auto-discovers any additional ``*_output.json`` evaluator files. + +It prints: +- average score delta per evaluator +- per-item score change count +- optional per-item score diffs (with --show-item-diffs) + +Example: + python3 packages/nvidia_nat_eval/scripts/compare_eval_runs.py \ + .tmp/nat/examples/evaluation_and_profiling/simple_web_query_eval/atif/llama-33-70b \ + .tmp/nat/examples/evaluation_and_profiling/simple_web_query_eval/llama-33-70b \ + --show-item-diffs +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +EVALUATOR_FILES = ( + "accuracy_output.json", + "groundedness_output.json", + "relevance_output.json", + "trajectory_accuracy_output.json", + "tuneable_eval_output.json", + "tunable_eval_output.json", +) + + +def _read_json(path: Path) -> dict: + with path.open(encoding="utf-8") as f: + return json.load(f) + + +def _score_delta(a: object, b: object) -> float | None: + if isinstance(a, (int, float)) and isinstance(b, (int, float)): + return float(a) - float(b) + return None + + +def _fmt_score(v: object) -> str: + if isinstance(v, float): + return f"{v:.6f}" + return str(v) + + +def _discover_evaluator_files(run_a: Path, run_b: Path) -> list[str]: + """Discover evaluator output files from both run directories. + + Includes all ``*_output.json`` files except workflow outputs. + Preferred known evaluator files are listed first for stable output. + """ + excluded = {"workflow_output.json", "workflow_output_atif.json"} + discovered = set() + for run_dir in (run_a, run_b): + if not run_dir.exists(): + continue + for path in run_dir.glob("*_output.json"): + if path.name not in excluded: + discovered.add(path.name) + + ordered: list[str] = [] + for name in EVALUATOR_FILES: + if name in discovered: + ordered.append(name) + + for name in sorted(discovered): + if name not in ordered: + ordered.append(name) + + return ordered + + +def compare_evaluator(run_a: Path, run_b: Path, file_name: str, show_item_diffs: bool) -> None: + """Compare a single evaluator output file across two runs. + + Args: + run_a: Path to the first run output directory. + run_b: Path to the second run output directory. + file_name: Evaluator output JSON file name to compare. + show_item_diffs: Whether to print per-item score differences. + + Returns: + None. + """ + path_a = run_a / file_name + path_b = run_b / file_name + + if not path_a.exists() or not path_b.exists(): + print(f"- {file_name}: missing in one/both runs") + return + + try: + data_a = _read_json(path_a) + except (OSError, json.JSONDecodeError, ValueError) as e: + print(f"- {file_name}: unreadable in run_a ({path_a}): {e}") + return + + try: + data_b = _read_json(path_b) + except (OSError, json.JSONDecodeError, ValueError) as e: + print(f"- {file_name}: unreadable in run_b ({path_b}): {e}") + return + + avg_a = data_a.get("average_score") + avg_b = data_b.get("average_score") + delta = _score_delta(avg_a, avg_b) + + items_a = {} + skipped_a = 0 + for item in data_a.get("eval_output_items", []): + if not isinstance(item, dict): + skipped_a += 1 + continue + item_id = item.get("id") + if item_id is None: + skipped_a += 1 + continue + items_a[str(item_id)] = item + + items_b = {} + skipped_b = 0 + for item in data_b.get("eval_output_items", []): + if not isinstance(item, dict): + skipped_b += 1 + continue + item_id = item.get("id") + if item_id is None: + skipped_b += 1 + continue + items_b[str(item_id)] = item + + all_ids = sorted(set(items_a) | set(items_b), key=lambda x: (len(x), x)) + + changed_ids: list[str] = [] + for item_id in all_ids: + score_a = items_a.get(item_id, {}).get("score") + score_b = items_b.get(item_id, {}).get("score") + if score_a != score_b: + changed_ids.append(item_id) + + print(f"\n{file_name}") + print(f" avg_score run_a={_fmt_score(avg_a)} run_b={_fmt_score(avg_b)}", end="") + if delta is not None: + print(f" delta={delta:+.6f}") + else: + print(" delta=N/A") + print(f" item_count run_a={len(items_a)} run_b={len(items_b)} changed_items={len(changed_ids)}") + if skipped_a or skipped_b: + print(f" skipped_items run_a={skipped_a} run_b={skipped_b}") + + if show_item_diffs and changed_ids: + for item_id in changed_ids: + score_a = items_a.get(item_id, {}).get("score") + score_b = items_b.get(item_id, {}).get("score") + print(f" id={item_id} run_a={_fmt_score(score_a)} run_b={_fmt_score(score_b)}") + + +def main() -> int: + """Run the CLI to compare evaluator outputs from two run directories. + + Parses positional run directory arguments and an optional per-item diff flag, + then compares all discovered evaluator output files. + + Returns: + Process exit code. Returns 0 for normal CLI completion. + """ + parser = argparse.ArgumentParser(description="Compare evaluator outputs between two eval runs.") + parser.add_argument("run_a", type=Path, help="Path to first run output directory") + parser.add_argument("run_b", type=Path, help="Path to second run output directory") + parser.add_argument("--show-item-diffs", action="store_true", help="Print per-item score deltas for changed items") + args = parser.parse_args() + + print(f"Run A: {args.run_a}") + print(f"Run B: {args.run_b}") + + evaluator_files = _discover_evaluator_files(args.run_a, args.run_b) + if not evaluator_files: + print("\nNo evaluator output files found in either run directory.") + return 0 + + for file_name in evaluator_files: + compare_evaluator(args.run_a, args.run_b, file_name, args.show_item_diffs) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/packages/nvidia_nat_eval/src/nat/plugins/eval/cli/evaluate.py b/packages/nvidia_nat_eval/src/nat/plugins/eval/cli/evaluate.py index 1b3f6645a8..918c6cf5b7 100644 --- a/packages/nvidia_nat_eval/src/nat/plugins/eval/cli/evaluate.py +++ b/packages/nvidia_nat_eval/src/nat/plugins/eval/cli/evaluate.py @@ -113,10 +113,15 @@ def write_tabular_output(eval_run_output: EvaluationRunOutput): # Print header with workflow status and runtime workflow_status = "INTERRUPTED" if eval_run_output.workflow_interrupted else "COMPLETED" total_runtime = eval_run_output.usage_stats.total_runtime if eval_run_output.usage_stats else 0.0 + workflow_output_files = ["workflow_output.json"] + if eval_run_output.workflow_output_file: + atif_workflow_output = eval_run_output.workflow_output_file.parent / "workflow_output_atif.json" + if atif_workflow_output.exists(): + workflow_output_files.append("workflow_output_atif.json") click.echo("") click.echo(click.style("=== EVALUATION SUMMARY ===", fg="bright_blue", bold=True)) - click.echo(f"Workflow Status: {workflow_status} (workflow_output.json)") + click.echo(f"Workflow Status: {workflow_status} ({', '.join(workflow_output_files)})") click.echo(f"Total Runtime: {total_runtime:.2f}s") # Include profiler stats if available @@ -213,7 +218,7 @@ def _build_eval_callback_manager(config: EvaluationRunConfig): async def run_and_evaluate(config: EvaluationRunConfig): - from nat.eval.eval_callbacks import EvalCallbackManager + from nat.plugins.eval.eval_callbacks import EvalCallbackManager from nat.plugins.eval.exporters.file_eval_callback import FileEvalCallback callback_manager = _build_eval_callback_manager(config) or EvalCallbackManager() diff --git a/packages/nvidia_nat_eval/src/nat/plugins/eval/eval_callbacks.py b/packages/nvidia_nat_eval/src/nat/plugins/eval/eval_callbacks.py index 81fd5cdb13..b5ef0f9e4b 100644 --- a/packages/nvidia_nat_eval/src/nat/plugins/eval/eval_callbacks.py +++ b/packages/nvidia_nat_eval/src/nat/plugins/eval/eval_callbacks.py @@ -61,6 +61,7 @@ class EvalResult: evaluation_outputs: list[tuple[str, Any]] = field(default_factory=list) workflow_output_json: str | None = None + atif_workflow_output_json: str | None = None run_config: Any | None = None effective_config: Any | None = None output_dir: Path | None = None @@ -74,6 +75,7 @@ def build_eval_result( usage_stats: Any | None = None, item_span_ids: dict[str, int] | None = None, workflow_output_json: str | None = None, + atif_workflow_output_json: str | None = None, run_config: Any | None = None, effective_config: Any | None = None, output_dir: Path | None = None, @@ -118,6 +120,7 @@ def build_eval_result( items=cb_items, evaluation_outputs=evaluation_results, workflow_output_json=workflow_output_json, + atif_workflow_output_json=atif_workflow_output_json, run_config=run_config, effective_config=effective_config, output_dir=output_dir, @@ -186,8 +189,11 @@ def needs_root_span_ids(self) -> bool: def on_dataset_loaded(self, *, dataset_name: str, items: list[EvalInputItem]) -> None: for cb in self._callbacks: + fn = getattr(cb, "on_dataset_loaded", None) + if not fn: + continue try: - cb.on_dataset_loaded(dataset_name=dataset_name, items=items) + fn(dataset_name=dataset_name, items=items) except Exception: logger.exception("EvalCallback %s.on_dataset_loaded failed", type(cb).__name__) @@ -266,8 +272,11 @@ def evaluation_context(self): def on_eval_complete(self, result: EvalResult) -> None: for cb in self._callbacks: + fn = getattr(cb, "on_eval_complete", None) + if not fn: + continue try: - cb.on_eval_complete(result) + fn(result) except Exception: logger.exception("EvalCallback %s.on_eval_complete failed", type(cb).__name__) diff --git a/packages/nvidia_nat_eval/src/nat/plugins/eval/evaluator/atif_evaluator.py b/packages/nvidia_nat_eval/src/nat/plugins/eval/evaluator/atif_evaluator.py new file mode 100644 index 0000000000..ea7c780d86 --- /dev/null +++ b/packages/nvidia_nat_eval/src/nat/plugins/eval/evaluator/atif_evaluator.py @@ -0,0 +1,59 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ATIF-native evaluator protocol definitions.""" + +from __future__ import annotations + +from collections.abc import Sequence +from typing import Any +from typing import Protocol +from typing import runtime_checkable + +from pydantic import BaseModel +from pydantic import Field + +from nat.data_models.atif import ATIFTrajectory +from nat.data_models.evaluator import EvalOutput + + +class AtifEvalSample(BaseModel): + """ATIF-native evaluation sample used by ATIF-backed evaluators.""" + + item_id: Any = Field(description="Identifier matching the source EvalInputItem.") + trajectory: ATIFTrajectory = Field(description="Canonical ATIF trajectory.") + expected_output_obj: Any = Field(default=None, description="Optional expected output reference.") + output_obj: Any = Field(default=None, description="Optional workflow output reference.") + metadata: dict[str, Any] = Field(default_factory=dict, description="Optional evaluator metadata.") + + +AtifEvalSampleList = Sequence[AtifEvalSample] + + +@runtime_checkable +class AtifEvaluator(Protocol): + """Protocol for evaluators that consume ATIF-native samples.""" + + async def evaluate_atif_fn(self, atif_samples: AtifEvalSampleList) -> EvalOutput: + """Evaluate using ATIF-native sample payloads.""" + ... + + +@runtime_checkable +class LegacyEvaluator(Protocol): + """Protocol for evaluators that consume legacy `EvalInput` payloads.""" + + async def evaluate_fn(self, eval_input) -> EvalOutput: + """Evaluate using legacy eval input payloads.""" + ... diff --git a/packages/nvidia_nat_eval/src/nat/plugins/eval/exporters/file_eval_callback.py b/packages/nvidia_nat_eval/src/nat/plugins/eval/exporters/file_eval_callback.py index a83398eda7..b405a16445 100644 --- a/packages/nvidia_nat_eval/src/nat/plugins/eval/exporters/file_eval_callback.py +++ b/packages/nvidia_nat_eval/src/nat/plugins/eval/exporters/file_eval_callback.py @@ -44,6 +44,7 @@ class FileEvalCallback: def __init__(self) -> None: self.workflow_output_file: Path | None = None + self.atif_workflow_output_file: Path | None = None self.evaluator_output_files: list[Path] = [] self.config_original_file: Path | None = None self.config_effective_file: Path | None = None @@ -149,14 +150,21 @@ def _build_run_metadata(run_config: Any) -> dict[str, Any]: def _write_workflow_output(self, result: EvalResult, output_dir: Path) -> None: """Write the serialized workflow output JSON.""" - if result.workflow_output_json is None: + if result.workflow_output_json is not None: + workflow_output_file = output_dir / "workflow_output.json" + with open(workflow_output_file, "w", encoding="utf-8") as f: + f.write(result.workflow_output_json) + self.workflow_output_file = workflow_output_file + logger.info("Workflow output written to %s", workflow_output_file) + + if result.atif_workflow_output_json is None: return - workflow_output_file = output_dir / "workflow_output.json" - with open(workflow_output_file, "w", encoding="utf-8") as f: - f.write(result.workflow_output_json) - self.workflow_output_file = workflow_output_file - logger.info("Workflow output written to %s", workflow_output_file) + atif_workflow_output_file = output_dir / "workflow_output_atif.json" + with open(atif_workflow_output_file, "w", encoding="utf-8") as f: + f.write(result.atif_workflow_output_json) + self.atif_workflow_output_file = atif_workflow_output_file + logger.info("ATIF workflow output written to %s", atif_workflow_output_file) def _write_evaluator_outputs(self, result: EvalResult, output_dir: Path) -> None: """Write per-evaluator result files.""" diff --git a/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/__init__.py b/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/__init__.py index e69de29bb2..a0a2fd31fe 100644 --- a/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/__init__.py +++ b/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/__init__.py @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .eval_harness import EvaluationHarness + +__all__ = ["EvaluationHarness"] diff --git a/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/atif_adapter.py b/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/atif_adapter.py new file mode 100644 index 0000000000..c0a8500daf --- /dev/null +++ b/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/atif_adapter.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ATIF adapter utilities for eval runtime ingress. + +This module provides a single-conversion adapter layer from ``EvalInputItem`` +trajectory data to ``ATIFTrajectory`` objects. Runtime code uses this to avoid +per-evaluator conversion and to keep ATIF as the canonical internal trace shape. +""" + +from __future__ import annotations + +from collections.abc import Mapping +from typing import Any + +from nat.data_models.atif import ATIFTrajectory +from nat.data_models.evaluator import EvalInput +from nat.data_models.evaluator import EvalInputItem +from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSample +from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSampleList +from nat.utils.atif_converter import IntermediateStepToATIFConverter + + +class EvalAtifAdapter: + """Build and cache ATIF trajectories for eval items.""" + + def __init__(self, converter: IntermediateStepToATIFConverter | None = None) -> None: + self._converter = converter or IntermediateStepToATIFConverter() + self._cache: dict[str, ATIFTrajectory] = {} + + @staticmethod + def _cache_key(item_id: Any) -> str: + item_type = type(item_id) + return f"{item_type.__module__}.{item_type.__qualname__}:{item_id!r}" + + def _coerce_trajectory(self, value: Any) -> ATIFTrajectory: + if isinstance(value, ATIFTrajectory): + return value + if isinstance(value, Mapping): + return ATIFTrajectory.model_validate(value) + raise TypeError(f"Unsupported ATIF trajectory payload type: {type(value)}") + + def get_trajectory(self, + item: EvalInputItem, + prebuilt: ATIFTrajectory | Mapping[str, Any] | None = None) -> ATIFTrajectory: + """Return cached ATIF trajectory for an eval item, converting at most once.""" + key = self._cache_key(item.id) + if key in self._cache: + return self._cache[key] + + if prebuilt is not None: + trajectory = self._coerce_trajectory(prebuilt) + else: + trajectory = self._converter.convert(steps=item.trajectory, session_id=key) + self._cache[key] = trajectory + return trajectory + + def _ensure_cache(self, + eval_input: EvalInput, + prebuilt_trajectories: Mapping[str, ATIFTrajectory | Mapping[str, Any]] | None = None) -> None: + """Populate cache for all eval items.""" + for item in eval_input.eval_input_items: + prebuilt = None + if prebuilt_trajectories is not None: + # Prefer type-aware cache keys but allow legacy string keys. + prebuilt = prebuilt_trajectories.get(self._cache_key(item.id)) + if prebuilt is None: + prebuilt = prebuilt_trajectories.get(str(item.id)) + self.get_trajectory(item=item, prebuilt=prebuilt) + + def build_samples( + self, + eval_input: EvalInput, + prebuilt_trajectories: Mapping[str, ATIFTrajectory | Mapping[str, Any]] | None = None + ) -> AtifEvalSampleList: + """Build ATIF-native samples for all eval input items.""" + self._ensure_cache(eval_input=eval_input, prebuilt_trajectories=prebuilt_trajectories) + samples: AtifEvalSampleList = [] + for item in eval_input.eval_input_items: + trajectory = self._cache[self._cache_key(item.id)] + samples.append( + AtifEvalSample( + item_id=item.id, + trajectory=trajectory, + expected_output_obj=item.expected_output_obj, + output_obj=item.output_obj, + metadata={}, + )) + return samples diff --git a/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/eval_harness.py b/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/eval_harness.py new file mode 100644 index 0000000000..431b705e84 --- /dev/null +++ b/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/eval_harness.py @@ -0,0 +1,88 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Lightweight ATIF-only evaluator harness. + +This harness is intentionally narrow in scope: +- it evaluates ATIF-native evaluators only (`evaluate_atif_fn`) +- it runs evaluators concurrently +- it returns per-evaluator `EvalOutput` objects + +Example: + ```python + harness = EvaluationHarness() + results = await harness.evaluate( + evaluators={"trajectory": trajectory_evaluator}, + atif_samples=atif_samples, + ) + ``` +""" + +from __future__ import annotations + +import asyncio +import logging + +from nat.data_models.evaluator import EvalOutput +from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSampleList +from nat.plugins.eval.evaluator.atif_evaluator import AtifEvaluator + +logger = logging.getLogger(__name__) + + +class EvaluationHarness: + """Run ATIF-native evaluators against a shared sample list.""" + + def __init__(self, logger_instance: logging.Logger | None = None): + self._logger = logger_instance or logger + + async def _evaluate_single(self, evaluator_name: str, evaluator: AtifEvaluator, + atif_samples: AtifEvalSampleList) -> tuple[str, EvalOutput] | None: + """Evaluate one evaluator using the ATIF lane. + + Returns: + A tuple of evaluator name and result on success, otherwise ``None``. + """ + if not callable(evaluator.evaluate_atif_fn): + self._logger.warning("Skipping evaluator %s: missing callable evaluate_atif_fn", evaluator_name) + return None + + try: + eval_output = await evaluator.evaluate_atif_fn(atif_samples) + return evaluator_name, eval_output + except Exception: + # Best-effort policy: log per-evaluator failure and continue. + self._logger.exception("An error occurred while running evaluator %s", evaluator_name) + return None + + async def evaluate(self, evaluators: dict[str, AtifEvaluator], + atif_samples: AtifEvalSampleList) -> dict[str, EvalOutput]: + """Evaluate ATIF-native evaluators concurrently. + + Args: + evaluators: Evaluators keyed by evaluator name. + atif_samples: Pre-built ATIF samples shared by all evaluators. + + Returns: + A mapping of evaluator name to `EvalOutput` for successful evaluators. + """ + tasks = [ + self._evaluate_single(evaluator_name=name, evaluator=evaluator, atif_samples=atif_samples) + for name, evaluator in evaluators.items() if evaluator + ] + if not tasks: + return {} + + results = await asyncio.gather(*tasks) + return {name: output for result in results if result is not None for name, output in [result]} diff --git a/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/evaluate.py b/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/evaluate.py index 8256c8b8f5..46e2cf1390 100644 --- a/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/evaluate.py +++ b/packages/nvidia_nat_eval/src/nat/plugins/eval/runtime/evaluate.py @@ -14,9 +14,11 @@ # limitations under the License. import asyncio +import inspect import json import logging import shutil +from collections.abc import Awaitable from contextlib import nullcontext from datetime import UTC from datetime import datetime @@ -45,11 +47,16 @@ from nat.data_models.intermediate_step import IntermediateStepType from nat.plugins.eval.dataset_handler.dataset_handler import DatasetHandler from nat.plugins.eval.eval_callbacks import EvalCallbackManager +from nat.plugins.eval.evaluator.atif_evaluator import AtifEvaluator +from nat.plugins.eval.evaluator.atif_evaluator import LegacyEvaluator +from nat.plugins.eval.runtime.eval_harness import EvaluationHarness from nat.plugins.eval.runtime.llm_validator import validate_llm_endpoints from nat.plugins.eval.utils.output_uploader import OutputUploader from nat.runtime.session import SessionManager if TYPE_CHECKING: + from nat.plugins.eval.eval_callbacks import EvalCallbackManager + from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSampleList from nat.plugins.eval.exporters.file_eval_callback import FileEvalCallback logger = logging.getLogger(__name__) @@ -73,13 +80,22 @@ def __init__(self, config: EvaluationRunConfig, callback_manager: "EvalCallbackM # Run-specific configuration self.config: EvaluationRunConfig = config self.callback_manager: EvalCallbackManager = callback_manager or EvalCallbackManager() + if self.config.write_output: + from nat.plugins.eval.exporters.file_eval_callback import FileEvalCallback + if not any(isinstance(cb, FileEvalCallback) for cb in self.callback_manager._callbacks): + # Keep direct `EvaluationRun(...)` behavior consistent with CLI usage. + self.callback_manager.register(FileEvalCallback()) self.eval_config: EvalConfig | None = None self.effective_config: Config | None = None # Stores the complete config after applying overrides # Helpers self.intermediate_step_adapter: IntermediateStepAdapter = IntermediateStepAdapter() + from nat.plugins.eval.runtime.atif_adapter import EvalAtifAdapter + self.atif_adapter = EvalAtifAdapter() + self.evaluation_harness = EvaluationHarness() # Metadata self.eval_input: EvalInput | None = None + self.atif_eval_samples: AtifEvalSampleList = [] self.workflow_interrupted: bool = False # evaluation_results is list of tuples (evaluator_name, EvalOutput) @@ -477,6 +493,15 @@ def write_output(self, dataset_handler: DatasetHandler, profiler_results: Profil self.workflow_output_file = workflow_output_file logger.info("Workflow output written to %s", workflow_output_file) + output_config = self.eval_config.general.output + if output_config and output_config.write_atif_workflow_output: + atif_workflow_output_file = self.eval_config.general.output_dir / "workflow_output_atif.json" + atif_workflow_output = json.dumps([sample.model_dump(mode="json") for sample in self.atif_eval_samples], + indent=2) + with open(atif_workflow_output_file, "w", encoding="utf-8") as f: + f.write(atif_workflow_output) + logger.info("ATIF workflow output written to %s", atif_workflow_output_file) + # Write the output of each evaluator to a separate json file for evaluator_name, eval_output in self.evaluation_results: output_file = self.eval_config.general.output_dir / f"{evaluator_name}_output.json" @@ -506,8 +531,31 @@ def publish_output(self, dataset_handler: DatasetHandler, profiler_results: Prof async def run_single_evaluator(self, evaluator_name: str, evaluator: Any): """Run a single evaluator and store its results.""" + if isinstance(evaluator, AtifEvaluator): + if not self.atif_eval_samples and self.eval_input is not None: + # Lazy-populate when run_single_evaluator is called outside run_and_evaluate. + self.atif_eval_samples = self.atif_adapter.build_samples(self.eval_input) + harness_results = await self.evaluation_harness.evaluate({evaluator_name: evaluator}, + self.atif_eval_samples) + eval_output = harness_results.get(evaluator_name) + if eval_output is None: + return + self.evaluation_results.append((evaluator_name, eval_output)) + if self.callback_manager: + await self.callback_manager.a_on_evaluator_score(eval_output=eval_output, evaluator_name=evaluator_name) + return + await self._run_single_legacy_evaluator(evaluator_name, evaluator) + + async def _run_single_legacy_evaluator(self, evaluator_name: str, evaluator: Any): + """Run one evaluator through the legacy `evaluate_fn` lane.""" try: - eval_output = await evaluator.evaluate_fn(self.eval_input) + evaluate_fn = getattr(evaluator, "evaluate_fn", None) + if not isinstance(evaluator, LegacyEvaluator): + raise TypeError(f"Evaluator '{evaluator_name}' is missing callable evaluate_fn and evaluate_atif_fn") + eval_result = evaluate_fn(self.eval_input) + if not inspect.isawaitable(eval_result): + raise TypeError(f"Evaluator '{evaluator_name}' evaluate_fn must return an awaitable") + eval_output = await eval_result self.evaluation_results.append((evaluator_name, eval_output)) if self.callback_manager: await self.callback_manager.a_on_evaluator_score(eval_output=eval_output, evaluator_name=evaluator_name) @@ -516,14 +564,40 @@ async def run_single_evaluator(self, evaluator_name: str, evaluator: Any): async def run_evaluators(self, evaluators: dict[str, Any]): """Run all configured evaluators asynchronously.""" - tasks = [self.run_single_evaluator(name, evaluator) for name, evaluator in evaluators.items() if evaluator] + atif_evaluators: dict[str, AtifEvaluator] = {} + legacy_evaluators: dict[str, LegacyEvaluator] = {} + for name, evaluator in evaluators.items(): + if not evaluator: + continue + if isinstance(evaluator, AtifEvaluator): + atif_evaluators[name] = evaluator + elif isinstance(evaluator, LegacyEvaluator): + legacy_evaluators[name] = evaluator + else: + logger.warning("Skipping evaluator %s: missing ATIF and legacy evaluator interfaces", name) - if not tasks: + if not atif_evaluators and not legacy_evaluators: logger.warning("All evaluators were empty or invalid.") return try: - await asyncio.gather(*tasks) + if atif_evaluators: + if not self.atif_eval_samples and self.eval_input is not None: + # Lazy-populate for direct callers of run_evaluators. + self.atif_eval_samples = self.atif_adapter.build_samples(self.eval_input) + harness_results = await self.evaluation_harness.evaluate(atif_evaluators, self.atif_eval_samples) + for evaluator_name, eval_output in harness_results.items(): + self.evaluation_results.append((evaluator_name, eval_output)) + if self.callback_manager: + await self.callback_manager.a_on_evaluator_score(eval_output=eval_output, + evaluator_name=evaluator_name) + + if legacy_evaluators: + tasks: list[Awaitable[None]] = [ + self._run_single_legacy_evaluator(evaluator_name=name, evaluator=evaluator) + for name, evaluator in legacy_evaluators.items() + ] + await asyncio.gather(*tasks) except Exception as e: logger.error("An error occurred while running evaluators: %s", e) raise @@ -590,10 +664,14 @@ def _on_eval_complete(self, dataset_handler: DatasetHandler | None = None) -> No from nat.plugins.eval.eval_callbacks import build_eval_result workflow_output_json: str | None = None + atif_workflow_output_json: str | None = None if dataset_handler is not None and self.eval_input is not None: step_filter = (self.eval_config.general.output.workflow_output_step_filter if self.eval_config and self.eval_config.general.output else None) workflow_output_json = dataset_handler.publish_eval_input(self.eval_input, step_filter) + if self.eval_config.general.output and self.eval_config.general.output.write_atif_workflow_output: + atif_workflow_output_json = json.dumps( + [sample.model_dump(mode="json") for sample in self.atif_eval_samples], indent=2) scores = {name: output.average_score for name, output in self.evaluation_results} result = build_eval_result( @@ -603,6 +681,7 @@ def _on_eval_complete(self, dataset_handler: DatasetHandler | None = None) -> No usage_stats=self.usage_stats, item_span_ids=self._item_span_ids, workflow_output_json=workflow_output_json, + atif_workflow_output_json=atif_workflow_output_json, run_config=self.config, effective_config=self.effective_config, output_dir=(self.eval_config.general.output_dir if self.eval_config else None), @@ -746,9 +825,20 @@ async def run_and_evaluate(self, # Pre-evaluation process the workflow output self.eval_input = dataset_handler.pre_eval_process_eval_input(self.eval_input) + evaluators = {name: eval_workflow.get_evaluator(name) for name in self.eval_config.evaluators} + needs_atif_samples = any( + callable(getattr(evaluator, "evaluate_atif_fn", None)) for evaluator in evaluators.values() + if evaluator is not None) + write_atif_workflow_output = bool(self.eval_config.general.output + and self.eval_config.general.output.write_atif_workflow_output) + if needs_atif_samples or write_atif_workflow_output: + # Build and cache ATIF trajectories when ATIF evaluators are present or ATIF workflow export is + # explicitly requested. + self.atif_eval_samples = self.atif_adapter.build_samples(self.eval_input) + else: + self.atif_eval_samples = [] # Evaluate - evaluators = {name: eval_workflow.get_evaluator(name) for name in self.eval_config.evaluators} await self.run_evaluators(evaluators) # Wait for all trace export tasks to complete (local workflows only) diff --git a/packages/nvidia_nat_eval/tests/eval/test_atif_adapter.py b/packages/nvidia_nat_eval/tests/eval/test_atif_adapter.py new file mode 100644 index 0000000000..b0488022c7 --- /dev/null +++ b/packages/nvidia_nat_eval/tests/eval/test_atif_adapter.py @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nat.data_models.atif import ATIFAgentConfig +from nat.data_models.atif import ATIFTrajectory +from nat.data_models.evaluator import EvalInput +from nat.data_models.evaluator import EvalInputItem +from nat.data_models.intermediate_step import IntermediateStep +from nat.data_models.intermediate_step import IntermediateStepPayload +from nat.data_models.intermediate_step import IntermediateStepType +from nat.data_models.intermediate_step import InvocationNode +from nat.data_models.intermediate_step import StreamEventData +from nat.plugins.eval.runtime.atif_adapter import EvalAtifAdapter + + +def _make_eval_input_item(item_id: str = "item-1") -> EvalInputItem: + step = IntermediateStep(parent_id="root", + function_ancestry=InvocationNode(function_name="llm_test", function_id="llm-test"), + payload=IntermediateStepPayload(event_type=IntermediateStepType.LLM_END, + data=StreamEventData(input="input", output="output"))) + return EvalInputItem(id=item_id, + input_obj="input", + expected_output_obj="expected", + output_obj="actual", + trajectory=[step], + full_dataset_entry={"id": item_id}) + + +class _CountingConverter: + + def __init__(self) -> None: + self.calls = 0 + + def convert(self, steps: list[IntermediateStep], *, session_id: str | None = None, agent_name: str | None = None): + self.calls += 1 + return ATIFTrajectory(session_id=session_id or "sid", + agent=ATIFAgentConfig(name=agent_name or "nat-agent", version="0.0.0")) + + +def test_private_ensure_cache_converts_once_per_item(): + converter = _CountingConverter() + adapter = EvalAtifAdapter(converter=converter) + eval_input = EvalInput(eval_input_items=[_make_eval_input_item("1")]) + + adapter._ensure_cache(eval_input) + adapter._ensure_cache(eval_input) + + assert converter.calls == 1 + + +def test_build_samples_uses_prebuilt_trajectory_without_conversion(): + converter = _CountingConverter() + adapter = EvalAtifAdapter(converter=converter) + item = _make_eval_input_item("sample-a") + eval_input = EvalInput(eval_input_items=[item]) + prebuilt = ATIFTrajectory(session_id="sample-a", agent=ATIFAgentConfig(name="prebuilt-agent", version="0.0.0")) + + samples = adapter.build_samples(eval_input, prebuilt_trajectories={"sample-a": prebuilt}) + + assert converter.calls == 0 + assert len(samples) == 1 + assert samples[0].trajectory.agent.name == "prebuilt-agent" + assert samples[0].item_id == "sample-a" diff --git a/packages/nvidia_nat_eval/tests/eval/test_eval_harness.py b/packages/nvidia_nat_eval/tests/eval/test_eval_harness.py new file mode 100644 index 0000000000..ed9be89738 --- /dev/null +++ b/packages/nvidia_nat_eval/tests/eval/test_eval_harness.py @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest.mock import AsyncMock +from unittest.mock import patch + +from nat.data_models.evaluator import EvalOutput +from nat.data_models.evaluator import EvalOutputItem +from nat.plugins.eval.runtime.eval_harness import EvaluationHarness + + +async def test_evaluate_returns_per_evaluator_outputs(): + """Harness returns per-evaluator outputs for successful evaluators.""" + harness = EvaluationHarness() + samples = [object()] + + output_a = EvalOutput(average_score=1.0, eval_output_items=[EvalOutputItem(id=1, score=1.0, reasoning={})]) + output_b = EvalOutput(average_score=0.5, eval_output_items=[EvalOutputItem(id=1, score=0.5, reasoning={})]) + + evaluator_a = AsyncMock() + evaluator_a.evaluate_atif_fn = AsyncMock(return_value=output_a) + evaluator_b = AsyncMock() + evaluator_b.evaluate_atif_fn = AsyncMock(return_value=output_b) + + results = await harness.evaluate({"A": evaluator_a, "B": evaluator_b}, samples) + + assert list(results.keys()) == ["A", "B"] + assert results["A"] == output_a + assert results["B"] == output_b + evaluator_a.evaluate_atif_fn.assert_awaited_once_with(samples) + evaluator_b.evaluate_atif_fn.assert_awaited_once_with(samples) + + +async def test_evaluate_best_effort_when_one_evaluator_fails(): + """Harness continues and returns successful outputs when one evaluator fails.""" + harness = EvaluationHarness() + samples = [object()] + + output = EvalOutput(average_score=0.7, eval_output_items=[EvalOutputItem(id=1, score=0.7, reasoning={})]) + good_evaluator = AsyncMock() + good_evaluator.evaluate_atif_fn = AsyncMock(return_value=output) + bad_evaluator = AsyncMock() + bad_evaluator.evaluate_atif_fn = AsyncMock(side_effect=RuntimeError("boom")) + + with patch("nat.plugins.eval.runtime.eval_harness.logger.exception") as mock_log_exception: + results = await harness.evaluate({"good": good_evaluator, "bad": bad_evaluator}, samples) + + assert results == {"good": output} + mock_log_exception.assert_called_once() + good_evaluator.evaluate_atif_fn.assert_awaited_once_with(samples) + bad_evaluator.evaluate_atif_fn.assert_awaited_once_with(samples) + + +async def test_evaluate_skips_none_evaluator_entry(): + """Harness skips falsy evaluator entries.""" + harness = EvaluationHarness() + samples = [object()] + + with patch("nat.plugins.eval.runtime.eval_harness.logger.warning") as mock_log_warning: + results = await harness.evaluate({"missing": None}, samples) + + assert results == {} + mock_log_warning.assert_not_called() diff --git a/packages/nvidia_nat_eval/tests/eval/test_evaluate.py b/packages/nvidia_nat_eval/tests/eval/test_evaluate.py index cec19718bd..8777f3d12f 100644 --- a/packages/nvidia_nat_eval/tests/eval/test_evaluate.py +++ b/packages/nvidia_nat_eval/tests/eval/test_evaluate.py @@ -15,12 +15,15 @@ import asyncio import inspect +import json import os import time from contextlib import asynccontextmanager from pathlib import Path +from types import SimpleNamespace from unittest.mock import AsyncMock from unittest.mock import MagicMock +from unittest.mock import mock_open from unittest.mock import patch from uuid import UUID from uuid import uuid4 @@ -44,6 +47,7 @@ from nat.data_models.intermediate_step import IntermediateStepType from nat.data_models.intermediate_step import StreamEventData from nat.data_models.invocation_node import InvocationNode +from nat.plugins.eval.exporters.file_eval_callback import FileEvalCallback from nat.plugins.eval.runtime.evaluate import EvaluationRun from nat.runtime.session import SessionManager @@ -93,6 +97,12 @@ def generated_answer(): return "Generated answer" +def test_evaluation_run_registers_file_callback_by_default(default_eval_run_config): + """`EvaluationRun` should register file output callback when write_output is enabled.""" + eval_run = EvaluationRun(default_eval_run_config) + assert any(isinstance(cb, FileEvalCallback) for cb in eval_run.callback_manager._callbacks) + + @pytest.fixture def tool_end_intermediate_step(): """Fixture to create a valid TOOL_END IntermediateStep.""" @@ -129,14 +139,18 @@ def eval_output(average_score): def mock_evaluator(eval_output): """Fixture to create a mock evaluator.""" - async def mock_evaluate_fn(_eval_input): - return eval_output + class LegacyEvaluatorDouble: - # Create a mock evaluator - mock_evaluator = AsyncMock() - mock_evaluator.evaluate_fn = AsyncMock(side_effect=mock_evaluate_fn) + def __init__(self, output): - return mock_evaluator + async def mock_evaluate_fn(_eval_input): + return output + + self.evaluate_fn = AsyncMock(side_effect=mock_evaluate_fn) + # Explicitly disable ATIF lane for legacy evaluator fixture. + self.evaluate_atif_fn = None + + return LegacyEvaluatorDouble(eval_output) @pytest.fixture @@ -504,6 +518,37 @@ async def test_run_single_evaluator_success(evaluation_run, mock_evaluator, eval assert result.average_score == average_score, f"Expected average score to be {average_score}" +async def test_run_single_evaluator_atif_lane(evaluation_run, eval_output): + """ATIF evaluators should run via evaluate_atif_fn and skip legacy evaluate_fn.""" + atif_evaluator = AsyncMock() + atif_evaluator.evaluate_atif_fn = AsyncMock(return_value=eval_output) + atif_evaluator.evaluate_fn = AsyncMock(side_effect=AssertionError("legacy path should not be called")) + + with patch.object(evaluation_run.evaluation_harness, "evaluate", + wraps=evaluation_run.evaluation_harness.evaluate) as mock_harness_evaluate: + await evaluation_run.run_single_evaluator("AtifEvaluator", atif_evaluator) + + atif_evaluator.evaluate_atif_fn.assert_awaited_once() + atif_evaluator.evaluate_fn.assert_not_called() + mock_harness_evaluate.assert_awaited_once() + assert evaluation_run.evaluation_results[-1][0] == "AtifEvaluator" + assert evaluation_run.evaluation_results[-1][1] == eval_output + + +async def test_run_single_evaluator_atif_lane_lazy_builds_samples(evaluation_run, eval_output): + """ATIF lane should lazily build samples when run outside run_and_evaluate.""" + atif_evaluator = AsyncMock() + atif_evaluator.evaluate_atif_fn = AsyncMock(return_value=eval_output) + atif_evaluator.evaluate_fn = AsyncMock(side_effect=AssertionError("legacy path should not be called")) + + with patch.object(evaluation_run.atif_adapter, "build_samples", + wraps=evaluation_run.atif_adapter.build_samples) as mock_build: + await evaluation_run.run_single_evaluator("AtifEvaluator", atif_evaluator) + + mock_build.assert_called_once() + atif_evaluator.evaluate_atif_fn.assert_awaited_once() + + async def test_run_evaluators_success(evaluation_run, mock_evaluator, eval_output, average_score): """Test for running multiple evaluators successfully.""" @@ -525,6 +570,26 @@ async def test_run_evaluators_success(evaluation_run, mock_evaluator, eval_outpu assert result.average_score == average_score, f"Expected average score to be {average_score}" +async def test_run_evaluators_uses_harness_for_atif_evaluators(evaluation_run, eval_output): + """`run_evaluators` delegates ATIF evaluator execution to `EvaluationHarness`.""" + atif_evaluator_1 = AsyncMock() + atif_evaluator_1.evaluate_atif_fn = AsyncMock(return_value=eval_output) + atif_evaluator_1.evaluate_fn = AsyncMock(side_effect=AssertionError("legacy path should not be called")) + + atif_evaluator_2 = AsyncMock() + atif_evaluator_2.evaluate_atif_fn = AsyncMock(return_value=eval_output) + atif_evaluator_2.evaluate_fn = AsyncMock(side_effect=AssertionError("legacy path should not be called")) + + with patch.object(evaluation_run.evaluation_harness, "evaluate", + wraps=evaluation_run.evaluation_harness.evaluate) as mock_harness_evaluate: + await evaluation_run.run_evaluators({"Atif1": atif_evaluator_1, "Atif2": atif_evaluator_2}) + + mock_harness_evaluate.assert_awaited_once() + atif_evaluator_1.evaluate_fn.assert_not_called() + atif_evaluator_2.evaluate_fn.assert_not_called() + assert len(evaluation_run.evaluation_results) == 2 + + async def test_run_evaluators_partial_failure(evaluation_run, mock_evaluator, eval_output, average_score): """ Test run_evaluators where one evaluator fails but others succeed. @@ -536,8 +601,13 @@ async def test_run_evaluators_partial_failure(evaluation_run, mock_evaluator, ev bad_evaluator_name = "BadEvaluator" # Create a failing evaluator - mock_failing_evaluator = AsyncMock() - mock_failing_evaluator.evaluate_fn.side_effect = RuntimeError("Evaluator failed") + class LegacyFailingEvaluatorDouble: + + def __init__(self): + self.evaluate_fn = AsyncMock(side_effect=RuntimeError("Evaluator failed")) + self.evaluate_atif_fn = None + + mock_failing_evaluator = LegacyFailingEvaluatorDouble() evaluators = {good_evaluator_name: mock_evaluator, bad_evaluator_name: mock_failing_evaluator} @@ -562,6 +632,229 @@ async def test_run_evaluators_partial_failure(evaluation_run, mock_evaluator, ev "Error message should indicate evaluator failure" +# Batch-3: Tests for running eval and writing results +def test_write_output(evaluation_run, default_eval_config, eval_input, eval_output, generated_answer): + """Test writing the workflow and evaluation results.""" + # Mock dataset handler to get the formatted workflow results + for eval_input_item in eval_input.eval_input_items: + eval_input_item.output_obj = generated_answer + + mock_dataset_handler = MagicMock() + workflow_output = json.dumps([item.model_dump() for item in eval_input.eval_input_items]) + mock_dataset_handler.publish_eval_input.return_value = workflow_output + + # Mock evaluation results + evaluator_name = "MockEvaluator" + evaluation_run.evaluation_results = [(evaluator_name, eval_output)] + + # Mock eval_config output directory + evaluation_run.eval_config = default_eval_config + output_dir = default_eval_config.general.output_dir + + # Workflow output must be written to workflow_output.json + workflow_output_path = output_dir / "workflow_output.json" + + # Evaluator results must be written to {evaluator_name}_output.json + evaluator_output_path = output_dir / f"{evaluator_name}_output.json" + + # Create a mock ProfilerResults object + mock_profiler_results = ProfilerResults() + + # Patch file operations and logging. It is important to keep logs frozen to match user expectations. + with patch("builtins.open", mock_open()) as mock_file, \ + patch("pathlib.Path.mkdir") as mock_mkdir, \ + patch("nat.plugins.eval.runtime.evaluate.logger.info") as mock_logger: + + # Run the actual function + evaluation_run.write_output(mock_dataset_handler, mock_profiler_results) + + # Ensure directories are created + mock_mkdir.assert_called() + + # Ensure the workflow output is written + mock_file.assert_any_call(workflow_output_path, "w", encoding="utf-8") + mock_file().write.assert_any_call(workflow_output) + + # Ensure the evaluator output is written + mock_file.assert_any_call(evaluator_output_path, "w", encoding="utf-8") + eval_output_dict = eval_output.model_dump_json(indent=2) + mock_file().write.assert_any_call(eval_output_dict) + + # Ensure log format has not changed + mock_logger.assert_any_call("Workflow output written to %s", workflow_output_path) + mock_logger.assert_any_call("Evaluation results written to %s", evaluator_output_path) + + +def test_write_output_writes_atif_workflow_output_when_enabled(evaluation_run, + default_eval_config, + eval_input, + eval_output): + """Test optional ATIF workflow output export for troubleshooting.""" + mock_dataset_handler = MagicMock() + mock_dataset_handler.publish_eval_input.return_value = json.dumps( + [item.model_dump() for item in eval_input.eval_input_items]) + + evaluator_name = "MockEvaluator" + evaluation_run.evaluation_results = [(evaluator_name, eval_output)] + evaluation_run.eval_config = default_eval_config + evaluation_run.eval_config.general.output.write_atif_workflow_output = True + evaluation_run.atif_eval_samples = [ + MagicMock(model_dump=MagicMock(return_value={ + "item_id": 1, "trajectory": { + "steps": [] + } + })) + ] + + output_dir = default_eval_config.general.output_dir + atif_workflow_output_path = output_dir / "workflow_output_atif.json" + expected_atif_output = json.dumps([{"item_id": 1, "trajectory": {"steps": []}}], indent=2) + + mock_profiler_results = ProfilerResults() + with patch("builtins.open", mock_open()) as mock_file, \ + patch("pathlib.Path.mkdir"), \ + patch("nat.plugins.eval.runtime.evaluate.logger.info") as mock_logger: + evaluation_run.write_output(mock_dataset_handler, mock_profiler_results) + + mock_file.assert_any_call(atif_workflow_output_path, "w", encoding="utf-8") + mock_file().write.assert_any_call(expected_atif_output) + mock_logger.assert_any_call("ATIF workflow output written to %s", atif_workflow_output_path) + + +def test_write_output_handles_none_output(evaluation_run, eval_input): + """This test ensures that write_output does not access .output without a None check.""" + # Setup minimal eval_config with output = None + evaluation_run.eval_config = SimpleNamespace( + general=SimpleNamespace(output=None, output_dir=Path(".tmp/nat/examples/mock/"))) + evaluation_run.eval_input = eval_input + # Mock dataset handler + mock_dataset_handler = MagicMock() + mock_dataset_handler.publish_eval_input.return_value = "[]" + # Create a mock ProfilerResults object + mock_profiler_results = ProfilerResults() + # Patch file operations and logging + with patch("builtins.open", mock_open()), \ + patch("pathlib.Path.mkdir"), \ + patch("nat.plugins.eval.runtime.evaluate.logger.info"): + # Should not raise AttributeError + try: + evaluation_run.write_output(mock_dataset_handler, mock_profiler_results) + except AttributeError: + pytest.fail("write_output should not access .output without a None check") + + +@pytest.mark.filterwarnings("ignore:.*Pydantic serializer warnings.*:UserWarning") +def test_write_configuration_with_path_config(evaluation_run, default_eval_config, tmp_path): + """Test that write_configuration correctly saves config files when config_file is a Path.""" + # Create a temporary config file + config_file = tmp_path / "test_config.yml" + config_file.write_text("""workflow: + type: test +eval: + general: + max_concurrency: 1 +""") + # Setup evaluation run + evaluation_run.config.config_file = config_file + evaluation_run.config.override = (("eval.general.max_concurrency", "5"), ) + evaluation_run.eval_config = default_eval_config + evaluation_run.eval_config.evaluators = {} + evaluation_run.eval_config.general.output_dir = tmp_path / "output" + + # Create a mock effective config + mock_effective_config = Config() + mock_effective_config.eval = default_eval_config + evaluation_run.effective_config = mock_effective_config + + # Run the function + with patch("nat.plugins.eval.runtime.evaluate.logger.info") as mock_logger: + evaluation_run.write_configuration() + + # Verify that all three files were created + config_original_file = evaluation_run.eval_config.general.output_dir / "config_original.yml" + config_effective_file = evaluation_run.eval_config.general.output_dir / "config_effective.yml" + config_metadata_file = evaluation_run.eval_config.general.output_dir / "config_metadata.json" + + assert config_original_file.exists(), "config_original.yml should be created" + assert config_effective_file.exists(), "config_effective.yml should be created" + assert config_metadata_file.exists(), "config_metadata.json should be created" + + # Verify metadata content + with open(config_metadata_file, encoding="utf-8") as f: + metadata = json.load(f) + assert metadata["config_file"] == str(config_file) + assert metadata["config_file_type"] == "Path" + assert len(metadata["overrides"]) == 1 + assert metadata["overrides"][0]["path"] == "eval.general.max_concurrency" + assert metadata["overrides"][0]["value"] == "5" + + # Verify logging + assert mock_logger.call_count >= 3, "Should log for all three config files" + + +@pytest.mark.filterwarnings("ignore:.*Pydantic serializer warnings.*:UserWarning") +def test_write_configuration_with_basemodel_config(evaluation_run, default_eval_config, tmp_path): + """Test that write_configuration correctly saves config files when config_file is a BaseModel.""" + # Setup evaluation run with BaseModel config + mock_config = Config() + default_eval_config.evaluators = {} + mock_config.eval = default_eval_config + evaluation_run.config.config_file = mock_config + evaluation_run.config.override = () # No overrides + evaluation_run.eval_config = default_eval_config + evaluation_run.eval_config.general.output_dir = tmp_path / "output" + evaluation_run.effective_config = mock_config + + # Run the function + with patch("nat.plugins.eval.runtime.evaluate.logger.info"): + evaluation_run.write_configuration() + + # Verify that all three files were created + config_original_file = evaluation_run.eval_config.general.output_dir / "config_original.yml" + config_effective_file = evaluation_run.eval_config.general.output_dir / "config_effective.yml" + config_metadata_file = evaluation_run.eval_config.general.output_dir / "config_metadata.json" + + assert config_original_file.exists(), "config_original.yml should be created" + assert config_effective_file.exists(), "config_effective.yml should be created" + assert config_metadata_file.exists(), "config_metadata.json should be created" + + # Verify metadata content + with open(config_metadata_file, encoding="utf-8") as f: + metadata = json.load(f) + assert metadata["config_file_type"] == "BaseModel" + assert len(metadata["overrides"]) == 0, "Should have no overrides" + + +def test_write_configuration_handles_missing_effective_config(evaluation_run, default_eval_config, tmp_path): + """Test that write_configuration handles gracefully when effective_config is None.""" + # Create a temporary config file + config_file = tmp_path / "test_config.yml" + config_file.write_text("workflow:\n type: test\n") + + # Setup evaluation run with None effective_config + evaluation_run.config.config_file = config_file + evaluation_run.eval_config = default_eval_config + evaluation_run.eval_config.general.output_dir = tmp_path / "output" + evaluation_run.effective_config = None # This is the key test condition + + # Run the function - it should not crash + with patch("nat.plugins.eval.runtime.evaluate.logger.info"), \ + patch("nat.plugins.eval.runtime.evaluate.logger.warning") as mock_warning: + evaluation_run.write_configuration() + + # Verify warning was logged + mock_warning.assert_any_call("Effective config not available, skipping config_effective.yml") + + # Verify that original and metadata files were created but not effective + config_original_file = evaluation_run.eval_config.general.output_dir / "config_original.yml" + config_effective_file = evaluation_run.eval_config.general.output_dir / "config_effective.yml" + config_metadata_file = evaluation_run.eval_config.general.output_dir / "config_metadata.json" + + assert config_original_file.exists(), "config_original.yml should be created" + assert not config_effective_file.exists(), "config_effective.yml should NOT be created when there are no overrides" + assert config_metadata_file.exists(), "config_metadata.json should be created" + + # Batch-3: Tests for running eval via run_and_evaluate @pytest.mark.parametrize("skip_workflow", [True, False]) async def test_run_and_evaluate(evaluation_run, default_eval_config, session_manager, mock_evaluator, skip_workflow): diff --git a/packages/nvidia_nat_eval/tests/eval/test_file_eval_callback.py b/packages/nvidia_nat_eval/tests/eval/test_file_eval_callback.py index d3b491e0ad..0111c50511 100644 --- a/packages/nvidia_nat_eval/tests/eval/test_file_eval_callback.py +++ b/packages/nvidia_nat_eval/tests/eval/test_file_eval_callback.py @@ -88,6 +88,19 @@ def test_file_eval_callback_writes_workflow_output(eval_result, tmp_path): assert callback.workflow_output_file == output_file +def test_file_eval_callback_writes_atif_workflow_output(eval_result, tmp_path): + """Test that FileEvalCallback writes workflow_output_atif.json when provided.""" + eval_result.atif_workflow_output_json = '[{"item_id": 1, "trajectory": {"steps": []}}]' + + callback = FileEvalCallback() + callback.on_eval_complete(eval_result) + + output_file = tmp_path / "output" / "workflow_output_atif.json" + assert output_file.exists() + assert output_file.read_text() == eval_result.atif_workflow_output_json + assert callback.atif_workflow_output_file == output_file + + def test_file_eval_callback_writes_evaluator_outputs(eval_result, tmp_path): """Test that FileEvalCallback writes per-evaluator output files.""" callback = FileEvalCallback() diff --git a/packages/nvidia_nat_langchain/src/nat/plugins/langchain/eval/trajectory_evaluator.py b/packages/nvidia_nat_langchain/src/nat/plugins/langchain/eval/trajectory_evaluator.py index 42c63ff9a1..54308544be 100644 --- a/packages/nvidia_nat_langchain/src/nat/plugins/langchain/eval/trajectory_evaluator.py +++ b/packages/nvidia_nat_langchain/src/nat/plugins/langchain/eval/trajectory_evaluator.py @@ -13,21 +13,27 @@ # See the License for the specific language governing permissions and # limitations under the License. +import asyncio import logging +from collections.abc import Mapping from langchain_classic.evaluation import TrajectoryEvalChain from langchain_core.agents import AgentAction from langchain_core.language_models import BaseChatModel from langchain_core.tools import BaseTool +from pydantic import Field from nat.builder.builder import EvalBuilder from nat.builder.evaluator import EvaluatorInfo from nat.cli.register_workflow import register_evaluator from nat.data_models.evaluator import EvalInputItem +from nat.data_models.evaluator import EvalOutput from nat.data_models.evaluator import EvalOutputItem from nat.data_models.evaluator import EvaluatorLLMConfig from nat.data_models.intermediate_step import IntermediateStep from nat.data_models.intermediate_step import IntermediateStepType +from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSample +from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSampleList from nat.plugins.eval.evaluator.base_evaluator import BaseEvaluator from nat.utils.exception_handlers.automatic_retries import patch_with_retry @@ -36,10 +42,20 @@ _DEFAULT_EVENT_FILTER = [IntermediateStepType.LLM_END, IntermediateStepType.TOOL_END] +def _coerce_text(value) -> str: + """Best-effort coercion to text for judge-chain inputs.""" + if value is None: + return "" + return value if isinstance(value, str) else str(value) + + class TrajectoryEvaluatorConfig(EvaluatorLLMConfig, name="trajectory"): """Agent trajectory evaluator configuration.""" - pass + enable_atif_evaluator: bool = Field( + default=False, + description="Enable ATIF-native trajectory evaluator lane. Disabled by default during migration.", + ) def _to_agent_actions(intermediate_steps: list[IntermediateStep]) -> list[tuple[AgentAction, str]]: @@ -63,6 +79,89 @@ def _to_agent_actions(intermediate_steps: list[IntermediateStep]) -> list[tuple[ return agent_actions +def _message_to_text(message) -> str: + """Convert ATIF message payloads into text for LangChain trajectory scoring.""" + if message is None: + return "" + if isinstance(message, str): + return message + + if isinstance(message, dict): + parts_iterable = message.get("parts") + if parts_iterable is None: + parts_iterable = [message] + else: + parts_iterable = message + + text_parts: list[str] = [] + for part in parts_iterable: + part_type = getattr(part, "type", None) + part_text = getattr(part, "text", None) + part_source = getattr(part, "source", None) + + if isinstance(part, dict): + part_type = part.get("type", part_type) + part_text = part.get("text", part_text) + part_source = part.get("source", part_source) + + if part_type == "text" and isinstance(part_text, str) and part_text: + text_parts.append(part_text) + continue + + if part_type == "image": + source_path = getattr(part_source, "path", None) + if isinstance(part_source, dict): + source_path = part_source.get("path", source_path) + if isinstance(source_path, str) and source_path: + text_parts.append(source_path) + return "\n".join(text_parts) + + +def _atif_to_agent_actions(trajectory) -> list[tuple[AgentAction, str]]: + """Convert an ATIF trajectory into LangChain `agent_trajectory` tuples.""" + agent_actions: list[tuple[AgentAction, str]] = [] + for step in trajectory.steps: + if step.source != "agent": + continue + + agent_message = _message_to_text(step.message) + if step.model_name or agent_message: + llm_action = AgentAction(tool=step.model_name or "", tool_input="", log="") + agent_actions.append((llm_action, agent_message)) + + if not step.tool_calls: + continue + + observation_by_call_id: dict[str, str] = {} + if step.observation: + for result in step.observation.results: + if result.source_call_id: + observation_by_call_id[result.source_call_id] = _message_to_text(result.content) + + for tool_call in step.tool_calls: + if isinstance(tool_call.arguments, dict): + tool_input = tool_call.arguments + elif isinstance(tool_call.arguments, Mapping): + tool_input = dict(tool_call.arguments) + else: + tool_input = str(tool_call.arguments) + action = AgentAction(tool=tool_call.function_name, tool_input=tool_input, log=agent_message) + tool_output = observation_by_call_id.get(tool_call.tool_call_id, "") + agent_actions.append((action, tool_output)) + + return agent_actions + + +def _atif_to_user_input(trajectory) -> str: + """Extract first user message from ATIF trajectory.""" + for step in trajectory.steps: + if step.source == "user": + text = _message_to_text(step.message) + if text: + return text + return "" + + class TrajectoryEvaluator(BaseEvaluator): def __init__(self, llm: BaseChatModel, tools: list[BaseTool] | None = None, max_concurrency: int = 8): @@ -72,24 +171,62 @@ def __init__(self, llm: BaseChatModel, tools: list[BaseTool] | None = None, max_ return_reasoning=True, requires_reference=True) - async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem: - question = item.input_obj - generated_answer = item.output_obj - agent_trajectory = _to_agent_actions(item.trajectory) - + async def _evaluate_with_trajectory(self, + item_id, + lane: str, + question: str, + generated_answer: str, + agent_trajectory: list[tuple[AgentAction, str]]) -> EvalOutputItem: + """Run trajectory scoring for one item regardless of input lane.""" + question_text = _coerce_text(question) + generated_answer_text = _coerce_text(generated_answer) try: - eval_result = await self.traj_eval_chain.aevaluate_agent_trajectory(input=question, + eval_result = await self.traj_eval_chain.aevaluate_agent_trajectory(input=question_text, agent_trajectory=agent_trajectory, - prediction=generated_answer) + prediction=generated_answer_text) except Exception as e: - logger.exception("Error evaluating trajectory for question: %s, Error: %s", question, e) - return EvalOutputItem(id=item.id, score=0.0, reasoning={}, error=str(e)) + # Some judge models occasionally miss the strict "Score: " suffix + # expected by LangChain's legacy trajectory parser. + if isinstance(e, ValueError) and "not enough values to unpack" in str(e): + logger.warning("Trajectory judge output parsing failed [lane=%s item_id=%s]: %s", lane, item_id, e) + else: + logger.exception("Error evaluating trajectory [lane=%s item_id=%s]", lane, item_id) + return EvalOutputItem(id=item_id, score=0.0, reasoning={}, error=str(e)) reasoning = { "reasoning": eval_result["reasoning"], "trajectory": [(action.model_dump(), output) for (action, output) in agent_trajectory], } - return EvalOutputItem(id=item.id, score=eval_result["score"], reasoning=reasoning) + return EvalOutputItem(id=item_id, score=eval_result["score"], reasoning=reasoning) + + async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem: + question = item.input_obj + generated_answer = item.output_obj + agent_trajectory = _to_agent_actions(item.trajectory) + return await self._evaluate_with_trajectory(item.id, "legacy", question, generated_answer, agent_trajectory) + + async def evaluate_atif_item(self, sample: AtifEvalSample) -> EvalOutputItem: + """Evaluate a single ATIF-native sample.""" + question = _atif_to_user_input(sample.trajectory) + generated_answer = sample.output_obj if sample.output_obj is not None else "" + agent_trajectory = _atif_to_agent_actions(sample.trajectory) + return await self._evaluate_with_trajectory(sample.item_id, + "atif", + question, + generated_answer, + agent_trajectory) + + async def evaluate_atif_fn(self, atif_samples: AtifEvalSampleList) -> EvalOutput: + """ATIF-native evaluation lane for trajectory scoring.""" + + async def wrapped(sample: AtifEvalSample) -> EvalOutputItem: + async with self.semaphore: + return await self.evaluate_atif_item(sample) + + output_items = await asyncio.gather(*[wrapped(sample) for sample in atif_samples]) + numeric_scores = [item.score for item in output_items if isinstance(item.score, int | float)] + avg_score = round(sum(numeric_scores) / len(numeric_scores), 2) if numeric_scores else None + return EvalOutput(average_score=avg_score, eval_output_items=output_items) @register_evaluator(config_type=TrajectoryEvaluatorConfig) @@ -107,4 +244,7 @@ async def register_trajectory_evaluator(config: TrajectoryEvaluatorConfig, build tools = await builder.get_all_tools(wrapper_type=LLMFrameworkEnum.LANGCHAIN) evaluator = TrajectoryEvaluator(llm=llm, tools=tools, max_concurrency=builder.get_max_concurrency()) - yield EvaluatorInfo(config=config, evaluate_fn=evaluator.evaluate, description="Trajectory Evaluator") + evaluator_info = EvaluatorInfo(config=config, evaluate_fn=evaluator.evaluate, description="Trajectory Evaluator") + if config.enable_atif_evaluator: + evaluator_info.evaluate_atif_fn = evaluator.evaluate_atif_fn + yield evaluator_info diff --git a/packages/nvidia_nat_langchain/src/nat/plugins/langchain/eval/tunable_rag_evaluator.py b/packages/nvidia_nat_langchain/src/nat/plugins/langchain/eval/tunable_rag_evaluator.py index 8427c9d111..323fe3762b 100644 --- a/packages/nvidia_nat_langchain/src/nat/plugins/langchain/eval/tunable_rag_evaluator.py +++ b/packages/nvidia_nat_langchain/src/nat/plugins/langchain/eval/tunable_rag_evaluator.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import asyncio import logging from collections.abc import Callable @@ -28,11 +29,19 @@ from nat.builder.evaluator import EvaluatorInfo from nat.builder.framework_enum import LLMFrameworkEnum from nat.cli.register_workflow import register_evaluator +from nat.data_models.atif import ATIFContentPart +from nat.data_models.atif import ATIFTrajectory from nat.data_models.component_ref import LLMRef from nat.data_models.evaluator import EvalInputItem +from nat.data_models.evaluator import EvalOutput from nat.data_models.evaluator import EvalOutputItem from nat.data_models.evaluator import EvaluatorBaseConfig +from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSample +from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSampleList from nat.plugins.eval.evaluator.base_evaluator import BaseEvaluator +from nat.utils.atif_message_utils import content_part_to_text +from nat.utils.atif_message_utils import message_to_text +from nat.utils.atif_message_utils import trajectory_to_user_input logger = logging.getLogger(__name__) @@ -52,6 +61,10 @@ class TunableRagEvaluatorConfig(EvaluatorBaseConfig, name="tunable_rag_evaluator }, description="Weights for different scoring components when using default scoring", ) + enable_atif_evaluator: bool = Field( + default=False, + description="Enable ATIF-native tunable RAG evaluator lane. Disabled by default during migration.", + ) def evaluation_prompt(judge_llm_prompt: str, @@ -139,10 +152,8 @@ def __init__(self, "relevance": 1 / 3, } - async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem: - question = item.input_obj - answer_description = item.expected_output_obj - generated_answer = item.output_obj + async def _evaluate_item_core(self, item_id, question: str, answer_description: str, + generated_answer: str) -> EvalOutputItem: score = 0.0 default_evaluation_schema = [ @@ -246,7 +257,42 @@ async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem: "reasoning": reasoning, } - return EvalOutputItem(id=item.id, score=score, reasoning=reasoning_obj) + return EvalOutputItem(id=item_id, score=score, reasoning=reasoning_obj) + + async def evaluate_item(self, item: EvalInputItem) -> EvalOutputItem: + question = str(item.input_obj) if item.input_obj is not None else "" + answer_description = str(item.expected_output_obj) if item.expected_output_obj is not None else "" + generated_answer = str(item.output_obj) if item.output_obj is not None else "" + return await self._evaluate_item_core(item.id, question, answer_description, generated_answer) + + @staticmethod + def _content_part_to_text(part: ATIFContentPart) -> str: + return content_part_to_text(part) + + @classmethod + def _message_to_text(cls, message: str | list[ATIFContentPart] | None) -> str: + return message_to_text(message) + + @classmethod + def _trajectory_to_user_input(cls, trajectory: ATIFTrajectory) -> str: + return trajectory_to_user_input(trajectory) + + async def evaluate_atif_item(self, sample: AtifEvalSample) -> EvalOutputItem: + question = self._trajectory_to_user_input(sample.trajectory) + answer_description = str(sample.expected_output_obj) if sample.expected_output_obj is not None else "" + generated_answer = str(sample.output_obj) if sample.output_obj is not None else "" + return await self._evaluate_item_core(sample.item_id, question, answer_description, generated_answer) + + async def evaluate_atif_fn(self, atif_samples: AtifEvalSampleList) -> EvalOutput: + + async def wrapped(sample: AtifEvalSample) -> EvalOutputItem: + async with self.semaphore: + return await self.evaluate_atif_item(sample) + + output_items = await asyncio.gather(*[wrapped(sample) for sample in atif_samples]) + numeric_scores = [item.score for item in output_items if isinstance(item.score, int | float)] + avg_score = round(sum(numeric_scores) / len(numeric_scores), 2) if numeric_scores else None + return EvalOutput(average_score=avg_score, eval_output_items=output_items) @register_evaluator(config_type=TunableRagEvaluatorConfig) @@ -259,4 +305,7 @@ async def register_tunable_rag_evaluator(config: TunableRagEvaluatorConfig, buil max_concurrency=builder.get_max_concurrency(), default_scoring=config.default_scoring, default_score_weights=config.default_score_weights) - yield EvaluatorInfo(config=config, evaluate_fn=evaluator.evaluate, description="Tunable RAG Evaluator") + evaluator_info = EvaluatorInfo(config=config, evaluate_fn=evaluator.evaluate, description="Tunable RAG Evaluator") + if config.enable_atif_evaluator: + evaluator_info.evaluate_atif_fn = evaluator.evaluate_atif_fn + yield evaluator_info diff --git a/packages/nvidia_nat_langchain/tests/eval/test_trajectory_evaluate.py b/packages/nvidia_nat_langchain/tests/eval/test_trajectory_evaluate.py index 4bd9fcec68..b6e910a2a8 100644 --- a/packages/nvidia_nat_langchain/tests/eval/test_trajectory_evaluate.py +++ b/packages/nvidia_nat_langchain/tests/eval/test_trajectory_evaluate.py @@ -21,10 +21,24 @@ from langchain_core.language_models import BaseChatModel from langchain_core.tools import BaseTool +from nat.data_models.atif import ATIFAgentConfig +from nat.data_models.atif import ATIFObservation +from nat.data_models.atif import ATIFObservationResult +from nat.data_models.atif import ATIFStep +from nat.data_models.atif import ATIFToolCall +from nat.data_models.atif import ATIFTrajectory from nat.data_models.evaluator import EvalInput from nat.data_models.evaluator import EvalInputItem from nat.data_models.evaluator import EvalOutput +from nat.data_models.intermediate_step import IntermediateStep +from nat.data_models.intermediate_step import IntermediateStepPayload +from nat.data_models.intermediate_step import IntermediateStepType +from nat.data_models.intermediate_step import StreamEventData +from nat.data_models.invocation_node import InvocationNode +from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSample from nat.plugins.langchain.eval.trajectory_evaluator import TrajectoryEvaluator +from nat.plugins.langchain.eval.trajectory_evaluator import TrajectoryEvaluatorConfig +from nat.plugins.langchain.eval.trajectory_evaluator import register_trajectory_evaluator @pytest.fixture(name="mock_llm") @@ -118,3 +132,167 @@ async def test_trajectory_evaluate_failure(trajectory_evaluator, rag_eval_input) assert error_message in failed_item.error assert successful_item.score == pytest.approx(0.8) assert successful_item.reasoning["reasoning"] == "LGTM" + + +@pytest.fixture(name="atif_samples") +def fixture_atif_samples(): + return [ + AtifEvalSample( + item_id="1", + trajectory=ATIFTrajectory( + session_id="atif-1", + agent=ATIFAgentConfig(name="test-agent", version="0.0.0"), + steps=[ + ATIFStep(step_id=1, source="user", message="What is AI?"), + ATIFStep( + step_id=2, + source="agent", + model_name="mock-llm", + message="AI is artificial intelligence.", + tool_calls=[ + ATIFToolCall( + tool_call_id="call-1", + function_name="web_search", + arguments={"query": "artificial intelligence"}, + ) + ], + observation=ATIFObservation( + results=[ATIFObservationResult(source_call_id="call-1", content="Search results context")]), + ), + ], + ), + expected_output_obj="Artificial intelligence.", + output_obj="AI is artificial intelligence.", + metadata={}, + ), + AtifEvalSample( + item_id="2", + trajectory=ATIFTrajectory( + session_id="atif-2", + agent=ATIFAgentConfig(name="test-agent", version="0.0.0"), + steps=[ + ATIFStep(step_id=1, source="user", message="What is ML?"), + ATIFStep(step_id=2, source="agent", model_name="mock-llm", message="ML is a subset of AI."), + ], + ), + expected_output_obj="Machine learning.", + output_obj="ML is a subset of AI.", + metadata={}, + ), + ] + + +async def test_trajectory_evaluate_atif_success(trajectory_evaluator, atif_samples): + scores = [ + { + "score": 0.9, "reasoning": "atif-1" + }, + { + "score": 0.8, "reasoning": "atif-2" + }, + ] + expected_average = (0.9 + 0.8) / 2 + + with patch.object(trajectory_evaluator, "traj_eval_chain") as mock_traj_eval_chain: + mock_traj_eval_chain.aevaluate_agent_trajectory = AsyncMock(side_effect=scores) + eval_output = await trajectory_evaluator.evaluate_atif_fn(atif_samples) + + assert isinstance(eval_output, EvalOutput) + assert len(eval_output.eval_output_items) == 2 + assert eval_output.average_score == pytest.approx(expected_average) + assert eval_output.eval_output_items[0].score == pytest.approx(0.9) + assert eval_output.eval_output_items[1].score == pytest.approx(0.8) + assert eval_output.eval_output_items[0].reasoning["reasoning"] == "atif-1" + assert eval_output.eval_output_items[1].reasoning["reasoning"] == "atif-2" + assert mock_traj_eval_chain.aevaluate_agent_trajectory.call_count == 2 + + +async def test_trajectory_legacy_and_atif_lane_parity_with_tolerance(trajectory_evaluator): + llm_end_step = IntermediateStep(parent_id="root", + function_ancestry=InvocationNode(function_name="llm_test", + function_id="test-llm-end"), + payload=IntermediateStepPayload(event_type=IntermediateStepType.LLM_END, + name="mock-llm", + data=StreamEventData(input="What is AI?", + output="AI answer"))) + tool_end_step = IntermediateStep(parent_id="root", + function_ancestry=InvocationNode(function_name="tool_test", + function_id="test-tool-end"), + payload=IntermediateStepPayload(event_type=IntermediateStepType.TOOL_END, + name="web_search", + data=StreamEventData( + input={"query": "What is AI?"}, + output="Search results context"))) + legacy_eval_input = EvalInput(eval_input_items=[ + EvalInputItem(id="1", + input_obj="What is AI?", + expected_output_obj="Artificial intelligence.", + output_obj="AI answer", + expected_trajectory=[], + trajectory=[llm_end_step, tool_end_step], + full_dataset_entry={}) + ]) + + atif_samples = [ + AtifEvalSample( + item_id="1", + trajectory=ATIFTrajectory( + session_id="atif-parity-1", + agent=ATIFAgentConfig(name="test-agent", version="0.0.0"), + steps=[ + ATIFStep(step_id=1, source="user", message="What is AI?"), + ATIFStep( + step_id=2, + source="agent", + model_name="mock-llm", + message="AI answer", + tool_calls=[ + ATIFToolCall(tool_call_id="call-1", + function_name="web_search", + arguments={"query": "What is AI?"}) + ], + observation=ATIFObservation( + results=[ATIFObservationResult(source_call_id="call-1", content="Search results context")]), + ), + ], + ), + expected_output_obj="Artificial intelligence.", + output_obj="AI answer", + metadata={}, + ) + ] + + async def score_from_trajectory(*, input, agent_trajectory, prediction): # noqa: ARG001 + return {"score": float(len(agent_trajectory)), "reasoning": "trajectory-size"} + + with patch.object(trajectory_evaluator, "traj_eval_chain") as mock_traj_eval_chain: + mock_traj_eval_chain.aevaluate_agent_trajectory = AsyncMock(side_effect=score_from_trajectory) + legacy_output = await trajectory_evaluator.evaluate(legacy_eval_input) + atif_output = await trajectory_evaluator.evaluate_atif_fn(atif_samples) + + assert legacy_output.average_score == pytest.approx(atif_output.average_score, abs=0.01) + assert legacy_output.eval_output_items[0].score == pytest.approx(atif_output.eval_output_items[0].score, abs=0.01) + + +async def test_register_trajectory_evaluator_exposes_legacy_lane_by_default(mock_llm, mock_tools): + config = TrajectoryEvaluatorConfig(llm_name="judge_llm") + builder = MagicMock(spec=["get_llm", "get_max_concurrency", "get_all_tools"]) + builder.get_llm = AsyncMock(return_value=mock_llm) + builder.get_all_tools = AsyncMock(return_value=mock_tools) + builder.get_max_concurrency.return_value = 2 + + async with register_trajectory_evaluator(config, builder) as info: + assert callable(info.evaluate_fn) + assert not callable(getattr(info, "evaluate_atif_fn", None)) + + +async def test_register_trajectory_evaluator_exposes_atif_lane_when_enabled(mock_llm, mock_tools): + config = TrajectoryEvaluatorConfig(llm_name="judge_llm", enable_atif_evaluator=True) + builder = MagicMock(spec=["get_llm", "get_max_concurrency", "get_all_tools"]) + builder.get_llm = AsyncMock(return_value=mock_llm) + builder.get_all_tools = AsyncMock(return_value=mock_tools) + builder.get_max_concurrency.return_value = 2 + + async with register_trajectory_evaluator(config, builder) as info: + assert callable(info.evaluate_fn) + assert callable(getattr(info, "evaluate_atif_fn", None)) diff --git a/packages/nvidia_nat_langchain/tests/eval/test_tunable_rag_evaluate.py b/packages/nvidia_nat_langchain/tests/eval/test_tunable_rag_evaluate.py index 9252882f9b..09110c3e7a 100644 --- a/packages/nvidia_nat_langchain/tests/eval/test_tunable_rag_evaluate.py +++ b/packages/nvidia_nat_langchain/tests/eval/test_tunable_rag_evaluate.py @@ -19,10 +19,16 @@ import pytest from langchain_core.language_models import BaseChatModel +from nat.data_models.atif import ATIFAgentConfig +from nat.data_models.atif import ATIFStep +from nat.data_models.atif import ATIFTrajectory from nat.data_models.evaluator import EvalInput from nat.data_models.evaluator import EvalInputItem from nat.data_models.evaluator import EvalOutput +from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSample from nat.plugins.langchain.eval.tunable_rag_evaluator import TunableRagEvaluator +from nat.plugins.langchain.eval.tunable_rag_evaluator import TunableRagEvaluatorConfig +from nat.plugins.langchain.eval.tunable_rag_evaluator import register_tunable_rag_evaluator @pytest.fixture @@ -157,3 +163,98 @@ async def test_evaluate_custom_scoring(): assert len(output.eval_output_items) == 1 assert output.eval_output_items[0].score == 0.75 assert output.eval_output_items[0].reasoning["reasoning"] == "Fair explanation." + + +@pytest.fixture(name="atif_samples") +def fixture_atif_samples(): + return [ + AtifEvalSample( + item_id="1", + trajectory=ATIFTrajectory( + session_id="atif-1", + agent=ATIFAgentConfig(name="test-agent", version="0.0.0"), + steps=[ + ATIFStep(step_id=1, source="user", message="What is AI?"), + ATIFStep(step_id=2, source="agent", message="AI is the simulation of human intelligence."), + ], + ), + expected_output_obj="AI is artificial intelligence.", + output_obj="AI is the simulation of human intelligence.", + metadata={}, + ), + AtifEvalSample( + item_id="2", + trajectory=ATIFTrajectory( + session_id="atif-2", + agent=ATIFAgentConfig(name="test-agent", version="0.0.0"), + steps=[ + ATIFStep(step_id=1, source="user", message="Define ML"), + ATIFStep(step_id=2, source="agent", message="ML helps machines learn."), + ], + ), + expected_output_obj="Machine Learning is a subset of AI.", + output_obj="ML helps machines learn.", + metadata={}, + ), + ] + + +async def test_evaluate_atif_success(evaluator, atif_samples): + evaluator.llm.ainvoke = AsyncMock(side_effect=[ + MagicMock(content='{"coverage_score": 0.9, "correctness_score": 0.8,' + '"relevance_score": 0.7, "reasoning": "ATIF sample 1"}'), + MagicMock(content='{"coverage_score": 0.6, "correctness_score": 0.7,' + '"relevance_score": 0.8, "reasoning": "ATIF sample 2"}') + ]) + + eval_output: EvalOutput = await evaluator.evaluate_atif_fn(atif_samples) + assert isinstance(eval_output, EvalOutput) + assert len(eval_output.eval_output_items) == 2 + assert eval_output.eval_output_items[0].score > 0 + assert eval_output.eval_output_items[1].score > 0 + assert eval_output.average_score > 0 + + +async def test_legacy_and_atif_lane_parity_with_tolerance(evaluator, rag_eval_input, atif_samples): + # Two legacy evaluations then two ATIF evaluations with identical per-item judge outputs. + evaluator.llm.ainvoke = AsyncMock(side_effect=[ + MagicMock(content='{"coverage_score": 0.9, "correctness_score": 0.8,' + '"relevance_score": 0.7, "reasoning": "shared-1"}'), + MagicMock(content='{"coverage_score": 0.6, "correctness_score": 0.7,' + '"relevance_score": 0.8, "reasoning": "shared-2"}'), + MagicMock(content='{"coverage_score": 0.9, "correctness_score": 0.8,' + '"relevance_score": 0.7, "reasoning": "shared-1"}'), + MagicMock(content='{"coverage_score": 0.6, "correctness_score": 0.7,' + '"relevance_score": 0.8, "reasoning": "shared-2"}'), + ]) + + legacy_output = await evaluator.evaluate(rag_eval_input) + atif_output = await evaluator.evaluate_atif_fn(atif_samples) + + assert legacy_output.average_score == pytest.approx(atif_output.average_score, abs=0.01) + assert legacy_output.eval_output_items[0].score == pytest.approx(atif_output.eval_output_items[0].score, abs=0.01) + assert legacy_output.eval_output_items[1].score == pytest.approx(atif_output.eval_output_items[1].score, abs=0.01) + + +async def test_register_tunable_rag_evaluator_exposes_legacy_lane_by_default(mock_llm): + config = TunableRagEvaluatorConfig(llm_name="judge_llm", judge_llm_prompt="Score this answer.") + builder = MagicMock(spec=["get_llm", "get_max_concurrency"]) + builder.get_llm = AsyncMock(return_value=mock_llm) + builder.get_max_concurrency.return_value = 2 + + async with register_tunable_rag_evaluator(config, builder) as info: + assert callable(info.evaluate_fn) + assert not callable(getattr(info, "evaluate_atif_fn", None)) + + +async def test_register_tunable_rag_evaluator_exposes_atif_lane_when_enabled(mock_llm): + config = TunableRagEvaluatorConfig(llm_name="judge_llm", + judge_llm_prompt="Score this answer.", + enable_atif_evaluator=True) + builder = MagicMock(spec=["get_llm", "get_max_concurrency"]) + builder.get_llm = AsyncMock(return_value=mock_llm) + builder.get_max_concurrency.return_value = 2 + + async with register_tunable_rag_evaluator(config, builder) as info: + assert callable(info.evaluate_fn) + assert callable(getattr(info, "evaluate_atif_fn", None)) diff --git a/packages/nvidia_nat_ragas/src/nat/plugins/ragas/rag_evaluator/atif_evaluate.py b/packages/nvidia_nat_ragas/src/nat/plugins/ragas/rag_evaluator/atif_evaluate.py new file mode 100644 index 0000000000..44a727061e --- /dev/null +++ b/packages/nvidia_nat_ragas/src/nat/plugins/ragas/rag_evaluator/atif_evaluate.py @@ -0,0 +1,113 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import typing +from collections.abc import Sequence + +from tqdm import tqdm + +from nat.data_models.atif import ATIFObservationResult +from nat.data_models.atif import ATIFTrajectory +from nat.data_models.evaluator import EvalOutput +from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSampleList +from nat.plugins.eval.utils.tqdm_position_registry import TqdmPositionRegistry +from nat.utils.atif_message_utils import message_to_text +from nat.utils.atif_message_utils import trajectory_to_user_input + +from .evaluate import _ragas_results_to_eval_output + +if typing.TYPE_CHECKING: + from ragas import EvaluationDataset + from ragas.llms import LangchainLLMWrapper + from ragas.metrics import Metric + +logger = logging.getLogger(__name__) + + +def _observation_result_to_text(result: ATIFObservationResult) -> str: + return message_to_text(result.content) + + +def _trajectory_to_retrieved_contexts(trajectory: ATIFTrajectory) -> list[str]: + contexts: list[str] = [] + for step in trajectory.steps: + if not step.observation: + continue + for result in step.observation.results: + text = _observation_result_to_text(result) + if text: + contexts.append(text) + return contexts + + +class RAGAtifEvaluator: + + def __init__(self, evaluator_llm: "LangchainLLMWrapper", metrics: Sequence["Metric"], max_concurrency=8): + self.evaluator_llm = evaluator_llm + self.metrics = metrics + self.max_concurrency = max_concurrency + + def atif_samples_to_ragas(self, atif_samples: AtifEvalSampleList) -> "EvaluationDataset": + """Converts ATIF-native samples into a Ragas-compatible EvaluationDataset.""" + from ragas import EvaluationDataset + from ragas import SingleTurnSample + + samples = [] + for sample in atif_samples: + user_input = trajectory_to_user_input(sample.trajectory) + reference = sample.expected_output_obj + response = sample.output_obj + reference_contexts = [""] + retrieved_contexts = _trajectory_to_retrieved_contexts(sample.trajectory) + ragas_sample = SingleTurnSample( + user_input=user_input, + reference=reference, + response=response, + reference_contexts=reference_contexts, + retrieved_contexts=retrieved_contexts, + ) + samples.append(ragas_sample) + return EvaluationDataset(samples=samples) + + async def evaluate(self, atif_samples: AtifEvalSampleList) -> EvalOutput: + """Run Ragas metrics evaluation on ATIF-native samples.""" + from ragas import evaluate as ragas_evaluate + from ragas.run_config import RunConfig + + ragas_dataset = self.atif_samples_to_ragas(atif_samples) + tqdm_position = TqdmPositionRegistry.claim() + first_metric_name = self.metrics[0].name if self.metrics else "no-metrics" + pbar = tqdm(total=len(ragas_dataset), desc=f"Evaluating Ragas {first_metric_name}", position=tqdm_position) + try: + if not self.metrics: + logger.warning("No RAGAS metrics configured for ATIF evaluator; returning empty metric results.") + results_dataset = None + else: + results_dataset = ragas_evaluate(dataset=ragas_dataset, + metrics=self.metrics, + show_progress=True, + llm=self.evaluator_llm, + run_config=RunConfig(max_workers=self.max_concurrency), + _pbar=pbar) + except Exception: + logger.exception("Error evaluating ATIF ragas metric") + results_dataset = None + finally: + pbar.close() + TqdmPositionRegistry.release(tqdm_position) + + ids = [sample.item_id for sample in atif_samples] + return _ragas_results_to_eval_output(results_dataset=results_dataset, ids=ids) diff --git a/packages/nvidia_nat_ragas/src/nat/plugins/ragas/rag_evaluator/evaluate.py b/packages/nvidia_nat_ragas/src/nat/plugins/ragas/rag_evaluator/evaluate.py index 52535cb281..edf94104a7 100644 --- a/packages/nvidia_nat_ragas/src/nat/plugins/ragas/rag_evaluator/evaluate.py +++ b/packages/nvidia_nat_ragas/src/nat/plugins/ragas/rag_evaluator/evaluate.py @@ -17,6 +17,7 @@ import math import typing from collections.abc import Sequence +from typing import Any from pydantic import BaseModel from tqdm import tqdm @@ -39,6 +40,49 @@ logger = logging.getLogger(__name__) +def _nan_to_zero(v: float | None) -> float: + """Convert NaN or None to 0.0 for safe arithmetic/serialization.""" + return 0.0 if v is None or (isinstance(v, float) and math.isnan(v)) else v + + +def _ragas_results_to_eval_output(results_dataset: "EvaluationResult | None", + ids: list[Any] | None = None) -> EvalOutput: + """Convert a ragas EvaluationResult to NAT EvalOutput.""" + if not results_dataset: + logger.error("Ragas evaluation failed with no results", exc_info=True) + return EvalOutput(average_score=0.0, eval_output_items=[]) + + scores: list[dict[str, float]] = results_dataset.scores + if not scores: + logger.warning("Ragas returned empty score list") + return EvalOutput(average_score=0.0, eval_output_items=[]) + + original_scores_dict = {metric: [score.get(metric) for score in scores] for metric in scores[0]} + scores_dict = {metric: [_nan_to_zero(score.get(metric)) for score in scores] for metric in scores[0]} + first_metric_name = next(iter(scores_dict.keys()), None) + + average_scores = {metric: (sum(values) / len(values) if values else 0.0) for metric, values in scores_dict.items()} + first_avg_score = average_scores.get(first_metric_name, 0.0) + if isinstance(first_avg_score, float) and math.isnan(first_avg_score): + first_avg_score = 0.0 + + df = results_dataset.to_pandas() + fallback_ids = df["user_input"].tolist() + output_ids = ids if ids and len(ids) >= len(df) else fallback_ids + + eval_output_items = [ + EvalOutputItem( + id=output_ids[i], + score=original_scores_dict[first_metric_name][i] if first_metric_name else None, + reasoning={ + key: getattr(row, key, None) + for key in ["user_input", "reference", "response", "retrieved_contexts"] + }, + ) for i, row in enumerate(df.itertuples(index=False)) + ] + return EvalOutput(average_score=first_avg_score, eval_output_items=eval_output_items) + + class RAGEvaluator: def __init__(self, @@ -106,59 +150,8 @@ def eval_input_to_ragas(self, eval_input: EvalInput) -> "EvaluationDataset": def ragas_to_eval_output(self, eval_input: EvalInput, results_dataset: "EvaluationResult | None") -> EvalOutput: """Converts the ragas EvaluationResult to nat EvalOutput""" - - if not results_dataset: - logger.error("Ragas evaluation failed with no results", exc_info=True) - return EvalOutput(average_score=0.0, eval_output_items=[]) - - scores: list[dict[str, float]] = results_dataset.scores - - # If Ragas returned no scores, return empty output to avoid downstream errors - if not scores: - logger.warning("Ragas returned empty score list") - return EvalOutput(average_score=0.0, eval_output_items=[]) - - def _nan_to_zero(v: float | None) -> float: - """Convert NaN or None to 0.0 for safe arithmetic/serialization.""" - return 0.0 if v is None or (isinstance(v, float) and math.isnan(v)) else v - - # Keep original scores (preserving NaN/None) for output - original_scores_dict = {metric: [score.get(metric) for score in scores] for metric in scores[0]} - - # Convert from list of dicts to dict of lists, coercing NaN/None to 0.0 for average calculation - scores_dict = {metric: [_nan_to_zero(score.get(metric)) for score in scores] for metric in scores[0]} - first_metric_name = list(scores_dict.keys())[0] if scores_dict else None - - # Compute the average of each metric using cleaned scores (NaN/None -> 0.0) - average_scores = { - metric: (sum(values) / len(values) if values else 0.0) - for metric, values in scores_dict.items() - } - - first_avg_score = average_scores.get(list(scores_dict.keys())[0], 0.0) - if isinstance(first_avg_score, float) and math.isnan(first_avg_score): - first_avg_score = 0.0 - - df = results_dataset.to_pandas() - # Get id from eval_input if df size matches number of eval_input_items - if len(eval_input.eval_input_items) >= len(df): - ids = [item.id for item in eval_input.eval_input_items] # Extract IDs - else: - ids = df["user_input"].tolist() # Use "user_input" as ID fallback - - # Construct EvalOutputItem list using original scores (preserving NaN/None) - eval_output_items = [ - EvalOutputItem( - id=ids[i], - score=original_scores_dict[first_metric_name][i] if first_metric_name else None, - reasoning={ - key: - getattr(row, key, None) # Use getattr to safely access attributes - for key in ["user_input", "reference", "response", "retrieved_contexts"] - }) for i, row in enumerate(df.itertuples(index=False)) - ] - # Return EvalOutput - return EvalOutput(average_score=first_avg_score, eval_output_items=eval_output_items) + ids = [item.id for item in eval_input.eval_input_items] + return _ragas_results_to_eval_output(results_dataset=results_dataset, ids=ids) async def evaluate(self, eval_input: EvalInput) -> EvalOutput: """Run Ragas metrics evaluation on the provided EvalInput""" diff --git a/packages/nvidia_nat_ragas/src/nat/plugins/ragas/rag_evaluator/register.py b/packages/nvidia_nat_ragas/src/nat/plugins/ragas/rag_evaluator/register.py index eea82966db..9f4b75ab41 100644 --- a/packages/nvidia_nat_ragas/src/nat/plugins/ragas/rag_evaluator/register.py +++ b/packages/nvidia_nat_ragas/src/nat/plugins/ragas/rag_evaluator/register.py @@ -26,6 +26,7 @@ from nat.data_models.evaluator import EvalInput from nat.data_models.evaluator import EvalOutput from nat.data_models.evaluator import EvaluatorLLMConfig +from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSampleList from nat.utils.exception_handlers.automatic_retries import patch_with_retry logger = logging.getLogger(__name__) @@ -49,6 +50,10 @@ class RagasEvaluatorConfig(EvaluatorLLMConfig, name="ragas"): default=None, description="The field in the input object that contains the content to evaluate.", ) + enable_atif_evaluator: bool = Field( + default=False, + description="Enable ATIF-native RAGAS evaluator lane. Disabled by default until rollout stabilization.", + ) @model_validator(mode="before") @classmethod @@ -112,6 +117,14 @@ async def evaluate_fn(eval_input: EvalInput) -> EvalOutput: return EvalOutput(average_score=0.0, eval_output_items=[]) return await evaluator.evaluate(eval_input) + async def evaluate_atif_fn(atif_samples: AtifEvalSampleList) -> EvalOutput: + """Run ATIF-native RAGAS evaluation and return NAT eval output.""" + if not atif_evaluator: + logger.warning("No ATIF evaluator found for RAGAS metrics.") + return EvalOutput(average_score=0.0, eval_output_items=[]) + return await atif_evaluator.evaluate(atif_samples) + + from .atif_evaluate import RAGAtifEvaluator from .evaluate import RAGEvaluator llm = await builder.get_llm(config.llm_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN) @@ -136,4 +149,11 @@ async def evaluate_fn(eval_input: EvalInput) -> EvalOutput: metrics=metrics, max_concurrency=builder.get_max_concurrency(), input_obj_field=config.input_obj_field) if metrics else None - yield EvaluatorInfo(config=config, evaluate_fn=evaluate_fn, description="Evaluator for RAGAS metrics") + atif_evaluator = RAGAtifEvaluator( + evaluator_llm=llm, metrics=metrics, + max_concurrency=builder.get_max_concurrency()) if (metrics and config.enable_atif_evaluator) else None + + evaluator_info = EvaluatorInfo(config=config, evaluate_fn=evaluate_fn, description="Evaluator for RAGAS metrics") + if config.enable_atif_evaluator: + evaluator_info.evaluate_atif_fn = evaluate_atif_fn + yield evaluator_info diff --git a/packages/nvidia_nat_ragas/tests/test_rag_evaluate.py b/packages/nvidia_nat_ragas/tests/test_rag_evaluate.py index dadda3418f..1bc7579b16 100644 --- a/packages/nvidia_nat_ragas/tests/test_rag_evaluate.py +++ b/packages/nvidia_nat_ragas/tests/test_rag_evaluate.py @@ -46,6 +46,41 @@ class ExampleModel(BaseModel): other: str +@pytest.fixture(name="atif_samples") +def fixture_atif_samples(rag_user_inputs, rag_expected_outputs, rag_generated_outputs): + """ATIF-native samples for testing RAG ATIF evaluator path.""" + from nat.data_models.atif import ATIFAgentConfig + from nat.data_models.atif import ATIFObservation + from nat.data_models.atif import ATIFObservationResult + from nat.data_models.atif import ATIFStep + from nat.data_models.atif import ATIFTrajectory + from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSample + + samples = [] + for index, (user_input, expected_output, + generated_output) in enumerate(zip(rag_user_inputs, rag_expected_outputs, rag_generated_outputs)): + trajectory = ATIFTrajectory( + session_id=str(index + 1), + agent=ATIFAgentConfig(name="nat-agent", version="0.0.0"), + steps=[ + ATIFStep(step_id=1, source="user", message=user_input), + ATIFStep(step_id=2, + source="agent", + message=str(generated_output), + observation=ATIFObservation(results=[ATIFObservationResult(content="retrieved context")])), + ], + ) + samples.append( + AtifEvalSample( + item_id=index + 1, + trajectory=trajectory, + expected_output_obj=expected_output, + output_obj=generated_output, + metadata={}, + )) + return samples + + @pytest.fixture def ragas_judge_llm() -> "LangchainLLMWrapper": """Fixture providing a mocked LangchainLLMWrapper.""" @@ -310,6 +345,204 @@ async def test_rag_evaluate_failure(rag_evaluator, rag_eval_input, ragas_judge_l assert output.eval_output_items == [] # No results due to failure +def test_atif_samples_to_ragas(ragas_judge_llm, ragas_metrics, atif_samples): + """Test ATIF sample mapping to ragas dataset.""" + from ragas.evaluation import EvaluationDataset + from ragas.evaluation import SingleTurnSample + + from nat.plugins.ragas.rag_evaluator.atif_evaluate import RAGAtifEvaluator + + atif_evaluator = RAGAtifEvaluator(evaluator_llm=ragas_judge_llm, metrics=ragas_metrics) + dataset = atif_evaluator.atif_samples_to_ragas(atif_samples) + + assert isinstance(dataset, EvaluationDataset) + assert len(dataset.samples) == len(atif_samples) + for sample in dataset.samples: + assert isinstance(sample, SingleTurnSample) + assert sample.retrieved_contexts == ["retrieved context"] + + +async def test_rag_atif_evaluate_success(ragas_judge_llm, ragas_metrics, atif_samples): + """Test ATIF-native evaluate path for RAGAS evaluator.""" + from nat.plugins.ragas.rag_evaluator.atif_evaluate import RAGAtifEvaluator + + mock_results_dataset = MagicMock() + dataset = "mock_dataset" + mock_output = "mock_output" + atif_evaluator = RAGAtifEvaluator(evaluator_llm=ragas_judge_llm, metrics=ragas_metrics) + + with patch.object(atif_evaluator, "atif_samples_to_ragas", return_value=dataset) as mock_to_ragas, \ + patch("ragas.evaluate", new_callable=MagicMock) as mock_ragas_evaluate, \ + patch("nat.plugins.ragas.rag_evaluator.atif_evaluate._ragas_results_to_eval_output", + return_value=mock_output) as mock_to_output: + mock_ragas_evaluate.return_value = mock_results_dataset + output = await atif_evaluator.evaluate(atif_samples) + + mock_to_ragas.assert_called_once_with(atif_samples) + mock_ragas_evaluate.assert_called_once() + called_kwargs = mock_ragas_evaluate.call_args.kwargs + assert called_kwargs["dataset"] == dataset + assert called_kwargs["metrics"] == ragas_metrics + assert called_kwargs["show_progress"] is True + assert called_kwargs["llm"] == ragas_judge_llm + mock_to_output.assert_called_once() + assert output == mock_output + + +def test_rag_legacy_and_atif_dataset_parity(rag_evaluator, + ragas_judge_llm, + ragas_metrics, + rag_eval_input, + intermediate_step_adapter): + """Ensure legacy and ATIF lanes produce equivalent ragas input samples.""" + from nat.data_models.atif import ATIFAgentConfig + from nat.data_models.atif import ATIFObservation + from nat.data_models.atif import ATIFObservationResult + from nat.data_models.atif import ATIFStep + from nat.data_models.atif import ATIFTrajectory + from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSample + from nat.plugins.ragas.rag_evaluator.atif_evaluate import RAGAtifEvaluator + + atif_samples = [] + for item in rag_eval_input.eval_input_items: + contexts = intermediate_step_adapter.get_context(item.trajectory, + intermediate_step_adapter.DEFAULT_EVENT_FILTER) + trajectory = ATIFTrajectory( + session_id=str(item.id), + agent=ATIFAgentConfig(name="nat-agent", version="0.0.0"), + steps=[ + ATIFStep(step_id=1, source="user", message=str(item.input_obj)), + ATIFStep(step_id=2, + source="agent", + message=str(item.output_obj), + observation=ATIFObservation( + results=[ATIFObservationResult(content=context) for context in contexts])), + ], + ) + atif_samples.append( + AtifEvalSample(item_id=item.id, + trajectory=trajectory, + expected_output_obj=item.expected_output_obj, + output_obj=item.output_obj, + metadata={})) + + atif_evaluator = RAGAtifEvaluator(evaluator_llm=ragas_judge_llm, metrics=ragas_metrics) + legacy_dataset = rag_evaluator.eval_input_to_ragas(rag_eval_input) + atif_dataset = atif_evaluator.atif_samples_to_ragas(atif_samples) + + assert len(legacy_dataset.samples) == len(atif_dataset.samples) + for legacy_sample, atif_sample in zip(legacy_dataset.samples, atif_dataset.samples): + assert legacy_sample.user_input == atif_sample.user_input + assert legacy_sample.reference == atif_sample.reference + assert legacy_sample.response == atif_sample.response + assert legacy_sample.retrieved_contexts == atif_sample.retrieved_contexts + + +@pytest.mark.parametrize( + "atif_trajectory_steps, expected_user_input, expected_contexts", + [ + ([], "", []), + ([{ + "step_id": 1, "source": "user", "message": "question only" + }], "question only", []), + ], +) +def test_atif_samples_to_ragas_edge_cases(ragas_judge_llm, + ragas_metrics, + atif_trajectory_steps, + expected_user_input, + expected_contexts): + """Ensure ATIF lane handles missing/partial trajectory content gracefully.""" + from nat.data_models.atif import ATIFAgentConfig + from nat.data_models.atif import ATIFTrajectory + from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSample + from nat.plugins.ragas.rag_evaluator.atif_evaluate import RAGAtifEvaluator + + trajectory = ATIFTrajectory(session_id="edge-case-1", + agent=ATIFAgentConfig(name="nat-agent", version="0.0.0"), + steps=atif_trajectory_steps) + atif_samples = [ + AtifEvalSample(item_id=1, trajectory=trajectory, expected_output_obj="ref", output_obj="resp", metadata={}) + ] + + atif_evaluator = RAGAtifEvaluator(evaluator_llm=ragas_judge_llm, metrics=ragas_metrics) + dataset = atif_evaluator.atif_samples_to_ragas(atif_samples) + + assert len(dataset.samples) == 1 + assert dataset.samples[0].user_input == expected_user_input + assert dataset.samples[0].retrieved_contexts == expected_contexts + + +async def test_rag_legacy_and_atif_score_parity(rag_evaluator, + ragas_judge_llm, + ragas_metrics, + rag_eval_input, + intermediate_step_adapter): + """Ensure legacy and ATIF evaluator lanes produce parity scores on the same dataset.""" + from nat.data_models.atif import ATIFAgentConfig + from nat.data_models.atif import ATIFObservation + from nat.data_models.atif import ATIFObservationResult + from nat.data_models.atif import ATIFStep + from nat.data_models.atif import ATIFTrajectory + from nat.plugins.eval.evaluator.atif_evaluator import AtifEvalSample + from nat.plugins.ragas.rag_evaluator.atif_evaluate import RAGAtifEvaluator + + metric_name = "AnswerAccuracy" + + def _mock_ragas_evaluate(*_args, **kwargs): + dataset = kwargs["dataset"] + rows = [] + scores = [] + for sample in dataset.samples: + score = 0.5 + (0.5 if sample.retrieved_contexts else 0.0) + scores.append({metric_name: score}) + rows.append({ + "user_input": sample.user_input, + "reference": sample.reference, + "response": sample.response, + "retrieved_contexts": sample.retrieved_contexts, + metric_name: score, + }) + result = MagicMock() + result.scores = scores + result.to_pandas.return_value = pd.DataFrame(rows) + return result + + atif_samples = [] + for item in rag_eval_input.eval_input_items: + contexts = intermediate_step_adapter.get_context(item.trajectory, + intermediate_step_adapter.DEFAULT_EVENT_FILTER) + trajectory = ATIFTrajectory( + session_id=str(item.id), + agent=ATIFAgentConfig(name="nat-agent", version="0.0.0"), + steps=[ + ATIFStep(step_id=1, source="user", message=str(item.input_obj)), + ATIFStep(step_id=2, + source="agent", + message=str(item.output_obj), + observation=ATIFObservation( + results=[ATIFObservationResult(content=context) for context in contexts])), + ], + ) + atif_samples.append( + AtifEvalSample(item_id=item.id, + trajectory=trajectory, + expected_output_obj=item.expected_output_obj, + output_obj=item.output_obj, + metadata={})) + + atif_evaluator = RAGAtifEvaluator(evaluator_llm=ragas_judge_llm, metrics=ragas_metrics) + with patch("ragas.evaluate", side_effect=_mock_ragas_evaluate): + legacy_output = await rag_evaluator.evaluate(rag_eval_input) + atif_output = await atif_evaluator.evaluate(atif_samples) + + assert legacy_output.average_score == pytest.approx(atif_output.average_score, abs=1e-9) + assert len(legacy_output.eval_output_items) == len(atif_output.eval_output_items) + for legacy_item, atif_item in zip(legacy_output.eval_output_items, atif_output.eval_output_items): + assert legacy_item.id == atif_item.id + assert legacy_item.score == pytest.approx(atif_item.score, abs=1e-9) + + def test_extract_input_obj_base_model_with_field(rag_evaluator_content): """Ensure extract_input_obj returns the specified field from a Pydantic BaseModel.""" model_obj = ExampleModel(content="hello world", other="ignore me") @@ -343,3 +576,41 @@ def test_extract_input_obj_base_model_without_field(rag_evaluator, rag_evaluator assert extracted_with_field == "json hello" assert extracted_default != extracted_with_field assert '"content":"json hello"' in extracted_default # basic sanity check on JSON output + + +async def test_register_ragas_evaluator_atif_lane_disabled_by_default(): + """Ensure RAGAS ATIF lane is opt-in while stabilizing.""" + from nat.plugins.ragas.rag_evaluator.register import RagasEvaluatorConfig + from nat.plugins.ragas.rag_evaluator.register import register_ragas_evaluator + + builder = MagicMock() + builder.get_llm = AsyncMock(return_value=MagicMock()) + builder.get_max_concurrency = MagicMock(return_value=1) + + config = RagasEvaluatorConfig(llm_name="judge", metric={"AnswerAccuracy": {"skip": True}}) + async with register_ragas_evaluator(config=config, builder=builder) as evaluator_info: + assert hasattr(evaluator_info, "evaluate_fn") + assert not hasattr(evaluator_info, "evaluate_atif_fn") + + builder.get_llm.assert_awaited_once() + + +async def test_register_ragas_evaluator_atif_lane_enabled(): + """Ensure RAGAS ATIF lane can be explicitly enabled by config.""" + from nat.plugins.ragas.rag_evaluator.register import RagasEvaluatorConfig + from nat.plugins.ragas.rag_evaluator.register import register_ragas_evaluator + + builder = MagicMock() + builder.get_llm = AsyncMock(return_value=MagicMock()) + builder.get_max_concurrency = MagicMock(return_value=1) + + config = RagasEvaluatorConfig(llm_name="judge", + metric={"AnswerAccuracy": { + "skip": True + }}, + enable_atif_evaluator=True) + async with register_ragas_evaluator(config=config, builder=builder) as evaluator_info: + assert hasattr(evaluator_info, "evaluate_fn") + assert callable(getattr(evaluator_info, "evaluate_atif_fn", None)) + + builder.get_llm.assert_awaited_once() diff --git a/pyproject.toml b/pyproject.toml index 52de63d06a..b43e9dc10b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,6 +68,7 @@ opentelemetry = ["nvidia-nat-opentelemetry == {version}"] phoenix = ["nvidia-nat-phoenix == {version}"] profiler = ["nvidia-nat-profiler == {version}"] rag = ["nvidia-nat-rag == {version}"] +ragas = ["nvidia-nat-ragas == {version}"] ragaai = ["nvidia-nat-ragaai == {version}"] mysql = ["nvidia-nat-mysql == {version}"] redis = ["nvidia-nat-redis == {version}"] diff --git a/uv.lock b/uv.lock index 0fb0c78fbb..34ee2e4d22 100644 --- a/uv.lock +++ b/uv.lock @@ -1002,10 +1002,16 @@ sdist = { url = "https://files.pythonhosted.org/packages/92/88/b8527e1b00c1811db wheels = [ { url = "https://files.pythonhosted.org/packages/ec/90/543f556fcfcfa270713eef906b6352ab048e1e557afec12925c991dc93c2/caio-0.9.25-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d6956d9e4a27021c8bd6c9677f3a59eb1d820cc32d0343cea7961a03b1371965", size = 36839, upload-time = "2025-12-26T15:21:40.267Z" }, { url = "https://files.pythonhosted.org/packages/51/3b/36f3e8ec38dafe8de4831decd2e44c69303d2a3892d16ceda42afed44e1b/caio-0.9.25-cp311-cp311-manylinux2010_x86_64.manylinux2014_x86_64.manylinux_2_12_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bf84bfa039f25ad91f4f52944452a5f6f405e8afab4d445450978cd6241d1478", size = 80255, upload-time = "2025-12-26T15:22:20.271Z" }, + { url = "https://files.pythonhosted.org/packages/df/ce/65e64867d928e6aff1b4f0e12dba0ef6d5bf412c240dc1df9d421ac10573/caio-0.9.25-cp311-cp311-manylinux_2_34_aarch64.whl", hash = "sha256:ae3d62587332bce600f861a8de6256b1014d6485cfd25d68c15caf1611dd1f7c", size = 80052, upload-time = "2026-03-04T22:08:20.402Z" }, + { url = "https://files.pythonhosted.org/packages/46/90/e278863c47e14ec58309aa2e38a45882fbe67b4cc29ec9bc8f65852d3e45/caio-0.9.25-cp311-cp311-manylinux_2_34_x86_64.whl", hash = "sha256:fc220b8533dcf0f238a6b1a4a937f92024c71e7b10b5a2dfc1c73604a25709bc", size = 78273, upload-time = "2026-03-04T22:08:21.368Z" }, { url = "https://files.pythonhosted.org/packages/d3/25/79c98ebe12df31548ba4eaf44db11b7cad6b3e7b4203718335620939083c/caio-0.9.25-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:fb7ff95af4c31ad3f03179149aab61097a71fd85e05f89b4786de0359dffd044", size = 36983, upload-time = "2025-12-26T15:21:36.075Z" }, { url = "https://files.pythonhosted.org/packages/a3/2b/21288691f16d479945968a0a4f2856818c1c5be56881d51d4dac9b255d26/caio-0.9.25-cp312-cp312-manylinux2010_x86_64.manylinux2014_x86_64.manylinux_2_12_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:97084e4e30dfa598449d874c4d8e0c8d5ea17d2f752ef5e48e150ff9d240cd64", size = 82012, upload-time = "2025-12-26T15:22:20.983Z" }, + { url = "https://files.pythonhosted.org/packages/03/c4/8a1b580875303500a9c12b9e0af58cb82e47f5bcf888c2457742a138273c/caio-0.9.25-cp312-cp312-manylinux_2_34_aarch64.whl", hash = "sha256:4fa69eba47e0f041b9d4f336e2ad40740681c43e686b18b191b6c5f4c5544bfb", size = 81502, upload-time = "2026-03-04T22:08:22.381Z" }, + { url = "https://files.pythonhosted.org/packages/d1/1c/0fe770b8ffc8362c48134d1592d653a81a3d8748d764bec33864db36319d/caio-0.9.25-cp312-cp312-manylinux_2_34_x86_64.whl", hash = "sha256:6bebf6f079f1341d19f7386db9b8b1f07e8cc15ae13bfdaff573371ba0575d69", size = 80200, upload-time = "2026-03-04T22:08:23.382Z" }, { url = "https://files.pythonhosted.org/packages/31/57/5e6ff127e6f62c9f15d989560435c642144aa4210882f9494204bc892305/caio-0.9.25-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:d6c2a3411af97762a2b03840c3cec2f7f728921ff8adda53d7ea2315a8563451", size = 36979, upload-time = "2025-12-26T15:21:35.484Z" }, { url = "https://files.pythonhosted.org/packages/a3/9f/f21af50e72117eb528c422d4276cbac11fb941b1b812b182e0a9c70d19c5/caio-0.9.25-cp313-cp313-manylinux2010_x86_64.manylinux2014_x86_64.manylinux_2_12_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0998210a4d5cd5cb565b32ccfe4e53d67303f868a76f212e002a8554692870e6", size = 81900, upload-time = "2025-12-26T15:22:21.919Z" }, + { url = "https://files.pythonhosted.org/packages/9c/12/c39ae2a4037cb10ad5eb3578eb4d5f8c1a2575c62bba675f3406b7ef0824/caio-0.9.25-cp313-cp313-manylinux_2_34_aarch64.whl", hash = "sha256:1a177d4777141b96f175fe2c37a3d96dec7911ed9ad5f02bac38aaa1c936611f", size = 81523, upload-time = "2026-03-04T22:08:25.187Z" }, + { url = "https://files.pythonhosted.org/packages/22/59/f8f2e950eb4f1a5a3883e198dca514b9d475415cb6cd7b78b9213a0dd45a/caio-0.9.25-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:9ed3cfb28c0e99fec5e208c934e5c157d0866aa9c32aa4dc5e9b6034af6286b7", size = 80243, upload-time = "2026-03-04T22:08:26.449Z" }, { url = "https://files.pythonhosted.org/packages/86/93/1f76c8d1bafe3b0614e06b2195784a3765bbf7b0a067661af9e2dd47fc33/caio-0.9.25-py3-none-any.whl", hash = "sha256:06c0bb02d6b929119b1cfbe1ca403c768b2013a369e2db46bfa2a5761cf82e40", size = 19087, upload-time = "2025-12-26T15:22:00.221Z" }, ] @@ -5675,7 +5681,7 @@ source = { editable = "examples/evaluation_and_profiling/email_phishing_analyzer dependencies = [ { name = "beautifulsoup4" }, { name = "networkx" }, - { name = "nvidia-nat", extra = ["eval", "langchain", "phoenix", "profiler", "test"] }, + { name = "nvidia-nat", extra = ["eval", "langchain", "phoenix", "profiler", "ragas", "test"] }, { name = "openinference-instrumentation-langchain" }, ] @@ -5808,7 +5814,7 @@ dependencies = [ { name = "nat-alert-triage-agent" }, { name = "nat-simple-calculator" }, { name = "nbclient" }, - { name = "nvidia-nat", extra = ["langchain", "llama-index", "mcp", "profiler", "test"] }, + { name = "nvidia-nat", extra = ["langchain", "llama-index", "mcp", "profiler", "ragas", "test"] }, { name = "python-dotenv", extra = ["cli"] }, ] @@ -6163,7 +6169,7 @@ name = "nat-simple-web-query-eval" source = { editable = "examples/evaluation_and_profiling/simple_web_query_eval" } dependencies = [ { name = "nat-simple-web-query" }, - { name = "nvidia-nat", extra = ["eval", "langchain", "profiler", "test"] }, + { name = "nvidia-nat", extra = ["eval", "langchain", "profiler", "ragas", "test"] }, ] [package.metadata] @@ -6592,6 +6598,9 @@ rag = [ ragaai = [ { name = "nvidia-nat-ragaai" }, ] +ragas = [ + { name = "nvidia-nat-ragas" }, +] redis = [ { name = "nvidia-nat-redis" }, ] @@ -6744,6 +6753,7 @@ requires-dist = [ { name = "nvidia-nat-rag", marker = "extra == 'rag'", editable = "packages/nvidia_nat_rag" }, { name = "nvidia-nat-ragaai", marker = "extra == 'ragaai'", editable = "packages/nvidia_nat_ragaai" }, { name = "nvidia-nat-ragas", marker = "extra == 'most'", editable = "packages/nvidia_nat_ragas" }, + { name = "nvidia-nat-ragas", marker = "extra == 'ragas'", editable = "packages/nvidia_nat_ragas" }, { name = "nvidia-nat-redis", marker = "extra == 'most'", editable = "packages/nvidia_nat_redis" }, { name = "nvidia-nat-redis", marker = "extra == 'redis'", editable = "packages/nvidia_nat_redis" }, { name = "nvidia-nat-s3", marker = "extra == 'most'", editable = "packages/nvidia_nat_s3" }, @@ -6764,7 +6774,7 @@ requires-dist = [ { name = "nvidia-nat-zep-cloud", marker = "extra == 'zep-cloud'", editable = "packages/nvidia_nat_zep_cloud" }, { name = "text-file-ingest", marker = "extra == 'examples'", editable = "examples/documentation_guides/workflows/text_file_ingest" }, ] -provides-extras = ["a2a", "adk", "agno", "app", "autogen", "core", "crewai", "eval", "data-flywheel", "fastmcp", "langchain", "llama-index", "mcp", "mem0ai", "nemo-customizer", "openpipe-art", "opentelemetry", "phoenix", "profiler", "rag", "ragaai", "mysql", "redis", "s3", "security", "semantic-kernel", "strands", "test", "vanna", "weave", "zep-cloud", "async-endpoints", "gunicorn", "pii-defense", "most", "examples"] +provides-extras = ["a2a", "adk", "agno", "app", "autogen", "core", "crewai", "eval", "data-flywheel", "fastmcp", "langchain", "llama-index", "mcp", "mem0ai", "nemo-customizer", "openpipe-art", "opentelemetry", "phoenix", "profiler", "rag", "ragas", "ragaai", "mysql", "redis", "s3", "security", "semantic-kernel", "strands", "test", "vanna", "weave", "zep-cloud", "async-endpoints", "gunicorn", "pii-defense", "most", "examples"] [package.metadata.requires-dev] dev = [