Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
6f9d204
Add ATIF-native evaluator interface and runtime dual-lane dispatch
AnuradhaKaruppiah Mar 4, 2026
edfb385
Add ATIF ragas evaluator
AnuradhaKaruppiah Mar 4, 2026
e082588
Config to enable or disable ATIF eval lane
AnuradhaKaruppiah Mar 4, 2026
f1dee62
Add ragas as a extra to get rid-off exampl install warnings
AnuradhaKaruppiah Mar 5, 2026
7b8ba34
Style fixes
AnuradhaKaruppiah Mar 5, 2026
11076de
Merge remote-tracking branch 'upstream/develop' into ak-eval-atif-2
AnuradhaKaruppiah Mar 5, 2026
744cac6
Fix warning seen in eval callback manager
AnuradhaKaruppiah Mar 5, 2026
7ebf1ac
Add an option to write the workflow output with ATIF traj
AnuradhaKaruppiah Mar 5, 2026
3eca62e
Style fixes
AnuradhaKaruppiah Mar 5, 2026
bc76a57
Add a simple script to get a quick view of drifts between runs
AnuradhaKaruppiah Mar 5, 2026
6e8c687
Style fixes
AnuradhaKaruppiah Mar 5, 2026
4a06115
Fix test failures
AnuradhaKaruppiah Mar 5, 2026
3f0aa55
Add an atif evaluation to langchain trajectory
AnuradhaKaruppiah Mar 5, 2026
8c7bb67
Update examples with traj evaluator
AnuradhaKaruppiah Mar 6, 2026
9ceef5b
Traj evaluator fixes
AnuradhaKaruppiah Mar 6, 2026
d1cd99f
Added atif config control knob
AnuradhaKaruppiah Mar 6, 2026
d14caae
Add atif version of all config example for easy A/B comparison
AnuradhaKaruppiah Mar 6, 2026
be81689
Update output directories
AnuradhaKaruppiah Mar 6, 2026
584c22b
style fixes
AnuradhaKaruppiah Mar 6, 2026
2625664
Add ATIF lane for the tunable rag evaluator
AnuradhaKaruppiah Mar 6, 2026
e7a895c
Style checks
AnuradhaKaruppiah Mar 6, 2026
285a24e
Add an atif version for A/B comp testing
AnuradhaKaruppiah Mar 6, 2026
c909f03
Have workflow output written in atif format
AnuradhaKaruppiah Mar 6, 2026
5754da4
Update uv.lock of examples that need ragas
AnuradhaKaruppiah Mar 6, 2026
2bdc5d9
Minor enhancements to comparison script
AnuradhaKaruppiah Mar 6, 2026
7fb0f9f
Merge remote-tracking branch 'upstream/develop' into ak-eval-atif-2
AnuradhaKaruppiah Mar 6, 2026
9b6db15
Pull the evaluation component into a lightweight eval harness
AnuradhaKaruppiah Mar 6, 2026
ab34fc6
Cleanup boiler plate
AnuradhaKaruppiah Mar 6, 2026
fefd003
Style fixes
AnuradhaKaruppiah Mar 6, 2026
1190af0
Auto-register FileEvalCallback for direct EvaluationRun class usage
AnuradhaKaruppiah Mar 6, 2026
a453ba0
Add review comments
AnuradhaKaruppiah Mar 6, 2026
9361459
Address code review feedback
AnuradhaKaruppiah Mar 6, 2026
64b5dd1
Re-add writing in atif format for file callback
AnuradhaKaruppiah Mar 6, 2026
4a5c399
Update CLI to show atif output file
AnuradhaKaruppiah Mar 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

function_groups:
calculator:
_type: calculator

functions:
current_datetime:
_type: current_datetime

llms:
nim_llm:
_type: nim
model_name: nvidia/nemotron-3-nano-30b-a3b
temperature: 0.0
max_tokens: 1024
eval_llm:
_type: nim
model_name: mistralai/mixtral-8x22b-instruct-v0.1
temperature: 0.0
max_tokens: 1024
openai_llm:
_type: openai
model_name: gpt-3.5-turbo
max_tokens: 2000

workflow:
_type: react_agent
tool_names: [calculator, current_datetime]
llm_name: nim_llm
verbose: true
parse_agent_response_max_retries: 3


eval:
general:
max_concurrency: 1
output:
dir: .tmp/nat/examples/getting_started/simple_calculator/atif
write_atif_workflow_output: true
dataset:
_type: json
file_path: examples/getting_started/simple_calculator/data/simple_calculator.json
evaluators:
tuneable_eval:
_type: tunable_rag_evaluator
enable_atif_evaluator: true
llm_name: eval_llm
default_scoring: true
default_score_weights:
coverage: 0.5
correctness: 0.3
relevance: 0.2
judge_llm_prompt: >
You are an intelligent evaluator that scores the generated answer based on the description of the expected answer.
The score is a measure of how well the generated answer matches the description of the expected answer based on the question.
Take into account the question, the relevance of the answer to the question and the quality compared to the description of the expected answer.
Rules:
- The score must be a float of any value between 0.0 and 1.0 on a sliding scale.
- The reasoning string must be concise and to the point. It should be 1 sentence and 2 only if extra description is needed. It must explain why the score was given and what is different between the generated answer and the expected answer.
- The tags <image> and <chart> are real images and charts.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

functions:
webpage_query:
_type: webpage_query
webpage_url: https://docs.smith.langchain.com
description: "Search for information about LangSmith. For any questions about LangSmith, you must use this tool!"
embedder_name: nv-embedqa-e5-v5
current_datetime:
_type: current_datetime

llms:
nim_llm:
_type: nim
model_name: nvidia/nemotron-3-nano-30b-a3b
temperature: 0.0
nim_rag_eval_llm:
_type: nim
model_name: nvidia/nemotron-3-nano-30b-a3b
max_tokens: 8
nim_trajectory_eval_llm:
_type: nim
model_name: nvidia/nemotron-3-nano-30b-a3b
temperature: 0.0
max_tokens: 1024

embedders:
nv-embedqa-e5-v5:
_type: nim
model_name: nvidia/nv-embedqa-e5-v5

workflow:
_type: react_agent
tool_names: [webpage_query, current_datetime]
llm_name: nim_llm
verbose: true
parse_agent_response_max_retries: 3

eval:
general:
max_concurrency: 1
output:
dir: ./.tmp/nat/examples/evaluation_and_profiling/simple_web_query_eval/atif/
cleanup: true
write_atif_workflow_output: true
dataset:
_type: json
file_path: examples/evaluation_and_profiling/simple_web_query_eval/data/langsmith.json
evaluators:
# RAGAS evaluators now run through the ATIF-native evaluator lane.
accuracy:
_type: ragas
enable_atif_evaluator: true
metric: AnswerAccuracy
llm_name: nim_rag_eval_llm
groundedness:
_type: ragas
enable_atif_evaluator: true
metric: ResponseGroundedness
llm_name: nim_rag_eval_llm
relevance:
_type: ragas
enable_atif_evaluator: true
metric: ContextRelevance
llm_name: nim_rag_eval_llm
trajectory_accuracy:
_type: trajectory
enable_atif_evaluator: true
llm_name: nim_trajectory_eval_llm
Loading