From 58c0e48ea4cf2ec3eb5eabbbdcd4dffc62715461 Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Sun, 21 Dec 2025 17:33:42 +0000 Subject: [PATCH 01/16] implement monitor rubric --- verifiers/rubrics/monitor_rubric.py | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 verifiers/rubrics/monitor_rubric.py diff --git a/verifiers/rubrics/monitor_rubric.py b/verifiers/rubrics/monitor_rubric.py new file mode 100644 index 000000000..ef927589f --- /dev/null +++ b/verifiers/rubrics/monitor_rubric.py @@ -0,0 +1,31 @@ +from typing import Callable + +from verifiers.rubrics.rubric import Rubric +from verifiers.types import State + + +class MonitorRubric(Rubric): + """Simple rubric that reads values from the state for logging.""" + + def __init__(self, state_keys: list[str] | None = None): + self.state_keys = state_keys or [] + + # build funcs for each state key + reward_funcs = [] + for key in self.state_keys: + reward_funcs.append(self.get_read_from_state(key)) + + reward_weights = [0.0] * len(self.state_keys) # only for logging + + # pass them to parent class + super().__init__(funcs=reward_funcs, weights=reward_weights) + + async def get_read_from_state(self, key: str) -> Callable[[State], float]: + """Create a reward function that reads from the state.""" + + def read_from_state(state: State) -> float: + return float(state.get(key, 0.0)) + + read_from_state.__name__ = key + + return read_from_state From 84f0bbdfbd654bfbfe1d316bf5a2bd2c1de81421 Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Sun, 21 Dec 2025 17:34:07 +0000 Subject: [PATCH 02/16] add to __init__ --- verifiers/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/verifiers/__init__.py b/verifiers/__init__.py index f148395b2..8be5e4b65 100644 --- a/verifiers/__init__.py +++ b/verifiers/__init__.py @@ -28,6 +28,7 @@ from .parsers.think_parser import ThinkParser from .parsers.xml_parser import XMLParser from .rubrics.judge_rubric import JudgeRubric +from .rubrics.monitor_rubric import MonitorRubric from .rubrics.rubric_group import RubricGroup from .rubrics.tool_rubric import ToolRubric from .utils.data_utils import ( @@ -85,6 +86,7 @@ def setup_logging( "JudgeRubric", "RubricGroup", "ToolRubric", + "MonitorRubric", "MathRubric", "TextArenaEnv", "ReasoningGymEnv", From 34324dceec302907dbdd3bc5cd5078c843e19151 Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Sun, 21 Dec 2025 17:41:24 +0000 Subject: [PATCH 03/16] allow arbitrary transforms --- verifiers/rubrics/monitor_rubric.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/verifiers/rubrics/monitor_rubric.py b/verifiers/rubrics/monitor_rubric.py index ef927589f..441d856d9 100644 --- a/verifiers/rubrics/monitor_rubric.py +++ b/verifiers/rubrics/monitor_rubric.py @@ -7,24 +7,32 @@ class MonitorRubric(Rubric): """Simple rubric that reads values from the state for logging.""" - def __init__(self, state_keys: list[str] | None = None): + def __init__( + self, + state_keys: list[str] | None = None, + transforms: list[Callable[..., float] | None] | None = None, + ): self.state_keys = state_keys or [] + self.transforms = transforms or [] + assert len(self.transforms) == len(self.state_keys), ( + "Number of transforms must match number of state keys" + ) - # build funcs for each state key reward_funcs = [] - for key in self.state_keys: - reward_funcs.append(self.get_read_from_state(key)) - + for key, transform in zip(self.state_keys, self.transforms): + reward_funcs.append(self.get_read_from_state(key, transform or float)) reward_weights = [0.0] * len(self.state_keys) # only for logging # pass them to parent class super().__init__(funcs=reward_funcs, weights=reward_weights) - async def get_read_from_state(self, key: str) -> Callable[[State], float]: + async def get_read_from_state( + self, key: str, transform: Callable[..., float] + ) -> Callable: """Create a reward function that reads from the state.""" - def read_from_state(state: State) -> float: - return float(state.get(key, 0.0)) + async def read_from_state(state: State) -> float: + return transform(state.get(key, 0.0)) read_from_state.__name__ = key From dbf7a236bffae37aafe9a7cf6af4b1a807d3f6bc Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Sun, 21 Dec 2025 18:59:27 +0000 Subject: [PATCH 04/16] allow adding rubric --- verifiers/envs/environment.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py index 662fa155a..3346c22af 100644 --- a/verifiers/envs/environment.py +++ b/verifiers/envs/environment.py @@ -1058,6 +1058,12 @@ def set_kwargs(self, **kwargs) -> None: else: setattr(self, key, value) + def add_rubric(self, rubric: Rubric) -> None: + if self.rubric is None: + self.rubric = rubric + else: + self.rubric = vf.RubricGroup(rubrics=[self.rubric, rubric]) + def set_max_seq_len(self, max_seq_len: int | None) -> None: """Set the maximum sequence length for this environment.""" self.max_seq_len = max_seq_len From 1fd31f47ab319ed8eda512e226ac3587a457ae8e Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Sun, 21 Dec 2025 18:59:44 +0000 Subject: [PATCH 05/16] make monitor rubric with 3 diff args --- verifiers/rubrics/monitor_rubric.py | 39 +++++++++++++++++++---------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/verifiers/rubrics/monitor_rubric.py b/verifiers/rubrics/monitor_rubric.py index 441d856d9..5d80c6a58 100644 --- a/verifiers/rubrics/monitor_rubric.py +++ b/verifiers/rubrics/monitor_rubric.py @@ -3,37 +3,50 @@ from verifiers.rubrics.rubric import Rubric from verifiers.types import State +StateKey = str +RenamedStateKey = tuple[StateKey, str] +RenamedTransformedStateKey = tuple[StateKey, str, Callable[..., float]] + class MonitorRubric(Rubric): """Simple rubric that reads values from the state for logging.""" def __init__( self, - state_keys: list[str] | None = None, - transforms: list[Callable[..., float] | None] | None = None, + state_keys: list[StateKey | RenamedStateKey | RenamedTransformedStateKey] + | None = None, ): - self.state_keys = state_keys or [] - self.transforms = transforms or [] - assert len(self.transforms) == len(self.state_keys), ( - "Number of transforms must match number of state keys" - ) + self.state_keys: list[ + StateKey | RenamedStateKey | RenamedTransformedStateKey + ] = state_keys or [] reward_funcs = [] - for key, transform in zip(self.state_keys, self.transforms): - reward_funcs.append(self.get_read_from_state(key, transform or float)) + for state_key in self.state_keys: + if isinstance(state_key, str): + reward_func = self.get_read_from_state(state_key) + else: + reward_func = self.get_read_from_state(*state_key) # type: ignore + reward_funcs.append(reward_func) reward_weights = [0.0] * len(self.state_keys) # only for logging # pass them to parent class super().__init__(funcs=reward_funcs, weights=reward_weights) - async def get_read_from_state( - self, key: str, transform: Callable[..., float] + def get_read_from_state( + self, + key: str, + name: str | None = None, + transform: Callable[..., float] = float, ) -> Callable: """Create a reward function that reads from the state.""" async def read_from_state(state: State) -> float: - return transform(state.get(key, 0.0)) + key_parts = key.split(".") + for key_part in key_parts[:-1]: + state = state.get(key_part, {}) + value = state.get(key_parts[-1], 0.0) + return transform(value) - read_from_state.__name__ = key + read_from_state.__name__ = name if name is not None else key return read_from_state From 8510a1a0a5c8e41d95ca0b516e668d5e2c8a2e68 Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Sun, 21 Dec 2025 18:59:57 +0000 Subject: [PATCH 06/16] add monitor metrics in common classes --- verifiers/envs/multiturn_env.py | 7 ++++++ verifiers/envs/python_env.py | 26 +++++++++++++++++----- verifiers/envs/sandbox_env.py | 38 ++++++++++++++++++++++++++++----- verifiers/envs/tool_env.py | 2 ++ 4 files changed, 63 insertions(+), 10 deletions(-) diff --git a/verifiers/envs/multiturn_env.py b/verifiers/envs/multiturn_env.py index 1aec4dd7a..873e1ee26 100644 --- a/verifiers/envs/multiturn_env.py +++ b/verifiers/envs/multiturn_env.py @@ -5,6 +5,7 @@ from openai import AsyncOpenAI import verifiers as vf +from verifiers.rubrics.monitor_rubric import MonitorRubric from verifiers.types import ( Messages, ModelResponse, @@ -23,10 +24,16 @@ logger = logging.getLogger(__name__) +class MultiTurnMonitorRubric(MonitorRubric): + def __init__(self): + super().__init__(state_keys=[("trajectory", "num_turns", len)]) + + class MultiTurnEnv(vf.Environment): def __init__(self, max_turns: int = -1, **kwargs): super().__init__(**kwargs) self.max_turns = max_turns + self.add_rubric(MultiTurnMonitorRubric()) @abstractmethod async def env_response( diff --git a/verifiers/envs/python_env.py b/verifiers/envs/python_env.py index 8c695e1b2..570034596 100644 --- a/verifiers/envs/python_env.py +++ b/verifiers/envs/python_env.py @@ -17,6 +17,16 @@ class PythonWorkerState(TypedDict): ready: bool execution_count: int + ready_wait_time: float + + +class PythonMonitorRubric(vf.MonitorRubric): + def __init__(self): + super().__init__( + state_keys=[ + ("python_state.ready_wait_time", "python_ready_wait_time"), + ] + ) class PythonWorkerNotReadyError(vf.SandboxError): ... @@ -189,6 +199,7 @@ def __init__( start_command=start_command, **kwargs, ) + self.add_rubric(PythonMonitorRubric()) self.add_tool( self.python, args_to_skip=["sandbox_id", "sandbox_state", "python_state"] ) @@ -229,7 +240,7 @@ async def python( ) -> str: """Execute `code` inside persistent Python REPL.""" if not python_state["ready"]: - await self._wait_for_worker_ready(sandbox_state, sandbox_id) + await self._wait_for_worker_ready(sandbox_id, sandbox_state, python_state) python_state["ready"] = True self.logger.debug(f"Executing code\n{code}") sandbox_response = await self._send_worker_request( @@ -242,7 +253,10 @@ async def cleanup_python_state(self, state: vf.State): state.pop("python_state", None) async def _wait_for_worker_ready( - self, sandbox_state: SandboxState, sandbox_id: str + self, + sandbox_id: str, + sandbox_state: SandboxState, + python_state: PythonWorkerState, ) -> None: s = time.time() try: @@ -260,11 +274,13 @@ async def _wait_for_worker_ready( ) if result.exit_code != 0: raise RuntimeError(result.stderr) - self.logger.debug( - f"Waited {time.time() - s:.1f}s for Python worker to be ready" - ) except Exception as e: raise PythonWorkerNotReadyError from e + ready_wait_time = time.time() - s + python_state["ready_wait_time"] = ready_wait_time + self.logger.debug( + f"Waited {ready_wait_time:.1f}s for Python worker to be ready" + ) async def _send_worker_request( self, diff --git a/verifiers/envs/sandbox_env.py b/verifiers/envs/sandbox_env.py index ff7fc52b8..9da66a901 100644 --- a/verifiers/envs/sandbox_env.py +++ b/verifiers/envs/sandbox_env.py @@ -89,6 +89,22 @@ def teardown(self, wait: bool = True) -> None: class SandboxState(TypedDict): ready: bool + ready_wait_time: float + command_execution_times: list[float] + + +class SandboxMonitorRubric(vf.MonitorRubric): + def __init__(self): + super().__init__( + state_keys=[ + ("sandbox_state.ready_wait_time", "sandbox_ready_wait_time"), + ( + "sandbox_state.command_execution_times", + "sandbox_command_execution_time", + lambda x: sum(x) / len(x) if len(x) > 0 else 0.0, + ), + ] + ) class SandboxCreationError(vf.SandboxError): ... @@ -127,6 +143,7 @@ def __init__( stop_errors=stop_errors if stop_errors is not None else [vf.SandboxError], **kwargs, ) + self.add_rubric(SandboxMonitorRubric()) self.timeout_per_command_seconds = timeout_per_command_seconds self.sandbox_client = ThreadedAsyncSandboxClient( max_workers=sandbox_client_max_workers, @@ -173,7 +190,9 @@ async def _wait_for_sandbox_ready( sandbox_state["ready"] = True except Exception as e: raise SandboxNotReadyError(e) - self.logger.debug(f"Waited {time.time() - s:.1f}s for sandbox to be ready") + ready_wait_time = time.time() - s + sandbox_state["ready_wait_time"] = ready_wait_time + self.logger.debug(f"Waited {ready_wait_time:.1f}s for sandbox to be ready") async def bash( self, @@ -197,13 +216,16 @@ async def bash( timeout=self.timeout_per_command_seconds, ) except CommandTimeoutError: - e = time.time() timeout_msg = f"Command timed out after {self.timeout_per_command_seconds}s" self.logger.warning(f"{timeout_msg} in sandbox {sandbox_id}") + sandbox_state["command_execution_times"].append( + self.timeout_per_command_seconds + ) return f"Error: {timeout_msg}" except Exception as e: raise vf.SandboxError from e - e = time.time() + command_execution_time = time.time() - s + sandbox_state["command_execution_times"].append(command_execution_time) stdout = results.stdout.strip() stderr = (results.stderr or "").strip() combined = stdout @@ -213,7 +235,9 @@ async def bash( else: combined = f"stderr:\n{stderr}" output = combined or "(no output)" - self.logger.debug(f"Executed command in {e - s:.1f}s. Got output: {output}") + self.logger.debug( + f"Executed command in {command_execution_time:.1f}s. Got output: {output}" + ) return output async def post_rollout(self, state: vf.State): @@ -252,7 +276,11 @@ async def setup_state(self, state: vf.State, **kwargs) -> vf.State: self.active_sandboxes.add(sandbox.id) self.logger.debug(f"Created sandbox {sandbox.id}") state["sandbox_id"] = sandbox.id - state["sandbox_state"] = {"ready": False} + state["sandbox_state"] = { + "ready": False, + "ready_wait_time": None, + "command_execution_times": [], + } return await super().setup_state(state, **kwargs) def update_tool_args( diff --git a/verifiers/envs/tool_env.py b/verifiers/envs/tool_env.py index 6acd4cca6..c708db09d 100644 --- a/verifiers/envs/tool_env.py +++ b/verifiers/envs/tool_env.py @@ -4,6 +4,7 @@ from openai.types.chat import ChatCompletionAssistantMessageParam import verifiers as vf +from verifiers.rubrics.tool_rubric import ToolRubric from verifiers.utils.async_utils import maybe_await from verifiers.utils.tool_utils import convert_func_to_oai_tool @@ -27,6 +28,7 @@ def __init__( for tool in self.tools } super().__init__(oai_tools=self.oai_tools, max_turns=max_turns, **kwargs) + self.add_rubric(ToolRubric(tools=self.tools)) def _should_stop_for_error(self, err: Exception) -> bool: """Check if error is in stop_errors.""" From 21f75a0535f0d0cf636a68a9ac7e9ca341d504c7 Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Sun, 21 Dec 2025 19:00:04 +0000 Subject: [PATCH 07/16] integrate with math python --- environments/math_python/math_python.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/environments/math_python/math_python.py b/environments/math_python/math_python.py index 87cc69335..4f9a9d106 100644 --- a/environments/math_python/math_python.py +++ b/environments/math_python/math_python.py @@ -31,7 +31,7 @@ def load_environment( parser = vf.Parser(extract_fn=extract_boxed_answer) math_rubric = vf.MathRubric(parser=parser) - vf_env = vf.PythonEnv( + return vf.PythonEnv( dataset=dataset, system_prompt=system_prompt, parser=parser, @@ -50,7 +50,3 @@ def load_environment( sandbox_client_max_workers=sandbox_client_max_workers, **kwargs, ) - assert vf_env.tools is not None - tool_rubric = vf.ToolRubric(tools=vf_env.tools) - vf_env.rubric = vf.RubricGroup(rubrics=[tool_rubric, vf_env.rubric]) - return vf_env From 4d26051da9403c380295e3a1551d013e0397e1e2 Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Sun, 21 Dec 2025 19:19:37 +0000 Subject: [PATCH 08/16] fix tests --- tests/test_env_group.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_env_group.py b/tests/test_env_group.py index 72289fcee..0d834bac7 100644 --- a/tests/test_env_group.py +++ b/tests/test_env_group.py @@ -47,7 +47,7 @@ def func3(completion, **kwargs): assert rubric.env_map == env_map # Should have all unique reward function names - assert set(rubric.all_reward_names) == {"func1", "func2", "func3"} + assert set(rubric.all_reward_names) == {"num_turns", "func1", "func2", "func3"} @pytest.mark.asyncio async def test_env_group_rubric_score_rollout(self, mock_openai_client): From edddce438226212e18c0e770a364cbfdefcf9afb Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Mon, 5 Jan 2026 15:20:52 +0000 Subject: [PATCH 09/16] append rubric if rubric is alr a rubric group --- verifiers/envs/environment.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py index 3346c22af..2eb176caa 100644 --- a/verifiers/envs/environment.py +++ b/verifiers/envs/environment.py @@ -1061,6 +1061,8 @@ def set_kwargs(self, **kwargs) -> None: def add_rubric(self, rubric: Rubric) -> None: if self.rubric is None: self.rubric = rubric + elif isinstance(self.rubric, vf.RubricGroup): + self.rubric.rubrics.append(rubric) else: self.rubric = vf.RubricGroup(rubrics=[self.rubric, rubric]) From 099e0253248d765b42eb9fcfb9266cbfa306abec Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Mon, 5 Jan 2026 15:41:53 +0000 Subject: [PATCH 10/16] refactor monitor rubric to support arbitrary metrics + adapt tool rubric to new pattern --- verifiers/__init__.py | 2 - verifiers/envs/multiturn_env.py | 14 +++++-- verifiers/envs/python_env.py | 18 ++++----- verifiers/envs/sandbox_env.py | 24 +++++++----- verifiers/envs/tool_env.py | 54 ++++++++++++++++++++++++- verifiers/rubrics/monitor_rubric.py | 50 ++++++----------------- verifiers/rubrics/tool_rubric.py | 61 ----------------------------- 7 files changed, 97 insertions(+), 126 deletions(-) delete mode 100644 verifiers/rubrics/tool_rubric.py diff --git a/verifiers/__init__.py b/verifiers/__init__.py index 8be5e4b65..47919d24f 100644 --- a/verifiers/__init__.py +++ b/verifiers/__init__.py @@ -30,7 +30,6 @@ from .rubrics.judge_rubric import JudgeRubric from .rubrics.monitor_rubric import MonitorRubric from .rubrics.rubric_group import RubricGroup -from .rubrics.tool_rubric import ToolRubric from .utils.data_utils import ( extract_boxed_answer, extract_hash_answer, @@ -85,7 +84,6 @@ def setup_logging( "Rubric", "JudgeRubric", "RubricGroup", - "ToolRubric", "MonitorRubric", "MathRubric", "TextArenaEnv", diff --git a/verifiers/envs/multiturn_env.py b/verifiers/envs/multiturn_env.py index 873e1ee26..29cb74c6c 100644 --- a/verifiers/envs/multiturn_env.py +++ b/verifiers/envs/multiturn_env.py @@ -25,15 +25,23 @@ class MultiTurnMonitorRubric(MonitorRubric): - def __init__(self): - super().__init__(state_keys=[("trajectory", "num_turns", len)]) + """Monitor rubric that counts the number of turns in multi-turn environments.""" + + def __init__(self, max_turns: int, **kwargs): + super().__init__(**kwargs) + if max_turns > 1: + self.add_metric(self.num_turns) + + async def num_turns(self, state: State) -> int: + return len(state["trajectory"]) class MultiTurnEnv(vf.Environment): def __init__(self, max_turns: int = -1, **kwargs): super().__init__(**kwargs) self.max_turns = max_turns - self.add_rubric(MultiTurnMonitorRubric()) + + self.add_rubric(MultiTurnMonitorRubric(max_turns=max_turns)) @abstractmethod async def env_response( diff --git a/verifiers/envs/python_env.py b/verifiers/envs/python_env.py index 570034596..c93300f30 100644 --- a/verifiers/envs/python_env.py +++ b/verifiers/envs/python_env.py @@ -20,15 +20,6 @@ class PythonWorkerState(TypedDict): ready_wait_time: float -class PythonMonitorRubric(vf.MonitorRubric): - def __init__(self): - super().__init__( - state_keys=[ - ("python_state.ready_wait_time", "python_ready_wait_time"), - ] - ) - - class PythonWorkerNotReadyError(vf.SandboxError): ... @@ -38,6 +29,15 @@ class PythonWorkerRequestError(vf.SandboxError): ... class PythonWorkerDeadError(vf.SandboxError): ... +class PythonMonitorRubric(vf.MonitorRubric): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.add_metric(self.python_ready_wait_time) + + async def python_ready_wait_time(self, state: vf.State) -> float: + return state["python_state"]["ready_wait_time"] + + class PythonEnv(SandboxEnv): """Sandbox-backed environment exposing a persistent Python REPL.""" diff --git a/verifiers/envs/sandbox_env.py b/verifiers/envs/sandbox_env.py index 9da66a901..d6f91073b 100644 --- a/verifiers/envs/sandbox_env.py +++ b/verifiers/envs/sandbox_env.py @@ -94,16 +94,20 @@ class SandboxState(TypedDict): class SandboxMonitorRubric(vf.MonitorRubric): - def __init__(self): - super().__init__( - state_keys=[ - ("sandbox_state.ready_wait_time", "sandbox_ready_wait_time"), - ( - "sandbox_state.command_execution_times", - "sandbox_command_execution_time", - lambda x: sum(x) / len(x) if len(x) > 0 else 0.0, - ), - ] + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.add_metric(self.sandbox_ready_wait_time) + self.add_metric(self.sandbox_command_execution_time) + + async def sandbox_ready_wait_time(self, state: vf.State) -> float: + return state["sandbox_state"]["ready_wait_time"] + + async def sandbox_command_execution_time(self, state: vf.State) -> float: + command_execution_times = state["sandbox_state"]["command_execution_times"] + return ( + sum(command_execution_times) / len(command_execution_times) + if len(command_execution_times) > 0 + else 0.0 ) diff --git a/verifiers/envs/tool_env.py b/verifiers/envs/tool_env.py index c708db09d..62b20b8eb 100644 --- a/verifiers/envs/tool_env.py +++ b/verifiers/envs/tool_env.py @@ -4,11 +4,60 @@ from openai.types.chat import ChatCompletionAssistantMessageParam import verifiers as vf -from verifiers.rubrics.tool_rubric import ToolRubric +from verifiers.rubrics.monitor_rubric import MonitorRubric +from verifiers.types import Messages from verifiers.utils.async_utils import maybe_await from verifiers.utils.tool_utils import convert_func_to_oai_tool +class ToolMonitorRubric(MonitorRubric): + """Monitor rubric that counts the number of tool calls in tool environments.""" + + def __init__(self, tools: list[Callable] | None = None, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.tools = tools or [] + self.tool_names = [tool.__name__ for tool in self.tools] # type: ignore[union-attr] + + # add tool metrics + self.add_metric(self.total_tool_calls) + for tool_name in self.tool_names: + self.add_metric(self.get_tool_call_count_func(tool_name)) + + async def total_tool_calls(self, completion: Messages) -> float: + """Count the total number of tool calls.""" + total = 0 + assert isinstance(completion, list) + for msg in completion: + if msg["role"] == "assistant" and "tool_calls" in msg: + assistant_msg = cast(ChatCompletionAssistantMessageParam, msg) # type: ignore[redundant-cast] + tool_calls = assistant_msg.get("tool_calls", []) + if isinstance(tool_calls, list): + total += len(tool_calls) + return float(total) + + def get_tool_call_count_func(self, tool_name: str) -> Callable: + """Create a metric that counts calls to a specific tool.""" + + async def tool_call_count_func(completion: Messages) -> int: + """Count calls to {tool_name} tool.""" + count = 0 + # Find tool calls in assistant messages + assert isinstance(completion, list) + for msg in completion: + if msg["role"] == "assistant" and "tool_calls" in msg: + assistant_msg = cast(ChatCompletionAssistantMessageParam, msg) # type: ignore[redundant-cast] + tool_calls = assistant_msg.get("tool_calls", []) + for tool_call in tool_calls: + if tool_call.get("function", {}).get("name") == tool_name: + count += 1 + + return count + + tool_call_count_func.__name__ = f"{tool_name}_calls" + return tool_call_count_func + + class ToolEnv(vf.MultiTurnEnv): def __init__( self, @@ -28,7 +77,8 @@ def __init__( for tool in self.tools } super().__init__(oai_tools=self.oai_tools, max_turns=max_turns, **kwargs) - self.add_rubric(ToolRubric(tools=self.tools)) + + self.add_rubric(ToolMonitorRubric(tools=self.tools)) def _should_stop_for_error(self, err: Exception) -> bool: """Check if error is in stop_errors.""" diff --git a/verifiers/rubrics/monitor_rubric.py b/verifiers/rubrics/monitor_rubric.py index 5d80c6a58..39d25d71f 100644 --- a/verifiers/rubrics/monitor_rubric.py +++ b/verifiers/rubrics/monitor_rubric.py @@ -1,7 +1,7 @@ from typing import Callable from verifiers.rubrics.rubric import Rubric -from verifiers.types import State +from verifiers.types import RewardFunc StateKey = str RenamedStateKey = tuple[StateKey, str] @@ -9,44 +9,16 @@ class MonitorRubric(Rubric): - """Simple rubric that reads values from the state for logging.""" + """Simple rubric that only contains metrics for logging.""" - def __init__( - self, - state_keys: list[StateKey | RenamedStateKey | RenamedTransformedStateKey] - | None = None, - ): - self.state_keys: list[ - StateKey | RenamedStateKey | RenamedTransformedStateKey - ] = state_keys or [] + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert all(weight == 0.0 for weight in self.weights) - reward_funcs = [] - for state_key in self.state_keys: - if isinstance(state_key, str): - reward_func = self.get_read_from_state(state_key) - else: - reward_func = self.get_read_from_state(*state_key) # type: ignore - reward_funcs.append(reward_func) - reward_weights = [0.0] * len(self.state_keys) # only for logging + def add_reward_func(self, *args, **kwargs): + """Cannot add reward func to monitor rubric.""" + self.logger.warning("Cannot add reward func to monitor rubric. Ignoring.") - # pass them to parent class - super().__init__(funcs=reward_funcs, weights=reward_weights) - - def get_read_from_state( - self, - key: str, - name: str | None = None, - transform: Callable[..., float] = float, - ) -> Callable: - """Create a reward function that reads from the state.""" - - async def read_from_state(state: State) -> float: - key_parts = key.split(".") - for key_part in key_parts[:-1]: - state = state.get(key_part, {}) - value = state.get(key_parts[-1], 0.0) - return transform(value) - - read_from_state.__name__ = name if name is not None else key - - return read_from_state + def add_metric(self, func: RewardFunc, *args, **kwargs): + """Ensure that the metric has weight 0.0""" + super().add_metric(func, weight=0.0) diff --git a/verifiers/rubrics/tool_rubric.py b/verifiers/rubrics/tool_rubric.py deleted file mode 100644 index 210020926..000000000 --- a/verifiers/rubrics/tool_rubric.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import Callable, cast - -from openai.types.chat import ChatCompletionAssistantMessageParam - -from verifiers.rubrics.rubric import Rubric -from verifiers.types import Messages -from verifiers.utils.tool_utils import convert_func_to_oai_tool - - -class ToolRubric(Rubric): - """Simple rubric that counts tool calls in completion messages.""" - - def __init__(self, tools: list[Callable] | None = None): - self.tools = tools or [] - self.oai_tools = [convert_func_to_oai_tool(tool) for tool in self.tools] - self.tool_names = [tool.__name__ for tool in self.tools] # type: ignore[union-attr] - - # Build initial reward functions and weights - reward_funcs = [] - reward_funcs.append(self.total_tool_calls) - reward_weights = [0.0] - - for tool_name in self.tool_names: - reward_funcs.append(self.get_tool_call_count_func(tool_name)) - reward_weights.append(0.0) - - # Pass them to parent class - super().__init__(funcs=reward_funcs, weights=reward_weights) - - async def total_tool_calls(self, completion: Messages) -> float: - """Count the total number of tool calls across all assistant messages.""" - total = 0 - assert isinstance(completion, list) - for msg in completion: - if msg["role"] == "assistant" and "tool_calls" in msg: - assistant_msg = cast(ChatCompletionAssistantMessageParam, msg) # type: ignore[redundant-cast] - tool_calls = assistant_msg.get("tool_calls", []) - if isinstance(tool_calls, list): - total += len(tool_calls) - return float(total) - - def get_tool_call_count_func(self, tool_name: str) -> Callable: - """Create a reward function that counts calls to a specific tool.""" - - async def tool_call_count_func(completion: Messages) -> float: - """Count calls to {tool_name} tool.""" - count = 0 - # Find tool calls in assistant messages - assert isinstance(completion, list) - for msg in completion: - if msg["role"] == "assistant" and "tool_calls" in msg: - assistant_msg = cast(ChatCompletionAssistantMessageParam, msg) # type: ignore[redundant-cast] - tool_calls = assistant_msg.get("tool_calls", []) - for tool_call in tool_calls: - if tool_call.get("function", {}).get("name") == tool_name: - count += 1 - - return float(count) - - tool_call_count_func.__name__ = f"{tool_name}_calls" - return tool_call_count_func From 14b566f6d5a9439a18e2784390602bdffbf0059a Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Mon, 5 Jan 2026 15:42:10 +0000 Subject: [PATCH 11/16] move rubric --- verifiers/envs/sandbox_env.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/verifiers/envs/sandbox_env.py b/verifiers/envs/sandbox_env.py index d6f91073b..d6b12efcf 100644 --- a/verifiers/envs/sandbox_env.py +++ b/verifiers/envs/sandbox_env.py @@ -93,6 +93,12 @@ class SandboxState(TypedDict): command_execution_times: list[float] +class SandboxCreationError(vf.SandboxError): ... + + +class SandboxNotReadyError(vf.SandboxError): ... + + class SandboxMonitorRubric(vf.MonitorRubric): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -111,12 +117,6 @@ async def sandbox_command_execution_time(self, state: vf.State) -> float: ) -class SandboxCreationError(vf.SandboxError): ... - - -class SandboxNotReadyError(vf.SandboxError): ... - - class SandboxEnv(vf.StatefulToolEnv): def __init__( self, From f85bdaa71f5f9ea729daada11ed7d0261d46636f Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Mon, 5 Jan 2026 16:10:24 +0000 Subject: [PATCH 12/16] fix tests --- verifiers/envs/multiturn_env.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/verifiers/envs/multiturn_env.py b/verifiers/envs/multiturn_env.py index 29cb74c6c..491679cae 100644 --- a/verifiers/envs/multiturn_env.py +++ b/verifiers/envs/multiturn_env.py @@ -27,10 +27,9 @@ class MultiTurnMonitorRubric(MonitorRubric): """Monitor rubric that counts the number of turns in multi-turn environments.""" - def __init__(self, max_turns: int, **kwargs): + def __init__(self, **kwargs): super().__init__(**kwargs) - if max_turns > 1: - self.add_metric(self.num_turns) + self.add_metric(self.num_turns) async def num_turns(self, state: State) -> int: return len(state["trajectory"]) @@ -41,7 +40,7 @@ def __init__(self, max_turns: int = -1, **kwargs): super().__init__(**kwargs) self.max_turns = max_turns - self.add_rubric(MultiTurnMonitorRubric(max_turns=max_turns)) + self.add_rubric(MultiTurnMonitorRubric()) @abstractmethod async def env_response( From e07e029bd948682613cbe868f91128a925273b5f Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Mon, 5 Jan 2026 16:13:58 +0000 Subject: [PATCH 13/16] remove simplistic docstrings --- verifiers/envs/multiturn_env.py | 2 -- verifiers/envs/tool_env.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/verifiers/envs/multiturn_env.py b/verifiers/envs/multiturn_env.py index 491679cae..69a445e56 100644 --- a/verifiers/envs/multiturn_env.py +++ b/verifiers/envs/multiturn_env.py @@ -25,8 +25,6 @@ class MultiTurnMonitorRubric(MonitorRubric): - """Monitor rubric that counts the number of turns in multi-turn environments.""" - def __init__(self, **kwargs): super().__init__(**kwargs) self.add_metric(self.num_turns) diff --git a/verifiers/envs/tool_env.py b/verifiers/envs/tool_env.py index 62b20b8eb..4fe541e72 100644 --- a/verifiers/envs/tool_env.py +++ b/verifiers/envs/tool_env.py @@ -11,8 +11,6 @@ class ToolMonitorRubric(MonitorRubric): - """Monitor rubric that counts the number of tool calls in tool environments.""" - def __init__(self, tools: list[Callable] | None = None, *args, **kwargs): super().__init__(*args, **kwargs) From 4be2f68e72248967a3e3f33f157a11bd3a2eb14f Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Mon, 5 Jan 2026 16:16:46 +0000 Subject: [PATCH 14/16] remove explicit monitor metric --- verifiers/envs/multiturn_env.py | 3 +-- verifiers/envs/python_env.py | 6 +++--- verifiers/envs/sandbox_env.py | 6 +++--- verifiers/envs/tool_env.py | 7 +++---- verifiers/rubrics/monitor_rubric.py | 24 ------------------------ 5 files changed, 10 insertions(+), 36 deletions(-) delete mode 100644 verifiers/rubrics/monitor_rubric.py diff --git a/verifiers/envs/multiturn_env.py b/verifiers/envs/multiturn_env.py index 69a445e56..9841cf6d8 100644 --- a/verifiers/envs/multiturn_env.py +++ b/verifiers/envs/multiturn_env.py @@ -5,7 +5,6 @@ from openai import AsyncOpenAI import verifiers as vf -from verifiers.rubrics.monitor_rubric import MonitorRubric from verifiers.types import ( Messages, ModelResponse, @@ -24,7 +23,7 @@ logger = logging.getLogger(__name__) -class MultiTurnMonitorRubric(MonitorRubric): +class MultiTurnMonitorRubric(vf.Rubric): def __init__(self, **kwargs): super().__init__(**kwargs) self.add_metric(self.num_turns) diff --git a/verifiers/envs/python_env.py b/verifiers/envs/python_env.py index c93300f30..4049b2660 100644 --- a/verifiers/envs/python_env.py +++ b/verifiers/envs/python_env.py @@ -29,9 +29,9 @@ class PythonWorkerRequestError(vf.SandboxError): ... class PythonWorkerDeadError(vf.SandboxError): ... -class PythonMonitorRubric(vf.MonitorRubric): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) +class PythonMonitorRubric(vf.Rubric): + def __init__(self, **kwargs): + super().__init__(**kwargs) self.add_metric(self.python_ready_wait_time) async def python_ready_wait_time(self, state: vf.State) -> float: diff --git a/verifiers/envs/sandbox_env.py b/verifiers/envs/sandbox_env.py index d6b12efcf..af88a816d 100644 --- a/verifiers/envs/sandbox_env.py +++ b/verifiers/envs/sandbox_env.py @@ -99,9 +99,9 @@ class SandboxCreationError(vf.SandboxError): ... class SandboxNotReadyError(vf.SandboxError): ... -class SandboxMonitorRubric(vf.MonitorRubric): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) +class SandboxMonitorRubric(vf.Rubric): + def __init__(self, **kwargs): + super().__init__(**kwargs) self.add_metric(self.sandbox_ready_wait_time) self.add_metric(self.sandbox_command_execution_time) diff --git a/verifiers/envs/tool_env.py b/verifiers/envs/tool_env.py index 4fe541e72..fa622c5a7 100644 --- a/verifiers/envs/tool_env.py +++ b/verifiers/envs/tool_env.py @@ -4,15 +4,14 @@ from openai.types.chat import ChatCompletionAssistantMessageParam import verifiers as vf -from verifiers.rubrics.monitor_rubric import MonitorRubric from verifiers.types import Messages from verifiers.utils.async_utils import maybe_await from verifiers.utils.tool_utils import convert_func_to_oai_tool -class ToolMonitorRubric(MonitorRubric): - def __init__(self, tools: list[Callable] | None = None, *args, **kwargs): - super().__init__(*args, **kwargs) +class ToolMonitorRubric(vf.Rubric): + def __init__(self, tools: list[Callable] | None = None, **kwargs): + super().__init__(**kwargs) self.tools = tools or [] self.tool_names = [tool.__name__ for tool in self.tools] # type: ignore[union-attr] diff --git a/verifiers/rubrics/monitor_rubric.py b/verifiers/rubrics/monitor_rubric.py deleted file mode 100644 index 39d25d71f..000000000 --- a/verifiers/rubrics/monitor_rubric.py +++ /dev/null @@ -1,24 +0,0 @@ -from typing import Callable - -from verifiers.rubrics.rubric import Rubric -from verifiers.types import RewardFunc - -StateKey = str -RenamedStateKey = tuple[StateKey, str] -RenamedTransformedStateKey = tuple[StateKey, str, Callable[..., float]] - - -class MonitorRubric(Rubric): - """Simple rubric that only contains metrics for logging.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - assert all(weight == 0.0 for weight in self.weights) - - def add_reward_func(self, *args, **kwargs): - """Cannot add reward func to monitor rubric.""" - self.logger.warning("Cannot add reward func to monitor rubric. Ignoring.") - - def add_metric(self, func: RewardFunc, *args, **kwargs): - """Ensure that the metric has weight 0.0""" - super().add_metric(func, weight=0.0) From 6a54dfebbf9c0cd0fa93611a40e06636f68dd6b6 Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Mon, 5 Jan 2026 16:17:40 +0000 Subject: [PATCH 15/16] also remove from __init__ --- verifiers/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/verifiers/__init__.py b/verifiers/__init__.py index 47919d24f..528c94fe4 100644 --- a/verifiers/__init__.py +++ b/verifiers/__init__.py @@ -28,7 +28,6 @@ from .parsers.think_parser import ThinkParser from .parsers.xml_parser import XMLParser from .rubrics.judge_rubric import JudgeRubric -from .rubrics.monitor_rubric import MonitorRubric from .rubrics.rubric_group import RubricGroup from .utils.data_utils import ( extract_boxed_answer, @@ -84,7 +83,6 @@ def setup_logging( "Rubric", "JudgeRubric", "RubricGroup", - "MonitorRubric", "MathRubric", "TextArenaEnv", "ReasoningGymEnv", From 573d68fd85b33c666578af7c9bf004f5239e7126 Mon Sep 17 00:00:00 2001 From: Mika Senghaas Date: Mon, 5 Jan 2026 18:08:10 +0000 Subject: [PATCH 16/16] fix state init --- verifiers/envs/python_env.py | 1 + verifiers/envs/sandbox_env.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/verifiers/envs/python_env.py b/verifiers/envs/python_env.py index 4049b2660..b5651ec82 100644 --- a/verifiers/envs/python_env.py +++ b/verifiers/envs/python_env.py @@ -210,6 +210,7 @@ async def setup_state(self, state: vf.State, **kwargs: Any) -> vf.State: state["python_state"] = { "ready": False, "execution_count": 0, + "ready_wait_time": -1.0, } return state diff --git a/verifiers/envs/sandbox_env.py b/verifiers/envs/sandbox_env.py index af88a816d..794b88c16 100644 --- a/verifiers/envs/sandbox_env.py +++ b/verifiers/envs/sandbox_env.py @@ -282,7 +282,7 @@ async def setup_state(self, state: vf.State, **kwargs) -> vf.State: state["sandbox_id"] = sandbox.id state["sandbox_state"] = { "ready": False, - "ready_wait_time": None, + "ready_wait_time": -1.0, "command_execution_times": [], } return await super().setup_state(state, **kwargs)