diff --git a/packages/sdk/server-ai/src/ldai/client.py b/packages/sdk/server-ai/src/ldai/client.py index d4a10bb..6057107 100644 --- a/packages/sdk/server-ai/src/ldai/client.py +++ b/packages/sdk/server-ai/src/ldai/client.py @@ -215,9 +215,27 @@ def _judge_config( default: AIJudgeConfigDefault, variables: Optional[Dict[str, Any]] = None, ) -> AIJudgeConfig: + if variables is not None: + if variables.get('message_history') is not None: + log.warning( + "The variable 'message_history' is reserved by the judge and will be ignored." + ) + if variables.get('response_to_evaluate') is not None: + log.warning( + "The variable 'response_to_evaluate' is reserved by the judge and will be ignored." + ) + + # Re-inject the reserved variables as their literal placeholders so they + # survive Mustache interpolation in ``__evaluate``. Without this, legacy + # templates like ``{{message_history}}`` get rendered to empty strings and + # ``_strip_legacy_judge_messages`` below cannot detect them. + extended_variables = dict(variables) if variables else {} + extended_variables['message_history'] = '{{message_history}}' + extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}' + (model, provider, messages, instructions, tracker_factory, enabled, judge_configuration, variation) = self.__evaluate( - key, context, default.to_dict(), variables + key, context, default.to_dict(), extended_variables ) def _extract_evaluation_metric_key(variation: Dict[str, Any]) -> Optional[str]: @@ -336,12 +354,8 @@ def _create_judge_instance( when materializing judges referenced by an AI config's judge configuration. """ try: - extended_variables = dict(variables) if variables else {} - extended_variables['message_history'] = '{{message_history}}' - extended_variables['response_to_evaluate'] = '{{response_to_evaluate}}' - judge_config = self._judge_config( - key, context, default or _DISABLED_JUDGE_DEFAULT, extended_variables + key, context, default or _DISABLED_JUDGE_DEFAULT, variables ) if not judge_config.enabled: diff --git a/packages/sdk/server-ai/tests/test_judge.py b/packages/sdk/server-ai/tests/test_judge.py index e1c6b90..f9c5bf4 100644 --- a/packages/sdk/server-ai/tests/test_judge.py +++ b/packages/sdk/server-ai/tests/test_judge.py @@ -1,11 +1,12 @@ """Tests for Judge functionality.""" -from unittest.mock import AsyncMock, MagicMock, call +from unittest.mock import AsyncMock, MagicMock, call, patch import pytest from ldclient import Config, Context, LDClient from ldclient.integrations.test_data import TestData +from ldai import LDAIClient from ldai.judge import Judge, _strip_legacy_judge_messages from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder from ldai.models import ( @@ -543,6 +544,137 @@ async def test_evaluate_messages_calls_evaluate( assert tracker.track_metrics_of_async.called +class TestJudgeConfigStripsLegacyMessages: + """Tests for ``LDAIClient.judge_config()`` legacy-message stripping. + + Both ``LDAIClient.judge_config()`` (the direct public API) and + ``LDAIClient.create_judge()`` (the wrapper API) reach the same internal + ``_judge_config()`` method. ``_judge_config()`` is responsible for + injecting ``message_history``/``response_to_evaluate`` markers so they + survive Mustache rendering, then stripping any legacy template messages + before returning the config. + """ + + @pytest.fixture + def context(self) -> Context: + return Context.create('user-key') + + def _make_client(self, td: TestData) -> LDAIClient: + config = Config('sdk-key', update_processor_class=td, send_events=False) + return LDAIClient(LDClient(config=config)) + + def test_judge_config_strips_legacy_messages_from_returned_config(self, context): + """Calling ``judge_config()`` directly (no variables) still strips legacy messages. + + This is the regression for the bug where legacy messages leaked through + the public ``judge_config()`` entry point because reserved-variable + markers were only injected in ``_create_judge_instance``. + """ + td = TestData.data_source() + td.update( + td.flag('legacy-judge') + .variations({ + 'model': {'name': 'gpt-4'}, + 'provider': {'name': 'openai'}, + 'messages': [ + {'role': 'system', 'content': 'You are a judge.'}, + {'role': 'assistant', 'content': '{{message_history}}'}, + {'role': 'user', 'content': 'Evaluate: {{response_to_evaluate}}'}, + ], + 'evaluationMetricKey': '$ld:ai:judge:relevance', + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, + }) + .variation_for_all(0) + ) + client = self._make_client(td) + + result = client.judge_config('legacy-judge', context) + + assert result.enabled is True + assert result.messages is not None + assert len(result.messages) == 1 + assert result.messages[0].role == 'system' + assert result.messages[0].content == 'You are a judge.' + + def test_judge_config_passes_user_variables_to_template(self, context): + """User variables are still interpolated into the system message.""" + td = TestData.data_source() + td.update( + td.flag('parametric-judge') + .variations({ + 'model': {'name': 'gpt-4'}, + 'provider': {'name': 'openai'}, + 'messages': [ + {'role': 'system', 'content': 'You are a {{tone}} judge.'}, + ], + 'evaluationMetricKey': '$ld:ai:judge:relevance', + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, + }) + .variation_for_all(0) + ) + client = self._make_client(td) + + result = client.judge_config( + 'parametric-judge', context, variables={'tone': 'strict'} + ) + + assert result.messages is not None + assert result.messages[0].content == 'You are a strict judge.' + + def test_judge_config_warns_on_reserved_variables(self, context): + """``_judge_config`` warns when callers pass reserved variable names.""" + td = TestData.data_source() + td.update( + td.flag('judge-config') + .variations({ + 'model': {'name': 'gpt-4'}, + 'provider': {'name': 'openai'}, + 'messages': [{'role': 'system', 'content': 'You are a judge.'}], + 'evaluationMetricKey': '$ld:ai:judge:relevance', + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, + }) + .variation_for_all(0) + ) + client = self._make_client(td) + + with patch('ldai.client.log') as mock_log: + client.judge_config( + 'judge-config', + context, + variables={ + 'message_history': 'should be ignored', + 'response_to_evaluate': 'should be ignored', + }, + ) + + warning_messages = [c.args[0] for c in mock_log.warning.call_args_list] + assert any("'message_history' is reserved" in m for m in warning_messages) + assert any("'response_to_evaluate' is reserved" in m for m in warning_messages) + + def test_judge_config_does_not_warn_without_reserved_variables(self, context): + """No warnings should be emitted when callers pass non-reserved variables.""" + td = TestData.data_source() + td.update( + td.flag('judge-config') + .variations({ + 'model': {'name': 'gpt-4'}, + 'provider': {'name': 'openai'}, + 'messages': [{'role': 'system', 'content': 'You are a judge.'}], + 'evaluationMetricKey': '$ld:ai:judge:relevance', + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, + }) + .variation_for_all(0) + ) + client = self._make_client(td) + + with patch('ldai.client.log') as mock_log: + client.judge_config('judge-config', context, variables={'tone': 'strict'}) + + warning_messages = [c.args[0] for c in mock_log.warning.call_args_list] + assert not any("'message_history' is reserved" in m for m in warning_messages) + assert not any("'response_to_evaluate' is reserved" in m for m in warning_messages) + + class TestEvaluationSchemaBuilder: """Tests for EvaluationSchemaBuilder."""