diff --git a/murphy/core/judge.py b/murphy/core/judge.py index e559504d..fef8e41e 100644 --- a/murphy/core/judge.py +++ b/murphy/core/judge.py @@ -12,6 +12,7 @@ from browser_use.agent.views import AgentHistoryList from browser_use.llm import ChatOpenAI, SystemMessage, UserMessage from browser_use.llm.messages import ContentPartImageParam, ContentPartTextParam, ImageURL +from browser_use.utils import sanitize_surrogates from murphy.models import ( PERSONA_REGISTRY, JudgeVerdict, @@ -133,9 +134,7 @@ def build_judge_trait_context(persona: str, traits: TraitVector, test_type: Test - **Flexible mechanism matching**: If the criteria say "the site prevents empty form submission" and the site uses a disabled submit button instead of an error toast, that IS a pass — the behavior (prevention) was achieved through a different mechanism. - **Quoted text is illustrative, not literal**: Any quoted UI text in criteria (e.g., "'Please fill out this field'") is ONE example of acceptable behavior, not the only acceptable response. A site showing "Required" instead of "Please fill out this field" achieves the same outcome. -- **Alternative outcomes**: When criteria list alternatives separated by OR, ANY one of them is sufficient for a pass. If ANY single OR condition is satisfied, set verdict=true immediately — do NOT require all OR branches to be satisfied. -- **Do NOT fail for missing ephemeral signals**: If a persistent signal already confirms the outcome (entity visible in list/detail with its name, URL changed to the expected destination, persistent banner present), do NOT set verdict=false just because an ephemeral toast or a specific status badge was not captured. Record those as missing_signals instead. -- **Non-happy-path default**: For security-oriented personas (adversarial, edge_case, angry_user), any mechanism that prevents the bad outcome (crash, data leak, unhandled exception, corrupted state) is a PASS. Only FAIL on demonstrable mishandling. +- **Alternative outcomes**: When criteria list alternatives separated by OR, ANY one of them is sufficient for a pass.- **Non-happy-path default**: For security-oriented personas (adversarial, edge_case, angry_user), any mechanism that prevents the bad outcome (crash, data leak, unhandled exception, corrupted state) is a PASS. Only FAIL on demonstrable mishandling. - **Silent handling is valid for security-oriented personas (adversarial, edge_case, angry_user)**: If the site silently sanitizes input, ignores invalid data, or gracefully degrades without any visible feedback, that IS correct behavior for security personas — not a failure. For UX-oriented personas (happy_path, confused_novice, impatient_user, explorer), the site MUST provide visible feedback — a disabled button with no explanation, a silently ignored input, or a form that does nothing on submit is a FAIL. - **Disabled controls ARE prevention for security-oriented personas**: If a submit/publish/next button is disabled when fields are empty or invalid, that IS the site preventing submission for security personas. For UX-oriented personas, a disabled control MUST be accompanied by visible explanation (tooltip, inline text, grayed-out label explaining why) to count as a PASS. - **Focus on harm, not form** (security personas): Ask "did the site handle this situation without harm?" not "did the site handle it exactly as described?" @@ -150,6 +149,7 @@ def build_judge_trait_context(persona: str, traits: TraitVector, test_type: Test If verdict=true and all expected signals were observed, leave `missing_signals` as an empty list. + ## Failure classification If verdict is FALSE, you MUST also classify the failure: @@ -200,14 +200,13 @@ def build_judge_trait_context(persona: str, traits: TraitVector, test_type: Test --- -## Outcome check (for verdict) - -Base verdict on whether the OUTCOME happened — not on which specific signals confirmed it: -- **Create flows**: if the new entity appears anywhere in the app with a recognizable identifier (name, ID, or other label visible in a list row, detail page, or URL), verdict=true. -- **Delete flows**: if the entity is absent from list/search results, verdict=true. -- **Edit flows**: if updated values are visible in any list, detail, or status view, verdict=true. -- **General**: if ANY single OR condition from the success criteria is satisfied by any observable evidence (persistent banner, URL change, entity in list, redirect to detail page), verdict=true. -- Only set verdict=false if there is ZERO evidence of any kind that the outcome occurred. +## Validation rules +- Validate outcome state before returning a verdict (no inference from partial signals). +- Use visible UI signals only: toasts, badges, list rows, detail cards, confirmation messages. +- For create flows: confirm new entity appears with a recognizable identifier. +- For delete flows: confirm entity is absent from list/search. +- For edit flows: reopen and confirm updates persist. +- If evidence is ambiguous, return verdict=false. ## Signal gaps (for `missing_signals` — never affects verdict) @@ -373,6 +372,7 @@ async def murphy_judge( errors=errors_text, final_result=final_result, ) + user_prompt = sanitize_surrogates(user_prompt) # Build multimodal user message with screenshots for visual verification user_content: list[ContentPartTextParam | ContentPartImageParam] = [ diff --git a/murphy/core/summary.py b/murphy/core/summary.py index 2f60d1c2..1177a2ce 100644 --- a/murphy/core/summary.py +++ b/murphy/core/summary.py @@ -5,6 +5,7 @@ from typing import Literal from browser_use.llm import ChatOpenAI, SystemMessage, UserMessage +from browser_use.utils import sanitize_surrogates from murphy.io.report import write_full_report from murphy.models import ( ExecutiveSummary, @@ -116,6 +117,7 @@ async def generate_executive_summary( 3. recommended_actions: Top 3 concrete actions the site team should take to improve UX Be specific and actionable. Reference actual test names and outcomes. Do NOT use generic statements.""" + prompt = sanitize_surrogates(prompt) response = await llm.ainvoke( messages=[ diff --git a/murphy/prompts.py b/murphy/prompts.py index 82882240..894e7d4c 100644 --- a/murphy/prompts.py +++ b/murphy/prompts.py @@ -380,22 +380,18 @@ def build_execution_prompt( f'- For edge_case or adversarial tests: ATTEMPT the action even if controls appear disabled. Click the submit/publish button, try form submission — observe what happens.\n' f'- Do NOT just search for error messages or describe what you see. Actually interact with the form: leave fields empty, then click submit. Report the observed behavior (disabled button, inline validation, error toast, silent rejection, etc.).\n' f"- The goal is to test the site's handling mechanism, not to find error messages.\n\n" - f'OBSERVATION PASS:\n' - f'- Immediately after every key action (form submit, confirm dialog, delete button, save button), PAUSE on the current page before any navigation.\n' - f'- On that same page, note what is visible: toast messages, banners, inline confirmations, loading indicators, error messages, or any status change.\n' - f'- These transient signals disappear once the page navigates — capture them now, in this step, before moving on.\n' - f'- Do NOT navigate elsewhere just to find these signals. Observe what is present on the current page and record it mentally for your done() call.\n\n' f'VALIDATION RULES:\n' - f'- The success_criteria above is your checklist. Evaluate it against what is already visible — do NOT navigate away to search for more evidence.\n' - f'- EARLY EXIT: As soon as ANY single OR condition from the success_criteria is satisfied by evidence already visible (current page, URL, DOM), call done(success=true) IMMEDIATELY. Do not take any further actions.\n' - f'- For create flows: a detail page URL containing an entity ID, the entity name visible on the current page, or the entity appearing in a list IS sufficient. Call done immediately.\n' - f'- For delete flows: if the entity is absent from the current list/search view, call done immediately.\n' - f'- For edit flows: if updated values are visible in the current view, call done immediately.\n' - f'- Do NOT perform additional navigation or searches to gather more evidence after a success condition is already met on the current page.\n' - f'- Do NOT return success=false just because an ephemeral toast or a specific status badge was not seen, if a persistent signal (entity in list, detail page, URL) already satisfies a success_criteria condition.\n' - f'- Only return success=false if there is NO evidence of any kind that the outcome occurred after checking what is currently visible.\n' - f'- If the primary completion signal is blocked or inconclusive, perform AT MOST ONE alternate in-app check (navigate to list or detail view), then call done regardless of what you find.\n' - f'- In validation_evidence inside done(), report: which success_criteria conditions were met, which transient signals were observed during the OBSERVATION PASS, and which were absent — all from memory, without taking more browser actions.\n\n' + f'- Validate outcome state before returning success (no inference from partial signals).\n' + f'- Use visible UI signals only: toasts, badges, list rows, detail cards, confirmation messages.\n' + f'- For create flows: confirm new entity appears with a recognizable identifier.\n' + f'- For delete flows: confirm entity is absent from list/search.\n' + f'- For edit flows: reopen and confirm updates persist.\n' + f'- If evidence is ambiguous, return success=false.\n' + f'- If the primary completion signal/action is blocked, disabled, or inconclusive, perform one alternate in-app verification route before deciding verdict.\n' + f'- Alternate verification must be within the app (e.g., list/detail/search/status views) and should check for objective outcome evidence.\n' + f'- During alternate verification, do not re-run the full primary workflow; verify existing outcome state only.\n' + f'- If evidence is ambiguous, contradictory, or missing, return success=false and explain what could not be verified.\n' + f'- Verify scenario success_criteria explicitly and cite which UI signal satisfied each required condition.\n\n' f'DOM STATE RULES:\n' f'- If UI appears empty, call refresh_dom_state before any reload. Do not repeatedly reload the same URL.\n' f'- If navigation to a destination fails or page state is non-interactive/ambiguous afterward, call refresh_dom_state before any second navigation attempt.\n'