From 11ed087967fe1febeb8e901c14923456c5e0a442 Mon Sep 17 00:00:00 2001 From: kevinngo1304 Date: Fri, 6 Mar 2026 19:33:12 +0100 Subject: [PATCH 1/5] Revert fix: occasional infinite verify loop --- murphy/prompts.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/murphy/prompts.py b/murphy/prompts.py index 82882240..9f31f939 100644 --- a/murphy/prompts.py +++ b/murphy/prompts.py @@ -380,22 +380,18 @@ def build_execution_prompt( f'- For edge_case or adversarial tests: ATTEMPT the action even if controls appear disabled. Click the submit/publish button, try form submission — observe what happens.\n' f'- Do NOT just search for error messages or describe what you see. Actually interact with the form: leave fields empty, then click submit. Report the observed behavior (disabled button, inline validation, error toast, silent rejection, etc.).\n' f"- The goal is to test the site's handling mechanism, not to find error messages.\n\n" - f'OBSERVATION PASS:\n' - f'- Immediately after every key action (form submit, confirm dialog, delete button, save button), PAUSE on the current page before any navigation.\n' - f'- On that same page, note what is visible: toast messages, banners, inline confirmations, loading indicators, error messages, or any status change.\n' - f'- These transient signals disappear once the page navigates — capture them now, in this step, before moving on.\n' - f'- Do NOT navigate elsewhere just to find these signals. Observe what is present on the current page and record it mentally for your done() call.\n\n' f'VALIDATION RULES:\n' - f'- The success_criteria above is your checklist. Evaluate it against what is already visible — do NOT navigate away to search for more evidence.\n' - f'- EARLY EXIT: As soon as ANY single OR condition from the success_criteria is satisfied by evidence already visible (current page, URL, DOM), call done(success=true) IMMEDIATELY. Do not take any further actions.\n' - f'- For create flows: a detail page URL containing an entity ID, the entity name visible on the current page, or the entity appearing in a list IS sufficient. Call done immediately.\n' - f'- For delete flows: if the entity is absent from the current list/search view, call done immediately.\n' - f'- For edit flows: if updated values are visible in the current view, call done immediately.\n' - f'- Do NOT perform additional navigation or searches to gather more evidence after a success condition is already met on the current page.\n' - f'- Do NOT return success=false just because an ephemeral toast or a specific status badge was not seen, if a persistent signal (entity in list, detail page, URL) already satisfies a success_criteria condition.\n' - f'- Only return success=false if there is NO evidence of any kind that the outcome occurred after checking what is currently visible.\n' - f'- If the primary completion signal is blocked or inconclusive, perform AT MOST ONE alternate in-app check (navigate to list or detail view), then call done regardless of what you find.\n' - f'- In validation_evidence inside done(), report: which success_criteria conditions were met, which transient signals were observed during the OBSERVATION PASS, and which were absent — all from memory, without taking more browser actions.\n\n' + f'- Base success on whether the OUTCOME happened, not on which specific signal confirmed it.\n' + f'- If ANY single OR condition from the success_criteria is satisfied by visible evidence, return success=true.\n' + f'- For create flows: if the new entity appears anywhere in the app with a recognizable identifier (name or ID visible in a list row, detail page, or URL), that IS confirmation — return success=true.\n' + f'- For delete flows: if the entity is absent from list/search, return success=true.\n' + f'- For edit flows: if updated values are visible in any list, detail, or status view, return success=true.\n' + f'- Do NOT return success=false just because an ephemeral toast or a specific status badge was not seen, if a persistent signal (entity in list, banner, URL change) already confirms the outcome.\n' + f'- Only return success=false if there is NO evidence of any kind that the outcome occurred.\n' + f'- In validation_evidence, note every signal checked: which were present and which were absent (e.g. "Entity visible in Agents list — success. Ephemeral toast not observed. Active badge not seen.").\n' + f'- If the primary completion signal/action is blocked, disabled, or inconclusive, perform one alternate in-app verification route before deciding verdict.\n' + f'- Alternate verification must be within the app (e.g., list/detail/search/status views) and should check for objective outcome evidence.\n' + f'- During alternate verification, do not re-run the full primary workflow; verify existing outcome state only.\n\n' f'DOM STATE RULES:\n' f'- If UI appears empty, call refresh_dom_state before any reload. Do not repeatedly reload the same URL.\n' f'- If navigation to a destination fails or page state is non-interactive/ambiguous afterward, call refresh_dom_state before any second navigation attempt.\n' From 50391b3150549e3b08d7b71f6fdfa075ba9d3bdd Mon Sep 17 00:00:00 2001 From: kevinngo1304 Date: Mon, 9 Mar 2026 11:25:24 +0100 Subject: [PATCH 2/5] Revert manually: fix agent does not report missing validation indicators --- murphy/core/execution.py | 3 +-- murphy/core/judge.py | 36 ++++++++---------------------------- murphy/io/report_markdown.py | 7 ------- murphy/models.py | 10 ---------- murphy/prompts.py | 18 +++++++++--------- 5 files changed, 18 insertions(+), 56 deletions(-) diff --git a/murphy/core/execution.py b/murphy/core/execution.py index 12f2cb48..a9cdedc0 100644 --- a/murphy/core/execution.py +++ b/murphy/core/execution.py @@ -233,8 +233,7 @@ async def _execute_single_test( reason=reason, validation_evidence=validation_evidence, feedback_quality=judgement.feedback_quality, - trait_evaluations=judgement.trait_evaluations, - missing_signals=judgement.missing_signals, + trait_evaluations=judgement.trait_evaluations ) test_result.failure_category = classify_failure(test_result) except Exception as exc: diff --git a/murphy/core/judge.py b/murphy/core/judge.py index e559504d..fbc032af 100644 --- a/murphy/core/judge.py +++ b/murphy/core/judge.py @@ -133,22 +133,12 @@ def build_judge_trait_context(persona: str, traits: TraitVector, test_type: Test - **Flexible mechanism matching**: If the criteria say "the site prevents empty form submission" and the site uses a disabled submit button instead of an error toast, that IS a pass — the behavior (prevention) was achieved through a different mechanism. - **Quoted text is illustrative, not literal**: Any quoted UI text in criteria (e.g., "'Please fill out this field'") is ONE example of acceptable behavior, not the only acceptable response. A site showing "Required" instead of "Please fill out this field" achieves the same outcome. -- **Alternative outcomes**: When criteria list alternatives separated by OR, ANY one of them is sufficient for a pass. If ANY single OR condition is satisfied, set verdict=true immediately — do NOT require all OR branches to be satisfied. -- **Do NOT fail for missing ephemeral signals**: If a persistent signal already confirms the outcome (entity visible in list/detail with its name, URL changed to the expected destination, persistent banner present), do NOT set verdict=false just because an ephemeral toast or a specific status badge was not captured. Record those as missing_signals instead. -- **Non-happy-path default**: For security-oriented personas (adversarial, edge_case, angry_user), any mechanism that prevents the bad outcome (crash, data leak, unhandled exception, corrupted state) is a PASS. Only FAIL on demonstrable mishandling. +- **Alternative outcomes**: When criteria list alternatives separated by OR, ANY one of them is sufficient for a pass.- **Non-happy-path default**: For security-oriented personas (adversarial, edge_case, angry_user), any mechanism that prevents the bad outcome (crash, data leak, unhandled exception, corrupted state) is a PASS. Only FAIL on demonstrable mishandling. - **Silent handling is valid for security-oriented personas (adversarial, edge_case, angry_user)**: If the site silently sanitizes input, ignores invalid data, or gracefully degrades without any visible feedback, that IS correct behavior for security personas — not a failure. For UX-oriented personas (happy_path, confused_novice, impatient_user, explorer), the site MUST provide visible feedback — a disabled button with no explanation, a silently ignored input, or a form that does nothing on submit is a FAIL. - **Disabled controls ARE prevention for security-oriented personas**: If a submit/publish/next button is disabled when fields are empty or invalid, that IS the site preventing submission for security personas. For UX-oriented personas, a disabled control MUST be accompanied by visible explanation (tooltip, inline text, grayed-out label explaining why) to count as a PASS. - **Focus on harm, not form** (security personas): Ask "did the site handle this situation without harm?" not "did the site handle it exactly as described?" - **Focus on clarity, not just harm** (UX personas): Ask "did the site help the user understand what happened?" not just "did it avoid crashing?" A site that silently swallows user input with no feedback is harmful to UX even if nothing technically broke. -## Missing signals (always report, never fail on) - -Even when verdict=true, populate `missing_signals` with any expected confirmation signals that were NOT observed. These are UX observations that do not affect the verdict: -- Ephemeral signals not captured: e.g. "success toast not observed" or "error flash message not seen" -- Status indicators absent: e.g. "'Active' badge not visible on the list entry" -- Secondary confirmations missing: e.g. "confirmation dialog not shown before delete" - -If verdict=true and all expected signals were observed, leave `missing_signals` as an empty list. ## Failure classification @@ -200,23 +190,13 @@ def build_judge_trait_context(persona: str, traits: TraitVector, test_type: Test --- -## Outcome check (for verdict) - -Base verdict on whether the OUTCOME happened — not on which specific signals confirmed it: -- **Create flows**: if the new entity appears anywhere in the app with a recognizable identifier (name, ID, or other label visible in a list row, detail page, or URL), verdict=true. -- **Delete flows**: if the entity is absent from list/search results, verdict=true. -- **Edit flows**: if updated values are visible in any list, detail, or status view, verdict=true. -- **General**: if ANY single OR condition from the success criteria is satisfied by any observable evidence (persistent banner, URL change, entity in list, redirect to detail page), verdict=true. -- Only set verdict=false if there is ZERO evidence of any kind that the outcome occurred. - -## Signal gaps (for `missing_signals` — never affects verdict) - -After determining the verdict, check which expected confirmation signals were NOT observed and list each one in `missing_signals`. Examples: -- "Ephemeral success toast not captured in screenshots or step trace" -- "'Active' status badge not visible on the agent list entry" -- "Confirmation dialog not shown before the destructive action" - -These are UX observations only. A non-empty `missing_signals` on a passing test means the site's feedback could be improved — it does NOT change the verdict. +## Validation rules +- Validate outcome state before returning a verdict (no inference from partial signals). +- Use visible UI signals only: toasts, badges, list rows, detail cards, confirmation messages. +- For create flows: confirm new entity appears with a recognizable identifier. +- For delete flows: confirm entity is absent from list/search. +- For edit flows: reopen and confirm updates persist. +- If evidence is ambiguous, return verdict=false. Based on the Navigation Evidence and Pages Reached, did the agent successfully complete this test? Evaluate each trait dimension independently and report per-trait assessments in trait_evaluations. diff --git a/murphy/io/report_markdown.py b/murphy/io/report_markdown.py index eb5e96d3..c55f91b3 100644 --- a/murphy/io/report_markdown.py +++ b/murphy/io/report_markdown.py @@ -48,13 +48,6 @@ def _render_test_detail(r: TestResult, index: int, lines: list[str]) -> None: else: lines += ['**Validation Performed:**', 'No explicit validation evidence recorded.', ''] - # ── Missing signals (shown on all tests — UX gaps even on passes) ── - missing_signals = getattr(r, 'missing_signals', []) or [] - if missing_signals: - lines += ['**Confirmation signals not observed (UX gaps):**'] - for s in missing_signals: - lines.append(f'- {s}') - lines.append('') # ── Evaluation dimensions ── if r.process_evaluation: diff --git a/murphy/models.py b/murphy/models.py index a27f0127..a3890972 100644 --- a/murphy/models.py +++ b/murphy/models.py @@ -294,15 +294,6 @@ class JudgeVerdict(BaseModel): usability_evaluation: str = '' feedback_quality: FeedbackQualityScore | None = None trait_evaluations: dict[str, str] | None = None - missing_signals: list[str] = Field( - default_factory=list, - description=( - 'Confirmation signals that were expected but not observed ' - '(e.g. "ephemeral toast not captured", "Active status badge not visible in list"). ' - 'The outcome still passed via another signal. ' - 'Report for UX improvement only — never used to fail the test.' - ), - ) # ─── Phase 3: Results ────────────────────────────────────────────────────────── @@ -326,7 +317,6 @@ class TestResult(BaseModel): validation_evidence: str = '' feedback_quality: FeedbackQualityScore | None = None trait_evaluations: dict[str, str] | None = None - missing_signals: list[str] = Field(default_factory=list) class ReportSummary(BaseModel): diff --git a/murphy/prompts.py b/murphy/prompts.py index 9f31f939..894e7d4c 100644 --- a/murphy/prompts.py +++ b/murphy/prompts.py @@ -381,17 +381,17 @@ def build_execution_prompt( f'- Do NOT just search for error messages or describe what you see. Actually interact with the form: leave fields empty, then click submit. Report the observed behavior (disabled button, inline validation, error toast, silent rejection, etc.).\n' f"- The goal is to test the site's handling mechanism, not to find error messages.\n\n" f'VALIDATION RULES:\n' - f'- Base success on whether the OUTCOME happened, not on which specific signal confirmed it.\n' - f'- If ANY single OR condition from the success_criteria is satisfied by visible evidence, return success=true.\n' - f'- For create flows: if the new entity appears anywhere in the app with a recognizable identifier (name or ID visible in a list row, detail page, or URL), that IS confirmation — return success=true.\n' - f'- For delete flows: if the entity is absent from list/search, return success=true.\n' - f'- For edit flows: if updated values are visible in any list, detail, or status view, return success=true.\n' - f'- Do NOT return success=false just because an ephemeral toast or a specific status badge was not seen, if a persistent signal (entity in list, banner, URL change) already confirms the outcome.\n' - f'- Only return success=false if there is NO evidence of any kind that the outcome occurred.\n' - f'- In validation_evidence, note every signal checked: which were present and which were absent (e.g. "Entity visible in Agents list — success. Ephemeral toast not observed. Active badge not seen.").\n' + f'- Validate outcome state before returning success (no inference from partial signals).\n' + f'- Use visible UI signals only: toasts, badges, list rows, detail cards, confirmation messages.\n' + f'- For create flows: confirm new entity appears with a recognizable identifier.\n' + f'- For delete flows: confirm entity is absent from list/search.\n' + f'- For edit flows: reopen and confirm updates persist.\n' + f'- If evidence is ambiguous, return success=false.\n' f'- If the primary completion signal/action is blocked, disabled, or inconclusive, perform one alternate in-app verification route before deciding verdict.\n' f'- Alternate verification must be within the app (e.g., list/detail/search/status views) and should check for objective outcome evidence.\n' - f'- During alternate verification, do not re-run the full primary workflow; verify existing outcome state only.\n\n' + f'- During alternate verification, do not re-run the full primary workflow; verify existing outcome state only.\n' + f'- If evidence is ambiguous, contradictory, or missing, return success=false and explain what could not be verified.\n' + f'- Verify scenario success_criteria explicitly and cite which UI signal satisfied each required condition.\n\n' f'DOM STATE RULES:\n' f'- If UI appears empty, call refresh_dom_state before any reload. Do not repeatedly reload the same URL.\n' f'- If navigation to a destination fails or page state is non-interactive/ambiguous afterward, call refresh_dom_state before any second navigation attempt.\n' From d44721f0fe455407837cd992409fe7f497fa6ab8 Mon Sep 17 00:00:00 2001 From: kevinngo1304 Date: Mon, 9 Mar 2026 11:33:45 +0100 Subject: [PATCH 3/5] add missing signals reporting --- murphy/core/execution.py | 3 ++- murphy/core/judge.py | 18 ++++++++++++++++++ murphy/io/report_markdown.py | 7 +++++++ murphy/models.py | 10 ++++++++++ 4 files changed, 37 insertions(+), 1 deletion(-) diff --git a/murphy/core/execution.py b/murphy/core/execution.py index a9cdedc0..12f2cb48 100644 --- a/murphy/core/execution.py +++ b/murphy/core/execution.py @@ -233,7 +233,8 @@ async def _execute_single_test( reason=reason, validation_evidence=validation_evidence, feedback_quality=judgement.feedback_quality, - trait_evaluations=judgement.trait_evaluations + trait_evaluations=judgement.trait_evaluations, + missing_signals=judgement.missing_signals, ) test_result.failure_category = classify_failure(test_result) except Exception as exc: diff --git a/murphy/core/judge.py b/murphy/core/judge.py index fbc032af..f0001d97 100644 --- a/murphy/core/judge.py +++ b/murphy/core/judge.py @@ -139,6 +139,15 @@ def build_judge_trait_context(persona: str, traits: TraitVector, test_type: Test - **Focus on harm, not form** (security personas): Ask "did the site handle this situation without harm?" not "did the site handle it exactly as described?" - **Focus on clarity, not just harm** (UX personas): Ask "did the site help the user understand what happened?" not just "did it avoid crashing?" A site that silently swallows user input with no feedback is harmful to UX even if nothing technically broke. +## Missing signals (always report, never fail on) + +Even when verdict=true, populate `missing_signals` with any expected confirmation signals that were NOT observed. These are UX observations that do not affect the verdict: +- Ephemeral signals not captured: e.g. "success toast not observed" or "error flash message not seen" +- Status indicators absent: e.g. "'Active' badge not visible on the list entry" +- Secondary confirmations missing: e.g. "confirmation dialog not shown before delete" + +If verdict=true and all expected signals were observed, leave `missing_signals` as an empty list. + ## Failure classification @@ -198,6 +207,15 @@ def build_judge_trait_context(persona: str, traits: TraitVector, test_type: Test - For edit flows: reopen and confirm updates persist. - If evidence is ambiguous, return verdict=false. +## Signal gaps (for `missing_signals` — never affects verdict) + +After determining the verdict, check which expected confirmation signals were NOT observed and list each one in `missing_signals`. Examples: +- "Ephemeral success toast not captured in screenshots or step trace" +- "'Active' status badge not visible on the agent list entry" +- "Confirmation dialog not shown before the destructive action" + +These are UX observations only. A non-empty `missing_signals` on a passing test means the site's feedback could be improved — it does NOT change the verdict. + Based on the Navigation Evidence and Pages Reached, did the agent successfully complete this test? Evaluate each trait dimension independently and report per-trait assessments in trait_evaluations. Also assess feedback quality (response_present, response_timely, response_clear, response_actionable, feedback_type). diff --git a/murphy/io/report_markdown.py b/murphy/io/report_markdown.py index c55f91b3..eb5e96d3 100644 --- a/murphy/io/report_markdown.py +++ b/murphy/io/report_markdown.py @@ -48,6 +48,13 @@ def _render_test_detail(r: TestResult, index: int, lines: list[str]) -> None: else: lines += ['**Validation Performed:**', 'No explicit validation evidence recorded.', ''] + # ── Missing signals (shown on all tests — UX gaps even on passes) ── + missing_signals = getattr(r, 'missing_signals', []) or [] + if missing_signals: + lines += ['**Confirmation signals not observed (UX gaps):**'] + for s in missing_signals: + lines.append(f'- {s}') + lines.append('') # ── Evaluation dimensions ── if r.process_evaluation: diff --git a/murphy/models.py b/murphy/models.py index a3890972..a27f0127 100644 --- a/murphy/models.py +++ b/murphy/models.py @@ -294,6 +294,15 @@ class JudgeVerdict(BaseModel): usability_evaluation: str = '' feedback_quality: FeedbackQualityScore | None = None trait_evaluations: dict[str, str] | None = None + missing_signals: list[str] = Field( + default_factory=list, + description=( + 'Confirmation signals that were expected but not observed ' + '(e.g. "ephemeral toast not captured", "Active status badge not visible in list"). ' + 'The outcome still passed via another signal. ' + 'Report for UX improvement only — never used to fail the test.' + ), + ) # ─── Phase 3: Results ────────────────────────────────────────────────────────── @@ -317,6 +326,7 @@ class TestResult(BaseModel): validation_evidence: str = '' feedback_quality: FeedbackQualityScore | None = None trait_evaluations: dict[str, str] | None = None + missing_signals: list[str] = Field(default_factory=list) class ReportSummary(BaseModel): From a1d4572f5c52f51e022f5c0be848de9def96692d Mon Sep 17 00:00:00 2001 From: kevinngo1304 Date: Sat, 7 Mar 2026 13:57:07 +0100 Subject: [PATCH 4/5] fix: ModelProviderError: 'utf-8' codec can't encode character, surrogates not allowed --- murphy/core/judge.py | 2 ++ murphy/core/summary.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/murphy/core/judge.py b/murphy/core/judge.py index f0001d97..fef8e41e 100644 --- a/murphy/core/judge.py +++ b/murphy/core/judge.py @@ -12,6 +12,7 @@ from browser_use.agent.views import AgentHistoryList from browser_use.llm import ChatOpenAI, SystemMessage, UserMessage from browser_use.llm.messages import ContentPartImageParam, ContentPartTextParam, ImageURL +from browser_use.utils import sanitize_surrogates from murphy.models import ( PERSONA_REGISTRY, JudgeVerdict, @@ -371,6 +372,7 @@ async def murphy_judge( errors=errors_text, final_result=final_result, ) + user_prompt = sanitize_surrogates(user_prompt) # Build multimodal user message with screenshots for visual verification user_content: list[ContentPartTextParam | ContentPartImageParam] = [ diff --git a/murphy/core/summary.py b/murphy/core/summary.py index 2f60d1c2..93065f65 100644 --- a/murphy/core/summary.py +++ b/murphy/core/summary.py @@ -6,6 +6,7 @@ from browser_use.llm import ChatOpenAI, SystemMessage, UserMessage from murphy.io.report import write_full_report +from browser_use.utils import sanitize_surrogates from murphy.models import ( ExecutiveSummary, ReportSummary, @@ -116,6 +117,7 @@ async def generate_executive_summary( 3. recommended_actions: Top 3 concrete actions the site team should take to improve UX Be specific and actionable. Reference actual test names and outcomes. Do NOT use generic statements.""" + prompt = sanitize_surrogates(prompt) response = await llm.ainvoke( messages=[ From 07f12771e8af0729162018e090aa9c27edc3ba30 Mon Sep 17 00:00:00 2001 From: kevinngo1304 Date: Mon, 9 Mar 2026 11:58:41 +0100 Subject: [PATCH 5/5] code style fix --- murphy/core/summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/murphy/core/summary.py b/murphy/core/summary.py index 93065f65..1177a2ce 100644 --- a/murphy/core/summary.py +++ b/murphy/core/summary.py @@ -5,8 +5,8 @@ from typing import Literal from browser_use.llm import ChatOpenAI, SystemMessage, UserMessage -from murphy.io.report import write_full_report from browser_use.utils import sanitize_surrogates +from murphy.io.report import write_full_report from murphy.models import ( ExecutiveSummary, ReportSummary,