From 11ed087967fe1febeb8e901c14923456c5e0a442 Mon Sep 17 00:00:00 2001
From: kevinngo1304 <kevin.ngo@prosus.com>
Date: Fri, 6 Mar 2026 19:33:12 +0100
Subject: [PATCH 1/5] Revert fix: occasional infinite verify loop

---
 murphy/prompts.py | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/murphy/prompts.py b/murphy/prompts.py
index 82882240..9f31f939 100644
--- a/murphy/prompts.py
+++ b/murphy/prompts.py
@@ -380,22 +380,18 @@ def build_execution_prompt(
 		f'- For edge_case or adversarial tests: ATTEMPT the action even if controls appear disabled. Click the submit/publish button, try form submission — observe what happens.\n'
 		f'- Do NOT just search for error messages or describe what you see. Actually interact with the form: leave fields empty, then click submit. Report the observed behavior (disabled button, inline validation, error toast, silent rejection, etc.).\n'
 		f"- The goal is to test the site's handling mechanism, not to find error messages.\n\n"
-		f'OBSERVATION PASS:\n'
-		f'- Immediately after every key action (form submit, confirm dialog, delete button, save button), PAUSE on the current page before any navigation.\n'
-		f'- On that same page, note what is visible: toast messages, banners, inline confirmations, loading indicators, error messages, or any status change.\n'
-		f'- These transient signals disappear once the page navigates — capture them now, in this step, before moving on.\n'
-		f'- Do NOT navigate elsewhere just to find these signals. Observe what is present on the current page and record it mentally for your done() call.\n\n'
 		f'VALIDATION RULES:\n'
-		f'- The success_criteria above is your checklist. Evaluate it against what is already visible — do NOT navigate away to search for more evidence.\n'
-		f'- EARLY EXIT: As soon as ANY single OR condition from the success_criteria is satisfied by evidence already visible (current page, URL, DOM), call done(success=true) IMMEDIATELY. Do not take any further actions.\n'
-		f'- For create flows: a detail page URL containing an entity ID, the entity name visible on the current page, or the entity appearing in a list IS sufficient. Call done immediately.\n'
-		f'- For delete flows: if the entity is absent from the current list/search view, call done immediately.\n'
-		f'- For edit flows: if updated values are visible in the current view, call done immediately.\n'
-		f'- Do NOT perform additional navigation or searches to gather more evidence after a success condition is already met on the current page.\n'
-		f'- Do NOT return success=false just because an ephemeral toast or a specific status badge was not seen, if a persistent signal (entity in list, detail page, URL) already satisfies a success_criteria condition.\n'
-		f'- Only return success=false if there is NO evidence of any kind that the outcome occurred after checking what is currently visible.\n'
-		f'- If the primary completion signal is blocked or inconclusive, perform AT MOST ONE alternate in-app check (navigate to list or detail view), then call done regardless of what you find.\n'
-		f'- In validation_evidence inside done(), report: which success_criteria conditions were met, which transient signals were observed during the OBSERVATION PASS, and which were absent — all from memory, without taking more browser actions.\n\n'
+		f'- Base success on whether the OUTCOME happened, not on which specific signal confirmed it.\n'
+		f'- If ANY single OR condition from the success_criteria is satisfied by visible evidence, return success=true.\n'
+		f'- For create flows: if the new entity appears anywhere in the app with a recognizable identifier (name or ID visible in a list row, detail page, or URL), that IS confirmation — return success=true.\n'
+		f'- For delete flows: if the entity is absent from list/search, return success=true.\n'
+		f'- For edit flows: if updated values are visible in any list, detail, or status view, return success=true.\n'
+		f'- Do NOT return success=false just because an ephemeral toast or a specific status badge was not seen, if a persistent signal (entity in list, banner, URL change) already confirms the outcome.\n'
+		f'- Only return success=false if there is NO evidence of any kind that the outcome occurred.\n'
+		f'- In validation_evidence, note every signal checked: which were present and which were absent (e.g. "Entity visible in Agents list — success. Ephemeral toast not observed. Active badge not seen.").\n'
+		f'- If the primary completion signal/action is blocked, disabled, or inconclusive, perform one alternate in-app verification route before deciding verdict.\n'
+		f'- Alternate verification must be within the app (e.g., list/detail/search/status views) and should check for objective outcome evidence.\n'
+		f'- During alternate verification, do not re-run the full primary workflow; verify existing outcome state only.\n\n'
 		f'DOM STATE RULES:\n'
 		f'- If UI appears empty, call refresh_dom_state before any reload. Do not repeatedly reload the same URL.\n'
 		f'- If navigation to a destination fails or page state is non-interactive/ambiguous afterward, call refresh_dom_state before any second navigation attempt.\n'

From 50391b3150549e3b08d7b71f6fdfa075ba9d3bdd Mon Sep 17 00:00:00 2001
From: kevinngo1304 <kevin.ngo@prosus.com>
Date: Mon, 9 Mar 2026 11:25:24 +0100
Subject: [PATCH 2/5] Revert manually: fix agent does not report missing
 validation indicators

---
 murphy/core/execution.py     |  3 +--
 murphy/core/judge.py         | 36 ++++++++----------------------------
 murphy/io/report_markdown.py |  7 -------
 murphy/models.py             | 10 ----------
 murphy/prompts.py            | 18 +++++++++---------
 5 files changed, 18 insertions(+), 56 deletions(-)

diff --git a/murphy/core/execution.py b/murphy/core/execution.py
index 12f2cb48..a9cdedc0 100644
--- a/murphy/core/execution.py
+++ b/murphy/core/execution.py
@@ -233,8 +233,7 @@ async def _execute_single_test(
 			reason=reason,
 			validation_evidence=validation_evidence,
 			feedback_quality=judgement.feedback_quality,
-			trait_evaluations=judgement.trait_evaluations,
-			missing_signals=judgement.missing_signals,
+			trait_evaluations=judgement.trait_evaluations
 		)
 		test_result.failure_category = classify_failure(test_result)
 	except Exception as exc:
diff --git a/murphy/core/judge.py b/murphy/core/judge.py
index e559504d..fbc032af 100644
--- a/murphy/core/judge.py
+++ b/murphy/core/judge.py
@@ -133,22 +133,12 @@ def build_judge_trait_context(persona: str, traits: TraitVector, test_type: Test
 
 - **Flexible mechanism matching**: If the criteria say "the site prevents empty form submission" and the site uses a disabled submit button instead of an error toast, that IS a pass — the behavior (prevention) was achieved through a different mechanism.
 - **Quoted text is illustrative, not literal**: Any quoted UI text in criteria (e.g., "'Please fill out this field'") is ONE example of acceptable behavior, not the only acceptable response. A site showing "Required" instead of "Please fill out this field" achieves the same outcome.
-- **Alternative outcomes**: When criteria list alternatives separated by OR, ANY one of them is sufficient for a pass. If ANY single OR condition is satisfied, set verdict=true immediately — do NOT require all OR branches to be satisfied.
-- **Do NOT fail for missing ephemeral signals**: If a persistent signal already confirms the outcome (entity visible in list/detail with its name, URL changed to the expected destination, persistent banner present), do NOT set verdict=false just because an ephemeral toast or a specific status badge was not captured. Record those as missing_signals instead.
-- **Non-happy-path default**: For security-oriented personas (adversarial, edge_case, angry_user), any mechanism that prevents the bad outcome (crash, data leak, unhandled exception, corrupted state) is a PASS. Only FAIL on demonstrable mishandling.
+- **Alternative outcomes**: When criteria list alternatives separated by OR, ANY one of them is sufficient for a pass.- **Non-happy-path default**: For security-oriented personas (adversarial, edge_case, angry_user), any mechanism that prevents the bad outcome (crash, data leak, unhandled exception, corrupted state) is a PASS. Only FAIL on demonstrable mishandling.
 - **Silent handling is valid for security-oriented personas (adversarial, edge_case, angry_user)**: If the site silently sanitizes input, ignores invalid data, or gracefully degrades without any visible feedback, that IS correct behavior for security personas — not a failure. For UX-oriented personas (happy_path, confused_novice, impatient_user, explorer), the site MUST provide visible feedback — a disabled button with no explanation, a silently ignored input, or a form that does nothing on submit is a FAIL.
 - **Disabled controls ARE prevention for security-oriented personas**: If a submit/publish/next button is disabled when fields are empty or invalid, that IS the site preventing submission for security personas. For UX-oriented personas, a disabled control MUST be accompanied by visible explanation (tooltip, inline text, grayed-out label explaining why) to count as a PASS.
 - **Focus on harm, not form** (security personas): Ask "did the site handle this situation without harm?" not "did the site handle it exactly as described?"
 - **Focus on clarity, not just harm** (UX personas): Ask "did the site help the user understand what happened?" not just "did it avoid crashing?" A site that silently swallows user input with no feedback is harmful to UX even if nothing technically broke.
 
-## Missing signals (always report, never fail on)
-
-Even when verdict=true, populate `missing_signals` with any expected confirmation signals that were NOT observed. These are UX observations that do not affect the verdict:
-- Ephemeral signals not captured: e.g. "success toast not observed" or "error flash message not seen"
-- Status indicators absent: e.g. "'Active' badge not visible on the list entry"
-- Secondary confirmations missing: e.g. "confirmation dialog not shown before delete"
-
-If verdict=true and all expected signals were observed, leave `missing_signals` as an empty list.
 
 ## Failure classification
 
@@ -200,23 +190,13 @@ def build_judge_trait_context(persona: str, traits: TraitVector, test_type: Test
 
 ---
 
-## Outcome check (for verdict)
-
-Base verdict on whether the OUTCOME happened — not on which specific signals confirmed it:
-- **Create flows**: if the new entity appears anywhere in the app with a recognizable identifier (name, ID, or other label visible in a list row, detail page, or URL), verdict=true.
-- **Delete flows**: if the entity is absent from list/search results, verdict=true.
-- **Edit flows**: if updated values are visible in any list, detail, or status view, verdict=true.
-- **General**: if ANY single OR condition from the success criteria is satisfied by any observable evidence (persistent banner, URL change, entity in list, redirect to detail page), verdict=true.
-- Only set verdict=false if there is ZERO evidence of any kind that the outcome occurred.
-
-## Signal gaps (for `missing_signals` — never affects verdict)
-
-After determining the verdict, check which expected confirmation signals were NOT observed and list each one in `missing_signals`. Examples:
-- "Ephemeral success toast not captured in screenshots or step trace"
-- "'Active' status badge not visible on the agent list entry"
-- "Confirmation dialog not shown before the destructive action"
-
-These are UX observations only. A non-empty `missing_signals` on a passing test means the site's feedback could be improved — it does NOT change the verdict.
+## Validation rules
+- Validate outcome state before returning a verdict (no inference from partial signals).
+- Use visible UI signals only: toasts, badges, list rows, detail cards, confirmation messages.
+- For create flows: confirm new entity appears with a recognizable identifier.
+- For delete flows: confirm entity is absent from list/search.
+- For edit flows: reopen and confirm updates persist.
+- If evidence is ambiguous, return verdict=false.
 
 Based on the Navigation Evidence and Pages Reached, did the agent successfully complete this test?
 Evaluate each trait dimension independently and report per-trait assessments in trait_evaluations.
diff --git a/murphy/io/report_markdown.py b/murphy/io/report_markdown.py
index eb5e96d3..c55f91b3 100644
--- a/murphy/io/report_markdown.py
+++ b/murphy/io/report_markdown.py
@@ -48,13 +48,6 @@ def _render_test_detail(r: TestResult, index: int, lines: list[str]) -> None:
 	else:
 		lines += ['**Validation Performed:**', 'No explicit validation evidence recorded.', '']
 
-	# ── Missing signals (shown on all tests — UX gaps even on passes) ──
-	missing_signals = getattr(r, 'missing_signals', []) or []
-	if missing_signals:
-		lines += ['**Confirmation signals not observed (UX gaps):**']
-		for s in missing_signals:
-			lines.append(f'- {s}')
-		lines.append('')
 
 	# ── Evaluation dimensions ──
 	if r.process_evaluation:
diff --git a/murphy/models.py b/murphy/models.py
index a27f0127..a3890972 100644
--- a/murphy/models.py
+++ b/murphy/models.py
@@ -294,15 +294,6 @@ class JudgeVerdict(BaseModel):
 	usability_evaluation: str = ''
 	feedback_quality: FeedbackQualityScore | None = None
 	trait_evaluations: dict[str, str] | None = None
-	missing_signals: list[str] = Field(
-		default_factory=list,
-		description=(
-			'Confirmation signals that were expected but not observed '
-			'(e.g. "ephemeral toast not captured", "Active status badge not visible in list"). '
-			'The outcome still passed via another signal. '
-			'Report for UX improvement only — never used to fail the test.'
-		),
-	)
 
 
 # ─── Phase 3: Results ──────────────────────────────────────────────────────────
@@ -326,7 +317,6 @@ class TestResult(BaseModel):
 	validation_evidence: str = ''
 	feedback_quality: FeedbackQualityScore | None = None
 	trait_evaluations: dict[str, str] | None = None
-	missing_signals: list[str] = Field(default_factory=list)
 
 
 class ReportSummary(BaseModel):
diff --git a/murphy/prompts.py b/murphy/prompts.py
index 9f31f939..894e7d4c 100644
--- a/murphy/prompts.py
+++ b/murphy/prompts.py
@@ -381,17 +381,17 @@ def build_execution_prompt(
 		f'- Do NOT just search for error messages or describe what you see. Actually interact with the form: leave fields empty, then click submit. Report the observed behavior (disabled button, inline validation, error toast, silent rejection, etc.).\n'
 		f"- The goal is to test the site's handling mechanism, not to find error messages.\n\n"
 		f'VALIDATION RULES:\n'
-		f'- Base success on whether the OUTCOME happened, not on which specific signal confirmed it.\n'
-		f'- If ANY single OR condition from the success_criteria is satisfied by visible evidence, return success=true.\n'
-		f'- For create flows: if the new entity appears anywhere in the app with a recognizable identifier (name or ID visible in a list row, detail page, or URL), that IS confirmation — return success=true.\n'
-		f'- For delete flows: if the entity is absent from list/search, return success=true.\n'
-		f'- For edit flows: if updated values are visible in any list, detail, or status view, return success=true.\n'
-		f'- Do NOT return success=false just because an ephemeral toast or a specific status badge was not seen, if a persistent signal (entity in list, banner, URL change) already confirms the outcome.\n'
-		f'- Only return success=false if there is NO evidence of any kind that the outcome occurred.\n'
-		f'- In validation_evidence, note every signal checked: which were present and which were absent (e.g. "Entity visible in Agents list — success. Ephemeral toast not observed. Active badge not seen.").\n'
+		f'- Validate outcome state before returning success (no inference from partial signals).\n'
+		f'- Use visible UI signals only: toasts, badges, list rows, detail cards, confirmation messages.\n'
+		f'- For create flows: confirm new entity appears with a recognizable identifier.\n'
+		f'- For delete flows: confirm entity is absent from list/search.\n'
+		f'- For edit flows: reopen and confirm updates persist.\n'
+		f'- If evidence is ambiguous, return success=false.\n'
 		f'- If the primary completion signal/action is blocked, disabled, or inconclusive, perform one alternate in-app verification route before deciding verdict.\n'
 		f'- Alternate verification must be within the app (e.g., list/detail/search/status views) and should check for objective outcome evidence.\n'
-		f'- During alternate verification, do not re-run the full primary workflow; verify existing outcome state only.\n\n'
+		f'- During alternate verification, do not re-run the full primary workflow; verify existing outcome state only.\n'
+		f'- If evidence is ambiguous, contradictory, or missing, return success=false and explain what could not be verified.\n'
+		f'- Verify scenario success_criteria explicitly and cite which UI signal satisfied each required condition.\n\n'
 		f'DOM STATE RULES:\n'
 		f'- If UI appears empty, call refresh_dom_state before any reload. Do not repeatedly reload the same URL.\n'
 		f'- If navigation to a destination fails or page state is non-interactive/ambiguous afterward, call refresh_dom_state before any second navigation attempt.\n'

From d44721f0fe455407837cd992409fe7f497fa6ab8 Mon Sep 17 00:00:00 2001
From: kevinngo1304 <kevin.ngo@prosus.com>
Date: Mon, 9 Mar 2026 11:33:45 +0100
Subject: [PATCH 3/5] add missing signals reporting

---
 murphy/core/execution.py     |  3 ++-
 murphy/core/judge.py         | 18 ++++++++++++++++++
 murphy/io/report_markdown.py |  7 +++++++
 murphy/models.py             | 10 ++++++++++
 4 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/murphy/core/execution.py b/murphy/core/execution.py
index a9cdedc0..12f2cb48 100644
--- a/murphy/core/execution.py
+++ b/murphy/core/execution.py
@@ -233,7 +233,8 @@ async def _execute_single_test(
 			reason=reason,
 			validation_evidence=validation_evidence,
 			feedback_quality=judgement.feedback_quality,
-			trait_evaluations=judgement.trait_evaluations
+			trait_evaluations=judgement.trait_evaluations,
+			missing_signals=judgement.missing_signals,
 		)
 		test_result.failure_category = classify_failure(test_result)
 	except Exception as exc:
diff --git a/murphy/core/judge.py b/murphy/core/judge.py
index fbc032af..f0001d97 100644
--- a/murphy/core/judge.py
+++ b/murphy/core/judge.py
@@ -139,6 +139,15 @@ def build_judge_trait_context(persona: str, traits: TraitVector, test_type: Test
 - **Focus on harm, not form** (security personas): Ask "did the site handle this situation without harm?" not "did the site handle it exactly as described?"
 - **Focus on clarity, not just harm** (UX personas): Ask "did the site help the user understand what happened?" not just "did it avoid crashing?" A site that silently swallows user input with no feedback is harmful to UX even if nothing technically broke.
 
+## Missing signals (always report, never fail on)
+
+Even when verdict=true, populate `missing_signals` with any expected confirmation signals that were NOT observed. These are UX observations that do not affect the verdict:
+- Ephemeral signals not captured: e.g. "success toast not observed" or "error flash message not seen"
+- Status indicators absent: e.g. "'Active' badge not visible on the list entry"
+- Secondary confirmations missing: e.g. "confirmation dialog not shown before delete"
+
+If verdict=true and all expected signals were observed, leave `missing_signals` as an empty list.
+
 
 ## Failure classification
 
@@ -198,6 +207,15 @@ def build_judge_trait_context(persona: str, traits: TraitVector, test_type: Test
 - For edit flows: reopen and confirm updates persist.
 - If evidence is ambiguous, return verdict=false.
 
+## Signal gaps (for `missing_signals` — never affects verdict)
+
+After determining the verdict, check which expected confirmation signals were NOT observed and list each one in `missing_signals`. Examples:
+- "Ephemeral success toast not captured in screenshots or step trace"
+- "'Active' status badge not visible on the agent list entry"
+- "Confirmation dialog not shown before the destructive action"
+
+These are UX observations only. A non-empty `missing_signals` on a passing test means the site's feedback could be improved — it does NOT change the verdict.
+
 Based on the Navigation Evidence and Pages Reached, did the agent successfully complete this test?
 Evaluate each trait dimension independently and report per-trait assessments in trait_evaluations.
 Also assess feedback quality (response_present, response_timely, response_clear, response_actionable, feedback_type).
diff --git a/murphy/io/report_markdown.py b/murphy/io/report_markdown.py
index c55f91b3..eb5e96d3 100644
--- a/murphy/io/report_markdown.py
+++ b/murphy/io/report_markdown.py
@@ -48,6 +48,13 @@ def _render_test_detail(r: TestResult, index: int, lines: list[str]) -> None:
 	else:
 		lines += ['**Validation Performed:**', 'No explicit validation evidence recorded.', '']
 
+	# ── Missing signals (shown on all tests — UX gaps even on passes) ──
+	missing_signals = getattr(r, 'missing_signals', []) or []
+	if missing_signals:
+		lines += ['**Confirmation signals not observed (UX gaps):**']
+		for s in missing_signals:
+			lines.append(f'- {s}')
+		lines.append('')
 
 	# ── Evaluation dimensions ──
 	if r.process_evaluation:
diff --git a/murphy/models.py b/murphy/models.py
index a3890972..a27f0127 100644
--- a/murphy/models.py
+++ b/murphy/models.py
@@ -294,6 +294,15 @@ class JudgeVerdict(BaseModel):
 	usability_evaluation: str = ''
 	feedback_quality: FeedbackQualityScore | None = None
 	trait_evaluations: dict[str, str] | None = None
+	missing_signals: list[str] = Field(
+		default_factory=list,
+		description=(
+			'Confirmation signals that were expected but not observed '
+			'(e.g. "ephemeral toast not captured", "Active status badge not visible in list"). '
+			'The outcome still passed via another signal. '
+			'Report for UX improvement only — never used to fail the test.'
+		),
+	)
 
 
 # ─── Phase 3: Results ──────────────────────────────────────────────────────────
@@ -317,6 +326,7 @@ class TestResult(BaseModel):
 	validation_evidence: str = ''
 	feedback_quality: FeedbackQualityScore | None = None
 	trait_evaluations: dict[str, str] | None = None
+	missing_signals: list[str] = Field(default_factory=list)
 
 
 class ReportSummary(BaseModel):

From a1d4572f5c52f51e022f5c0be848de9def96692d Mon Sep 17 00:00:00 2001
From: kevinngo1304 <kevin.ngo@prosus.com>
Date: Sat, 7 Mar 2026 13:57:07 +0100
Subject: [PATCH 4/5] fix: ModelProviderError: 'utf-8' codec can't encode
 character, surrogates not allowed

---
 murphy/core/judge.py   | 2 ++
 murphy/core/summary.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/murphy/core/judge.py b/murphy/core/judge.py
index f0001d97..fef8e41e 100644
--- a/murphy/core/judge.py
+++ b/murphy/core/judge.py
@@ -12,6 +12,7 @@
 from browser_use.agent.views import AgentHistoryList
 from browser_use.llm import ChatOpenAI, SystemMessage, UserMessage
 from browser_use.llm.messages import ContentPartImageParam, ContentPartTextParam, ImageURL
+from browser_use.utils import sanitize_surrogates
 from murphy.models import (
 	PERSONA_REGISTRY,
 	JudgeVerdict,
@@ -371,6 +372,7 @@ async def murphy_judge(
 		errors=errors_text,
 		final_result=final_result,
 	)
+	user_prompt = sanitize_surrogates(user_prompt)
 
 	# Build multimodal user message with screenshots for visual verification
 	user_content: list[ContentPartTextParam | ContentPartImageParam] = [
diff --git a/murphy/core/summary.py b/murphy/core/summary.py
index 2f60d1c2..93065f65 100644
--- a/murphy/core/summary.py
+++ b/murphy/core/summary.py
@@ -6,6 +6,7 @@
 
 from browser_use.llm import ChatOpenAI, SystemMessage, UserMessage
 from murphy.io.report import write_full_report
+from browser_use.utils import sanitize_surrogates
 from murphy.models import (
 	ExecutiveSummary,
 	ReportSummary,
@@ -116,6 +117,7 @@ async def generate_executive_summary(
 3. recommended_actions: Top 3 concrete actions the site team should take to improve UX
 
 Be specific and actionable. Reference actual test names and outcomes. Do NOT use generic statements."""
+	prompt = sanitize_surrogates(prompt)
 
 	response = await llm.ainvoke(
 		messages=[

From 07f12771e8af0729162018e090aa9c27edc3ba30 Mon Sep 17 00:00:00 2001
From: kevinngo1304 <kevin.ngo@prosus.com>
Date: Mon, 9 Mar 2026 11:58:41 +0100
Subject: [PATCH 5/5] code style fix

---
 murphy/core/summary.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/murphy/core/summary.py b/murphy/core/summary.py
index 93065f65..1177a2ce 100644
--- a/murphy/core/summary.py
+++ b/murphy/core/summary.py
@@ -5,8 +5,8 @@
 from typing import Literal
 
 from browser_use.llm import ChatOpenAI, SystemMessage, UserMessage
-from murphy.io.report import write_full_report
 from browser_use.utils import sanitize_surrogates
+from murphy.io.report import write_full_report
 from murphy.models import (
 	ExecutiveSummary,
 	ReportSummary,