From e84a5214eace677edf3bc0547332b4432555411a Mon Sep 17 00:00:00 2001 From: markm39 Date: Mon, 30 Mar 2026 18:26:08 -0500 Subject: [PATCH] feat(cli): add refusal detection and anti-refusal prompt hardening When the LLM recognizes hard problems (FrontierMath, IMO, competitions), it refuses to attempt them. This adds two defenses: 1. System prompt hardening: explicit instructions to never refuse, never discuss problem difficulty/sources, always write Lean code. 2. Refusal detection with forced retry: when the model produces no tool calls and its text matches refusal patterns ("open problem", "cannot solve", "FrontierMath", etc.), inject a system message forcing it to attempt the proof. Max 2 retries before accepting the refusal. Also fixes additional Rust 1.94 clippy lints (map_or, field assignment). --- crates/openproof-cli/src/autonomous.rs | 14 +- crates/openproof-cli/src/decomposition.rs | 2 +- crates/openproof-cli/src/main.rs | 1 + crates/openproof-cli/src/refusal.rs | 165 ++++++++++++++++++++++ crates/openproof-cli/src/system_prompt.rs | 14 +- crates/openproof-cli/src/turn_handling.rs | 22 ++- crates/openproof-model/src/lib.rs | 2 +- 7 files changed, 209 insertions(+), 11 deletions(-) create mode 100644 crates/openproof-cli/src/refusal.rs diff --git a/crates/openproof-cli/src/autonomous.rs b/crates/openproof-cli/src/autonomous.rs index 7e8854f..4307c36 100644 --- a/crates/openproof-cli/src/autonomous.rs +++ b/crates/openproof-cli/src/autonomous.rs @@ -1315,11 +1315,13 @@ pub async fn run_tactic_search_file(file_path: String) -> Result<()> { // Extract the type expression for the fallback goal let type_expr = extract_type_expr_from_content(&content).unwrap_or_default(); - let mut node = openproof_protocol::ProofNode::default(); - node.id = "root".to_string(); - node.content = content.clone(); - node.statement = type_expr; - node.status = openproof_protocol::ProofNodeStatus::Pending; + let node = openproof_protocol::ProofNode { + id: "root".to_string(), + content: content.clone(), + statement: type_expr, + status: openproof_protocol::ProofNodeStatus::Pending, + ..Default::default() + }; let mut proof = openproof_protocol::ProofSessionState::default(); proof.nodes.push(node); proof.active_node_id = Some("root".to_string()); @@ -2123,7 +2125,7 @@ fn extract_type_expr_from_content(content: &str) -> Option { let mut kw_idx = None; for kw in ["theorem ", "lemma ", "def "] { if let Some(idx) = before.rfind(kw) { - if kw_idx.map_or(true, |prev| idx > prev) { + if kw_idx.is_none_or(|prev| idx > prev) { kw_idx = Some(idx); } } diff --git a/crates/openproof-cli/src/decomposition.rs b/crates/openproof-cli/src/decomposition.rs index 862b595..6f083c2 100644 --- a/crates/openproof-cli/src/decomposition.rs +++ b/crates/openproof-cli/src/decomposition.rs @@ -368,7 +368,7 @@ pub fn check_decomposition_consistency( #[cfg(test)] mod tests { use super::*; - use openproof_protocol::{ProofNodeKind, ProofNodeStatus}; + use openproof_protocol::{ProofNodeKind, ProofNodeStatus, SearchAttemptMetrics}; fn make_branch( id: &str, diff --git a/crates/openproof-cli/src/main.rs b/crates/openproof-cli/src/main.rs index 0248349..21b168a 100644 --- a/crates/openproof-cli/src/main.rs +++ b/crates/openproof-cli/src/main.rs @@ -6,6 +6,7 @@ mod event_loop; mod export; mod helpers; mod key_handling; +mod refusal; mod setup; mod shell; mod slash_autonomous; diff --git a/crates/openproof-cli/src/refusal.rs b/crates/openproof-cli/src/refusal.rs new file mode 100644 index 0000000..bafe3d2 --- /dev/null +++ b/crates/openproof-cli/src/refusal.rs @@ -0,0 +1,165 @@ +//! Refusal detection and forced retry for LLM proof attempts. +//! +//! When the model recognizes a hard problem from training data (FrontierMath, +//! IMO, competitions), it may refuse to attempt it. This module detects refusal +//! signals in the model's text and provides retry messages to force continuation. + +pub const MAX_REFUSAL_RETRIES: u8 = 2; + +/// Strong refusal signals -- any single match triggers refusal detection. +const STRONG_SIGNALS: &[&str] = &[ + "open problem", + "unsolved problem", + "open question", + "no known proof", + "no known solution", + "frontiermath", + "frontier math", + "competition problem", + "benchmark problem", + "beyond current capabilities", + "beyond my capabilities", + "beyond the capabilities", + "i cannot solve this", + "i'm unable to prove", + "i am unable to prove", + "cannot be proven", + "is considered unsolved", + "this is an open", + "this problem is from", + "known to be undecidable", + "no elementary proof", + "this is a well-known", + "remains an open", +]; + +/// Weak refusal signals -- only trigger when combined with no tool usage +/// and no Lean code in the response. +const WEAK_SIGNALS: &[&str] = &[ + "i apologize", + "i'm sorry", + "i am sorry", + "too difficult", + "unable to", + "cannot prove", + "cannot solve", + "i don't think", + "i do not think", + "not feasible", + "not possible to prove", +]; + +/// Detect whether the model's response is a refusal to attempt the proof. +/// +/// Returns `true` if the text contains refusal signals. If `used_tools` is true, +/// only strong signals trigger (the model at least tried before complaining). +pub fn detect_refusal(text: &str, used_tools: bool) -> bool { + if text.trim().is_empty() { + return false; + } + let lower = text.to_ascii_lowercase(); + + // Strong signals always trigger. + if STRONG_SIGNALS.iter().any(|s| lower.contains(s)) { + return true; + } + + // Weak signals only trigger if the model didn't use tools and wrote no Lean code. + if !used_tools { + let has_lean_code = text.contains("```lean"); + if !has_lean_code && WEAK_SIGNALS.iter().any(|s| lower.contains(s)) { + return true; + } + } + + false +} + +/// Build an escalating retry message to force the model to attempt the proof. +pub fn refusal_retry_message(retry_count: u8) -> String { + match retry_count { + 1 => concat!( + "You MUST attempt this proof. Do not assess difficulty or discuss problem sources. ", + "Start by writing a sorry-skeleton: create a .lean file with the theorem statement ", + "and `sorry` placeholders, then use lean_verify to check it compiles. ", + "Decompose the problem into sub-have steps and fill them one at a time.", + ) + .to_string(), + _ => concat!( + "FINAL ATTEMPT: Write Lean code NOW using file_write. Do not produce text without ", + "tool calls. Create a sorry-skeleton immediately with the theorem statement and sorry. ", + "Any response without a tool call wastes this proof attempt entirely.", + ) + .to_string(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn detects_strong_refusal() { + assert!(detect_refusal( + "This is an open problem in number theory.", + false + )); + assert!(detect_refusal( + "This problem is from FrontierMath and is unsolved.", + false + )); + assert!(detect_refusal("I cannot solve this theorem.", false)); + assert!(detect_refusal( + "The conjecture remains an open question.", + false + )); + } + + #[test] + fn detects_strong_even_with_tools() { + assert!(detect_refusal("This is an open problem.", true)); + } + + #[test] + fn detects_weak_refusal_without_tools() { + assert!(detect_refusal( + "I apologize, but this is too difficult for me.", + false + )); + assert!(detect_refusal("I'm sorry, I cannot prove this.", false)); + } + + #[test] + fn weak_signals_ignored_with_tools() { + assert!(!detect_refusal( + "I'm sorry, let me try a different approach.", + true + )); + } + + #[test] + fn weak_signals_ignored_with_lean_code() { + let text = + "I'm sorry, this is hard. Here's my attempt:\n```lean\ntheorem foo := sorry\n```"; + assert!(!detect_refusal(text, false)); + } + + #[test] + fn no_refusal_on_normal_text() { + assert!(!detect_refusal( + "Let me try using norm_num for this.", + false + )); + assert!(!detect_refusal("The proof follows from induction.", false)); + assert!(!detect_refusal("", false)); + } + + #[test] + fn retry_messages_are_different() { + let m1 = refusal_retry_message(1); + let m2 = refusal_retry_message(2); + assert_ne!(m1, m2); + assert!(m1.contains("sorry-skeleton")); + assert!(m2.contains("FINAL ATTEMPT")); + } +} diff --git a/crates/openproof-cli/src/system_prompt.rs b/crates/openproof-cli/src/system_prompt.rs index 9999d45..782ad2c 100644 --- a/crates/openproof-cli/src/system_prompt.rs +++ b/crates/openproof-cli/src/system_prompt.rs @@ -148,6 +148,14 @@ pub fn build_system_prompt(session: Option<&SessionSnapshot>) -> String { "The theorem statement is FIXED.", ).to_string(), "When stuck, try a different approach rather than explaining why you are stuck. Decompose hard sorrys into sub-have steps.".to_string(), + concat!( + "MANDATORY: You MUST attempt every proof. NEVER refuse.\n", + "- Do NOT discuss whether a problem is open, unsolved, from a benchmark, or too hard.\n", + "- Do NOT reference problem sources (FrontierMath, IMO, competitions, etc.).\n", + "- Do NOT assess problem difficulty. Write Lean code, not commentary.\n", + "- NEVER produce a response without tool calls. Always write code.\n", + "- Even if you believe a proof is impossible, write the best sorry-skeleton you can.", + ).to_string(), "Use structured markers: TITLE, PROBLEM, FORMAL_TARGET, ACCEPTED_TARGET, PHASE, STATUS, THEOREM, LEMMA, and fenced ```lean``` blocks.".to_string(), "Maintain Paper.tex alongside Lean code using file_write/file_patch. Do not include LaTeX in response text.".to_string(), ]; @@ -478,7 +486,8 @@ pub async fn build_branch_turn_messages( "then IMMEDIATELY create the sorry-skeleton in Lean and lean_verify it. ", "Do not spend more than 5 tool calls on research. The compilable skeleton matters ", "more than the research. Focus on decomposing the theorem into a chain of `have` ", - "steps that captures the KEY MATHEMATICAL INSIGHT. The Prover branch will fill the sorrys.", + "steps that captures the KEY MATHEMATICAL INSIGHT. The Prover branch will fill the sorrys. ", + "Never refuse to plan a proof. Even for hard problems, write a decomposition skeleton.", ).to_string() } AgentRole::Retriever => { @@ -493,7 +502,8 @@ pub async fn build_branch_turn_messages( "lean_screen_tactics (test simp, omega, ring, exact?, apply?, linarith, norm_num, aesop, decide, tauto) ", "-> file_patch to apply the working tactic -> lean_verify to confirm. ", "If stuck on a goal for more than 3 attempts, decompose it with sub-have steps. ", - "Do NOT change theorem statements. Spend ALL your tool calls on the lean_goals/screen_tactics/patch/verify loop.", + "Do NOT change theorem statements. Spend ALL your tool calls on the lean_goals/screen_tactics/patch/verify loop. ", + "Never refuse to attempt. Even partial progress (filling some sorrys) is valuable.", ).to_string() } AgentRole::Repairer => { diff --git a/crates/openproof-cli/src/turn_handling.rs b/crates/openproof-cli/src/turn_handling.rs index 265764c..e6f050a 100644 --- a/crates/openproof-cli/src/turn_handling.rs +++ b/crates/openproof-cli/src/turn_handling.rs @@ -98,6 +98,7 @@ pub async fn run_agentic_loop( } let mut turn_used_tools = false; let mut last_verify_ok = false; + let mut refusal_retries: u8 = 0; // Spawn lean-lsp-mcp for structured goal access (fallback when Pantograph unavailable). let lsp_mcp: Option>> = LeanLspMcp::spawn(&project_dir) @@ -138,7 +139,16 @@ pub async fn run_agentic_loop( Ok(result) => { // If there are no tool calls, we are done. if result.tool_calls.is_empty() { - // Text was streamed via StreamDelta events. + // Detect refusal and force retry before giving up. + if refusal_retries < crate::refusal::MAX_REFUSAL_RETRIES + && crate::refusal::detect_refusal(&result.text, turn_used_tools) + { + refusal_retries += 1; + let retry = crate::refusal::refusal_retry_message(refusal_retries); + messages.push(TurnMessage::chat("assistant", &result.text)); + messages.push(TurnMessage::chat("system", &retry)); + continue; + } break; } @@ -427,6 +437,7 @@ pub fn start_agent_branch_turn( let mut accumulated_text = String::new(); let mut turn_used_tools = false; let mut last_verify_ok = false; + let mut refusal_retries: u8 = 0; let branch_lsp_mcp: Option>> = LeanLspMcp::spawn(&project_dir) .map(|c| Arc::new(Mutex::new(c))) @@ -452,6 +463,15 @@ pub fn start_agent_branch_turn( Ok(turn) => { accumulated_text.push_str(&turn.text); if turn.tool_calls.is_empty() { + if refusal_retries < crate::refusal::MAX_REFUSAL_RETRIES + && crate::refusal::detect_refusal(&accumulated_text, turn_used_tools) + { + refusal_retries += 1; + let retry = crate::refusal::refusal_retry_message(refusal_retries); + all_messages.push(TurnMessage::chat("assistant", &turn.text)); + all_messages.push(TurnMessage::chat("system", &retry)); + continue; + } break; } turn_used_tools = true; diff --git a/crates/openproof-model/src/lib.rs b/crates/openproof-model/src/lib.rs index 3c050a0..dea7c7c 100644 --- a/crates/openproof-model/src/lib.rs +++ b/crates/openproof-model/src/lib.rs @@ -304,7 +304,7 @@ fn build_turn_payload(request: &CodexTurnRequest<'_>) -> Value { "model": request.model, "store": false, "stream": true, - "instructions": "You are openproof, a local-first formal math assistant. Be concise, direct, and helpful. If the user asks a general question, answer directly. If the user gives a theorem-like statement, help formalize or prove it in Lean.", + "instructions": "You are openproof, a local-first formal math assistant. Be concise, direct, and helpful. If the user asks a general question, answer directly. If the user gives a theorem-like statement, help formalize or prove it in Lean. NEVER refuse to attempt a proof. Do not discuss problem difficulty, sources, or feasibility. Always use tools to write and verify Lean code.", "input": request.messages.iter().map(serialize_turn_message).collect::>(), "include": ["reasoning.encrypted_content"], "tool_choice": if request.include_tools { "auto" } else { "none" },