Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions crates/openproof-cli/src/autonomous.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1315,11 +1315,13 @@ pub async fn run_tactic_search_file(file_path: String) -> Result<()> {
// Extract the type expression for the fallback goal
let type_expr = extract_type_expr_from_content(&content).unwrap_or_default();

let mut node = openproof_protocol::ProofNode::default();
node.id = "root".to_string();
node.content = content.clone();
node.statement = type_expr;
node.status = openproof_protocol::ProofNodeStatus::Pending;
let node = openproof_protocol::ProofNode {
id: "root".to_string(),
content: content.clone(),
statement: type_expr,
status: openproof_protocol::ProofNodeStatus::Pending,
..Default::default()
};
let mut proof = openproof_protocol::ProofSessionState::default();
proof.nodes.push(node);
proof.active_node_id = Some("root".to_string());
Expand Down Expand Up @@ -2123,7 +2125,7 @@ fn extract_type_expr_from_content(content: &str) -> Option<String> {
let mut kw_idx = None;
for kw in ["theorem ", "lemma ", "def "] {
if let Some(idx) = before.rfind(kw) {
if kw_idx.map_or(true, |prev| idx > prev) {
if kw_idx.is_none_or(|prev| idx > prev) {
kw_idx = Some(idx);
}
}
Expand Down
2 changes: 1 addition & 1 deletion crates/openproof-cli/src/decomposition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ pub fn check_decomposition_consistency(
#[cfg(test)]
mod tests {
use super::*;
use openproof_protocol::{ProofNodeKind, ProofNodeStatus};
use openproof_protocol::{ProofNodeKind, ProofNodeStatus, SearchAttemptMetrics};

fn make_branch(
id: &str,
Expand Down
1 change: 1 addition & 0 deletions crates/openproof-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod event_loop;
mod export;
mod helpers;
mod key_handling;
mod refusal;
mod setup;
mod shell;
mod slash_autonomous;
Expand Down
165 changes: 165 additions & 0 deletions crates/openproof-cli/src/refusal.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
//! Refusal detection and forced retry for LLM proof attempts.
//!
//! When the model recognizes a hard problem from training data (FrontierMath,
//! IMO, competitions), it may refuse to attempt it. This module detects refusal
//! signals in the model's text and provides retry messages to force continuation.

pub const MAX_REFUSAL_RETRIES: u8 = 2;

/// Strong refusal signals -- any single match triggers refusal detection.
const STRONG_SIGNALS: &[&str] = &[
"open problem",
"unsolved problem",
"open question",
"no known proof",
"no known solution",
"frontiermath",
"frontier math",
"competition problem",
"benchmark problem",
"beyond current capabilities",
"beyond my capabilities",
"beyond the capabilities",
"i cannot solve this",
"i'm unable to prove",
"i am unable to prove",
"cannot be proven",
"is considered unsolved",
"this is an open",
"this problem is from",
"known to be undecidable",
"no elementary proof",
"this is a well-known",
"remains an open",
];

/// Weak refusal signals -- only trigger when combined with no tool usage
/// and no Lean code in the response.
const WEAK_SIGNALS: &[&str] = &[
"i apologize",
"i'm sorry",
"i am sorry",
"too difficult",
"unable to",
"cannot prove",
"cannot solve",
"i don't think",
"i do not think",
"not feasible",
"not possible to prove",
];

/// Detect whether the model's response is a refusal to attempt the proof.
///
/// Returns `true` if the text contains refusal signals. If `used_tools` is true,
/// only strong signals trigger (the model at least tried before complaining).
pub fn detect_refusal(text: &str, used_tools: bool) -> bool {
if text.trim().is_empty() {
return false;
}
let lower = text.to_ascii_lowercase();

// Strong signals always trigger.
if STRONG_SIGNALS.iter().any(|s| lower.contains(s)) {
return true;
}

// Weak signals only trigger if the model didn't use tools and wrote no Lean code.
if !used_tools {
let has_lean_code = text.contains("```lean");
if !has_lean_code && WEAK_SIGNALS.iter().any(|s| lower.contains(s)) {
return true;
}
}

false
}

/// Build an escalating retry message to force the model to attempt the proof.
pub fn refusal_retry_message(retry_count: u8) -> String {
match retry_count {
1 => concat!(
"You MUST attempt this proof. Do not assess difficulty or discuss problem sources. ",
"Start by writing a sorry-skeleton: create a .lean file with the theorem statement ",
"and `sorry` placeholders, then use lean_verify to check it compiles. ",
"Decompose the problem into sub-have steps and fill them one at a time.",
)
.to_string(),
_ => concat!(
"FINAL ATTEMPT: Write Lean code NOW using file_write. Do not produce text without ",
"tool calls. Create a sorry-skeleton immediately with the theorem statement and sorry. ",
"Any response without a tool call wastes this proof attempt entirely.",
)
.to_string(),
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn detects_strong_refusal() {
assert!(detect_refusal(
"This is an open problem in number theory.",
false
));
assert!(detect_refusal(
"This problem is from FrontierMath and is unsolved.",
false
));
assert!(detect_refusal("I cannot solve this theorem.", false));
assert!(detect_refusal(
"The conjecture remains an open question.",
false
));
}

#[test]
fn detects_strong_even_with_tools() {
assert!(detect_refusal("This is an open problem.", true));
}

#[test]
fn detects_weak_refusal_without_tools() {
assert!(detect_refusal(
"I apologize, but this is too difficult for me.",
false
));
assert!(detect_refusal("I'm sorry, I cannot prove this.", false));
}

#[test]
fn weak_signals_ignored_with_tools() {
assert!(!detect_refusal(
"I'm sorry, let me try a different approach.",
true
));
}

#[test]
fn weak_signals_ignored_with_lean_code() {
let text =
"I'm sorry, this is hard. Here's my attempt:\n```lean\ntheorem foo := sorry\n```";
assert!(!detect_refusal(text, false));
}

#[test]
fn no_refusal_on_normal_text() {
assert!(!detect_refusal(
"Let me try using norm_num for this.",
false
));
assert!(!detect_refusal("The proof follows from induction.", false));
assert!(!detect_refusal("", false));
}

#[test]
fn retry_messages_are_different() {
let m1 = refusal_retry_message(1);
let m2 = refusal_retry_message(2);
assert_ne!(m1, m2);
assert!(m1.contains("sorry-skeleton"));
assert!(m2.contains("FINAL ATTEMPT"));
}
}
14 changes: 12 additions & 2 deletions crates/openproof-cli/src/system_prompt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,14 @@ pub fn build_system_prompt(session: Option<&SessionSnapshot>) -> String {
"The theorem statement is FIXED.",
).to_string(),
"When stuck, try a different approach rather than explaining why you are stuck. Decompose hard sorrys into sub-have steps.".to_string(),
concat!(
"MANDATORY: You MUST attempt every proof. NEVER refuse.\n",
"- Do NOT discuss whether a problem is open, unsolved, from a benchmark, or too hard.\n",
"- Do NOT reference problem sources (FrontierMath, IMO, competitions, etc.).\n",
"- Do NOT assess problem difficulty. Write Lean code, not commentary.\n",
"- NEVER produce a response without tool calls. Always write code.\n",
"- Even if you believe a proof is impossible, write the best sorry-skeleton you can.",
).to_string(),
"Use structured markers: TITLE, PROBLEM, FORMAL_TARGET, ACCEPTED_TARGET, PHASE, STATUS, THEOREM, LEMMA, and fenced ```lean``` blocks.".to_string(),
"Maintain Paper.tex alongside Lean code using file_write/file_patch. Do not include LaTeX in response text.".to_string(),
];
Expand Down Expand Up @@ -478,7 +486,8 @@ pub async fn build_branch_turn_messages(
"then IMMEDIATELY create the sorry-skeleton in Lean and lean_verify it. ",
"Do not spend more than 5 tool calls on research. The compilable skeleton matters ",
"more than the research. Focus on decomposing the theorem into a chain of `have` ",
"steps that captures the KEY MATHEMATICAL INSIGHT. The Prover branch will fill the sorrys.",
"steps that captures the KEY MATHEMATICAL INSIGHT. The Prover branch will fill the sorrys. ",
"Never refuse to plan a proof. Even for hard problems, write a decomposition skeleton.",
).to_string()
}
AgentRole::Retriever => {
Expand All @@ -493,7 +502,8 @@ pub async fn build_branch_turn_messages(
"lean_screen_tactics (test simp, omega, ring, exact?, apply?, linarith, norm_num, aesop, decide, tauto) ",
"-> file_patch to apply the working tactic -> lean_verify to confirm. ",
"If stuck on a goal for more than 3 attempts, decompose it with sub-have steps. ",
"Do NOT change theorem statements. Spend ALL your tool calls on the lean_goals/screen_tactics/patch/verify loop.",
"Do NOT change theorem statements. Spend ALL your tool calls on the lean_goals/screen_tactics/patch/verify loop. ",
"Never refuse to attempt. Even partial progress (filling some sorrys) is valuable.",
).to_string()
}
AgentRole::Repairer => {
Expand Down
22 changes: 21 additions & 1 deletion crates/openproof-cli/src/turn_handling.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ pub async fn run_agentic_loop(
}
let mut turn_used_tools = false;
let mut last_verify_ok = false;
let mut refusal_retries: u8 = 0;

// Spawn lean-lsp-mcp for structured goal access (fallback when Pantograph unavailable).
let lsp_mcp: Option<Arc<Mutex<LeanLspMcp>>> = LeanLspMcp::spawn(&project_dir)
Expand Down Expand Up @@ -138,7 +139,16 @@ pub async fn run_agentic_loop(
Ok(result) => {
// If there are no tool calls, we are done.
if result.tool_calls.is_empty() {
// Text was streamed via StreamDelta events.
// Detect refusal and force retry before giving up.
if refusal_retries < crate::refusal::MAX_REFUSAL_RETRIES
&& crate::refusal::detect_refusal(&result.text, turn_used_tools)
{
refusal_retries += 1;
let retry = crate::refusal::refusal_retry_message(refusal_retries);
messages.push(TurnMessage::chat("assistant", &result.text));
messages.push(TurnMessage::chat("system", &retry));
continue;
}
break;
}

Expand Down Expand Up @@ -427,6 +437,7 @@ pub fn start_agent_branch_turn(
let mut accumulated_text = String::new();
let mut turn_used_tools = false;
let mut last_verify_ok = false;
let mut refusal_retries: u8 = 0;

let branch_lsp_mcp: Option<Arc<Mutex<LeanLspMcp>>> = LeanLspMcp::spawn(&project_dir)
.map(|c| Arc::new(Mutex::new(c)))
Expand All @@ -452,6 +463,15 @@ pub fn start_agent_branch_turn(
Ok(turn) => {
accumulated_text.push_str(&turn.text);
if turn.tool_calls.is_empty() {
if refusal_retries < crate::refusal::MAX_REFUSAL_RETRIES
&& crate::refusal::detect_refusal(&accumulated_text, turn_used_tools)
{
refusal_retries += 1;
let retry = crate::refusal::refusal_retry_message(refusal_retries);
all_messages.push(TurnMessage::chat("assistant", &turn.text));
all_messages.push(TurnMessage::chat("system", &retry));
continue;
}
break;
}
turn_used_tools = true;
Expand Down
2 changes: 1 addition & 1 deletion crates/openproof-model/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ fn build_turn_payload(request: &CodexTurnRequest<'_>) -> Value {
"model": request.model,
"store": false,
"stream": true,
"instructions": "You are openproof, a local-first formal math assistant. Be concise, direct, and helpful. If the user asks a general question, answer directly. If the user gives a theorem-like statement, help formalize or prove it in Lean.",
"instructions": "You are openproof, a local-first formal math assistant. Be concise, direct, and helpful. If the user asks a general question, answer directly. If the user gives a theorem-like statement, help formalize or prove it in Lean. NEVER refuse to attempt a proof. Do not discuss problem difficulty, sources, or feasibility. Always use tools to write and verify Lean code.",
"input": request.messages.iter().map(serialize_turn_message).collect::<Vec<_>>(),
"include": ["reasoning.encrypted_content"],
"tool_choice": if request.include_tools { "auto" } else { "none" },
Expand Down
Loading