diff --git a/README.md b/README.md index 2c56654c..017303cc 100644 --- a/README.md +++ b/README.md @@ -541,6 +541,7 @@ Both patterns share the same configuration interface and are transparent to user **Built-in Agents:** - `AgentFactCheck`: LangChain-based fact-checking with autonomous search control - `AgentHallucination`: Custom workflow hallucination detection with adaptive context gathering +- `ArticleFactChecker`: Two-phase article fact-checking — extracts verifiable claims then verifies each in parallel using web search and Arxiv, with configurable concurrency control **Quick Example:** @@ -597,6 +598,7 @@ For detailed guidance on choosing and implementing agent patterns, see [Agent De - [Agent Development Guide](docs/agent_development_guide.md) - Comprehensive guide for creating custom agents and tools - [AgentHallucination Example](examples/agent/agent_hallucination_example.py) - Production agent example - [AgentFactCheck Example](examples/agent/agent_executor_example.py) - LangChain agent example +- [ArticleFactChecker Example](examples/agent/agent_article_fact_checking_example.py) - Article-scale two-phase fact verification ## ⚙️ Execution Modes diff --git a/README_zh-CN.md b/README_zh-CN.md index 8171632f..d69fd072 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -534,6 +534,7 @@ Dingo 支持基于智能体的评估器,可以使用外部工具进行多步 **内置智能体:** - `AgentFactCheck`: 基于 LangChain 的事实核查,自主搜索控制 - `AgentHallucination`: 自定义工作流的幻觉检测,自适应上下文收集 +- `ArticleFactChecker`: 两阶段文章事实核查 —— 先提取可验证声明,再并发调用网络搜索与 Arxiv 逐条验证,支持可配置的并发控制 **快速示例:** @@ -590,6 +591,7 @@ class MyAgent(BaseAgent): - [智能体开发指南](docs/agent_development_guide.md) - [AgentHallucination 示例](examples/agent/agent_hallucination_example.py) - [AgentFactCheck LangChain示例](examples/agent/agent_executor_example.py) +- [ArticleFactChecker 示例](examples/agent/agent_article_fact_checking_example.py) - 文章级两阶段事实核查 ## 执行引擎 diff --git a/dingo/model/llm/agent/__init__.py b/dingo/model/llm/agent/__init__.py index 5ffcf30e..d81b392c 100644 --- a/dingo/model/llm/agent/__init__.py +++ b/dingo/model/llm/agent/__init__.py @@ -1,22 +1,24 @@ -""" -Agent Framework for Dingo - -This package provides agent-based evaluation capabilities that extend LLMs with -tool usage, multi-step reasoning, and adaptive context gathering. - -Key Components: -- BaseAgent: Abstract base class for agent evaluators -- Tool system: Registry and base classes for agent tools -""" - -from dingo.model.llm.agent.base_agent import BaseAgent -from dingo.model.llm.agent.tools import BaseTool, ToolConfig, ToolRegistry, get_tool, tool_register - -__all__ = [ - 'BaseAgent', - 'BaseTool', - 'ToolConfig', - 'ToolRegistry', - 'get_tool', - 'tool_register', -] +""" +Agent Framework for Dingo + +This package provides agent-based evaluation capabilities that extend LLMs with +tool usage, multi-step reasoning, and adaptive context gathering. + +Key Components: +- BaseAgent: Abstract base class for agent evaluators +- Tool system: Registry and base classes for agent tools +""" + +from dingo.model.llm.agent.agent_article_fact_checker import ArticleFactChecker +from dingo.model.llm.agent.base_agent import BaseAgent +from dingo.model.llm.agent.tools import BaseTool, ToolConfig, ToolRegistry, get_tool, tool_register + +__all__ = [ + 'ArticleFactChecker', + 'BaseAgent', + 'BaseTool', + 'ToolConfig', + 'ToolRegistry', + 'get_tool', + 'tool_register', +] diff --git a/dingo/model/llm/agent/agent_article_fact_checker.py b/dingo/model/llm/agent/agent_article_fact_checker.py new file mode 100644 index 00000000..244489ec --- /dev/null +++ b/dingo/model/llm/agent/agent_article_fact_checker.py @@ -0,0 +1,1802 @@ +""" +ArticleFactChecker: Agent-based article fact-checking with claims extraction. + +Uses Agent-First architecture (LangChain ReAct / ``use_agent_executor=True``), +giving the agent full autonomy over tool selection, execution order, and +multi-step reasoning to verify factual claims in long-form articles. + +See Also: + AgentFactCheck: Single-claim hallucination detection + docs/agent_development_guide.md: Agent development patterns +""" + +import asyncio +import json +import os +import re +import threading +import time +import uuid +from collections import Counter +from datetime import datetime +from typing import Any, Dict, List, Optional + +from dingo.io import Data +from dingo.io.input.required_field import RequiredField +from dingo.io.output.eval_detail import EvalDetail, QualityLabel +from dingo.model import Model +from dingo.model.llm.agent.base_agent import BaseAgent +from dingo.utils import log + + +class PromptTemplates: + """ + Modular prompt templates for ArticleFactChecker. + + This class provides reusable prompt components that can be assembled + based on article type and verification needs. This approach: + - Reduces context window usage for long articles + - Allows dynamic prompt customization + - Makes prompts easier to maintain and test + """ + + CORE_ROLE = """You are an expert article fact-checker with autonomous tool selection capabilities. + +Your Task: Systematically verify ALL factual claims in the provided article.""" + + TOOLS_DESCRIPTION = """ +Available Tools: +================ +1. claims_extractor: Extract verifiable claims from long-form text + - Use this FIRST to identify all checkable statements + - Supports 8 claim types: factual, statistical, attribution, institutional, + temporal, comparative, monetary, technical + - Returns list of structured claims with types + +2. arxiv_search: Search academic papers and verify metadata + - Use for claims about research papers, academic publications + - Provides paper metadata: title, authors, abstract, publication date + - Authors in papers often indicate institutional affiliations in abstracts + - NOTE: Affiliations are in unstructured text, not dedicated fields + - Best for: paper titles, author names, publication dates, and + institutional claims when a related paper exists + - For institutional claims: use arxiv_search FIRST to find the paper, + then tavily_search to cross-verify affiliations + +3. tavily_search: General web search for fact verification + - Use for general factual claims, current events, companies, products + - Use for cross-verifying institutional/organizational affiliations + - Use for news, product specs, financial figures, comparative claims + - Supports multilingual queries: search BOTH English AND Chinese terms for + Chinese content (e.g., both "清华大学 OmniDocBench" and + "Tsinghua University OmniDocBench") + - Use search_depth='advanced' for authoritative fact-checking results + - Provides current web information with sources and URLs""" + + WORKFLOW_STEPS = """ +Workflow (Autonomous Decision-Making): +====================================== +STEP 0: Analyze Article Type + First, identify the article type to guide your verification strategy. + +STEP 1: Extract Claims (REQUIRED - Do NOT skip this step) + - You MUST call the claims_extractor tool with the full article text + - This is a mandatory first step before any verification + - Do NOT extract claims manually in your reasoning - use the tool + - Review the tool output and use the extracted claims for verification + - Claims are categorized by type for targeted verification + +STEP 2: Verify Each Claim (Autonomous Tool Selection) + For each claim, analyze its type and context, then SELECT THE BEST TOOL: + + Tool Selection Principles: + 1. arxiv_search - For academic paper verification (paper title, author, arXiv ID) + 2. tavily_search - For general web verification (current events, companies, products) + + Claim-Type Specific Rules: + - INSTITUTIONAL/ATTRIBUTION claims (e.g., "released by X University and Y Lab"): + You MUST use arxiv_search FIRST to find the actual paper and check author + affiliations, THEN use tavily_search to cross-verify. Do NOT rely on + tavily_search alone for institutional claims — web sources often give + vague or incomplete attribution. The paper's author list is the + authoritative source for institutional affiliations. + For CHINESE institution names: translate to English before arxiv_search + (e.g., "清华大学" → "Tsinghua University", "达摩院" → "Alibaba DAMO Academy", + "上海人工智能实验室" → "Shanghai AI Laboratory") + Search with BOTH Chinese and English terms in tavily_search for maximum coverage. + - STATISTICAL/TECHNICAL claims: Use tavily_search for official benchmarks + - FACTUAL claims: Use tavily_search for general verification + + Adaptive Strategies: + - COMBINE tools for comprehensive verification + - FALLBACK: If arxiv_search finds no paper → immediately use tavily_search alone + - FALLBACK: If tavily_search returns no relevant results → mark as UNVERIFIABLE + (do NOT retry with same query; try a different angle or accept UNVERIFIABLE) + - MULTI-SOURCE: Cross-verify important claims with multiple sources + +STEP 3: Synthesize Results + After verifying ALL claims, generate a comprehensive report.""" + + OUTPUT_FORMAT = """ +Output Format: +============== +You MUST return JSON in this exact format: + +```json +{ + "article_verification_summary": { + "article_type": "academic|news|product|blog|policy|opinion", + "total_claims": , + "verified_claims": , + "false_claims": , + "unverifiable_claims": , + "accuracy_score": <0.0-1.0> + }, + "detailed_findings": [ + { + "claim_id": "claim_001", + "original_claim": "...", + "claim_type": "institutional|factual|temporal|comparative|etc", + "verification_result": "FALSE|TRUE|UNVERIFIABLE", + "evidence": "...", + "sources": ["url1", "url2"], + "verification_method": "arxiv_search|tavily_search|combined", + "search_queries_used": ["query1", "query2"], + "reasoning": "Step-by-step reasoning for the verification conclusion" + } + ], + "false_claims_comparison": [ + { + "article_claimed": "Example: OpenAI released o1 in November 2024", + "actual_truth": "OpenAI released o1 on December 5, 2024", + "evidence": "Verified via official OpenAI announcement" + } + ] +} +```""" + + VERDICT_CRITERIA = """ +Verdict Decision Criteria: +========================== +Before assigning a verification_result to any claim, apply these evidence-based criteria: + +TRUE - Claim is CONFIRMED by evidence: + - You found specific, credible evidence that DIRECTLY supports the claim + - The evidence explicitly confirms the key facts (names, numbers, dates, relationships) + - You can cite a specific source URL that contains the confirming information + +FALSE - Claim is CONTRADICTED by evidence: + - You found specific, credible evidence that DIRECTLY contradicts the claim + - The evidence reveals a clear factual error (wrong date, wrong number, wrong attribution) + - You can point to the specific discrepancy between claim and evidence + +UNVERIFIABLE - Insufficient or ambiguous evidence: + - You could NOT find evidence that clearly confirms OR contradicts the claim + - Evidence partially matches but key details cannot be confirmed + - Sources mention the topic but do not address the specific claim being checked + - The claim involves details not found in any source + +CRITICAL RULE: Absence of contradictory evidence does NOT equal confirmation. +If your search did not find explicit confirming evidence, the verdict is UNVERIFIABLE, not TRUE. +If your reasoning includes phrases like "not explicitly listed", "could not confirm", +"no direct evidence", or "not mentioned in results", the verdict MUST be UNVERIFIABLE.""" + + SELF_VERIFICATION_STEP = """ +STEP 3.5: Self-Verify Verdict-Reasoning Consistency (MANDATORY) + Before generating your final JSON report, review EVERY claim's verdict: + + For each claim in your detailed_findings: + a) Re-read the evidence and reasoning you wrote for this claim + b) Ask yourself: "Does my evidence DIRECTLY and EXPLICITLY support this verdict?" + c) Apply these consistency checks: + - Reasoning says "not found", "not listed", "not mentioned", "no evidence" + -> Verdict MUST be UNVERIFIABLE (not TRUE) + - Reasoning says "confirmed by [specific source]" with a URL + -> Verdict can be TRUE + - Reasoning says "contradicts", "actually [different fact]", "incorrect" + -> Verdict MUST be FALSE + - Reasoning is uncertain or hedging ("may", "possibly", "unclear") + -> Verdict MUST be UNVERIFIABLE + d) If you find ANY inconsistency, correct the verdict NOW + + This step is critical for report quality. Do NOT skip it.""" + + CRITICAL_GUIDELINES = """ +Critical Guidelines: +==================== +- ALWAYS extract claims first before verification +- AUTONOMOUS tool selection based on claim type and article context +- VERIFY each claim independently +- USE multiple sources when possible (especially for critical claims) +- CITE specific evidence and URLs +- BE THOROUGH: Don't skip claims +- ADAPTIVE: If a tool fails, try alternatives intelligently +- CONTEXT-AWARE: Consider article type when selecting verification approach + +Remember: You are an autonomous agent with full decision-making power. +Analyze the article type, choose tools intelligently based on claim context, +adapt to intermediate results, and ensure comprehensive verification.""" + + # Article type specific guidance + ARTICLE_TYPE_GUIDANCE = { + "academic": """ +Article Type Guidance (Academic): +- Focus on arxiv_search for paper verification AND institutional claims +- For institutional affiliations: COMBINE arxiv_search (paper authors/abstracts) + tavily_search (cross-verify) +- Verify: paper titles, authors, publication dates, citations, institutional attributions +- Example: "OmniDocBench by Tsinghua" → arxiv_search for paper metadata THEN tavily_search to cross-verify""", + + "news": """ +Article Type Guidance (News): +- Focus on tavily_search for current events +- Verify dates, quotes, and attributions carefully +- Cross-reference multiple news sources +- Example: "released on December 5" → tavily_search with date context""", + + "product": """ +Article Type Guidance (Product Review): +- Use tavily_search for official specifications +- Verify technical specs against manufacturer data +- Check benchmark claims against third-party reviews +- Example: "A17 Pro chip" → tavily_search for official Apple specs""", + + "blog": """ +Article Type Guidance (Technical Blog): +- Use tavily_search for documentation verification +- Verify version numbers and feature claims +- Check performance claims against benchmarks +- Example: "React 18 features" → tavily_search for React docs""", + + "policy": """ +Article Type Guidance (Policy Document): +- Use tavily_search for government sources +- Verify dates, regulations, and official statements +- Cross-reference with official government websites""", + + "opinion": """ +Article Type Guidance (Opinion Piece): +- Focus only on attributed factual claims +- Verify quotes and statistics cited +- Distinguish opinions from verifiable facts""" + } + + PER_CLAIM_VERIFICATION_PROMPT = """You are a fact-checking expert. Verify ONE specific factual claim. + +Use available search tools to find evidence, then respond ONLY with valid JSON: + +{ + "verification_result": "TRUE|FALSE|UNVERIFIABLE", + "evidence": "Key evidence found (1-3 sentences)", + "sources": ["url1", "url2"], + "verification_method": "tavily_search|arxiv_search|combined|no_search", + "search_queries_used": ["query text"], + "reasoning": "Step-by-step reasoning for your verdict" +} + +Verdict Rules: +- TRUE: Found specific, direct evidence CONFIRMING the claim with a cited URL +- FALSE: Found specific evidence CONTRADICTING the claim +- UNVERIFIABLE: Could not find clear confirming OR contradicting evidence + +CRITICAL: Start with search, then produce JSON only. No text outside the JSON.""" + + @classmethod + def build(cls, article_type: Optional[str] = None) -> str: + """ + Build complete system prompt from modular components. + + Args: + article_type: Optional article type for targeted guidance + ("academic", "news", "product", "blog", "policy", "opinion") + + Returns: + Complete system prompt string + """ + parts = [ + cls.CORE_ROLE, + cls.TOOLS_DESCRIPTION, + cls.WORKFLOW_STEPS, + ] + + if article_type and article_type.lower() in cls.ARTICLE_TYPE_GUIDANCE: + parts.append(cls.ARTICLE_TYPE_GUIDANCE[article_type.lower()]) + + parts.extend([ + cls.VERDICT_CRITERIA, + cls.OUTPUT_FORMAT, + cls.SELF_VERIFICATION_STEP, + cls.CRITICAL_GUIDELINES + ]) + + return "\n".join(parts) + + @classmethod + def get_article_types(cls) -> List[str]: + """Return list of supported article types.""" + return list(cls.ARTICLE_TYPE_GUIDANCE.keys()) + + +@Model.llm_register("ArticleFactChecker") +class ArticleFactChecker(BaseAgent): + """ + Article-level fact-checking agent using LangChain ReAct (Agent-First pattern). + + The agent autonomously: + 1. Extracts claims via claims_extractor + 2. Selects the best verification tool per claim type (arxiv_search / tavily_search) + 3. Builds evidence chains and generates a structured verification report + + Configuration Example:: + + { + "name": "ArticleFactChecker", + "config": { + "key": "your-openai-api-key", + "model": "gpt-4o-mini", + "parameters": { + "agent_config": { + "max_iterations": 10, + "tools": { + "claims_extractor": { + "api_key": "your-openai-api-key", + "max_claims": 50, + "claim_types": ["factual", "institutional", "statistical", "attribution"] + }, + "tavily_search": { + "api_key": "your-tavily-api-key", + "max_results": 5 + }, + "arxiv_search": {"max_results": 5} + } + } + } + } + } + """ + + use_agent_executor = True # Enable Agent-First mode + available_tools = [ + "claims_extractor", # Extract verifiable claims from article + "arxiv_search", # Verify academic papers and institutions + "tavily_search" # General web search verification + ] + max_iterations = 10 # Allow more iterations for comprehensive checking + max_concurrent_claims = 5 # Default parallel claim verification slots + + _required_fields = [RequiredField.CONTENT] # Article text + + _metric_info = { + "metric_name": "ArticleFactChecker", + "description": "Article-level fact checking with autonomous claims extraction and verification" + } + + # Lock to serialise ClaimsExtractor class-level config mutation across threads. + # Required because LocalExecutor may call eval() from multiple threads concurrently. + _claims_extractor_lock = threading.Lock() + + # --- Output Path and File Saving Methods --- + + @classmethod + def _get_output_dir(cls) -> Optional[str]: + """ + Get output directory for artifact files. + + Returns: + Output directory path (created if needed), or None if saving is disabled. + """ + params = cls.dynamic_config.parameters or {} + agent_cfg = params.get('agent_config') or {} + + explicit_path = agent_cfg.get('output_path') + if explicit_path: + os.makedirs(explicit_path, exist_ok=True) + return explicit_path + + if agent_cfg.get('save_artifacts') is False: + return None + + base_output = agent_cfg.get('base_output_path') or 'outputs' + create_time = time.strftime("%Y%m%d_%H%M%S", time.localtime()) + auto_path = os.path.join(base_output, f"article_factcheck_{create_time}_{uuid.uuid4().hex[:6]}") + os.makedirs(auto_path, exist_ok=True) + log.debug(f"ArticleFactChecker: artifact path auto-derived: {auto_path}") + return auto_path + + @classmethod + def _save_article_content(cls, output_dir: str, content: str) -> Optional[str]: + """ + Save original article content to output directory. + + Args: + output_dir: Output directory path + content: Article markdown content + + Returns: + Path to saved file, or None on failure + """ + file_path = os.path.join(output_dir, "article_content.md") + try: + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content) + log.info(f"Saved article content to {file_path}") + return file_path + except (IOError, OSError) as e: + log.error(f"Failed to save article content: {e}") + return None + + @classmethod + def _write_jsonl_file(cls, file_path: str, records: List[Dict]) -> Optional[str]: + """Write records as JSONL. Returns file_path on success, None on failure.""" + try: + with open(file_path, 'w', encoding='utf-8') as f: + for record in records: + f.write(json.dumps(record, ensure_ascii=False) + '\n') + return file_path + except (IOError, OSError) as e: + log.error(f"Failed to write {file_path}: {e}") + return None + + @classmethod + def _save_claims(cls, output_dir: str, claims: List[Dict]) -> Optional[str]: + """Save extracted claims to JSONL file.""" + file_path = os.path.join(output_dir, "claims_extracted.jsonl") + saved = cls._write_jsonl_file(file_path, claims) + if saved: + log.info(f"Saved {len(claims)} claims to {file_path}") + return saved + + @classmethod + def _save_verification_details(cls, output_dir: str, enriched_claims: List[Dict]) -> Optional[str]: + """Save per-claim verification details to JSONL file.""" + file_path = os.path.join(output_dir, "claims_verification.jsonl") + saved = cls._write_jsonl_file(file_path, enriched_claims) + if saved: + log.info(f"Saved {len(enriched_claims)} verification details to {file_path}") + return saved + + @classmethod + def _save_full_report(cls, output_dir: str, report_data: Dict) -> Optional[str]: + """ + Save full structured verification report to JSON file. + + Args: + output_dir: Output directory path + report_data: Complete report dictionary + + Returns: + Path to saved file, or None on failure + """ + file_path = os.path.join(output_dir, "verification_report.json") + try: + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(report_data, f, ensure_ascii=False, indent=2) + log.info(f"Saved verification report to {file_path}") + return file_path + except (IOError, OSError) as e: + log.error(f"Failed to save verification report: {e}") + return None + + # --- Data Processing Methods --- + + @classmethod + def _extract_claims_from_tool_calls(cls, tool_calls: List[Dict]) -> List[Dict]: + """ + Extract claims list from tool_calls observation data. + + The claims_extractor tool returns its results in the observation field + of the tool_calls list (via langchain_adapter). + + Args: + tool_calls: List of tool call dicts from AgentWrapper + + Returns: + List of claim dictionaries extracted from claims_extractor output + """ + for tc in tool_calls: + if tc.get('tool') == 'claims_extractor': + observation = tc.get('observation', '') + if not observation: + continue + try: + obs_data = json.loads(observation) + if obs_data.get('success'): + # Claims may be in data.claims (langchain_adapter wrapping) + # or directly in obs_data.claims + data_section = obs_data.get('data', obs_data) + claims = data_section.get('claims', []) + if claims: + return claims + except (json.JSONDecodeError, TypeError) as e: + log.warning(f"Failed to parse claims_extractor observation: {e}") + return [] + + @classmethod + def _extract_claims_from_detailed_findings(cls, verification_data: Dict[str, Any]) -> List[Dict]: + """ + Fallback: extract claims from agent's detailed_findings when + claims_extractor tool was not called. + + Args: + verification_data: Agent's parsed JSON output + + Returns: + List of claim dicts with source="agent_reasoning" + """ + return [ + { + "claim_id": finding.get("claim_id", ""), + "claim": finding.get("original_claim", ""), + "claim_type": finding.get("claim_type", "unknown"), + "confidence": None, + "verifiable": True, + "source": "agent_reasoning" + } + for finding in verification_data.get("detailed_findings", []) + ] + + _VERDICT_MAP = { + "TRUE": "TRUE", "FALSE": "FALSE", "UNVERIFIABLE": "UNVERIFIABLE", + "CONFIRMED": "TRUE", "ACCURATE": "TRUE", "CORRECT": "TRUE", "VERIFIED": "TRUE", + "INACCURATE": "FALSE", "INCORRECT": "FALSE", "WRONG": "FALSE", + "DISPROVEN": "FALSE", "REFUTED": "FALSE", + } + + @classmethod + def _normalize_verdict(cls, verdict: Any) -> str: + """Normalize verdict to standard values (TRUE/FALSE/UNVERIFIABLE). Unknown values default to UNVERIFIABLE.""" + if not verdict or not isinstance(verdict, str): + return "UNVERIFIABLE" + return cls._VERDICT_MAP.get(verdict.strip().upper(), "UNVERIFIABLE") + + # Pre-compiled regexes for Tier 3 per-field extraction in _parse_claim_json_robust. + _RE_VERDICT = re.compile(r'"verification_result"\s*:\s*"(TRUE|FALSE|UNVERIFIABLE)"', re.IGNORECASE) + _RE_EVIDENCE = re.compile(r'"evidence"\s*:\s*"((?:[^"\\]|\\.)*)"', re.DOTALL) + _RE_EVIDENCE_TRUNC = re.compile(r'"evidence"\s*:\s*"((?:[^"\\]|\\.)+)', re.DOTALL) + _RE_SOURCES = re.compile(r'"sources"\s*:\s*\[(.*?)\]', re.DOTALL) + _RE_SOURCES_TRUNC = re.compile(r'"sources"\s*:\s*\[(.*)', re.DOTALL) + _RE_REASONING = re.compile(r'"reasoning"\s*:\s*"((?:[^"\\]|\\.)*)"', re.DOTALL) + _RE_REASONING_TRUNC = re.compile(r'"reasoning"\s*:\s*"((?:[^"\\]|\\.)+)', re.DOTALL) + + # Hedging language patterns that indicate reasoning contradicts a TRUE verdict. + _HEDGING_PATTERNS = re.compile( + r"(?:" + r"not explicitly (?:stated|listed|mentioned|confirmed|found)" + r"|(?:cannot|could not|couldn't) (?:be verified|confirm|find|verify)" + r"|unable to (?:verify|confirm|find)" + r"|is(?:n't| not) explicitly" + r"|no (?:direct|explicit) evidence" + r"|insufficient evidence" + r"|not directly (?:confirmed|stated|verified)" + r"|cannot be fully verified" + r"|exact .{0,30} isn't .{0,30} stated" + r"|while .{0,40} isn't .{0,30} stated" + r"|not .{0,20} explicitly .{0,20} in (?:the )?(?:available |found )?(?:sources?|documentation|results?)" + r")", + re.IGNORECASE + ) + + @classmethod + def _check_reasoning_verdict_consistency(cls, enriched_claims: List[Dict]) -> int: + """ + Downgrade TRUE verdicts to UNVERIFIABLE when reasoning contains hedging language. + + Only affects TRUE verdicts; FALSE verdicts are never changed. + + Args: + enriched_claims: List of enriched claim dicts (modified in place) + + Returns: + Number of verdicts downgraded + """ + downgraded = 0 + for claim in enriched_claims: + if claim.get("verification_result") != "TRUE": + continue + + reasoning = claim.get("reasoning", "") + if not reasoning: + continue + + match = cls._HEDGING_PATTERNS.search(reasoning) + if match: + claim["verification_result"] = "UNVERIFIABLE" + claim_id = claim.get("claim_id", "unknown") + matched_text = match.group(0) + log.info( + f"Verdict downgraded TRUE→UNVERIFIABLE for {claim_id}: " + f"hedging detected in reasoning: '{matched_text}'" + ) + downgraded += 1 + + return downgraded + + @classmethod + def _recalculate_summary(cls, enriched_claims: List[Dict]) -> Dict[str, Any]: + """ + Recalculate verification summary from actual enriched claim data. + + This ensures the summary matches the actual verdict distribution, + overriding any inconsistent self-reported summary from the agent. + + Args: + enriched_claims: List of enriched claim dicts with normalized verdicts + + Returns: + Summary dict with total_claims, verified_claims, false_claims, + unverifiable_claims, and accuracy_score + """ + total = len(enriched_claims) + true_count = sum(1 for c in enriched_claims if c.get("verification_result") == "TRUE") + false_count = sum(1 for c in enriched_claims if c.get("verification_result") == "FALSE") + unverifiable_count = sum(1 for c in enriched_claims if c.get("verification_result") == "UNVERIFIABLE") + accuracy = true_count / total if total > 0 else 0.0 + return { + "total_claims": total, + "verified_claims": true_count, + "false_claims": false_count, + "unverifiable_claims": unverifiable_count, + "accuracy_score": round(accuracy, 4) + } + + @classmethod + def _build_per_claim_verification( + cls, + verification_data: Dict[str, Any], + extracted_claims: List[Dict], + tool_calls: List[Dict] + ) -> List[Dict]: + """ + Merge verification_data, extracted_claims, and tool_calls into + per-claim verification records. + + Data sources: + - detailed_findings: verification result, evidence, sources, reasoning + - extracted_claims: claim_type, confidence, verifiable, context + - tool_calls: search queries and tool usage details + + Args: + verification_data: Agent's parsed JSON output + extracted_claims: Claims from claims_extractor tool + tool_calls: Complete tool call list from agent + + Returns: + List of enriched per-claim verification records + """ + detailed_findings = verification_data.get("detailed_findings", []) + + # Build lookup from extracted claims by claim_id + claims_by_id: Dict[str, Dict] = {} + for claim in extracted_claims: + cid = claim.get('claim_id', '') + if cid: + claims_by_id[cid] = claim + + enriched_claims: List[Dict] = [] + for finding in detailed_findings: + claim_id = finding.get('claim_id', '') + extracted = claims_by_id.get(claim_id, {}) + + enriched = { + "claim_id": claim_id, + "original_claim": finding.get('original_claim', extracted.get('claim', '')), + "claim_type": finding.get('claim_type', extracted.get('claim_type', 'unknown')), + "confidence": extracted.get('confidence'), + "verification_result": finding.get('verification_result', 'UNVERIFIABLE'), + "evidence": finding.get('evidence', ''), + "sources": finding.get('sources', []), + "verification_method": finding.get('verification_method', ''), + "search_queries_used": finding.get('search_queries_used', []), + "reasoning": finding.get('reasoning', ''), + } + + enriched_claims.append(enriched) + + # If no detailed_findings but we have extracted claims, create placeholder records + if not enriched_claims and extracted_claims: + for claim in extracted_claims: + enriched_claims.append({ + "claim_id": claim.get('claim_id', ''), + "original_claim": claim.get('claim', ''), + "claim_type": claim.get('claim_type', 'unknown'), + "confidence": claim.get('confidence'), + "verification_result": "UNVERIFIABLE", + "evidence": "", + "sources": [], + "verification_method": "", + "search_queries_used": [], + "reasoning": "No verification data available from agent", + }) + + return enriched_claims + + @classmethod + def _build_structured_report( + cls, + verification_data: Dict[str, Any], + extracted_claims: List[Dict], + enriched_claims: List[Dict], + tool_calls: List[Dict], + reasoning_steps: int, + content_length: int, + execution_time: float, + claims_source: str = "claims_extractor_tool" + ) -> Dict[str, Any]: + """ + Build a complete structured verification report. + + Args: + verification_data: Agent's parsed JSON output + extracted_claims: Claims from claims_extractor or fallback + enriched_claims: Merged per-claim verification records + tool_calls: Complete tool call list + reasoning_steps: Number of reasoning steps + content_length: Length of original article content + execution_time: Total execution time in seconds + claims_source: Where claims came from ("claims_extractor_tool" or "agent_reasoning") + + Returns: + Complete structured report dictionary + """ + summary = verification_data.get("article_verification_summary", {}) + + # Claims extraction stats + claim_types_dist: Dict[str, int] = {} + verifiable_count = 0 + for claim in extracted_claims: + ct = claim.get('claim_type', 'unknown') + claim_types_dist[ct] = claim_types_dist.get(ct, 0) + 1 + if claim.get('verifiable', True): + verifiable_count += 1 + + report = { + "report_version": "2.0", + "generated_at": datetime.now().isoformat(timespec='seconds'), + "article_info": { + "content_source": "markdown", + "content_length": content_length + }, + "claims_extraction": { + "total_extracted": len(extracted_claims), + "claims_source": claims_source, + "verifiable": verifiable_count, + "claim_types_distribution": claim_types_dist + }, + "verification_summary": { + "total_verified": summary.get("verified_claims", 0) + summary.get("false_claims", 0), + "verified_true": summary.get("verified_claims", 0), + "verified_false": summary.get("false_claims", 0), + "unverifiable": summary.get("unverifiable_claims", 0), + "accuracy_score": summary.get("accuracy_score", 0.0) + }, + "detailed_findings": enriched_claims, + "false_claims_comparison": verification_data.get("false_claims_comparison", []), + "agent_metadata": { + "model": getattr(cls.dynamic_config, 'model', 'unknown'), + "tool_calls_count": len(tool_calls), + "reasoning_steps": reasoning_steps, + "execution_time_seconds": round(execution_time, 2) + } + } + + return report + + # --- Overridden Core Methods --- + + @classmethod + def eval(cls, input_data: Data) -> EvalDetail: + """ + Two-phase async fact-checking with parallel claim verification. + + Phase 1: Extract claims via ClaimsExtractor (direct call, ~30s) + Phase 2: Verify each claim with a focused mini-agent using asyncio.gather + with Semaphore(max_concurrent_claims) to limit concurrency (~80-120s) + + This replaces the old single-agent sequential approach (~669s for 15 claims). + + Temperature defaults to 0 for deterministic tool selection and + consistent verification results. Users can override via config. + + Args: + input_data: Data object with article content + + Returns: + EvalDetail with comprehensive verification report + """ + start_time = time.time() + output_dir = cls._get_output_dir() + + if cls.dynamic_config: + if cls.dynamic_config.parameters is None: + cls.dynamic_config.parameters = {} + cls.dynamic_config.parameters.setdefault("temperature", 0) + + if output_dir and input_data.content: + cls._save_article_content(output_dir, input_data.content) + + try: + return asyncio.run(cls._async_eval(input_data, start_time, output_dir)) + except RuntimeError as e: + # Fallback when called inside an already-running event loop (e.g. Jupyter, tests) + if "cannot run" in str(e).lower() or "already running" in str(e).lower(): + import concurrent.futures + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: + future = pool.submit( + lambda: asyncio.run(cls._async_eval(input_data, start_time, output_dir)) + ) + return future.result() + raise + + # --- Two-Phase Async Architecture Methods --- + + @classmethod + async def _async_eval( + cls, input_data: Data, start_time: float, output_dir: Optional[str] + ) -> EvalDetail: + """ + Async two-phase orchestrator for parallel claim verification. + + Phase 1: Extract claims directly via ClaimsExtractor tool (~30s). + Phase 2: Verify claims concurrently with asyncio.gather and Semaphore. + """ + # Phase 1: Extract claims directly (no agent overhead) + print("[ArticleFactChecker] Phase 1: Extracting claims from article...", flush=True) + claims = await cls._async_extract_claims(input_data) + if not claims: + return cls._create_error_result("No claims extracted from article") + + print(f"[ArticleFactChecker] Phase 1 done: {len(claims)} claims extracted", flush=True) + if output_dir: + cls._save_claims(output_dir, claims) + + # Phase 2: Parallel verification with semaphore-controlled concurrency + max_concurrent = cls._get_max_concurrent_claims() + semaphore = asyncio.Semaphore(max_concurrent) + total = len(claims) + print( + f"[ArticleFactChecker] Phase 2: Verifying {total} claims " + f"(max {max_concurrent} concurrent)...", + flush=True + ) + log.info(f"ArticleFactChecker: verifying {total} claims with max_concurrent={max_concurrent}") + + # Pre-create LLM and tools once to avoid concurrent config modification + llm = cls.get_langchain_llm() + lc_tools = cls.get_langchain_tools() + search_tools = [t for t in lc_tools if t.name in ('tavily_search', 'arxiv_search')] + + _completed = [0] # mutable counter; safe in asyncio single-threaded context + + async def _verify_with_progress(claim: Dict) -> Any: + claim_id = claim.get('claim_id', '') + try: + result = await cls._async_verify_single_claim(claim, semaphore, llm, search_tools) + except Exception as exc: + _completed[0] += 1 + print(f"[ArticleFactChecker] [{_completed[0]}/{total}] {claim_id} → ERROR", flush=True) + return exc + _completed[0] += 1 + if not isinstance(result, dict) or not result.get('success'): + verdict = 'ERROR' + else: + out = (result.get('agent_result') or {}).get('output') or '' + m = cls._RE_VERDICT.search(out) + verdict = m.group(1) if m else '?' + print(f"[ArticleFactChecker] [{_completed[0]}/{total}] {claim_id} → {verdict}", flush=True) + return result + + verification_results = await asyncio.gather( + *[_verify_with_progress(claim) for claim in claims], + return_exceptions=True + ) + + elapsed = time.time() - start_time + print( + f"[ArticleFactChecker] Phase 2 done: {total}/{total} claims verified " + f"({elapsed:.1f}s elapsed)", + flush=True + ) + return cls._aggregate_parallel_results( + input_data, claims, verification_results, start_time, output_dir + ) + + @classmethod + async def _async_extract_claims(cls, input_data: Data) -> List[Dict]: + """ + Phase 1: Extract claims by calling ClaimsExtractor directly. + + Runs the synchronous ClaimsExtractor.execute() in a thread executor + to avoid blocking the event loop. + + Returns: + List of claim dicts with claim_id, claim, claim_type, etc. + """ + from dingo.model.llm.agent.tools.claims_extractor import ClaimsExtractor, ClaimsExtractorConfig + + params = cls.dynamic_config.parameters or {} + agent_cfg = params.get('agent_config') or {} + extractor_cfg = agent_cfg.get('tools', {}).get('claims_extractor', {}) + + config_kwargs: Dict[str, Any] = { + 'model': cls.dynamic_config.model or "gpt-4o-mini", + 'api_key': extractor_cfg.get('api_key') or cls.dynamic_config.key, + 'max_claims': extractor_cfg.get('max_claims', 50), + } + base_url = extractor_cfg.get('base_url') or getattr(cls.dynamic_config, 'api_url', None) + if base_url: + config_kwargs['base_url'] = base_url + claim_types = extractor_cfg.get('claim_types') + if claim_types: + config_kwargs['claim_types'] = claim_types + + content = input_data.content or '' + loop = asyncio.get_running_loop() + with cls._claims_extractor_lock: + ClaimsExtractor.config = ClaimsExtractorConfig(**config_kwargs) + result = await loop.run_in_executor(None, ClaimsExtractor.execute, content) + + if result.get('success'): + data_section = result.get('data', result) + return data_section.get('claims', []) + + log.warning(f"ClaimsExtractor failed: {result.get('error', 'unknown')}") + return [] + + @classmethod + async def _async_verify_single_claim( + cls, + claim: Dict, + semaphore: asyncio.Semaphore, + llm: Any, + search_tools: List, + ) -> Dict: + """ + Phase 2: Verify one claim with a focused mini-agent. + + The semaphore limits concurrent API calls to prevent rate limiting. + Each mini-agent only handles one claim with a simplified prompt, + returning structured JSON verification output. + + Args: + claim: Claim dict from ClaimsExtractor (has claim_id, claim, claim_type) + semaphore: Asyncio semaphore for concurrency control + llm: Pre-created LangChain LLM instance (shared, thread-safe) + search_tools: Pre-configured search tools (tavily_search / arxiv_search) + + Returns: + Dict with claim, agent_result, success keys + """ + from dingo.model.llm.agent.agent_wrapper import AgentWrapper + + async with semaphore: + claim_id = claim.get('claim_id', 'unknown') + claim_text = claim.get('claim', '') + claim_type = claim.get('claim_type', 'factual') + claim_preview = (claim_text or '')[:60] + print(f"[ArticleFactChecker] → {claim_id} ({claim_type}): {claim_preview}", flush=True) + + try: + agent = AgentWrapper.create_agent( + llm=llm, + tools=search_tools, + system_prompt=PromptTemplates.PER_CLAIM_VERIFICATION_PROMPT + ) + + input_text = ( + f"Claim ID: {claim_id}\n" + f"Claim Type: {claim_type}\n" + f"Claim to verify: {claim_text}" + ) + + per_claim_max_iter = max(cls.get_max_iterations(), 5) + + agent_result = await AgentWrapper.async_invoke_and_format( + agent, + input_text=input_text, + max_iterations=per_claim_max_iter + ) + + log.debug(f"Verified {claim_id}: success={agent_result.get('success')}") + return {"claim": claim, "agent_result": agent_result, "success": True} + + except Exception as e: + log.error(f"Failed to verify {claim_id}: {e}") + return { + "claim": claim, + "agent_result": {"output": "", "success": False, "error": str(e)}, + "success": False + } + + @classmethod + def _get_max_concurrent_claims(cls) -> int: + """Read max_concurrent_claims from agent_config or use class default.""" + params = cls.dynamic_config.parameters or {} + agent_cfg = params.get('agent_config') or {} + return agent_cfg.get('max_concurrent_claims', cls.max_concurrent_claims) + + @classmethod + def _parse_claim_json_robust(cls, output: Optional[str]) -> Dict[str, Any]: + """ + Robustly parse claim verification JSON from LLM output. + + Three-tier parsing strategy: + 1. Regex match for a complete *flat* JSON object containing + ``"verification_result"`` (cannot match nested ``{}``). + 2. Truncated-JSON repair: strip markdown fences, append missing + closing characters, then ``json.loads``. + 3. Per-field regex extraction as last resort (includes fallback + patterns for truncated string values). + + Args: + output: Raw string returned by the per-claim mini-agent, or None. + + Returns: + Dict with as many fields as could be recovered; empty dict on + total failure. + """ + if not output or not isinstance(output, str): + return {} + + # --- Tier 1: exact regex match for flat JSON object --- + try: + json_match = re.search( + r'\{[^{}]*"verification_result"[^{}]*\}', output, re.DOTALL + ) + if json_match: + return json.loads(json_match.group(0)) + except (json.JSONDecodeError, AttributeError): + pass + + # --- Tier 2: truncated-JSON repair --- + try: + text = output.strip() + text = re.sub(r'^```(?:json)?\s*', '', text) + text = re.sub(r'\s*```\s*$', '', text) + text = text.strip() + + brace_start = text.find('{') + if brace_start != -1: + fragment = text[brace_start:] + suffixes = ['', '"', '"}', '"]', '"]}', '"}]'] + for suffix in suffixes: + patched = fragment + suffix + open_braces = patched.count('{') - patched.count('}') + open_brackets = patched.count('[') - patched.count(']') + closing = ']' * max(0, open_brackets) + '}' * max(0, open_braces) + try: + candidate = json.loads(patched + closing) + if isinstance(candidate, dict) and 'verification_result' in candidate: + return candidate + except (json.JSONDecodeError, ValueError): + continue + except Exception: + pass + + # --- Tier 3: per-field regex extraction --- + extracted: Dict[str, Any] = {} + try: + verdict_m = cls._RE_VERDICT.search(output) + if verdict_m: + extracted['verification_result'] = verdict_m.group(1).upper() + + evidence_m = cls._RE_EVIDENCE.search(output) or cls._RE_EVIDENCE_TRUNC.search(output) + if evidence_m: + extracted['evidence'] = evidence_m.group(1).replace('\\"', '"').replace('\\n', '\n') + + sources_m = cls._RE_SOURCES.search(output) or cls._RE_SOURCES_TRUNC.search(output) + if sources_m: + raw_sources = sources_m.group(1) + extracted['sources'] = [ + s.strip().strip('"') for s in raw_sources.split(',') + if s.strip().strip('"') + ] + + reasoning_m = cls._RE_REASONING.search(output) or cls._RE_REASONING_TRUNC.search(output) + if reasoning_m: + extracted['reasoning'] = reasoning_m.group(1).replace('\\"', '"').replace('\\n', '\n') + except Exception: + pass + + return extracted + + @classmethod + def _parse_single_claim_result(cls, claim: Dict, agent_result: Dict) -> Dict: + """ + Parse mini-agent JSON output into enriched claim verification record. + + Tries to extract the JSON block from agent output; falls back to + metadata derived from tool_calls when parsing fails. + + Args: + claim: Original claim dict from ClaimsExtractor + agent_result: Result dict from AgentWrapper.async_invoke_and_format + + Returns: + Enriched claim dict compatible with existing report structure + """ + output = agent_result.get('output', '') + tool_calls = agent_result.get('tool_calls', []) + + parsed = cls._parse_claim_json_robust(output) + + search_queries = [ + tc.get('args', {}).get('query', '') + for tc in tool_calls + if tc.get('args', {}).get('query') + ] + methods_used = list({tc.get('tool', '') for tc in tool_calls if tc.get('tool')}) + if parsed.get('verification_method'): + verification_method = parsed['verification_method'] + elif len(methods_used) > 1: + verification_method = 'combined' + elif methods_used: + verification_method = methods_used[0] + else: + verification_method = 'no_search' + + return { + "claim_id": claim.get('claim_id', ''), + "original_claim": claim.get('claim', ''), + "claim_type": claim.get('claim_type', 'unknown'), + "confidence": claim.get('confidence'), + "verification_result": cls._normalize_verdict( + parsed.get('verification_result', 'UNVERIFIABLE') + ), + "evidence": parsed.get('evidence', ''), + "sources": parsed.get('sources', []), + "verification_method": verification_method, + "search_queries_used": parsed.get('search_queries_used', search_queries), + "reasoning": parsed.get('reasoning', output[:500] if output else ''), + } + + @classmethod + def _build_unverifiable_claim_record(cls, claim: Dict, error_msg: str) -> Dict: + """Build a fallback UNVERIFIABLE record when claim verification fails.""" + return { + "claim_id": claim.get('claim_id', ''), + "original_claim": claim.get('claim', ''), + "claim_type": claim.get('claim_type', 'unknown'), + "confidence": None, + "verification_result": "UNVERIFIABLE", + "evidence": "", + "sources": [], + "verification_method": "error", + "search_queries_used": [], + "reasoning": f"Verification failed: {error_msg}", + } + + @classmethod + def _aggregate_parallel_results( + cls, + input_data: Data, + claims: List[Dict], + verification_results: List[Any], + start_time: float, + output_dir: Optional[str], + ) -> EvalDetail: + """ + Aggregate parallel verification results into a final EvalDetail. + + Merges per-claim mini-agent outputs, applies reasoning-verdict + consistency checks, recalculates the summary, and produces the + same structured report format as the sequential path. + + Args: + input_data: Original article Data object + claims: Extracted claims from Phase 1 + verification_results: List of results from asyncio.gather + (may contain Exception objects due to return_exceptions=True) + start_time: Wall-clock start time for execution_time calculation + output_dir: Optional path to save artifacts + + Returns: + EvalDetail with full verification report + """ + execution_time = time.time() - start_time + enriched_claims: List[Dict] = [] + all_tool_calls: List[Dict] = [] + total_reasoning_steps = 0 + + for claim, vr in zip(claims, verification_results): + if isinstance(vr, Exception): + enriched = cls._build_unverifiable_claim_record(claim, str(vr)) + elif not vr.get('success', False): + error = vr.get('agent_result', {}).get('error', 'unknown error') + enriched = cls._build_unverifiable_claim_record(claim, error) + else: + agent_result = vr.get('agent_result', {}) + enriched = cls._parse_single_claim_result(claim, agent_result) + all_tool_calls.extend(agent_result.get('tool_calls', [])) + total_reasoning_steps += agent_result.get('reasoning_steps', 0) + enriched_claims.append(enriched) + + # Apply reasoning-verdict consistency downgrade (TRUE → UNVERIFIABLE on hedging) + downgraded = cls._check_reasoning_verdict_consistency(enriched_claims) + if downgraded: + log.info(f"Consistency check: downgraded {downgraded} TRUE→UNVERIFIABLE") + + summary = cls._recalculate_summary(enriched_claims) + + # Build verification_data in the format _build_structured_report() expects + verification_data: Dict[str, Any] = { + "article_verification_summary": { + "article_type": "unknown", + **summary + }, + "detailed_findings": enriched_claims, + "false_claims_comparison": [ + { + "article_claimed": c["original_claim"], + "evidence": c.get("evidence", ""), + } + for c in enriched_claims + if c.get("verification_result") == "FALSE" + ], + } + + report = cls._build_structured_report( + verification_data=verification_data, + extracted_claims=claims, + enriched_claims=enriched_claims, + tool_calls=all_tool_calls, + reasoning_steps=total_reasoning_steps, + content_length=len(input_data.content or ''), + execution_time=execution_time, + claims_source="claims_extractor_direct_async", + ) + + if output_dir: + cls._save_verification_details(output_dir, enriched_claims) + cls._save_full_report(output_dir, report) + + # Build EvalDetail with the same structure as _build_eval_detail_from_verification + return cls._build_eval_detail_from_verification( + verification_data, + all_tool_calls, + total_reasoning_steps, + report=report, + ) + + @classmethod + def _format_agent_input(cls, input_data: Data) -> str: + """ + Format article content for agent. + + Args: + input_data: Data object with content (article text) + + Returns: + Formatted input string with task instructions + """ + article_text = input_data.content + + return f"""Please fact-check the following article comprehensively: + +===== ARTICLE START ===== +{article_text} +===== ARTICLE END ===== + +Your Task: +0. First, analyze the article type (academic/news/product/blog/policy) to guide your verification strategy +1. Extract ALL verifiable claims from this article using claims_extractor tool +2. Verify each claim using autonomous tool selection based on claim type and article context +3. Generate a comprehensive verification report + +Begin your systematic fact-checking process now. +""" + + @classmethod + def _get_system_prompt(cls, input_data: Data) -> str: + """Build system prompt, optionally tailored to article type.""" + article_type = getattr(input_data, 'article_type', None) + return PromptTemplates.build(article_type=article_type) + + @classmethod + def aggregate_results(cls, input_data: Data, results: List[Any]) -> EvalDetail: + """ + Parse agent output into structured EvalDetail report with full artifact saving. + + This method: + 1. Parses the agent's JSON output + 2. Extracts claims from tool_calls + 3. Builds per-claim verification records + 4. Generates structured report + 5. Saves all artifacts to output directory + 6. Returns EvalDetail with dual-layer reason (text + structured data) + + Args: + input_data: Original article data + results: List containing agent execution result dictionary + + Returns: + EvalDetail with comprehensive verification report + """ + if not results: + return cls._create_error_result("No results from agent") + + agent_result = results[0] + + # Check for execution errors + if not agent_result.get('success', True): + error_msg = agent_result.get('error', 'Unknown error') + + # For recursion limit errors, create custom EvalDetail + if "recursion limit" in error_msg.lower(): + limit_match = re.search(r'recursion limit of (\d+)', error_msg.lower()) + limit = int(limit_match.group(1)) if limit_match else 25 + + result = EvalDetail(metric=cls.__name__) + result.status = True # True indicates an issue/error + result.label = [f"{QualityLabel.QUALITY_BAD_PREFIX}AGENT_RECURSION_LIMIT"] + result.reason = [ + "Article Fact-Checking Failed: Recursion Limit Exceeded", + "=" * 70, + f"Agent reached maximum iteration limit ({limit} iterations).", + "", + "The article may be too long or contain too many claims to verify.", + "", + "Recommendations:", + f" 1. Increase max_iterations to {limit + 20} in agent_config", + " 2. Reduce max_claims from 50 to 20-30 in claims_extractor", + " 3. Use a shorter article or split into sections", + "", + "See detailed execution trace in ERROR logs above." + ] + return result + + # For other timeout errors, create custom EvalDetail + elif "timed out" in error_msg.lower() or "timeout" in error_msg.lower(): + result = EvalDetail(metric=cls.__name__) + result.status = True + result.label = [f"{QualityLabel.QUALITY_BAD_PREFIX}AGENT_TIMEOUT"] + result.reason = [ + "Article Fact-Checking Failed: Request Timeout", + "=" * 70, + "Request timed out during fact-checking.", + "", + "Possible causes:", + " - LLM API is responding slowly", + " - Article is too long to process", + " - Network connectivity issues", + "", + "Recommendations:", + " 1. Switch to faster model (e.g., gpt-4o-mini instead of deepseek-chat)", + " 2. Reduce article length (try shorter articles first)", + " 3. Reduce max_claims in claims_extractor (from 50 to 20-30)", + " 4. Check API response time and network connection", + "", + "See detailed execution trace in ERROR logs above (if available)." + ] + return result + + # For other errors, use default error template + return cls._create_error_result(error_msg) + + # Extract agent output + output = agent_result.get('output', '') + tool_calls = agent_result.get('tool_calls', []) + reasoning_steps = agent_result.get('reasoning_steps', 0) + + # Validate output exists + if not output or not output.strip(): + return cls._create_error_result( + "Agent returned empty output. " + "This may indicate the agent reached max_iterations without completing." + ) + + # Parse agent output (JSON format) + try: + verification_data = cls._parse_verification_output(output) + except Exception as e: + return cls._create_error_result( + f"Failed to parse agent output: {str(e)}\nOutput: {output[:300]}..." + ) + + # --- Extract claims and build enriched verification records --- + extracted_claims = cls._extract_claims_from_tool_calls(tool_calls) + claims_source = "claims_extractor_tool" + if not extracted_claims: + extracted_claims = cls._extract_claims_from_detailed_findings(verification_data) + claims_source = "agent_reasoning" + if extracted_claims: + log.info(f"Claims from agent reasoning (fallback): {len(extracted_claims)}") + + enriched_claims = cls._build_per_claim_verification( + verification_data, extracted_claims, tool_calls + ) + + # Normalize verdicts to standard values (TRUE/FALSE/UNVERIFIABLE) + for claim in enriched_claims: + claim["verification_result"] = cls._normalize_verdict(claim.get("verification_result", "")) + + # Code-level reasoning-verdict consistency check: + # Detect hedging language in reasoning that contradicts TRUE verdicts + downgraded = cls._check_reasoning_verdict_consistency(enriched_claims) + if downgraded: + log.info(f"Reasoning-verdict consistency check: {downgraded} verdict(s) downgraded") + + # Recalculate summary from actual data to override agent's self-reported summary + if enriched_claims: + recalculated = cls._recalculate_summary(enriched_claims) + original_summary = verification_data.get("article_verification_summary", {}) + verification_data["article_verification_summary"] = { + "article_type": original_summary.get("article_type", "unknown"), + **recalculated + } + + # Note: this legacy path is only reached if someone calls aggregate_results() + # directly (bypassing the overridden eval()). Timing metadata is unavailable + # here; use the async eval() path for accurate execution_time and artifact saving. + execution_time = 0.0 + content_length = len(getattr(input_data, 'content', '') or '') + output_dir = None + + # Build structured report + report = cls._build_structured_report( + verification_data=verification_data, + extracted_claims=extracted_claims, + enriched_claims=enriched_claims, + tool_calls=tool_calls, + reasoning_steps=reasoning_steps, + content_length=content_length, + execution_time=execution_time, + claims_source=claims_source + ) + + # --- Save artifacts to output directory --- + if output_dir: + try: + if extracted_claims: + cls._save_claims(output_dir, extracted_claims) + if enriched_claims: + cls._save_verification_details(output_dir, enriched_claims) + cls._save_full_report(output_dir, report) + except Exception as e: + log.warning(f"Failed to save some output artifacts: {e}") + + # Build EvalDetail from verification data (with enriched report) + return cls._build_eval_detail_from_verification( + verification_data, + tool_calls, + reasoning_steps, + report=report + ) + + @classmethod + def _parse_verification_output(cls, output: str) -> Dict[str, Any]: + """ + Parse agent output to extract verification data. + + Supports multiple formats with enhanced fallback parsing: + 1. JSON in code block (```json ... ```) + 2. JSON in generic code block (``` ... ```) + 3. Raw JSON object + 4. Partial JSON extraction + 5. Text analysis fallback with pattern matching + + Args: + output: Agent's text output + + Returns: + Parsed verification data dictionary + + Note: + Never raises - always returns a valid structure with raw_output for debugging + """ + # Strategy 1: Extract JSON from ```json code block + json_match = re.search( + r'```json\s*(\{.*?\})\s*```', + output, + re.DOTALL | re.IGNORECASE + ) + + if json_match: + try: + return json.loads(json_match.group(1)) + except json.JSONDecodeError as e: + log.debug(f"Failed to parse ```json block: {e}") + + # Strategy 2: Extract JSON from generic ``` code block + generic_block_match = re.search( + r'```\s*(\{.*?\})\s*```', + output, + re.DOTALL + ) + + if generic_block_match: + try: + return json.loads(generic_block_match.group(1)) + except json.JSONDecodeError as e: + log.debug(f"Failed to parse generic code block: {e}") + + # Strategy 3: Try direct JSON parsing (entire output is JSON) + try: + return json.loads(output.strip()) + except json.JSONDecodeError: + pass + + # Strategy 4: Find and extract JSON object anywhere in text + # Look for { ... } pattern that could be valid JSON + json_object_match = re.search( + r'(\{[^{}]*"article_verification_summary"[^{}]*\{[^{}]*\}[^{}]*\})', + output, + re.DOTALL + ) + + if json_object_match: + try: + return json.loads(json_object_match.group(1)) + except json.JSONDecodeError: + pass + + # Strategy 5: Try to find any valid JSON object + # Find the largest balanced { } block + brace_positions = [] + depth = 0 + start_pos = None + + for i, char in enumerate(output): + if char == '{': + if depth == 0: + start_pos = i + depth += 1 + elif char == '}': + depth -= 1 + if depth == 0 and start_pos is not None: + brace_positions.append((start_pos, i + 1)) + start_pos = None + + # Try each JSON candidate from largest to smallest + for start, end in sorted(brace_positions, key=lambda x: x[1] - x[0], reverse=True): + try: + candidate = output[start:end] + parsed = json.loads(candidate) + if isinstance(parsed, dict) and ("article_verification_summary" in parsed or "total_claims" in parsed): + return parsed + except json.JSONDecodeError: + continue + + # Strategy 6: Enhanced text analysis fallback + log.warning("Failed to parse as JSON, creating fallback structure from text analysis") + + # Extract summary numbers using multiple patterns + patterns = { + 'total': [ + r'total[_\s]*claims?[:\s]*(\d+)', + r'"total_claims"[:\s]*(\d+)', + r'(\d+)\s*(?:total\s+)?claims?\s+(?:analyzed|extracted|found)', + ], + 'false': [ + r'false[_\s]*claims?[:\s]*(\d+)', + r'"false_claims"[:\s]*(\d+)', + r'(\d+)\s*(?:false|incorrect|inaccurate)\s+claims?', + ], + 'verified': [ + r'verified[_\s]*claims?[:\s]*(\d+)', + r'"verified_claims"[:\s]*(\d+)', + r'(\d+)\s*(?:verified|true|accurate)\s+claims?', + ], + 'unverifiable': [ + r'unverifiable[_\s]*claims?[:\s]*(\d+)', + r'"unverifiable_claims"[:\s]*(\d+)', + r'(\d+)\s*(?:unverifiable|unknown|unclear)\s+claims?', + ], + 'accuracy': [ + r'accuracy[_\s]*(?:score)?[:\s]*([\d.]+)', + r'"accuracy_score"[:\s]*([\d.]+)', + r'overall\s+accuracy[:\s]*([\d.]+)', + ], + 'article_type': [ + r'"article_type"[:\s]*"(\w+)"', + r'article\s+type[:\s]*(\w+)', + ] + } + + def extract_first_match(pattern_list: List[str], default=None): + for pattern in pattern_list: + match = re.search(pattern, output, re.IGNORECASE) + if match: + return match.group(1) + return default + + total = int(extract_first_match(patterns['total'], '0')) + false = int(extract_first_match(patterns['false'], '0')) + verified = int(extract_first_match(patterns['verified'], '0') or (total - false)) + unverifiable = int(extract_first_match(patterns['unverifiable'], '0')) + accuracy_str = extract_first_match(patterns['accuracy'], '0') + article_type = extract_first_match(patterns['article_type'], 'unknown') + + # Parse accuracy (handle both 0.95 and 95% formats) + try: + accuracy = float(accuracy_str) + if accuracy > 1.0: # Likely percentage format + accuracy = accuracy / 100.0 + except (ValueError, TypeError): + accuracy = verified / total if total > 0 else 0.0 + + # Extract false claims details if present + false_claims_comparison = [] + claim_pattern = r'(?:claim|error|false)[:\s]*["\']?([^"\']+)["\']?\s*(?:→|->|:)\s*["\']?([^"\']+)["\']?' + claim_matches = re.findall(claim_pattern, output, re.IGNORECASE) + for claimed, truth in claim_matches[:5]: # Limit to 5 claims + false_claims_comparison.append({ + "article_claimed": claimed.strip(), + "actual_truth": truth.strip(), + }) + + return { + "article_verification_summary": { + "article_type": article_type, + "total_claims": total, + "verified_claims": verified, + "false_claims": false, + "unverifiable_claims": unverifiable, + "accuracy_score": accuracy + }, + "false_claims_comparison": false_claims_comparison, + "raw_output": output, # Include raw output for debugging + "parse_method": "text_analysis_fallback" + } + + @classmethod + def _build_eval_detail_from_verification( + cls, + verification_data: Dict[str, Any], + tool_calls: List, + reasoning_steps: int, + report: Optional[Dict[str, Any]] = None + ) -> EvalDetail: + """ + Build EvalDetail from parsed verification data with dual-layer reason. + + reason[0] is a human-readable text summary string. + reason[1] is the full structured report dict (JSON-serializable). + + Args: + verification_data: Parsed verification results + tool_calls: List of tool calls made by agent + reasoning_steps: Number of reasoning steps taken + report: Optional structured report dict from _build_structured_report + + Returns: + EvalDetail with comprehensive report + """ + summary = verification_data.get("article_verification_summary", {}) + total = summary.get("total_claims", 0) + false_count = summary.get("false_claims", 0) + unverifiable_count = summary.get("unverifiable_claims", 0) + verified = summary.get("verified_claims", 0) + accuracy = summary.get("accuracy_score", 0.0) + + # Binary status aligned with Dingo's evaluation model: + # - TRUE claims → good (no issue) + # - FALSE / UNVERIFIABLE claims → bad (issue detected) + # Unverifiable claims indicate sourcing deficiencies, which is + # a data quality problem (consistent with journalism standards). + has_issues = (false_count + unverifiable_count) > 0 + result = EvalDetail(metric=cls.__name__) + result.status = has_issues + result.score = accuracy + if false_count > 0: + result.label = [f"{QualityLabel.QUALITY_BAD_PREFIX}ARTICLE_FACTUAL_ERROR"] + elif unverifiable_count > 0: + result.label = [f"{QualityLabel.QUALITY_BAD_PREFIX}ARTICLE_UNVERIFIED_CLAIMS"] + else: + result.label = [QualityLabel.QUALITY_GOOD] + + # Build human-readable text summary + lines = [ + "Article Fact-Checking Report", + "=" * 70, + f"Total Claims Analyzed: {total}", + f"Verified Claims: {verified}", + f"False Claims: {false_count}", + f"Unverifiable Claims: {unverifiable_count}", + f"Overall Accuracy: {accuracy:.1%}", + "", + "Agent Performance:", + f" Tool Calls: {len(tool_calls)}", + f" Reasoning Steps: {reasoning_steps}", + "" + ] + + # Add false claims comparison table + false_claims = verification_data.get("false_claims_comparison", []) + if false_claims: + lines.append("FALSE CLAIMS DETAILED COMPARISON:") + lines.append("=" * 70) + + for i, fc in enumerate(false_claims, 1): + lines.extend([ + f"\n#{i} FALSE CLAIM", + " Article Claimed:", + f" {fc.get('article_claimed', 'N/A')}", + " Actual Truth:", + f" {fc.get('actual_truth', 'N/A')}", + " Evidence:", + f" {fc.get('evidence', 'N/A')}", + ]) + + # Add detailed findings summary + detailed = verification_data.get("detailed_findings", []) + if detailed: + lines.append("\n\nALL CLAIMS VERIFICATION SUMMARY:") + lines.append("=" * 70) + + result_counts = Counter(f.get("verification_result", "UNKNOWN") for f in detailed) + for result_type, count in result_counts.items(): + lines.append(f" {result_type}: {count} claims") + + # Show sample false claims + false_findings = [f for f in detailed if f.get("verification_result") == "FALSE"] + if false_findings and len(false_findings) <= 5: + lines.append("\n False Claims Details:") + for finding in false_findings[:5]: + lines.append( + f" - {finding.get('claim_id')}: {finding.get('original_claim', '')[:80]}..." + ) + + # Add raw output if available (for debugging) + if "raw_output" in verification_data: + lines.extend([ + "", + "DEBUG: Raw Agent Output (first 500 chars):", + verification_data["raw_output"][:500] + "..." + ]) + + # Dual-layer reason: [text_summary, structured_report] + text_summary = "\n".join(lines) + result.reason = [text_summary] + + if report: + result.reason.append(report) + + return result + + @classmethod + def _create_error_result(cls, error_message: str) -> EvalDetail: + """ + Create error result for agent failures. + + Args: + error_message: Description of the error + + Returns: + EvalDetail with error status + """ + result = EvalDetail(metric=cls.__name__) + result.status = True # True indicates an issue/error + result.label = [f"{QualityLabel.QUALITY_BAD_PREFIX}AGENT_ERROR"] + result.reason = [ + "Article Fact-Checking Failed", + "=" * 70, + f"Error: {error_message}", + "", + "Possible causes:", + "- Agent exceeded max_iterations without completing", + "- LLM failed to follow output format instructions", + "- Tool execution errors (API failures, rate limits)", + "- Invalid or empty article content", + "", + "Troubleshooting:", + "1. Check agent configuration (API keys, max_iterations)", + "2. Verify article content is valid and non-empty", + "3. Check tool configurations (claims_extractor, arxiv_search, tavily_search)", + "4. Review agent logs for detailed error messages" + ] + return result + + @classmethod + def plan_execution(cls, input_data: Data) -> List[Dict[str, Any]]: + """ + Not used when use_agent_executor=True. + + The LangChain agent autonomously plans its execution using ReAct pattern. + This method is only called for legacy agent path (use_agent_executor=False). + + Args: + input_data: Input data (unused) + + Returns: + Empty list (no manual planning needed) + """ + return [] diff --git a/dingo/model/llm/agent/agent_wrapper.py b/dingo/model/llm/agent/agent_wrapper.py index eb46778d..4240c1ef 100644 --- a/dingo/model/llm/agent/agent_wrapper.py +++ b/dingo/model/llm/agent/agent_wrapper.py @@ -1,291 +1,348 @@ -""" -Agent Wrapper for Dingo Agents (LangChain 1.0) - -Wraps LangChain's create_agent to work with Dingo's agent patterns. -Uses the modern LangChain 1.0 API (released November 2025). - -Key Changes from AgentExecutor: -- Uses langchain.agents.create_agent (built on LangGraph) -- Returns CompiledStateGraph instead of AgentExecutor -- Message-based invocation interface -- Built-in persistence and checkpointing support -""" - -from typing import Any, Dict, List, Optional - -from dingo.utils import log - - -class AgentWrapper: - """ - Wrapper that integrates LangChain 1.0 create_agent with Dingo agents. - - Handles: - - Tool conversion from Dingo to LangChain format - - Agent creation using create_agent - - Result parsing from message-based output to Dingo structures - - Configuration and logging - """ - - @staticmethod - def create_agent( - llm, - tools: List, - system_prompt: Optional[str] = None, - **config - ): - """ - Create a LangChain agent using langchain.agents.create_agent. - - Args: - llm: LangChain LLM instance (ChatOpenAI) - tools: List of LangChain StructuredTools - system_prompt: Optional system message - **config: Additional configuration (debug, middleware, etc.) - - Returns: - CompiledStateGraph (LangGraph agent) - - Example: - llm = AgentWrapper.get_openai_llm_from_dingo_config(config) - tools = convert_dingo_tools(["tavily_search"], agent) - agent = AgentWrapper.create_agent( - llm=llm, - tools=tools, - system_prompt="You are a fact-checking agent..." - ) - """ - try: - from langchain.agents import create_agent - except ImportError as e: - error_msg = ( - "LangChain is not installed but required for agent creation.\n\n" - "Install with:\n" - " pip install -r requirements/agent.txt\n" - "Or:\n" - " pip install 'dingo-python[agent]'" - ) - log.error(error_msg) - raise ImportError(error_msg) from e - - try: - # Create agent using LangChain 1.0 API - agent = create_agent( - model=llm, - tools=tools, - system_prompt=system_prompt or "You are a helpful assistant with access to tools.", - debug=config.get("debug", False) - ) - - log.debug( - f"Created agent with {len(tools)} tools using langchain.agents.create_agent" - ) - return agent - - except Exception as e: - log.error(f"Failed to create agent: {e}") - raise - - @staticmethod - def invoke_and_format( - agent, - input_text: str, - input_data: Optional[Any] = None, - max_iterations: Optional[int] = None - ) -> Dict[str, Any]: - """ - Invoke agent and format output for Dingo. - - Args: - agent: Compiled agent (from create_agent) - input_text: Text to pass to agent - input_data: Optional Data object for context - max_iterations: Maximum reasoning iterations (default: 25) - In LangChain 1.0, this is passed as 'recursion_limit' to the agent - - Returns: - Dict with: - - output: str (agent's final response) - - messages: List[Message] (full conversation) - - tool_calls: List[Dict] (parsed tool invocations) - - success: bool - - Example: - result = AgentWrapper.invoke_and_format( - agent, - input_text="Is Paris the capital of France?", - input_data=data_obj, - max_iterations=10 - ) - - Note: - In LangChain 1.0, iteration limits are controlled by recursion_limit, - which is passed at invocation time rather than during agent creation. - """ - try: - # Build config dict for agent invocation - config = {} - if max_iterations is not None: - # LangChain 1.0 uses 'recursion_limit' instead of 'max_iterations' - config["recursion_limit"] = max_iterations - log.debug(f"Setting recursion_limit={max_iterations}") - - # Invoke agent with message-based input and config - if config: - result = agent.invoke( - {"messages": [("user", input_text)]}, - config - ) - else: - # No config needed, use default recursion_limit (25) - result = agent.invoke({ - "messages": [("user", input_text)] - }) - - # Extract messages from result - messages = result.get('messages', []) - - # Get final output (last AI message) - output = "" - if messages: - last_message = messages[-1] - output = getattr(last_message, 'content', str(last_message)) - - # Parse tool calls from messages - tool_calls = AgentWrapper._extract_tool_calls(messages) - - # Count reasoning steps (messages between user input and final response) - reasoning_steps = len([m for m in messages if hasattr(m, 'type') and m.type == 'ai']) - - formatted_result = { - 'output': output, - 'messages': messages, - 'tool_calls': tool_calls, - 'reasoning_steps': reasoning_steps, - 'success': True - } - - log.debug( - f"Agent execution completed: {len(tool_calls)} tool calls, " - f"{reasoning_steps} reasoning steps" - ) - - return formatted_result - - except Exception as e: - log.error(f"Agent invocation failed: {e}") - return { - 'output': '', - 'messages': [], - 'tool_calls': [], - 'reasoning_steps': 0, - 'success': False, - 'error': str(e) - } - - @staticmethod - def _extract_tool_calls(messages: List) -> List[Dict[str, Any]]: - """ - Extract tool calls from message sequence. - - Parses AIMessage objects with tool_calls and their corresponding - ToolMessage responses. - - Args: - messages: List of message objects - - Returns: - List of dicts with tool, args, observation - """ - tool_calls = [] - - try: - from langchain_core.messages import AIMessage, ToolMessage - - for i, message in enumerate(messages): - # Check if AI message has tool calls - if isinstance(message, AIMessage) and hasattr(message, 'tool_calls'): - for tool_call in message.tool_calls: - # Find corresponding tool response - observation = "" - if i + 1 < len(messages) and isinstance(messages[i + 1], ToolMessage): - observation = messages[i + 1].content - - tool_calls.append({ - 'tool': tool_call.get('name', 'unknown'), - 'args': tool_call.get('args', {}), - 'observation': observation - }) - - except ImportError: - # Fallback if langchain_core not available - log.warning("Could not import langchain_core for tool call extraction") - - except Exception as e: - log.warning(f"Error extracting tool calls: {e}") - - return tool_calls - - @staticmethod - def get_openai_llm_from_dingo_config(dynamic_config): - """ - Create LangChain ChatOpenAI LLM from Dingo's dynamic_config. - - Args: - dynamic_config: BaseOpenAI.dynamic_config (EvaluatorLLMArgs) - - Returns: - LangChain ChatOpenAI instance - - Note: - This wraps Dingo's existing client creation pattern - for use with LangChain's agent framework. - - Example: - llm = AgentWrapper.get_openai_llm_from_dingo_config( - agent.dynamic_config - ) - """ - try: - from langchain_openai import ChatOpenAI - except ImportError as e: - error_msg = ( - "langchain-openai is not installed but required for LLM integration.\n\n" - "Install with:\n" - " pip install -r requirements/agent.txt\n" - "Or:\n" - " pip install 'dingo-python[agent]'" - ) - log.error(error_msg) - raise ImportError(error_msg) from e - - if not hasattr(dynamic_config, 'key') or not dynamic_config.key: - raise ValueError( - "dynamic_config must have 'key' (API key) for LLM" - ) - - if not hasattr(dynamic_config, 'api_url') or not dynamic_config.api_url: - raise ValueError( - "dynamic_config must have 'api_url' (base URL) for LLM" - ) - - # Extract parameters - params = dynamic_config.parameters or {} - - # Create ChatOpenAI instance - llm = ChatOpenAI( - api_key=dynamic_config.key, - base_url=dynamic_config.api_url, - model=dynamic_config.model or "gpt-4.1-mini", - temperature=params.get("temperature", 0.3), - max_tokens=params.get("max_tokens", 1000), # Lower default to avoid context length issues - top_p=params.get("top_p", 1.0), - timeout=params.get("timeout", 30) - ) - - log.debug( - f"Created ChatOpenAI: model={dynamic_config.model}, " - f"temp={params.get('temperature', 0.3)}" - ) - - return llm +""" +Agent Wrapper for Dingo Agents (LangChain 1.0) + +Wraps LangChain's create_agent to work with Dingo's agent patterns. +Uses the modern LangChain 1.0 API (released November 2025). + +Key Changes from AgentExecutor: +- Uses langchain.agents.create_agent (built on LangGraph) +- Returns CompiledStateGraph instead of AgentExecutor +- Message-based invocation interface +- Built-in persistence and checkpointing support +""" + +from typing import Any, Dict, List, Optional + +from dingo.utils import log + + +class AgentWrapper: + """ + Wrapper that integrates LangChain 1.0 create_agent with Dingo agents. + + Handles: + - Tool conversion from Dingo to LangChain format + - Agent creation using create_agent + - Result parsing from message-based output to Dingo structures + - Configuration and logging + """ + + @staticmethod + def create_agent( + llm, + tools: List, + system_prompt: Optional[str] = None, + **config + ): + """ + Create a LangChain agent using langchain.agents.create_agent. + + Args: + llm: LangChain LLM instance (ChatOpenAI) + tools: List of LangChain StructuredTools + system_prompt: Optional system message + **config: Additional configuration (debug, middleware, etc.) + + Returns: + CompiledStateGraph (LangGraph agent) + + Example: + llm = AgentWrapper.get_openai_llm_from_dingo_config(config) + tools = convert_dingo_tools(["tavily_search"], agent) + agent = AgentWrapper.create_agent( + llm=llm, + tools=tools, + system_prompt="You are a fact-checking agent..." + ) + """ + try: + from langchain.agents import create_agent + except ImportError as e: + error_msg = ( + "LangChain is not installed but required for agent creation.\n\n" + "Install with:\n" + " pip install -r requirements/agent.txt\n" + "Or:\n" + " pip install 'dingo-python[agent]'" + ) + log.error(error_msg) + raise ImportError(error_msg) from e + + try: + # Create agent using LangChain 1.0 API + agent = create_agent( + model=llm, + tools=tools, + system_prompt=system_prompt or "You are a helpful assistant with access to tools.", + debug=config.get("debug", False) + ) + + log.debug( + f"Created agent with {len(tools)} tools using langchain.agents.create_agent" + ) + return agent + + except Exception as e: + log.error(f"Failed to create agent: {e}") + raise + + @staticmethod + def invoke_and_format( + agent, + input_text: str, + input_data: Optional[Any] = None, + max_iterations: Optional[int] = None + ) -> Dict[str, Any]: + """ + Invoke agent and format output for Dingo. + + Args: + agent: Compiled agent (from create_agent) + input_text: Text to pass to agent + input_data: Optional Data object for context + max_iterations: Maximum reasoning iterations (default: 25) + In LangChain 1.0, this is passed as 'recursion_limit' to the agent + + Returns: + Dict with: + - output: str (agent's final response) + - messages: List[Message] (full conversation) + - tool_calls: List[Dict] (parsed tool invocations) + - success: bool + + Example: + result = AgentWrapper.invoke_and_format( + agent, + input_text="Is Paris the capital of France?", + input_data=data_obj, + max_iterations=10 + ) + + Note: + In LangChain 1.0, iteration limits are controlled by recursion_limit, + which is passed at invocation time rather than during agent creation. + """ + try: + # Build config dict for agent invocation + config = {} + if max_iterations is not None: + # LangChain 1.0 uses 'recursion_limit' instead of 'max_iterations' + config["recursion_limit"] = max_iterations + log.debug(f"Setting recursion_limit={max_iterations}") + + # Invoke agent with message-based input and config + if config: + result = agent.invoke( + {"messages": [("user", input_text)]}, + config + ) + else: + # No config needed, use default recursion_limit (25) + result = agent.invoke({ + "messages": [("user", input_text)] + }) + + formatted_result = AgentWrapper._format_agent_result(result) + log.debug( + f"Agent execution completed: {len(formatted_result['tool_calls'])} tool calls, " + f"{formatted_result['reasoning_steps']} reasoning steps" + ) + return formatted_result + + except Exception as e: + log.error(f"Agent invocation failed: {e}") + return AgentWrapper._make_error_result(str(e)) + + @staticmethod + async def async_invoke_and_format( + agent, + input_text: str, + input_data: Optional[Any] = None, + max_iterations: Optional[int] = None + ) -> Dict[str, Any]: + """ + Async version of invoke_and_format using agent.ainvoke(). + + Used for concurrent claim verification in ArticleFactChecker's + two-phase parallel architecture. + + Args: + agent: Compiled agent (from create_agent) + input_text: Text to pass to agent + input_data: Optional Data object for context (unused, kept for API parity) + max_iterations: Maximum reasoning iterations (recursion_limit) + + Returns: + Dict with output, messages, tool_calls, reasoning_steps, success + """ + try: + config = {} + if max_iterations is not None: + config["recursion_limit"] = max_iterations + + if config: + result = await agent.ainvoke( + {"messages": [("user", input_text)]}, + config + ) + else: + result = await agent.ainvoke({"messages": [("user", input_text)]}) + + formatted_result = AgentWrapper._format_agent_result(result) + log.debug( + f"Async agent execution completed: {len(formatted_result['tool_calls'])} tool calls, " + f"{formatted_result['reasoning_steps']} reasoning steps" + ) + return formatted_result + + except Exception as e: + log.error(f"Async agent invocation failed: {e}") + return AgentWrapper._make_error_result(str(e)) + + @staticmethod + def _format_agent_result(result: Dict) -> Dict[str, Any]: + """ + Convert raw agent invocation result into Dingo's standard output format. + + Shared by both invoke_and_format (sync) and async_invoke_and_format (async) + to avoid duplication of message-parsing logic. + + Args: + result: Raw dict returned by agent.invoke() / agent.ainvoke() + + Returns: + Dict with output, messages, tool_calls, reasoning_steps, success=True + """ + messages = result.get('messages', []) + output = "" + if messages: + last_message = messages[-1] + output = getattr(last_message, 'content', str(last_message)) + tool_calls = AgentWrapper._extract_tool_calls(messages) + reasoning_steps = len([m for m in messages if hasattr(m, 'type') and m.type == 'ai']) + return { + 'output': output, + 'messages': messages, + 'tool_calls': tool_calls, + 'reasoning_steps': reasoning_steps, + 'success': True, + } + + @staticmethod + def _make_error_result(error: str) -> Dict[str, Any]: + """Build a standard error result dict for failed agent invocations.""" + return { + 'output': '', + 'messages': [], + 'tool_calls': [], + 'reasoning_steps': 0, + 'success': False, + 'error': error, + } + + @staticmethod + def _extract_tool_calls(messages: List) -> List[Dict[str, Any]]: + """ + Extract tool calls from message sequence. + + Parses AIMessage objects with tool_calls and their corresponding + ToolMessage responses. + + Args: + messages: List of message objects + + Returns: + List of dicts with tool, args, observation + """ + tool_calls = [] + + try: + from langchain_core.messages import AIMessage, ToolMessage + + for i, message in enumerate(messages): + # Check if AI message has tool calls + if isinstance(message, AIMessage) and hasattr(message, 'tool_calls'): + for tool_call in message.tool_calls: + # Find corresponding tool response + observation = "" + if i + 1 < len(messages) and isinstance(messages[i + 1], ToolMessage): + observation = messages[i + 1].content + + tool_calls.append({ + 'tool': tool_call.get('name', 'unknown'), + 'args': tool_call.get('args', {}), + 'observation': observation + }) + + except ImportError: + # Fallback if langchain_core not available + log.warning("Could not import langchain_core for tool call extraction") + + except Exception as e: + log.warning(f"Error extracting tool calls: {e}") + + return tool_calls + + @staticmethod + def get_openai_llm_from_dingo_config(dynamic_config): + """ + Create LangChain ChatOpenAI LLM from Dingo's dynamic_config. + + Args: + dynamic_config: BaseOpenAI.dynamic_config (EvaluatorLLMArgs) + + Returns: + LangChain ChatOpenAI instance + + Note: + This wraps Dingo's existing client creation pattern + for use with LangChain's agent framework. + + Example: + llm = AgentWrapper.get_openai_llm_from_dingo_config( + agent.dynamic_config + ) + """ + try: + from langchain_openai import ChatOpenAI + except ImportError as e: + error_msg = ( + "langchain-openai is not installed but required for LLM integration.\n\n" + "Install with:\n" + " pip install -r requirements/agent.txt\n" + "Or:\n" + " pip install 'dingo-python[agent]'" + ) + log.error(error_msg) + raise ImportError(error_msg) from e + + if not hasattr(dynamic_config, 'key') or not dynamic_config.key: + raise ValueError( + "dynamic_config must have 'key' (API key) for LLM" + ) + + if not hasattr(dynamic_config, 'api_url') or not dynamic_config.api_url: + raise ValueError( + "dynamic_config must have 'api_url' (base URL) for LLM" + ) + + # Extract parameters + params = dynamic_config.parameters or {} + + # Create ChatOpenAI instance + llm = ChatOpenAI( + api_key=dynamic_config.key, + base_url=dynamic_config.api_url, + model=dynamic_config.model or "gpt-4.1-mini", + temperature=params.get("temperature", 0.3), + max_tokens=params.get("max_tokens", 4096), + top_p=params.get("top_p", 1.0), + timeout=params.get("timeout", 30) + ) + + log.debug( + f"Created ChatOpenAI: model={dynamic_config.model}, " + f"temp={params.get('temperature', 0.3)}" + ) + + return llm diff --git a/dingo/model/llm/agent/tools/arxiv_search.py b/dingo/model/llm/agent/tools/arxiv_search.py new file mode 100644 index 00000000..5d946602 --- /dev/null +++ b/dingo/model/llm/agent/tools/arxiv_search.py @@ -0,0 +1,472 @@ +""" +arXiv Search Tool + +This module provides integration with arXiv API for academic paper search and verification. +arXiv is a free distribution service and open-access archive for scholarly articles in +the fields of physics, mathematics, computer science, and more. + +Dependencies: + arxiv>=2.4.0 + +Configuration: + max_results: Maximum number of search results (default: 5, range: 1-50) + sort_by: Sort order - "relevance", "lastUpdatedDate", or "submittedDate" (default: "relevance") + sort_order: "ascending" or "descending" (default: "descending") + rate_limit_delay: Delay between requests in seconds (default: 3.0) + timeout: Request timeout in seconds (default: 30) + api_key: Not required for arXiv (public API) +""" + +import re +import threading +import time +from typing import Any, Dict, List, Optional + +from pydantic import Field + +from dingo.io.input import RequiredField +from dingo.model.llm.agent.tools.base_tool import BaseTool, ToolConfig +from dingo.model.llm.agent.tools.tool_registry import tool_register +from dingo.utils import log + + +class ArxivConfig(ToolConfig): + """Configuration for arXiv search tool""" + api_key: Optional[str] = None # Override parent - not needed for arXiv + max_results: int = Field(default=5, ge=1, le=50) + sort_by: str = Field(default="relevance", pattern="^(relevance|lastUpdatedDate|submittedDate)$") + sort_order: str = Field(default="descending", pattern="^(ascending|descending)$") + rate_limit_delay: float = Field(default=3.0, ge=0.0) + timeout: int = Field(default=30, ge=1) + + +@tool_register +class ArxivSearch(BaseTool): + """ + arXiv search tool for academic paper verification. + + Provides search capabilities for academic papers in arXiv's open-access archive. + Supports searching by arXiv ID, DOI, title, author, and keywords with automatic + detection of query type. + + Features: + - Auto-detection of arXiv IDs and DOIs + - No API key required (public API) + - Rate limiting to respect arXiv guidelines + - Support for multiple search modes + - Comprehensive paper metadata + + arXiv ID Patterns: + - New format: 2301.12345 or 2301.12345v1 (with version) + - Old format: hep-ph/0123456 or hep-ph/0123456v1 + + DOI Pattern: + - Standard DOI: 10.1234/example.doi + + Usage: + # Auto-detect search type + result = ArxivSearch.execute(query="1706.03762") + + # Explicit search by title + result = ArxivSearch.execute( + query="Attention is All You Need", + search_type="title" + ) + + # Result structure: + { + 'success': True, + 'query': '1706.03762', + 'search_type': 'arxiv_id', + 'results': [ + { + 'arxiv_id': '1706.03762', + 'title': 'Attention is All You Need', + 'authors': ['Vaswani, Ashish', ...], + 'summary': 'We propose a new...', + 'published': '2017-06-12', + 'updated': '2017-12-06', + 'pdf_url': 'http://arxiv.org/pdf/1706.03762v5', + 'doi': '10.48550/arXiv.1706.03762', + 'categories': ['cs.CL', 'cs.LG'], + 'journal_ref': 'NIPS 2017' + }, + ... + ] + } + """ + + name = "arxiv_search" + description = ( + "Search arXiv for academic papers by ID, DOI, title, or author. " + "Returns comprehensive paper metadata including title, authors, abstract, " + "publication date, PDF URL, and citations. Useful for verifying academic " + "claims, finding research papers, and checking paper details." + ) + config: ArxivConfig = ArxivConfig() + + _required_fields = [RequiredField.CONTENT] + _last_request_time: float = 0.0 + _rate_limit_lock: threading.Lock = threading.Lock() + + @classmethod + def execute(cls, query: str, search_type: str = "auto", **kwargs) -> Dict[str, Any]: + """ + Execute arXiv search. + + Args: + query: Search query string (arXiv ID, DOI, title, author, or keywords) + search_type: Search mode - "auto", "id", "doi", "title", "author" (default: "auto") + **kwargs: Optional overrides for configuration + - max_results: Override max_results config + - sort_by: Override sort_by config + - sort_order: Override sort_order config + + Returns: + Dict with search results: + { + 'success': bool, + 'query': str, + 'search_type': str, + 'results': List[Dict], + 'count': int + } + + Raises: + ImportError: If arxiv library is not installed + ValueError: If query is empty or search_type is invalid + Exception: For API errors + """ + # Validate inputs + if not query or not query.strip(): + log.error("arXiv search query cannot be empty") + return { + 'success': False, + 'error': 'Search query cannot be empty', + 'query': query + } + + valid_search_types = ["auto", "id", "doi", "title", "author"] + if search_type not in valid_search_types: + log.error(f"Invalid search_type: {search_type}") + return { + 'success': False, + 'error': f'Invalid search_type. Must be one of: {", ".join(valid_search_types)}', + 'query': query + } + + # Import arxiv library (lazy import) + try: + import arxiv + except ImportError: + error_msg = ( + "arxiv library is not installed but required for arXiv search.\n\n" + "Install with:\n" + " pip install -r requirements/agent.txt\n" + "Or:\n" + " pip install arxiv\n" + "Or:\n" + " pip install 'dingo-python[agent]'" + ) + log.error(error_msg) + return { + 'success': False, + 'error': error_msg, + 'query': query, + 'error_type': 'DependencyError' + } + + # Apply rate limiting + cls._apply_rate_limiting() + + # Execute search + try: + log.info(f"Executing arXiv search: {query[:100]}... (type: {search_type})") + + # Build search query based on type + detected_type, arxiv_query = cls._build_arxiv_query(query, search_type) + + # Get configuration + max_results = kwargs.get('max_results', cls.config.max_results) + sort_by_str = kwargs.get('sort_by', cls.config.sort_by) + sort_order_str = kwargs.get('sort_order', cls.config.sort_order) + + # Map sort_by string to arxiv.SortCriterion + sort_by_map = { + 'relevance': arxiv.SortCriterion.Relevance, + 'lastUpdatedDate': arxiv.SortCriterion.LastUpdatedDate, + 'submittedDate': arxiv.SortCriterion.SubmittedDate + } + sort_by = sort_by_map.get(sort_by_str, arxiv.SortCriterion.Relevance) + + # Map sort_order string to arxiv.SortOrder + sort_order_map = { + 'ascending': arxiv.SortOrder.Ascending, + 'descending': arxiv.SortOrder.Descending + } + sort_order = sort_order_map.get(sort_order_str, arxiv.SortOrder.Descending) + + # Create search + search = arxiv.Search( + query=arxiv_query, + max_results=max_results, + sort_by=sort_by, + sort_order=sort_order + ) + + # Execute search and collect results + results = [] + client = arxiv.Client() + for paper in client.results(search): + results.append(cls._format_paper(paper)) + + # Format response + result = { + 'success': True, + 'query': query, + 'search_type': detected_type, + 'results': results, + 'count': len(results) + } + + log.info(f"arXiv search successful: {len(results)} results") + return result + + except Exception as e: + log.error(f"arXiv search failed: {e}") + + # Sanitize error message to prevent information disclosure + error_str = str(e).lower() + if "timeout" in error_str: + error_msg = "Search request timed out" + elif "network" in error_str or "connection" in error_str: + error_msg = "Network connection error" + elif "rate limit" in error_str: + error_msg = "Rate limit exceeded" + else: + error_msg = f"Search failed: {type(e).__name__}" + + return { + 'success': False, + 'error': error_msg, + 'query': query, + 'error_type': type(e).__name__ + } + + @classmethod + def _build_arxiv_query(cls, query: str, search_type: str) -> tuple: + """ + Build arXiv API query based on search type. + + Auto-detection priority: + 1. arXiv ID (e.g., "2301.12345" or "hep-ph/0123456") + 2. DOI (e.g., "10.1234/example") + 3. Title/keyword search + + Args: + query: User query + search_type: "auto", "id", "doi", "title", or "author" + + Returns: + Tuple of (detected_type: str, arxiv_query: str) + """ + query = query.strip() + + # Auto-detect or explicit type + if search_type == "auto": + # Check for arXiv ID + if cls._is_arxiv_id(query): + detected_type = "arxiv_id" + # Clean up arXiv ID (remove "arXiv:" prefix if present) + clean_id = query.replace("arXiv:", "").replace("arxiv:", "").strip() + arxiv_query = f"id:{clean_id}" + + # Check for DOI + elif cls._is_doi(query): + detected_type = "doi" + arxiv_query = f"doi:{query}" + + # Default to title search + else: + detected_type = "title" + arxiv_query = f"ti:{query}" + + elif search_type == "id": + detected_type = "arxiv_id" + clean_id = query.replace("arXiv:", "").replace("arxiv:", "").strip() + arxiv_query = f"id:{clean_id}" + + elif search_type == "doi": + detected_type = "doi" + arxiv_query = f"doi:{query}" + + elif search_type == "title": + detected_type = "title" + arxiv_query = f"ti:{query}" + + elif search_type == "author": + detected_type = "author" + arxiv_query = f"au:{query}" + + else: + # Fallback + detected_type = "title" + arxiv_query = f"ti:{query}" + + return detected_type, arxiv_query + + @classmethod + def _is_arxiv_id(cls, text: str) -> bool: + """ + Check if text matches arXiv ID pattern. + + Patterns: + - New format: YYMM.NNNNN or YYMM.NNNNNvN (e.g., 2301.12345, 2301.12345v1) + - Old format: archive/NNNNNNN or archive/NNNNNNNvN (e.g., hep-ph/0123456) + + Args: + text: Text to check + + Returns: + True if text matches arXiv ID pattern + """ + text = text.strip().replace("arXiv:", "").replace("arxiv:", "") + + # New format: YYMM.NNNNN(vN)? + new_pattern = r'^\d{4}\.\d{4,5}(v\d+)?$' + if re.match(new_pattern, text): + return True + + # Old format: archive/NNNNNNN(vN)? + old_pattern = r'^[a-z\-]+/\d{7}(v\d+)?$' + if re.match(old_pattern, text): + return True + + return False + + @classmethod + def _is_doi(cls, text: str) -> bool: + """ + Check if text matches DOI pattern. + + Pattern: 10.NNNN/... (standard DOI format) + + Args: + text: Text to check + + Returns: + True if text matches DOI pattern + """ + text = text.strip() + doi_pattern = r'^10\.\d{4,9}/[-._;()/:A-Z0-9]+$' + return bool(re.match(doi_pattern, text, re.IGNORECASE)) + + @classmethod + def _format_paper(cls, paper) -> Dict[str, Any]: + """ + Format arxiv.Result to standard dictionary. + + Args: + paper: arxiv.Result object + + Returns: + Formatted paper dictionary + """ + return { + 'arxiv_id': paper.entry_id.split('/')[-1], # Extract ID from full URL + 'title': paper.title, + 'authors': [author.name for author in paper.authors], + 'summary': paper.summary, + 'published': paper.published.strftime('%Y-%m-%d') if paper.published else None, + 'updated': paper.updated.strftime('%Y-%m-%d') if paper.updated else None, + 'pdf_url': paper.pdf_url, + 'doi': paper.doi, + 'categories': paper.categories, + 'primary_category': paper.primary_category, + 'journal_ref': paper.journal_ref, + 'comment': paper.comment + } + + @classmethod + def _apply_rate_limiting(cls): + """ + Apply rate limiting to respect arXiv guidelines. + + arXiv recommends at least 3 seconds between requests. + This method enforces the configured rate_limit_delay. + Thread-safe: uses _rate_limit_lock to prevent concurrent requests + from bypassing the rate limit. + """ + with cls._rate_limit_lock: + current_time = time.time() + time_since_last_request = current_time - cls._last_request_time + + if time_since_last_request < cls.config.rate_limit_delay: + sleep_time = cls.config.rate_limit_delay - time_since_last_request + log.debug(f"Rate limiting: sleeping for {sleep_time:.2f} seconds") + time.sleep(sleep_time) + + cls._last_request_time = time.time() + + @classmethod + def detect_paper_references(cls, text: str) -> Dict[str, List[str]]: + """ + Utility: Detect paper references in text. + + Searches for arXiv IDs and DOIs in text and returns them. + Useful for preprocessing text to find papers to look up. + + Args: + text: Text to search for paper references + + Returns: + Dict with 'arxiv_ids' and 'dois' keys containing found references + + Example: + text = "See arXiv:1706.03762 and DOI 10.1234/example" + refs = ArxivSearch.detect_paper_references(text) + # refs = { + # 'arxiv_ids': ['1706.03762'], + # 'dois': ['10.1234/example'] + # } + """ + # Find arXiv IDs + arxiv_ids = [] + + # New format: YYMM.NNNNN(vN)? - use non-capturing group to avoid tuple returns + new_pattern = r'\b\d{4}\.\d{4,5}(?:v\d+)?\b' + arxiv_ids.extend(re.findall(new_pattern, text)) + + # Old format: archive/NNNNNNN(vN)? - use non-capturing group + old_pattern = r'\b[a-z\-]+/\d{7}(?:v\d+)?\b' + arxiv_ids.extend(re.findall(old_pattern, text)) + + # Also look for explicit "arXiv:..." mentions + arxiv_prefix_pattern = r'arXiv:\s*(\d{4}\.\d{4,5}(?:v\d+)?|[a-z\-]+/\d{7}(?:v\d+)?)' + arxiv_ids.extend(re.findall(arxiv_prefix_pattern, text, re.IGNORECASE)) + + # Find DOIs + doi_pattern = r'\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b' + dois = re.findall(doi_pattern, text, re.IGNORECASE) + + # Deduplicate + arxiv_ids = list(set(arxiv_ids)) + dois = list(set(dois)) + + return { + 'arxiv_ids': arxiv_ids, + 'dois': dois + } + + @classmethod + def validate_config(cls): + """ + Validate tool configuration. + + arXiv doesn't require an API key, so we override the parent's + api_key validation. + """ + # arXiv is a public API - no API key required + # Just validate that config exists + if not hasattr(cls, 'config'): + raise ValueError(f"{cls.name}: Missing configuration") diff --git a/dingo/model/llm/agent/tools/claims_extractor.py b/dingo/model/llm/agent/tools/claims_extractor.py new file mode 100644 index 00000000..f3204b96 --- /dev/null +++ b/dingo/model/llm/agent/tools/claims_extractor.py @@ -0,0 +1,606 @@ +""" +Claims Extraction Tool + +This module provides LLM-based extraction of verifiable claims from long-form text. +Based on Claimify methodology and ACL 2025 best practices for atomic fact extraction. + +Dependencies: + openai>=1.0.0 (for LLM-based extraction) + +Configuration: + model: LLM model for extraction (default: "gpt-4o-mini") + api_key: OpenAI API key + base_url: Custom API base URL (optional, e.g., "https://api.deepseek.com/v1" for DeepSeek) + max_claims: Maximum number of claims to extract (default: 50, range: 1-200) + claim_types: Types of claims to extract (default: all types) + chunk_size: Text chunk size for processing (default: 2000) + include_context: Include surrounding context (default: True) +""" + +import json +import re +from typing import Any, Dict, List, Optional + +from pydantic import Field + +from dingo.model.llm.agent.tools.base_tool import BaseTool, ToolConfig +from dingo.model.llm.agent.tools.tool_registry import tool_register +from dingo.utils import log + + +class ClaimsExtractorConfig(ToolConfig): + """Configuration for claims extraction tool""" + model: str = Field(default="gpt-4o-mini", description="LLM model for extraction") + api_key: Optional[str] = Field(default=None, description="OpenAI API key") + base_url: Optional[str] = Field(default=None, description="Custom API base URL (e.g., for DeepSeek)") + max_claims: int = Field(default=50, ge=1, le=200) + claim_types: List[str] = Field( + default=[ + # Original claim types + "factual", # General facts + "statistical", # Numbers, percentages, metrics + "attribution", # Who said/did/published what + "institutional", # Organizations, affiliations, collaborations + # New claim types for multi-type article support + "temporal", # Time-related claims (dates, durations, "recently") + "comparative", # Comparisons between entities/products + "monetary", # Financial figures, costs, prices + "technical" # Technical specifications, capabilities + ], + description="Types of claims to extract (8 types)" + ) + chunk_size: int = Field(default=2000, ge=500, le=10000, description="Text chunk size") + include_context: bool = Field(default=True, description="Include surrounding context") + temperature: float = Field(default=0.1, ge=0.0, le=1.0, description="LLM temperature") + + +@tool_register +class ClaimsExtractor(BaseTool): + """ + Extract verifiable claims from long-form text (articles, blog posts). + + This tool uses LLM-based extraction to identify atomic, decontextualized claims + that can be independently fact-checked. Based on Claimify (ACL 2025) methodology. + + Features: + - Atomic claim extraction (one fact per claim) + - Decontextualization (claims stand alone) + - Claim type classification + - Context preservation (optional) + - Deduplication and merging + + Claim Types (8 types): + - factual: General facts (e.g., "The tower is 330 meters tall") + - statistical: Numbers, percentages (e.g., "Model has 0.9B parameters") + - attribution: Who said/did what (e.g., "Vaswani et al. proposed Transformer") + - institutional: Organizations, affiliations (e.g., "Released by MIT and Stanford") + - temporal: Time-related (e.g., "Released on December 5, 2024") + - comparative: Comparisons (e.g., "GPU improved 20% vs previous gen") + - monetary: Financial figures (e.g., "Priced at $999") + - technical: Technical specs (e.g., "A17 Pro chip with 3nm process") + + Usage: + # Extract all types of claims (using default OpenAI API) + result = ClaimsExtractor.execute(text=article_text) + + # Extract only institutional claims + result = ClaimsExtractor.execute( + text=article_text, + claim_types=["institutional"] + ) + + # Use custom API (e.g., DeepSeek) + ClaimsExtractor.config.model = "deepseek-chat" + ClaimsExtractor.config.base_url = "https://api.deepseek.com/v1" + result = ClaimsExtractor.execute(text=article_text) + + # Result structure: + { + 'success': True, + 'claims': [ + { + 'claim_id': 'claim_001', + 'claim': 'OmniDocBench was released by Tsinghua University', + 'claim_type': 'institutional', + 'context': 'PaddleOCR-VL登顶的OmniDocBench V1.5...', + 'position': {'start': 120, 'end': 180}, + 'verifiable': True, + 'confidence': 0.95 + }, + ... + ], + 'metadata': { + 'total_claims': 25, + 'verifiable_claims': 20, + 'claim_types_distribution': {...} + } + } + """ + + name = "claims_extractor" + description = ( + "Extract verifiable claims from long-form text (articles, blog posts). " + "Returns atomic, decontextualized claims with context and metadata. " + "Useful for fact-checking articles, identifying checkable statements. " + "Supports 8 claim types: factual, statistical, attribution, institutional, " + "temporal, comparative, monetary, technical." + ) + config: ClaimsExtractorConfig = ClaimsExtractorConfig() + + # System prompt for LLM-based extraction + EXTRACTION_SYSTEM_PROMPT = """You are an expert fact-checker specialized in extracting verifiable claims from text. + +Your task is to extract ATOMIC, VERIFIABLE claims that can be independently fact-checked. + +Guidelines: +1. Atomicity: Each claim describes ONE fact, statistic, or attribution +2. Verifiability: Can be checked against authoritative sources +3. Decontextualization: Include necessary context to stand alone +4. Faithfulness: Preserve original meaning +5. Specificity: Extract specific, checkable claims (not opinions or vague statements) + +Claim Types (EXPANDED from 4 to 8 for multi-type article support): +- factual: General facts (e.g., "The tower is 330 meters tall") +- statistical: Numbers, percentages, metrics (e.g., "Model has 0.9B parameters") +- attribution: Who said/did/published what (e.g., "Vaswani et al. proposed Transformer") +- institutional: Organizations, affiliations, collaborations (e.g., "Released by MIT and Stanford") +- temporal: Time-related claims - dates, durations, "recently" (e.g., "Released on Dec 5, 2024") +- comparative: Comparisons between entities/products (e.g., "GPU improved 20% vs A16") +- monetary: Financial figures, costs, prices (e.g., "128GB model priced at $999") +- technical: Technical specifications, capabilities (e.g., "A17 Pro chip with 3nm process") + +Output Format (JSON): +{ + "claims": [ + { + "claim": "具体的声明文本", + "claim_type": "institutional", + "context": "周围的上下文(帮助理解)", + "verifiable": true, + "confidence": 0.95 + } + ] +} + +Examples: + +Example 1 - Academic Article: +Input: "百度刚刚发布的PaddleOCR-VL模型登顶了由清华大学、阿里达摩院等联合发布的OmniDocBench榜单。" + +Output: +{ + "claims": [ + { + "claim": "PaddleOCR-VL model was just released by Baidu", + "claim_type": "attribution", + "context": "百度刚刚发布的PaddleOCR-VL模型...", + "verifiable": true, + "confidence": 0.90 + }, + { + "claim": "PaddleOCR-VL topped the OmniDocBench leaderboard", + "claim_type": "factual", + "context": "模型登顶了...OmniDocBench榜单", + "verifiable": true, + "confidence": 0.95 + }, + { + "claim": "OmniDocBench was jointly released by Tsinghua University and Alibaba DAMO Academy", + "claim_type": "institutional", + "context": "由清华大学、阿里达摩院等联合发布的OmniDocBench榜单", + "verifiable": true, + "confidence": 0.95 + } + ] +} + +Example 2 - News Article: +Input: "OpenAI于2024年12月5日正式发布o1推理模型。CEO Sam Altman表示这是AGI道路上的里程碑。ChatGPT Plus月费保持20美元。" + +Output: +{ + "claims": [ + { + "claim": "OpenAI released o1 reasoning model on December 5, 2024", + "claim_type": "temporal", + "context": "OpenAI于2024年12月5日正式发布o1推理模型", + "verifiable": true, + "confidence": 0.98 + }, + { + "claim": "Sam Altman stated o1 is a milestone on the path to AGI", + "claim_type": "attribution", + "context": "CEO Sam Altman表示这是AGI道路上的里程碑", + "verifiable": true, + "confidence": 0.90 + }, + { + "claim": "ChatGPT Plus monthly fee remains $20", + "claim_type": "monetary", + "context": "ChatGPT Plus月费保持20美元", + "verifiable": true, + "confidence": 0.95 + } + ] +} + +Example 3 - Product Review: +Input: "iPhone 15 Pro搭载A17 Pro芯片,采用3纳米工艺。GPU性能相比A16提升20%。国行128GB版售价7999元。" + +Output: +{ + "claims": [ + { + "claim": "iPhone 15 Pro features A17 Pro chip with 3nm process", + "claim_type": "technical", + "context": "iPhone 15 Pro搭载A17 Pro芯片,采用3纳米工艺", + "verifiable": true, + "confidence": 0.98 + }, + { + "claim": "GPU performance improved 20% compared to A16", + "claim_type": "comparative", + "context": "GPU性能相比A16提升20%", + "verifiable": true, + "confidence": 0.90 + }, + { + "claim": "China 128GB model priced at 7999 yuan", + "claim_type": "monetary", + "context": "国行128GB版售价7999元", + "verifiable": true, + "confidence": 0.95 + } + ] +} + +Critical: Extract SPECIFIC claims with verifiable details. Ignore opinions, marketing language, or vague statements. +""" + + @classmethod + def execute( + cls, + text: str, + claim_types: Optional[List[str]] = None, + **kwargs + ) -> Dict[str, Any]: + """ + Extract verifiable claims from text. + + Args: + text: Input text (supports Markdown) + claim_types: Types of claims to extract (default: all types from config) + **kwargs: Optional configuration overrides + - max_claims: Override max_claims config + - include_context: Override include_context config + - chunk_size: Override chunk_size config + + Returns: + Dict with extracted claims: + { + 'success': bool, + 'claims': List[Dict], + 'metadata': Dict + } + + Raises: + ImportError: If openai library is not installed + ValueError: If text is empty or API key is missing + Exception: For API errors + """ + # Validate inputs + if not text or not text.strip(): + log.error("Claims extraction: text cannot be empty") + return { + 'success': False, + 'error': 'Input text cannot be empty', + 'claims': [] + } + + if not cls.config.api_key: + error_msg = ( + "OpenAI API key is required for claims extraction.\n\n" + "Set api_key in tool configuration or environment variable OPENAI_API_KEY" + ) + log.error(error_msg) + return { + 'success': False, + 'error': error_msg, + 'error_type': 'ConfigurationError', + 'claims': [] + } + + # Import OpenAI library (lazy import) + try: + from openai import OpenAI + except ImportError: + error_msg = ( + "openai library is not installed but required for claims extraction.\n\n" + "Install with:\n" + " pip install -r requirements/agent.txt\n" + "Or:\n" + " pip install openai>=1.0.0" + ) + log.error(error_msg) + return { + 'success': False, + 'error': error_msg, + 'error_type': 'DependencyError', + 'claims': [] + } + + # Get configuration + claim_types_filter = claim_types or cls.config.claim_types + max_claims = kwargs.get('max_claims', cls.config.max_claims) + include_context = kwargs.get('include_context', cls.config.include_context) + chunk_size = kwargs.get('chunk_size', cls.config.chunk_size) + + log.info(f"Extracting claims from text ({len(text)} chars, chunk_size={chunk_size})") + + try: + # Create OpenAI client (with optional custom base_url) + client_kwargs = {"api_key": cls.config.api_key} + if cls.config.base_url: + client_kwargs["base_url"] = cls.config.base_url + log.info(f"Using custom API base URL: {cls.config.base_url}") + client = OpenAI(**client_kwargs) + + # Chunk text if needed + chunks = cls._chunk_text(text, chunk_size) + log.debug(f"Split text into {len(chunks)} chunks") + + # Extract claims from each chunk + all_claims = [] + for i, chunk_data in enumerate(chunks): + log.debug(f"Processing chunk {i+1}/{len(chunks)}") + + chunk_claims = cls._extract_claims_from_chunk( + client, + chunk_data['text'], + chunk_data['start_pos'], + claim_types_filter, + include_context + ) + all_claims.extend(chunk_claims) + + # Deduplicate and merge similar claims + unique_claims = cls._deduplicate_claims(all_claims) + + # Limit to max_claims + if len(unique_claims) > max_claims: + log.warning(f"Limiting claims from {len(unique_claims)} to {max_claims}") + unique_claims = unique_claims[:max_claims] + + # Add claim IDs + for i, claim in enumerate(unique_claims, 1): + claim['claim_id'] = f"claim_{i:03d}" + + # Build metadata + metadata = cls._build_metadata(unique_claims) + + result = { + 'success': True, + 'claims': unique_claims, + 'metadata': metadata + } + + log.info(f"Claims extraction successful: {len(unique_claims)} claims extracted") + return result + + except Exception as e: + log.error(f"Claims extraction failed: {e}") + + # Sanitize error message + error_str = str(e).lower() + if "api key" in error_str or "authentication" in error_str: + error_msg = "Invalid or missing API key" + elif "rate limit" in error_str: + error_msg = "Rate limit exceeded" + elif "timeout" in error_str: + error_msg = "Request timed out" + else: + error_msg = f"Extraction failed: {type(e).__name__}" + + return { + 'success': False, + 'error': error_msg, + 'error_type': type(e).__name__, + 'claims': [] + } + + @classmethod + def _chunk_text(cls, text: str, chunk_size: int) -> List[Dict[str, Any]]: + """ + Split long text into chunks for processing. + + Args: + text: Input text + chunk_size: Maximum chunk size in characters + + Returns: + List of chunk dictionaries with text and position info + """ + if len(text) <= chunk_size: + return [{'text': text, 'start_pos': 0, 'end_pos': len(text)}] + + chunks = [] + start = 0 + + while start < len(text): + end = start + chunk_size + + # Try to break at sentence boundary + if end < len(text): + # Look for sentence ending within last 20% of chunk + search_start = start + int((end - start) * 0.8) + sentence_end = max( + text.rfind('。', search_start, end), + text.rfind('.', search_start, end), + text.rfind('\n\n', search_start, end) + ) + if sentence_end > start: + end = sentence_end + 1 + + chunk_text = text[start:end] + chunks.append({ + 'text': chunk_text, + 'start_pos': start, + 'end_pos': end + }) + + start = end + + return chunks + + @classmethod + def _extract_claims_from_chunk( + cls, + client, + chunk_text: str, + start_pos: int, + claim_types: List[str], + include_context: bool + ) -> List[Dict]: + """ + Extract claims from a single text chunk using LLM. + + Args: + client: OpenAI client + chunk_text: Text chunk to process + start_pos: Start position of chunk in original text + claim_types: Types of claims to extract + include_context: Whether to include context + + Returns: + List of extracted claims + """ + # Build user prompt + user_prompt = f"""Extract verifiable claims from the following text. + +Focus on these claim types: {', '.join(claim_types)} + +Text: +{chunk_text} + +Return JSON with claims array as specified in the system prompt. +""" + + # Call LLM + try: + response = client.chat.completions.create( + model=cls.config.model, + messages=[ + {"role": "system", "content": cls.EXTRACTION_SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt} + ], + temperature=cls.config.temperature, + response_format={"type": "json_object"} # Force JSON output + ) + + output_text = response.choices[0].message.content + + # Parse JSON + result_json = json.loads(output_text) + claims = result_json.get('claims', []) + + # Add position info and filter by type + filtered_claims = [] + for claim in claims: + claim_type = claim.get('claim_type', 'unknown') + if claim_type in claim_types or 'all' in claim_types: + # Add position (approximate - based on chunk) + claim['position'] = { + 'start': start_pos, + 'end': start_pos + len(chunk_text) + } + + # Remove context if not requested + if not include_context: + claim.pop('context', None) + + filtered_claims.append(claim) + + return filtered_claims + + except json.JSONDecodeError as e: + log.warning(f"Failed to parse LLM output as JSON: {e}") + return [] + except Exception as e: + log.error(f"LLM call failed: {e}") + return [] + + @classmethod + def _deduplicate_claims(cls, claims: List[Dict]) -> List[Dict]: + """ + Remove duplicate or highly similar claims. + + Args: + claims: List of claims + + Returns: + Deduplicated claims + """ + if len(claims) <= 1: + return claims + + unique_claims = [] + seen_texts = set() + + for claim in claims: + claim_text = claim.get('claim', '').strip().lower() + + # Skip if empty + if not claim_text: + continue + + # Skip if exact duplicate + if claim_text in seen_texts: + continue + + # Check for very similar claims (simple substring check) + is_duplicate = False + for seen_text in seen_texts: + # If one is substring of other and length difference < 20% + if claim_text in seen_text or seen_text in claim_text: + len_diff = abs(len(claim_text) - len(seen_text)) + if len_diff < 0.2 * max(len(claim_text), len(seen_text)): + is_duplicate = True + break + + if not is_duplicate: + unique_claims.append(claim) + seen_texts.add(claim_text) + + return unique_claims + + @classmethod + def _build_metadata(cls, claims: List[Dict]) -> Dict[str, Any]: + """ + Build metadata summary for extracted claims. + + Args: + claims: List of claims + + Returns: + Metadata dictionary + """ + total_claims = len(claims) + verifiable_claims = sum(1 for c in claims if c.get('verifiable', True)) + + # Count by type + type_distribution = {} + for claim in claims: + claim_type = claim.get('claim_type', 'unknown') + type_distribution[claim_type] = type_distribution.get(claim_type, 0) + 1 + + return { + 'total_claims': total_claims, + 'verifiable_claims': verifiable_claims, + 'claim_types_distribution': type_distribution + } + + @classmethod + def validate_config(cls): + """Validate tool configuration before execution.""" + if not cls.config.api_key: + raise ValueError(f"{cls.name}: OpenAI API key is required") diff --git a/docs/agent_architecture.md b/docs/agent_architecture.md new file mode 100644 index 00000000..c55d34c6 --- /dev/null +++ b/docs/agent_architecture.md @@ -0,0 +1,605 @@ +# Dingo Agent Architecture & Implementation Guide + +## Overview + +Dingo's Agent system extends traditional rule and LLM-based evaluation with **multi-step reasoning**, **tool usage**, and **adaptive context gathering** capabilities. This document provides a comprehensive overview of the Agent architecture, file structure, and implementation patterns. + +## Table of Contents + +1. [Architecture Overview](#architecture-overview) +2. [File Structure](#file-structure) +3. [Core Components](#core-components) +4. [Implementation Patterns](#implementation-patterns) +5. [Data Flow](#data-flow) +6. [Configuration](#configuration) +7. [Examples](#examples) + +--- + +## Architecture Overview + +### High-Level Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Dingo Evaluation System │ +├─────────────────────────────────────────────────────────────┤ +│ Data Input → Executor → [Rules | LLMs | Agents] → Results │ +└─────────────────────────────────────────────────────────────┘ + ▼ + ┌─────────────────────┐ + │ Agent Framework │ + └─────────────────────┘ + │ + ┌─────────────────────┼─────────────────────┐ + ▼ ▼ ▼ + ┌─────────┐ ┌──────────┐ ┌──────────┐ + │ Base │ │ Tools │ │ LangChain│ + │ Agent │◄────────│ Registry │ │ Adapter │ + └─────────┘ └──────────┘ └──────────┘ + │ │ + ▼ ▼ +┌────────────────┐ ┌──────────────────┐ +│ AgentFactCheck │ │ tavily_search │ +│AgentHallucin..│ │ arxiv_search │ +│ArticleFactChk │ │ claims_extractor│ +│ (Custom) │ │ render_tool │ +└────────────────┘ │ mineru_ocr_tool │ + └──────────────────┘ +``` + +### Evaluation Flow Comparison + +``` +Traditional Evaluation: +┌──────┐ ┌─────────┐ ┌────────────┐ +│ Data │─────▶│ Rule/LLM│─────▶│ EvalDetail │ +└──────┘ └─────────┘ └────────────┘ + +Agent-Based Evaluation: +┌──────┐ ┌───────┐ ┌──────────┐ ┌─────┐ ┌────────────┐ +│ Data │─────▶│ Agent │─────▶│Tool Calls│─────▶│ LLM │─────▶│ EvalDetail │ +└──────┘ └───────┘ └──────────┘ └─────┘ └────────────┘ + │ │ + Web Search Reasoning & + OCR Tools Synthesis +``` + +--- + +## File Structure + +### Current Implementation (Latest) + +``` +dingo/ +├── model/ +│ ├── llm/ # LLM-based evaluators +│ │ ├── agent/ # ✨ Agent Framework +│ │ │ ├── __init__.py # Package exports (BaseAgent, tools) +│ │ │ ├── base_agent.py # BaseAgent abstract class +│ │ │ ├── agent_fact_check.py # LangChain-based agent (framework-driven) +│ │ │ ├── agent_hallucination.py # Custom workflow agent (imperative) +│ │ │ ├── agent_article_fact_checker.py # Agent-First article fact-checker +│ │ │ ├── agent_wrapper.py # LangChain 1.0 integration wrapper +│ │ │ ├── langchain_adapter.py # Dingo ↔ LangChain tool adapter +│ │ │ └── tools/ # Agent tools +│ │ │ ├── __init__.py # Tool registry exports +│ │ │ ├── base_tool.py # BaseTool abstract class +│ │ │ ├── tool_registry.py # Tool registration & discovery +│ │ │ ├── claims_extractor.py # Claims extraction tool (LLM-based) +│ │ │ ├── arxiv_search.py # Academic paper search tool +│ │ │ ├── tavily_search.py # Web search tool (Tavily API) +│ │ │ ├── render_tool.py # HTML rendering tool +│ │ │ └── mineru_ocr_tool.py # OCR tool (MinerU integration) +│ │ ├── base_openai.py # Base class for OpenAI-compatible LLMs +│ │ └── ... # Other LLM evaluators +│ ├── model.py # ✏️ Central registry (@Model decorator) +│ └── rule/ # Rule-based evaluators +│ +├── config/ +│ └── input_args.py # ✏️ Configuration models (Pydantic) +│ # - InputArgs +│ # - EvaluatorArgs (includes agent_config) +│ +├── exec/ +│ ├── local.py # ✏️ Local executor with thread/process pools +│ │ # - Agents run in ThreadPoolExecutor (I/O-bound) +│ └── spark.py # Distributed executor (Spark) +│ +├── io/ +│ ├── input/ +│ │ └── data.py # Data class (standardized input) +│ └── output/ +│ └── eval_detail.py # EvalDetail (evaluation result) +│ +└── utils/ + └── log_util/ # Logging utilities + └── logger.py + +examples/ +└── agent/ # ✨ Agent usage examples + ├── agent_executor_example.py # Basic agent execution + ├── agent_hallucination_example.py # Hallucination detection example + └── agent_article_fact_checking_example.py # Article fact-checking example + +test/ +└── scripts/ + └── model/ + └── llm/ + └── agent/ # ✨ Agent tests + ├── test_agent_fact_check.py + ├── test_agent_hallucination.py + ├── test_article_fact_checker.py # ArticleFactChecker tests (88 tests) + ├── test_async_article_fact_checker.py # Async/parsing tests (30 tests) + ├── test_tool_registry.py + └── tools/ + ├── test_claims_extractor.py + ├── test_arxiv_search.py + ├── test_tavily_search.py + ├── test_render_tool.py + └── test_mineru_ocr_tool.py + +docs/ +├── agent_development_guide.md # Comprehensive development guide +├── agent_architecture.md # This file +├── article_fact_checking_guide.md # ArticleFactChecker guide +└── quick_start_article_fact_checking.md # Quick start for article fact-checking + +requirements/ +└── agent.txt # Agent dependencies + # - langchain>=1.0.0 + # - langchain-openai + # - tavily-python + # - etc. + +.github/ +└── env/ + └── agent_hallucination.json # Example agent configuration +``` + +### Key File Changes from "Old Version" + +| Old Path | New Path | Notes | +|----------|----------|-------| +| `dingo/model/agent/` | `dingo/model/llm/agent/` | Moved under LLM module hierarchy | +| N/A | `agent_wrapper.py` | Added LangChain 1.0 integration | +| N/A | `langchain_adapter.py` | Added Dingo ↔ LangChain adapters | +| `agent_fact_check_web.py` | `agent_fact_check.py` | Simplified naming | +| N/A | `agent_hallucination.py` | Added custom workflow example | +| `tools/web_search.py` | `tools/tavily_search.py` | Specific implementation naming | +| N/A | `tools/render_tool.py` | Added HTML rendering | +| N/A | `tools/mineru_ocr_tool.py` | Added OCR capabilities | + +--- + +## Core Components + +### 1. BaseAgent (base_agent.py) + +**Purpose**: Abstract base class for all agent-based evaluators + +**Key Features**: +- Extends `BaseOpenAI` to inherit LLM functionality +- Supports dual execution paths: Legacy (manual) and LangChain (framework-driven) +- Manages tool execution and configuration injection +- Provides agent orchestration methods + +**Core Methods**: +```python +class BaseAgent(BaseOpenAI): + # Configuration + available_tools: List[str] = [] # Tools this agent can use + max_iterations: int = 5 # Safety limit + use_agent_executor: bool = False # Enable LangChain path + + # Abstract methods (must implement) + @abstractmethod + def plan_execution(cls, input_data: Data) -> List[Dict[str, Any]] + @abstractmethod + def aggregate_results(cls, input_data: Data, results: List[Any]) -> EvalDetail + + # Main evaluation entry point + def eval(cls, input_data: Data) -> EvalDetail + + # Tool execution + def execute_tool(cls, tool_name: str, **kwargs) -> Dict[str, Any] + def configure_tool(cls, tool_name: str, tool_class) + + # LangChain integration + def _eval_with_langchain_agent(cls, input_data: Data) -> EvalDetail + def get_langchain_tools(cls) + def _format_agent_input(cls, input_data: Data) -> str + def _get_system_prompt(cls, input_data: Data) -> str +``` + +**Execution Flow**: +``` +eval() +├─ use_agent_executor == True? (standard path) +│ ├─ Yes → _eval_with_langchain_agent() +│ │ ├─ get_langchain_tools() +│ │ ├─ get_langchain_llm() +│ │ ├─ AgentWrapper.create_agent() +│ │ ├─ AgentWrapper.invoke_and_format() +│ │ └─ aggregate_results() +│ │ +│ └─ No → Legacy path +│ ├─ plan_execution() +│ ├─ Loop through plan steps +│ │ ├─ execute_tool() for tool steps +│ │ └─ send_messages() for LLM steps +│ └─ aggregate_results() + +Note: ArticleFactChecker overrides eval() entirely and uses a two-phase +async parallel architecture (asyncio.run → _async_eval) instead of +the above base-class dispatch. See ArticleFactChecker section below. +``` + +### 2. Tool System + +#### BaseTool (tools/base_tool.py) + +**Purpose**: Abstract interface for all agent tools + +```python +class BaseTool(ABC): + name: str # Unique identifier + description: str # For LLM understanding + config: ToolConfig # Tool-specific config + + @abstractmethod + def execute(cls, **kwargs) -> Dict[str, Any] + def validate_config(cls) + def update_config(cls, config_dict: Dict[str, Any]) +``` + +#### ToolRegistry (tools/tool_registry.py) + +**Purpose**: Central registry for tool discovery and management + +**Key Features**: +- Auto-discovery via `@tool_register()` decorator +- Lazy loading (tools loaded on first use) +- Configuration injection from agent config + +```python +@tool_register("tavily_search") +class TavilySearch(BaseTool): + name = "tavily_search" + description = "Search the web using Tavily API" + + @classmethod + def execute(cls, query: str, **kwargs) -> Dict[str, Any]: + # Implementation + return { + 'success': True, + 'results': [...], + 'answer': "..." + } +``` + +**Built-in Tools**: + +| Tool | File | Purpose | Dependencies | +|------|------|---------|--------------| +| `claims_extractor` | `claims_extractor.py` | LLM-based claims extraction | `openai` | +| `arxiv_search` | `arxiv_search.py` | Academic paper search | `arxiv` | +| `tavily_search` | `tavily_search.py` | Web search via Tavily API | `tavily-python` | +| `render_tool` | `render_tool.py` | HTML rendering with Playwright | `playwright` | +| `mineru_ocr_tool` | `mineru_ocr_tool.py` | OCR with MinerU | `magic-pdf` | + +### 3. LangChain Integration + +#### AgentWrapper (agent_wrapper.py) + +**Purpose**: Wrapper for LangChain 1.0 create_agent API + +**Key Methods**: +```python +class AgentWrapper: + @staticmethod + def create_agent(llm, tools, system_prompt, **config) + # Uses langchain.agents.create_agent (LangGraph-based) + + @staticmethod + def invoke_and_format(agent, input_text, input_data, max_iterations) + # Invokes agent and formats results for Dingo + + @staticmethod + def get_openai_llm_from_dingo_config(dynamic_config) + # Creates ChatOpenAI from Dingo config +``` + +**LangChain 1.0 Changes** (Nov 2025): +- Uses `create_agent()` instead of deprecated `AgentExecutor` +- Built on LangGraph for better state management +- `recursion_limit` instead of `max_iterations` +- Message-based invocation interface + +#### LangChain Adapter (langchain_adapter.py) + +**Purpose**: Converts Dingo tools to LangChain StructuredTool format + +```python +def convert_dingo_tools(tool_names: List[str], agent_class) -> List[StructuredTool]: + # Wraps Dingo tools for LangChain compatibility + # Preserves Dingo's configuration injection mechanism +``` + +### 4. Agent Implementations + +#### AgentFactCheck (agent_fact_check.py) + +**Pattern**: LangChain-Based (Framework-Driven) + +**Key Characteristics**: +- Sets `use_agent_executor = True` +- Overrides `_format_agent_input()` for custom input formatting +- Overrides `_get_system_prompt()` for task-specific instructions +- LangChain handles autonomous tool calling and reasoning +- Parses structured output in `aggregate_results()` + +**Workflow**: +``` +Input: Question + Response + Context (optional) + ↓ +LangChain Agent decides: + - With context: MAY search for additional verification + - Without context: MUST search to verify facts + ↓ +Agent autonomously: + - Calls tavily_search tool as needed + - Reasons about results + - Returns structured output (HALLUCINATION_DETECTED: YES/NO) + ↓ +aggregate_results() parses output → EvalDetail +``` + +**When to Use**: +- ✅ Complex multi-step reasoning +- ✅ Benefit from LangChain's orchestration +- ✅ Prefer declarative style +- ✅ Rapid prototyping + +#### AgentHallucination (agent_hallucination.py) + +**Pattern**: Custom Workflow (Imperative) + +**Key Characteristics**: +- Implements custom `eval()` with explicit workflow +- Manually calls `execute_tool()` for searches +- Manually calls `send_messages()` for LLM interactions +- Delegates to existing evaluator (LLMHallucination) +- Full control over execution flow + +**Workflow**: +``` +Input: Content + Context (optional) + ↓ +Check context availability + ↓ +├─ Has context? → Delegate to LLMHallucination +│ +└─ No context? → Agent workflow: + 1. Extract factual claims (LLM call) + 2. Search web for each claim (Tavily tool) + 3. Synthesize context (combine results) + 4. Evaluate with synthesized context (LLMHallucination) + ↓ +Return EvalDetail with provenance +``` + +**When to Use**: +- Fine-grained control over steps +- Compose with existing evaluators +- Prefer explicit behavior +- Domain-specific workflows +- Conditional logic between steps + +#### ArticleFactChecker (agent_article_fact_checker.py) + +**Pattern**: Agent-First with Context Tracking (LangChain ReAct + Artifact Saving) + +**Key Characteristics**: +- Sets `use_agent_executor = True` (same as AgentFactCheck) +- Overrides `eval()` to add context tracking and file saving +- Uses thread-local storage (`threading.local()`) for concurrent safety +- Extracts claims from tool_calls observation data +- Builds enriched per-claim verification records +- Saves intermediate artifacts (article, claims, verification, report) +- Produces dual-layer `EvalDetail.reason`: `[text_summary, structured_report_dict]` + +**Workflow** (two-phase parallel architecture): +``` +Input: Article text (Markdown) + | +eval() override: + |- Save article content to output_path + |- asyncio.run(_async_eval()) + | +Phase 1 — Claims Extraction: + |- ClaimsExtractor.execute(content) # Direct tool call, not via agent + |- Returns list of factual claims + | +Phase 2 — Parallel Claim Verification: + |- asyncio.gather() with Semaphore(max_concurrent_claims) + |- Each claim → independent LangChain mini-agent + │ |- _async_verify_single_claim() + │ |- AgentWrapper.async_invoke_and_format() + │ |- _parse_claim_json_robust() # 3-tier robust JSON parsing + │ └─ Returns per-claim verdict + | +Aggregation: + |- _aggregate_parallel_results() + |- _recalculate_summary() + |- Save artifacts (claims_extracted.jsonl, claims_verification.jsonl, report.json) + |- Return EvalDetail with dual-layer reason +``` + +**When to Use**: +- Article-level comprehensive fact-checking +- Need intermediate artifacts (claims list, per-claim details, full report) +- Benefit from transparent evidence chains +- Want structured report alongside text summary + +--- +--- + +## Data Flow + +### Complete Evaluation Pipeline + +``` +┌───────────────────────────────────────────────────────────────┐ +│ 1. Configuration Loading │ +└───────────────────────────────────────────────────────────────┘ + JSON Config → InputArgs (Pydantic) → EvaluatorArgs + ├─ name: "AgentFactCheck" + ├─ config.key: API key + ├─ config.model: "gpt-4" + └─ config.parameters.agent_config: + ├─ max_iterations: 10 + └─ tools: + └─ tavily_search: + └─ api_key: "..." + +┌───────────────────────────────────────────────────────────────┐ +│ 2. Data Loading & Conversion │ +└───────────────────────────────────────────────────────────────┘ + DataSource.load() → Generator[raw_data] + ↓ + Converter.convert() → Data objects + ├─ content: str + ├─ prompt: Optional[str] + ├─ context: Optional[List[str]] + └─ raw_data: Dict + +┌───────────────────────────────────────────────────────────────┐ +│ 3. Agent Execution (ThreadPoolExecutor) │ +└───────────────────────────────────────────────────────────────┘ + BaseAgent.eval(Data) → EvalDetail + │ + ├─ use_agent_executor? + │ + ├─ YES (LangChain Path): + │ ├─ _format_agent_input(Data) → input_text + │ ├─ _get_system_prompt(Data) → system_prompt + │ ├─ get_langchain_tools() → StructuredTool[] + │ ├─ get_langchain_llm() → ChatOpenAI + │ ├─ AgentWrapper.create_agent() → CompiledStateGraph + │ ├─ AgentWrapper.invoke_and_format() + │ │ ├─ Agent reasoning loop (ReAct) + │ │ ├─ Tool calls (autonomous) + │ │ └─ Final output + │ └─ aggregate_results() → EvalDetail + │ + └─ NO (Legacy Path): + ├─ plan_execution(Data) → plan: List[step] + ├─ Loop through steps: + │ ├─ Tool step: execute_tool(name, **args) + │ │ ├─ ToolRegistry.get(name) + │ │ ├─ configure_tool() + │ │ └─ tool.execute() + │ └─ LLM step: send_messages(messages) + └─ aggregate_results(results) → EvalDetail + +┌───────────────────────────────────────────────────────────────┐ +│ 4. Result Aggregation │ +└───────────────────────────────────────────────────────────────┘ + EvalDetail + ├─ metric: str # "AgentFactCheck" + ├─ status: bool # True = issue detected + ├─ score: Optional[float] # Numeric score + ├─ label: List[str] # ["QUALITY_BAD.HALLUCINATION"] + └─ reason: List[Any] # Dual-layer reason: + # reason[0]: str (human-readable text) + # reason[1]: Dict (structured report, optional) + # ArticleFactChecker uses this for + # text summary + full report dict + +┌───────────────────────────────────────────────────────────────┐ +│ 5. Summary Generation │ +└───────────────────────────────────────────────────────────────┘ + ResultInfo → SummaryModel + ├─ total_count: int + ├─ good_count: int + ├─ bad_count: int + ├─ type_ratio: Dict[field, Dict[label, count]] + └─ metrics_score_stats: Dict[metric, stats] +``` + +### Tool Execution Flow + +``` +BaseAgent.execute_tool(tool_name, **kwargs) + ↓ +Check if tool in available_tools + ↓ +ToolRegistry.get(tool_name) → tool_class + ↓ +configure_tool(tool_name, tool_class) + ├─ Extract config from dynamic_config.parameters.agent_config.tools.{tool_name} + └─ tool_class.update_config(config_dict) + ↓ +tool_class.execute(**kwargs) + ├─ Tool-specific logic (API calls, processing, etc.) + └─ Return Dict[str, Any] with 'success' key + ↓ +Return to agent for processing +``` + +--- + +## Summary + +### Key Takeaways + +1. **Architecture**: Agents extend `BaseOpenAI` and are registered via `@Model.llm_register()` +2. **Location**: All agent code lives under `dingo/model/llm/agent/` +3. **Three Patterns**: LangChain-based (declarative), Custom Workflow (imperative), Agent-First + Context (hybrid) +4. **Tool System**: Centralized registry with configuration injection +5. **Execution**: Runs in ThreadPoolExecutor alongside other LLMs +6. **Configuration**: Nested under `parameters.agent_config` in evaluator config +7. **Artifact Saving**: ArticleFactChecker auto-saves intermediate artifacts to a timestamped directory by default; override via `agent_config.output_path`, or disable with `agent_config.save_artifacts=false` + +### Implementation Checklist + +Creating a new agent: +- [ ] Choose pattern (LangChain vs Custom) +- [ ] Create agent file under `dingo/model/llm/agent/` +- [ ] Extend `BaseAgent` +- [ ] Register with `@Model.llm_register("YourAgent")` +- [ ] Define `available_tools` list +- [ ] Implement required methods based on pattern +- [ ] Add tests under `test/scripts/model/llm/agent/` +- [ ] Update documentation +- [ ] Add example usage under `examples/agent/` + +Creating a new tool: +- [ ] Create tool file under `dingo/model/llm/agent/tools/` +- [ ] Extend `BaseTool` +- [ ] Register with `@tool_register("your_tool")` +- [ ] Implement `execute()` method +- [ ] Define custom `ToolConfig` if needed +- [ ] Add tests under `test/scripts/model/llm/agent/tools/` +- [ ] Update requirements/agent.txt if dependencies needed + +### Next Steps + +- Read `docs/agent_development_guide.md` for detailed implementation guide +- Study `agent_fact_check.py` for LangChain pattern example +- Study `agent_hallucination.py` for custom workflow example +- Study `agent_article_fact_checker.py` for Agent-First + artifact saving pattern +- Review `examples/agent/` for usage examples +- Check `test/scripts/model/llm/agent/` for testing patterns + +--- + +## Reference Links + +- [Agent Development Guide](./agent_development_guide.md) - Comprehensive development guide +- [Article Fact-Checking Guide](./article_fact_checking_guide.md) - ArticleFactChecker usage guide +- [CLAUDE.md](../CLAUDE.md) - Project overview and common commands +- [LangChain Documentation](https://python.langchain.com/docs/concepts/agents/) - Agent concepts +- [Tavily API](https://tavily.com/) - Web search tool documentation diff --git a/docs/agent_development_guide.md b/docs/agent_development_guide.md index 3a5dc3d0..da071b7c 100644 --- a/docs/agent_development_guide.md +++ b/docs/agent_development_guide.md @@ -48,18 +48,18 @@ Data → Agent → [Tool 1, Tool 2, ...] → LLM Reasoning → EvalDetail ## Agent Implementation Patterns -Dingo supports two complementary patterns for implementing agent-based evaluators. Both patterns share the same configuration interface and are transparent to users, allowing you to choose the approach that best fits your needs. +Dingo supports three complementary patterns for implementing agent-based evaluators. All patterns share the same configuration interface and are transparent to users, allowing you to choose the approach that best fits your needs. ### Pattern Comparison -| Aspect | LangChain-Based | Custom Workflow | -|--------|-----------------|-----------------| -| **Control** | Framework-driven | Developer-driven | -| **Complexity** | Simple (declarative) | Moderate (imperative) | -| **Flexibility** | Limited to LangChain patterns | Unlimited | -| **Code Volume** | Low (~100 lines) | Medium (~200 lines) | -| **Best For** | Multi-step reasoning | Workflow composition | -| **Example** | AgentFactCheck | AgentHallucination | +| Aspect | LangChain-Based | Custom Workflow | Agent-First + Context | +|--------|-----------------|-----------------|----------------------| +| **Control** | Framework-driven | Developer-driven | Framework + override | +| **Complexity** | Simple (declarative) | Moderate (imperative) | Moderate (hybrid) | +| **Flexibility** | Limited to LangChain | Unlimited | LangChain + artifacts | +| **Code Volume** | Low (~100 lines) | Medium (~200 lines) | High (~500+ lines) | +| **Best For** | Multi-step reasoning | Workflow composition | Article-level verification | +| **Example** | AgentFactCheck | AgentHallucination | ArticleFactChecker | ### Pattern 1: LangChain-Based Agents (Framework-Driven) @@ -367,25 +367,140 @@ Provide a concise summary of the key facts.""" --- +### Pattern 3: Agent-First with Context Tracking (ArticleFactChecker) + +**Philosophy**: Use LangChain's ReAct pattern for autonomous reasoning, override `eval()` and `aggregate_results()` for context tracking and artifact saving. + +#### When to Use + +- Article-level comprehensive verification (many claims) +- Need intermediate artifacts (claims list, per-claim details, structured report) +- Want dual-layer output: human-readable text + structured data +- Benefit from thread-safe concurrent evaluation + +#### Key Implementation Steps + +1. Set `use_agent_executor = True` (same as Pattern 1) +2. **Override `eval()`** with a two-phase async architecture: + - Save article content to output directory + - Call `asyncio.run(cls._async_eval(input_data, ...))` (bypasses `_eval_with_langchain_agent`) + - Phase 1: Direct `ClaimsExtractor.execute()` call (no agent overhead) + - Phase 2: Per-claim verification via `asyncio.gather()` + `Semaphore(max_concurrent_claims)` +3. **Each claim** gets its own independent LangChain mini-agent: + - `_async_verify_single_claim()` invokes `AgentWrapper.async_invoke_and_format()` + - Results parsed by `_parse_claim_json_robust()` (3-tier robust parser) +4. **Aggregation** via `_aggregate_parallel_results()` and `_recalculate_summary()` + - Save artifacts (claims_extracted.jsonl, claims_verification.jsonl, report.json) + - Return EvalDetail with dual-layer reason: `[text_summary, report_dict]` + +#### Async Parallel Execution Pattern + +```python +import asyncio +import threading + +class ArticleFactChecker(BaseAgent): + _thread_local = threading.local() + _claims_extractor_lock = threading.Lock() # Thread-safe config mutation + + @classmethod + def eval(cls, input_data: Data) -> EvalDetail: + start_time = time.time() + output_dir = cls._get_output_dir() + if output_dir and input_data.content: + cls._save_article_content(output_dir, input_data.content) + try: + return asyncio.run(cls._async_eval(input_data, start_time, output_dir)) + except RuntimeError: + # Fallback for already-running event loop (e.g., Jupyter) + loop = asyncio.new_event_loop() + return loop.run_until_complete(cls._async_eval(input_data, start_time, output_dir)) + + @classmethod + async def _async_eval(cls, input_data, start_time, output_dir) -> EvalDetail: + claims = await cls._async_extract_claims(input_data) + semaphore = asyncio.Semaphore(cls._get_max_concurrent_claims()) + tasks = [cls._async_verify_single_claim(c, semaphore, ...) for c in claims] + results = await asyncio.gather(*tasks, return_exceptions=True) + return cls._build_eval_detail(results, start_time, output_dir, input_data) +``` + +#### Output Path Access Pattern + +`_get_output_dir()` uses a three-priority chain (highest to lowest): + +1. **Explicit path** – `agent_config.output_path` is set → use it (backward-compatible) +2. **Opt-out** – `agent_config.save_artifacts=false` → return `None`, skip saving +3. **Auto-generate** – default behaviour: `outputs/article_factcheck__/` + - Override the base directory with `agent_config.base_output_path` + +```python +@classmethod +def _get_output_dir(cls) -> Optional[str]: + """ + Get output directory for artifact files (three-priority chain). + Returns output dir path (created if needed), or None if saving disabled. + """ + params = cls.dynamic_config.parameters or {} + agent_cfg = params.get('agent_config') or {} + + explicit_path = agent_cfg.get('output_path') + if explicit_path: + os.makedirs(explicit_path, exist_ok=True) + return explicit_path + + if agent_cfg.get('save_artifacts') is False: + return None # Opted out of artifact saving + + base_output = agent_cfg.get('base_output_path') or 'outputs' + create_time = time.strftime("%Y%m%d_%H%M%S", time.localtime()) + auto_path = os.path.join(base_output, f"article_factcheck_{create_time}_{uuid.uuid4().hex[:6]}") + os.makedirs(auto_path, exist_ok=True) + return auto_path +``` + +#### Dual-Layer EvalDetail.reason + +```python +# reason[0]: Human-readable text summary (str) +# reason[1]: Structured report dict (JSON-serializable, optional) +result.reason = [text_summary] +if report: + result.reason.append(report) # Dict, not str +``` + +This ensures the Dingo standard output contains both readable summaries and full structured data. + +**Full implementation**: `dingo/model/llm/agent/agent_article_fact_checker.py` +**Tests**: `test/scripts/model/llm/agent/test_article_fact_checker.py` (88 tests), +`test/scripts/model/llm/agent/test_async_article_fact_checker.py` (30 tests) +**Guide**: `docs/article_fact_checking_guide.md` + +--- + ### Decision Tree: Which Pattern Should I Use? ``` Start - │ - ├─ Do you need to compose with existing Dingo evaluators? - │ ├─ Yes → Use Custom Pattern (AgentHallucination style) - │ └─ No → Continue - │ - ├─ Is your workflow highly domain-specific? - │ ├─ Yes → Use Custom Pattern - │ └─ No → Continue - │ - ├─ Do you prefer explicit control over every step? - │ ├─ Yes → Use Custom Pattern - │ └─ No → Continue - │ - └─ Default → Use LangChain Pattern (AgentFactCheck style) - ✅ Simpler, less code, battle-tested + | + +- Do you need intermediate artifact saving (claims, reports)? + | +- Yes -> Use Agent-First + Context (ArticleFactChecker style) + | +- No -> Continue + | + +- Do you need to compose with existing Dingo evaluators? + | +- Yes -> Use Custom Pattern (AgentHallucination style) + | +- No -> Continue + | + +- Is your workflow highly domain-specific? + | +- Yes -> Use Custom Pattern + | +- No -> Continue + | + +- Do you prefer explicit control over every step? + | +- Yes -> Use Custom Pattern + | +- No -> Continue + | + +- Default -> Use LangChain Pattern (AgentFactCheck style) + Simpler, less code, battle-tested ``` ### Can I Mix Both Patterns? @@ -395,6 +510,7 @@ Start ```json { "evaluator": [{ + "fields": {"content": "content"}, "evals": [ {"name": "AgentFactCheck"}, // LangChain-based {"name": "AgentHallucination"} // Custom workflow @@ -1408,7 +1524,11 @@ class TestMyAgent: - **AgentHallucination**: `dingo/model/llm/agent/agent_hallucination.py` - Production agent with web search - **AgentFactCheck**: `examples/agent/agent_executor_example.py` - LangChain 1.0 agent example +- **ArticleFactChecker**: `dingo/model/llm/agent/agent_article_fact_checker.py` - Agent-First with context tracking and artifact saving +- **ArticleFactChecker Example**: `examples/agent/agent_article_fact_checking_example.py` - Full article fact-checking example - **TavilySearch Tool**: `dingo/model/llm/agent/tools/tavily_search.py` - Web search tool implementation +- **ClaimsExtractor Tool**: `dingo/model/llm/agent/tools/claims_extractor.py` - LLM-based claims extraction tool +- **ArxivSearch Tool**: `dingo/model/llm/agent/tools/arxiv_search.py` - Academic paper search tool **Note**: For complete implementation examples, refer to the files above. They demonstrate real-world patterns for agent and tool development. @@ -1525,10 +1645,15 @@ summary = executor.execute() ## Additional Resources - [AgentHallucination Implementation](../dingo/model/llm/agent/agent_hallucination.py) +- [ArticleFactChecker Implementation](../dingo/model/llm/agent/agent_article_fact_checker.py) - [BaseAgent Source](../dingo/model/llm/agent/base_agent.py) - [Tool Registry Source](../dingo/model/llm/agent/tools/tool_registry.py) - [Tavily Search Example](../dingo/model/llm/agent/tools/tavily_search.py) +- [Claims Extractor](../dingo/model/llm/agent/tools/claims_extractor.py) +- [ArxivSearch](../dingo/model/llm/agent/tools/arxiv_search.py) - [Example Usage](../examples/agent/agent_hallucination_example.py) +- [Article Fact-Checking Example](../examples/agent/agent_article_fact_checking_example.py) +- [Article Fact-Checking Guide](./article_fact_checking_guide.md) --- diff --git a/docs/article_fact_checking_guide.md b/docs/article_fact_checking_guide.md new file mode 100644 index 00000000..74410a31 --- /dev/null +++ b/docs/article_fact_checking_guide.md @@ -0,0 +1,860 @@ +# Article Fact-Checking Guide + +This guide explains how to use the `ArticleFactChecker` agent for comprehensive article fact-checking. + +## Overview + +The `ArticleFactChecker` is an Agent-First architecture implementation that autonomously: +1. Extracts verifiable claims from long-form articles +2. Selects appropriate verification tools based on claim types +3. Verifies institutional attributions and factual statements +4. Generates structured verification reports with evidence + +**Implementation Pattern:** Agent-First (LangChain 1.0 ReAct) + +## Quick Start + +### Basic Usage (Direct Evaluation) + +```python +import os +from dingo.io.input import Data +from dingo.model.llm.agent import ArticleFactChecker + +# Set API keys (use environment variables) +os.environ["OPENAI_API_KEY"] = "your-openai-api-key" +os.environ["TAVILY_API_KEY"] = "your-tavily-api-key" # Optional + +# Fact-check article +article_text = """ +Your article content here... +""" + +data = Data(content=article_text) +result = ArticleFactChecker.eval(data) + +# View results +print(f"Accuracy: {result.score:.1%}") +print(f"Issues Found: {result.status}") + +# reason[0]: Human-readable text summary (always present) +if result.reason: + print(result.reason[0] if isinstance(result.reason[0], str) else str(result.reason[0])) + + # reason[1]: Structured report dict (always present after evaluation) + if len(result.reason) > 1 and isinstance(result.reason[1], dict): + report = result.reason[1] + print(f"Report Version: {report.get('report_version', 'N/A')}") +``` + +### Advanced Usage (Full Configuration) + +> **Note**: Executor requires `input_path` pointing to a file. The `plaintext` format reads +> line-by-line, splitting the article into separate Data objects per line. Use `jsonl` format +> instead: `json.dumps` encodes newlines as `\n`, keeping the entire article as one Data object. + +```python +import json +import os +import tempfile + +from dingo.config import InputArgs +from dingo.exec import Executor + +# Read article and convert to JSONL (entire article as one Data object) +with open("article.md", "r") as f: + article_text = f.read() + +temp_jsonl = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, encoding='utf-8') +temp_jsonl.write(json.dumps({"content": article_text}, ensure_ascii=False) + '\n') +temp_jsonl.close() + +# Configure ArticleFactChecker with full options +config = { + "input_path": temp_jsonl.name, + "dataset": {"source": "local", "format": "jsonl"}, + "executor": {"max_workers": 1}, + "evaluator": [{ + "fields": {"content": "content"}, + "evals": [{ + "name": "ArticleFactChecker", + "config": { + "key": os.getenv("OPENAI_API_KEY"), + "model": "deepseek-chat", # or "gpt-4o-mini" for OpenAI + "parameters": { + "agent_config": { + "max_iterations": 15, + "output_path": "outputs/article_factcheck/", # Optional: save intermediate artifacts + "tools": { + "claims_extractor": { + "api_key": os.getenv("OPENAI_API_KEY"), + "max_claims": 50, + "claim_types": [ + "factual", "statistical", "attribution", "institutional", + "temporal", "comparative", "monetary", "technical" + ] + }, + "tavily_search": { + "api_key": os.getenv("TAVILY_API_KEY") + }, + "arxiv_search": {"max_results": 5} + } + } + } + } + }] + }] +} + +# Execute +input_args = InputArgs(**config) +result = Executor.exec_map["local"](input_args).execute() + +print(f"Total: {result.total_count}, Good: {result.good_count}, Bad: {result.bad_count}") + +# Cleanup +os.unlink(temp_jsonl.name) +``` + +### CLI Usage + +```bash +# 1. Convert article to JSONL format (entire article as one line) +python -c " +import json +with open('path/to/article.md', 'r') as f: + text = f.read() +with open('article_input.jsonl', 'w') as f: + f.write(json.dumps({'content': text}, ensure_ascii=False) + '\n') +" + +# 2. Create configuration file +cat > article_check_config.json << EOF +{ + "input_path": "article_input.jsonl", + "dataset": { + "source": "local", + "format": "jsonl" + }, + "evaluator": [{ + "fields": {"content": "content"}, + "evals": [{ + "name": "ArticleFactChecker", + "config": { + "key": "${OPENAI_API_KEY}", + "model": "deepseek-chat", + "parameters": { + "agent_config": { + "max_iterations": 15, + "tools": { + "claims_extractor": { + "api_key": "${OPENAI_API_KEY}", + "max_claims": 50 + }, + "tavily_search": { + "api_key": "${TAVILY_API_KEY}" + }, + "arxiv_search": {} + } + } + } + } + }] + }] +} +EOF + +# 3. Run fact-checking +python -m dingo.run.cli --input article_check_config.json +``` + +## Supported Article Types + +`ArticleFactChecker` is designed to handle various article types with adaptive verification strategies: + +### 1. Academic Articles + +**Characteristics:** Research paper announcements, academic news, conference proceedings + +**Claim Types:** institutional, attribution, statistical, factual + +**Verification Strategy:** +- Use `arxiv_search` for paper metadata (title, authors, abstract) +- Use `tavily_search` for institutional affiliations verification +- Combine both tools for comprehensive verification + +**Example:** +```python +academic_article = """ +百度刚刚发布的PaddleOCR-VL模型登顶了由清华大学、阿里达摩院等联合发布的OmniDocBench榜单。 +""" + +data = Data(content=academic_article) +result = ArticleFactChecker.eval(data) +``` + +**Expected Claims:** +- Attribution: "PaddleOCR-VL released by Baidu" +- Institutional: "OmniDocBench jointly released by Tsinghua and Alibaba DAMO" +- Factual: "PaddleOCR-VL topped OmniDocBench leaderboard" + +--- + +### 2. News Articles + +**Characteristics:** Tech news, product launches, current events, announcements + +**Claim Types:** temporal, attribution, factual, statistical, monetary + +**Verification Strategy:** +- Use `tavily_search` with date filters for temporal claims +- Verify attributions through official announcements +- Cross-check statistics with authoritative sources + +**Example:** +```python +news_article = """ +OpenAI于2024年12月5日正式发布o1推理模型。CEO Sam Altman表示这是AGI道路上的里程碑。 +根据技术报告,o1在数学推理任务上的准确率达到89.3%。ChatGPT Plus月费保持20美元。 +""" + +data = Data(content=news_article) +result = ArticleFactChecker.eval(data) +``` + +**Expected Claims:** +- Temporal: "Released on December 5, 2024" +- Attribution: "Sam Altman stated o1 is a milestone" +- Statistical: "89.3% accuracy on math reasoning" +- Monetary: "ChatGPT Plus remains $20/month" + +--- + +### 3. Product Reviews + +**Characteristics:** Gadget reviews, product comparisons, specifications + +**Claim Types:** technical, comparative, monetary, statistical, factual + +**Verification Strategy:** +- Use `tavily_search` for official specifications +- Verify comparative claims with benchmark databases +- Check pricing against official sources + +**Example:** +```python +product_review = """ +iPhone 15 Pro搭载A17 Pro芯片,采用3纳米工艺。 +GPU性能相比A16提升20%。国行128GB版售价7999元。 +在Geekbench 6测试中,单核跑分达到2920。 +""" + +data = Data(content=product_review) +result = ArticleFactChecker.eval(data) +``` + +**Expected Claims:** +- Technical: "A17 Pro chip with 3nm process" +- Comparative: "GPU improved 20% vs A16" +- Monetary: "128GB priced at 7999 yuan" +- Statistical: "Geekbench single-core: 2920" + +--- + +### 4. Technical Blogs + +**Characteristics:** Engineering blogs, tutorials, technical analysis + +**Claim Types:** factual, attribution, technical, comparative + +**Verification Strategy:** +- Use `tavily_search` for technical documentation +- Verify code examples and API usage +- Cross-check with official docs and benchmarks + +**Example:** +```python +tech_blog = """ +React 18引入了并发渲染特性,性能提升了3倍。 +根据Dan Abramov的博客,新的Suspense API简化了异步数据加载。 +""" + +data = Data(content=tech_blog) +result = ArticleFactChecker.eval(data) +``` + +**Expected Claims:** +- Factual: "React 18 introduced concurrent rendering" +- Comparative: "Performance improved 3x" +- Attribution: "Dan Abramov stated Suspense simplifies async loading" + +--- + +### Claim Types Reference + +The agent supports **8 claim types** (expanded from original 4): + +| Claim Type | Description | Example | +|------------|-------------|---------| +| **factual** | General facts | "The tower is 330 meters tall" | +| **statistical** | Numbers, percentages, metrics | "Model has 0.9B parameters" | +| **attribution** | Who said/did/published what | "Vaswani et al. proposed Transformer" | +| **institutional** | Organizations, affiliations | "Released by MIT and Stanford" | +| **temporal** | Time-related claims | "Released on Dec 5, 2024" | +| **comparative** | Comparisons between entities | "GPU improved 20% vs A16" | +| **monetary** | Financial figures, prices | "Priced at $999" | +| **technical** | Technical specifications | "A17 Pro chip with 3nm process" | + +Note: temporal, comparative, monetary, technical types were added in v0.3.0 for multi-type article support + +--- + +## How It Works + +### Agent-First Architecture + +The `ArticleFactChecker` uses **Agent-First** design with `use_agent_executor = True`: + +``` +┌─────────────────────────────────────────────────┐ +│ ArticleFactChecker (LangChain Agent) │ +│ [Autonomous Decision-Making] │ +└─────────────────────────────────────────────────┘ + ↓ Autonomous Decision + ┌──────────────────────────────┐ + │ Available Tools │ + └──────────────────────────────┘ + ↓ ↓ ↓ +┌──────────┐ ┌─────────┐ ┌──────────┐ +│claims_ │ │arxiv_ │ │tavily_ │ +│extractor │ │search │ │search │ +└──────────┘ └─────────┘ └──────────┘ +``` + +**Key Advantages:** +- **Intelligent Tool Selection**: Agent chooses tools based on claim semantics +- **Multi-Step Reasoning**: Builds evidence chains across multiple verifications +- **Adaptive Strategies**: Adjusts approach based on intermediate results +- **Fallback Mechanisms**: Tries alternative tools if initial verification fails + +### Workflow + +**Step 0: Article Type Analysis** + - Agent first identifies the article type: academic, news, product, blog, policy, opinion + - This classification guides claim extraction and verification strategy + - Different article types emphasize different claim types: + - Academic → institutional, attribution, statistical + - News → temporal, attribution, factual + - Product → technical, comparative, monetary + - Blog → factual, technical, attribution + +**Step 1: Claims Extraction** + - Agent calls `claims_extractor` tool on full article + - Extracts atomic, verifiable claims with 8 types: factual, statistical, attribution, + institutional, temporal, comparative, monetary, technical + - Claims are decontextualized (stand-alone) for independent verification + +**Step 2: Autonomous Tool Selection** + - Agent analyzes each claim type and article context + - Selects best verification tool based on principles (not rigid IF-THEN rules): + - **Academic papers** → `arxiv_search` (metadata) + `tavily_search` (institutions) + - **Institutional/organizational claims** → `tavily_search` (primary) + - **Current events/news** → `tavily_search` with date filters + - **Product specs/pricing** → `tavily_search` for official sources + - **Technical documentation** → `tavily_search` for docs + - **Adaptive Strategy:** Combines tools, uses fallbacks, cross-verifies with multiple sources + +3. **Verification** + - Agent calls selected tools to verify each claim + - Collects evidence and sources + - Adapts if initial verification fails + +4. **Report Generation** + - Synthesizes verification results + - Generates structured report with: + - Summary statistics + - False claims comparison table + - Evidence and sources + - Severity ratings + +## Claim Types + +### Institutional Claims + +Claims about organizational affiliations: + +``` +Example: "OmniDocBench was released by Tsinghua University" + +Agent Decision: +1. Recognizes institutional claim +2. Checks if paper mentioned → Yes (OmniDocBench) +3. Selects arxiv_search tool +4. Searches for paper metadata and author affiliations +5. Compares claimed vs actual institutions via LLM reasoning +``` + +### Statistical Claims + +Claims with numbers or percentages: + +``` +Example: "The model has 0.9B parameters" + +Agent Decision: +1. Recognizes statistical claim +2. Selects tavily_search for general verification +3. Searches for official sources +4. Verifies number accuracy +``` + +### Factual Claims + +General factual statements: + +``` +Example: "PaddleOCR-VL topped the OmniDocBench leaderboard" + +Agent Decision: +1. Recognizes factual claim +2. Selects tavily_search +3. Searches for leaderboard information +4. Verifies ranking claim +``` + +## Configuration + +### Agent Configuration + +```python +{ + "agent_config": { + "max_iterations": 15, # Maximum reasoning steps + + # Artifact output path (three options, evaluated in priority order): + # 1. "output_path": "path/to/dir" → use explicit path (backward-compatible) + # 2. "save_artifacts": false → disable artifact saving entirely + # 3. (default) → auto-generate outputs/article_factcheck__/ + # Override base dir with "base_output_path": "custom/base/" + + "tools": { + "claims_extractor": { + "api_key": "...", + "max_claims": 50, # Max claims to extract + "claim_types": [ # Types to extract + "factual", + "statistical", + "attribution", + "institutional" + ], + "chunk_size": 2000, # Text chunk size + "include_context": true, # Include surrounding context + "temperature": 0.1 # LLM temperature + }, + "arxiv_search": { + "max_results": 5, # Max search results + "sort_by": "relevance", + "rate_limit_delay": 3.0 # Delay between requests + }, + "tavily_search": { + "api_key": "...", + "max_results": 5, + "search_depth": "advanced" # or "basic" + } + }, + "max_concurrent_claims": 5 # Max parallel claim verifications (asyncio Semaphore) + } +} +``` + +### Output Format + +The `EvalDetail` returned by `ArticleFactChecker` uses a **dual-layer reason** structure: + +- `reason[0]`: Human-readable text summary (always present, `str`) +- `reason[1]`: Structured report dictionary (always present after evaluation, `dict`) + +```python +{ + "metric": "ArticleFactChecker", + "status": true, # true = issues found, false = all good + "score": 0.75, # Overall accuracy (0.0-1.0) + "label": ["QUALITY_BAD_ARTICLE_FACTUAL_ERROR"], # or QUALITY_BAD_ARTICLE_UNVERIFIED_CLAIMS / QUALITY_GOOD + "reason": [ + # reason[0]: Human-readable text summary (str) + "Article Fact-Checking Report\n" + "======================================================================\n" + "Total Claims Analyzed: 20\n" + "Verified Claims: 15\n" + "False Claims: 5\n" + "Unverifiable Claims: 0\n" + "Overall Accuracy: 75.0%\n" + "\n" + "Agent Performance:\n" + " Tool Calls: 8\n" + " Reasoning Steps: 10\n" + "\n" + "FALSE CLAIMS DETAILED COMPARISON:\n" + "======================================================================\n" + "\n" + "#1 FALSE CLAIM\n" + " Article Claimed:\n" + " OmniDocBench was released by Tsinghua University...\n" + " Actual Truth:\n" + " OmniDocBench was released by Shanghai AI Lab, Abaka AI, 2077AI\n" + " Evidence:\n" + " Verified via arXiv paper 2412.07626 author list", + + # reason[1]: Structured report dict (always present) + { + "report_version": "2.0", + "generated_at": "2026-02-06T15:30:00", + "article_info": {"content_source": "markdown", "content_length": 5432}, + "claims_extraction": { + "total_extracted": 20, + "verifiable": 18, + "claim_types_distribution": {"factual": 5, "institutional": 3, "...": "..."} + }, + "verification_summary": { + "total_verified": 20, + "verified_true": 15, + "verified_false": 5, + "unverifiable": 0, + "accuracy_score": 0.75 + }, + "detailed_findings": ["..."], + "false_claims_comparison": ["..."], + "agent_metadata": { + "model": "deepseek-chat", + "tool_calls_count": 8, + "reasoning_steps": 10, + "execution_time_seconds": 45.2 + } + } + ] +} +``` + +### Output Files + +ArticleFactChecker auto-saves intermediate artifacts to a timestamped directory by default. + +**Dingo standard output** (saved to executor output_path): + +Default mode (`merge=false`, the default): +- `summary.json` - Aggregated statistics +- `content/