diff --git a/README.md b/README.md
index 2c56654c..017303cc 100644
--- a/README.md
+++ b/README.md
@@ -541,6 +541,7 @@ Both patterns share the same configuration interface and are transparent to user
 **Built-in Agents:**
 - `AgentFactCheck`: LangChain-based fact-checking with autonomous search control
 - `AgentHallucination`: Custom workflow hallucination detection with adaptive context gathering
+- `ArticleFactChecker`: Two-phase article fact-checking — extracts verifiable claims then verifies each in parallel using web search and Arxiv, with configurable concurrency control
 
 **Quick Example:**
 
@@ -597,6 +598,7 @@ For detailed guidance on choosing and implementing agent patterns, see [Agent De
 - [Agent Development Guide](docs/agent_development_guide.md) - Comprehensive guide for creating custom agents and tools
 - [AgentHallucination Example](examples/agent/agent_hallucination_example.py) - Production agent example
 - [AgentFactCheck Example](examples/agent/agent_executor_example.py) - LangChain agent example
+- [ArticleFactChecker Example](examples/agent/agent_article_fact_checking_example.py) - Article-scale two-phase fact verification
 
 ## ⚙️ Execution Modes
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 8171632f..d69fd072 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -534,6 +534,7 @@ Dingo 支持基于智能体的评估器，可以使用外部工具进行多步
 **内置智能体：**
 - `AgentFactCheck`: 基于 LangChain 的事实核查，自主搜索控制
 - `AgentHallucination`: 自定义工作流的幻觉检测，自适应上下文收集
+- `ArticleFactChecker`: 两阶段文章事实核查 —— 先提取可验证声明，再并发调用网络搜索与 Arxiv 逐条验证，支持可配置的并发控制
 
 **快速示例：**
 
@@ -590,6 +591,7 @@ class MyAgent(BaseAgent):
 - [智能体开发指南](docs/agent_development_guide.md)
 - [AgentHallucination 示例](examples/agent/agent_hallucination_example.py)
 - [AgentFactCheck LangChain示例](examples/agent/agent_executor_example.py)
+- [ArticleFactChecker 示例](examples/agent/agent_article_fact_checking_example.py) - 文章级两阶段事实核查
 
 ## 执行引擎
 
diff --git a/dingo/model/llm/agent/__init__.py b/dingo/model/llm/agent/__init__.py
index 5ffcf30e..d81b392c 100644
--- a/dingo/model/llm/agent/__init__.py
+++ b/dingo/model/llm/agent/__init__.py
@@ -1,22 +1,24 @@
-"""
-Agent Framework for Dingo
-
-This package provides agent-based evaluation capabilities that extend LLMs with
-tool usage, multi-step reasoning, and adaptive context gathering.
-
-Key Components:
-- BaseAgent: Abstract base class for agent evaluators
-- Tool system: Registry and base classes for agent tools
-"""
-
-from dingo.model.llm.agent.base_agent import BaseAgent
-from dingo.model.llm.agent.tools import BaseTool, ToolConfig, ToolRegistry, get_tool, tool_register
-
-__all__ = [
-    'BaseAgent',
-    'BaseTool',
-    'ToolConfig',
-    'ToolRegistry',
-    'get_tool',
-    'tool_register',
-]
+"""
+Agent Framework for Dingo
+
+This package provides agent-based evaluation capabilities that extend LLMs with
+tool usage, multi-step reasoning, and adaptive context gathering.
+
+Key Components:
+- BaseAgent: Abstract base class for agent evaluators
+- Tool system: Registry and base classes for agent tools
+"""
+
+from dingo.model.llm.agent.agent_article_fact_checker import ArticleFactChecker
+from dingo.model.llm.agent.base_agent import BaseAgent
+from dingo.model.llm.agent.tools import BaseTool, ToolConfig, ToolRegistry, get_tool, tool_register
+
+__all__ = [
+    'ArticleFactChecker',
+    'BaseAgent',
+    'BaseTool',
+    'ToolConfig',
+    'ToolRegistry',
+    'get_tool',
+    'tool_register',
+]
diff --git a/dingo/model/llm/agent/agent_article_fact_checker.py b/dingo/model/llm/agent/agent_article_fact_checker.py
new file mode 100644
index 00000000..244489ec
--- /dev/null
+++ b/dingo/model/llm/agent/agent_article_fact_checker.py
@@ -0,0 +1,1802 @@
+"""
+ArticleFactChecker: Agent-based article fact-checking with claims extraction.
+
+Uses Agent-First architecture (LangChain ReAct / ``use_agent_executor=True``),
+giving the agent full autonomy over tool selection, execution order, and
+multi-step reasoning to verify factual claims in long-form articles.
+
+See Also:
+    AgentFactCheck: Single-claim hallucination detection
+    docs/agent_development_guide.md: Agent development patterns
+"""
+
+import asyncio
+import json
+import os
+import re
+import threading
+import time
+import uuid
+from collections import Counter
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+from dingo.io import Data
+from dingo.io.input.required_field import RequiredField
+from dingo.io.output.eval_detail import EvalDetail, QualityLabel
+from dingo.model import Model
+from dingo.model.llm.agent.base_agent import BaseAgent
+from dingo.utils import log
+
+
+class PromptTemplates:
+    """
+    Modular prompt templates for ArticleFactChecker.
+
+    This class provides reusable prompt components that can be assembled
+    based on article type and verification needs. This approach:
+    - Reduces context window usage for long articles
+    - Allows dynamic prompt customization
+    - Makes prompts easier to maintain and test
+    """
+
+    CORE_ROLE = """You are an expert article fact-checker with autonomous tool selection capabilities.
+
+Your Task: Systematically verify ALL factual claims in the provided article."""
+
+    TOOLS_DESCRIPTION = """
+Available Tools:
+================
+1. claims_extractor: Extract verifiable claims from long-form text
+   - Use this FIRST to identify all checkable statements
+   - Supports 8 claim types: factual, statistical, attribution, institutional,
+     temporal, comparative, monetary, technical
+   - Returns list of structured claims with types
+
+2. arxiv_search: Search academic papers and verify metadata
+   - Use for claims about research papers, academic publications
+   - Provides paper metadata: title, authors, abstract, publication date
+   - Authors in papers often indicate institutional affiliations in abstracts
+   - NOTE: Affiliations are in unstructured text, not dedicated fields
+   - Best for: paper titles, author names, publication dates, and
+     institutional claims when a related paper exists
+   - For institutional claims: use arxiv_search FIRST to find the paper,
+     then tavily_search to cross-verify affiliations
+
+3. tavily_search: General web search for fact verification
+   - Use for general factual claims, current events, companies, products
+   - Use for cross-verifying institutional/organizational affiliations
+   - Use for news, product specs, financial figures, comparative claims
+   - Supports multilingual queries: search BOTH English AND Chinese terms for
+     Chinese content (e.g., both "清华大学 OmniDocBench" and
+     "Tsinghua University OmniDocBench")
+   - Use search_depth='advanced' for authoritative fact-checking results
+   - Provides current web information with sources and URLs"""
+
+    WORKFLOW_STEPS = """
+Workflow (Autonomous Decision-Making):
+======================================
+STEP 0: Analyze Article Type
+   First, identify the article type to guide your verification strategy.
+
+STEP 1: Extract Claims (REQUIRED - Do NOT skip this step)
+   - You MUST call the claims_extractor tool with the full article text
+   - This is a mandatory first step before any verification
+   - Do NOT extract claims manually in your reasoning - use the tool
+   - Review the tool output and use the extracted claims for verification
+   - Claims are categorized by type for targeted verification
+
+STEP 2: Verify Each Claim (Autonomous Tool Selection)
+   For each claim, analyze its type and context, then SELECT THE BEST TOOL:
+
+   Tool Selection Principles:
+   1. arxiv_search - For academic paper verification (paper title, author, arXiv ID)
+   2. tavily_search - For general web verification (current events, companies, products)
+
+   Claim-Type Specific Rules:
+   - INSTITUTIONAL/ATTRIBUTION claims (e.g., "released by X University and Y Lab"):
+     You MUST use arxiv_search FIRST to find the actual paper and check author
+     affiliations, THEN use tavily_search to cross-verify. Do NOT rely on
+     tavily_search alone for institutional claims — web sources often give
+     vague or incomplete attribution. The paper's author list is the
+     authoritative source for institutional affiliations.
+     For CHINESE institution names: translate to English before arxiv_search
+     (e.g., "清华大学" → "Tsinghua University", "达摩院" → "Alibaba DAMO Academy",
+      "上海人工智能实验室" → "Shanghai AI Laboratory")
+     Search with BOTH Chinese and English terms in tavily_search for maximum coverage.
+   - STATISTICAL/TECHNICAL claims: Use tavily_search for official benchmarks
+   - FACTUAL claims: Use tavily_search for general verification
+
+   Adaptive Strategies:
+   - COMBINE tools for comprehensive verification
+   - FALLBACK: If arxiv_search finds no paper → immediately use tavily_search alone
+   - FALLBACK: If tavily_search returns no relevant results → mark as UNVERIFIABLE
+     (do NOT retry with same query; try a different angle or accept UNVERIFIABLE)
+   - MULTI-SOURCE: Cross-verify important claims with multiple sources
+
+STEP 3: Synthesize Results
+   After verifying ALL claims, generate a comprehensive report."""
+
+    OUTPUT_FORMAT = """
+Output Format:
+==============
+You MUST return JSON in this exact format:
+
+```json
+{
+  "article_verification_summary": {
+    "article_type": "academic|news|product|blog|policy|opinion",
+    "total_claims": <number>,
+    "verified_claims": <number>,
+    "false_claims": <number>,
+    "unverifiable_claims": <number>,
+    "accuracy_score": <0.0-1.0>
+  },
+  "detailed_findings": [
+    {
+      "claim_id": "claim_001",
+      "original_claim": "...",
+      "claim_type": "institutional|factual|temporal|comparative|etc",
+      "verification_result": "FALSE|TRUE|UNVERIFIABLE",
+      "evidence": "...",
+      "sources": ["url1", "url2"],
+      "verification_method": "arxiv_search|tavily_search|combined",
+      "search_queries_used": ["query1", "query2"],
+      "reasoning": "Step-by-step reasoning for the verification conclusion"
+    }
+  ],
+  "false_claims_comparison": [
+    {
+      "article_claimed": "Example: OpenAI released o1 in November 2024",
+      "actual_truth": "OpenAI released o1 on December 5, 2024",
+      "evidence": "Verified via official OpenAI announcement"
+    }
+  ]
+}
+```"""
+
+    VERDICT_CRITERIA = """
+Verdict Decision Criteria:
+==========================
+Before assigning a verification_result to any claim, apply these evidence-based criteria:
+
+TRUE - Claim is CONFIRMED by evidence:
+  - You found specific, credible evidence that DIRECTLY supports the claim
+  - The evidence explicitly confirms the key facts (names, numbers, dates, relationships)
+  - You can cite a specific source URL that contains the confirming information
+
+FALSE - Claim is CONTRADICTED by evidence:
+  - You found specific, credible evidence that DIRECTLY contradicts the claim
+  - The evidence reveals a clear factual error (wrong date, wrong number, wrong attribution)
+  - You can point to the specific discrepancy between claim and evidence
+
+UNVERIFIABLE - Insufficient or ambiguous evidence:
+  - You could NOT find evidence that clearly confirms OR contradicts the claim
+  - Evidence partially matches but key details cannot be confirmed
+  - Sources mention the topic but do not address the specific claim being checked
+  - The claim involves details not found in any source
+
+CRITICAL RULE: Absence of contradictory evidence does NOT equal confirmation.
+If your search did not find explicit confirming evidence, the verdict is UNVERIFIABLE, not TRUE.
+If your reasoning includes phrases like "not explicitly listed", "could not confirm",
+"no direct evidence", or "not mentioned in results", the verdict MUST be UNVERIFIABLE."""
+
+    SELF_VERIFICATION_STEP = """
+STEP 3.5: Self-Verify Verdict-Reasoning Consistency (MANDATORY)
+   Before generating your final JSON report, review EVERY claim's verdict:
+
+   For each claim in your detailed_findings:
+   a) Re-read the evidence and reasoning you wrote for this claim
+   b) Ask yourself: "Does my evidence DIRECTLY and EXPLICITLY support this verdict?"
+   c) Apply these consistency checks:
+      - Reasoning says "not found", "not listed", "not mentioned", "no evidence"
+        -> Verdict MUST be UNVERIFIABLE (not TRUE)
+      - Reasoning says "confirmed by [specific source]" with a URL
+        -> Verdict can be TRUE
+      - Reasoning says "contradicts", "actually [different fact]", "incorrect"
+        -> Verdict MUST be FALSE
+      - Reasoning is uncertain or hedging ("may", "possibly", "unclear")
+        -> Verdict MUST be UNVERIFIABLE
+   d) If you find ANY inconsistency, correct the verdict NOW
+
+   This step is critical for report quality. Do NOT skip it."""
+
+    CRITICAL_GUIDELINES = """
+Critical Guidelines:
+====================
+- ALWAYS extract claims first before verification
+- AUTONOMOUS tool selection based on claim type and article context
+- VERIFY each claim independently
+- USE multiple sources when possible (especially for critical claims)
+- CITE specific evidence and URLs
+- BE THOROUGH: Don't skip claims
+- ADAPTIVE: If a tool fails, try alternatives intelligently
+- CONTEXT-AWARE: Consider article type when selecting verification approach
+
+Remember: You are an autonomous agent with full decision-making power.
+Analyze the article type, choose tools intelligently based on claim context,
+adapt to intermediate results, and ensure comprehensive verification."""
+
+    # Article type specific guidance
+    ARTICLE_TYPE_GUIDANCE = {
+        "academic": """
+Article Type Guidance (Academic):
+- Focus on arxiv_search for paper verification AND institutional claims
+- For institutional affiliations: COMBINE arxiv_search (paper authors/abstracts) + tavily_search (cross-verify)
+- Verify: paper titles, authors, publication dates, citations, institutional attributions
+- Example: "OmniDocBench by Tsinghua" → arxiv_search for paper metadata THEN tavily_search to cross-verify""",
+
+        "news": """
+Article Type Guidance (News):
+- Focus on tavily_search for current events
+- Verify dates, quotes, and attributions carefully
+- Cross-reference multiple news sources
+- Example: "released on December 5" → tavily_search with date context""",
+
+        "product": """
+Article Type Guidance (Product Review):
+- Use tavily_search for official specifications
+- Verify technical specs against manufacturer data
+- Check benchmark claims against third-party reviews
+- Example: "A17 Pro chip" → tavily_search for official Apple specs""",
+
+        "blog": """
+Article Type Guidance (Technical Blog):
+- Use tavily_search for documentation verification
+- Verify version numbers and feature claims
+- Check performance claims against benchmarks
+- Example: "React 18 features" → tavily_search for React docs""",
+
+        "policy": """
+Article Type Guidance (Policy Document):
+- Use tavily_search for government sources
+- Verify dates, regulations, and official statements
+- Cross-reference with official government websites""",
+
+        "opinion": """
+Article Type Guidance (Opinion Piece):
+- Focus only on attributed factual claims
+- Verify quotes and statistics cited
+- Distinguish opinions from verifiable facts"""
+    }
+
+    PER_CLAIM_VERIFICATION_PROMPT = """You are a fact-checking expert. Verify ONE specific factual claim.
+
+Use available search tools to find evidence, then respond ONLY with valid JSON:
+
+{
+  "verification_result": "TRUE|FALSE|UNVERIFIABLE",
+  "evidence": "Key evidence found (1-3 sentences)",
+  "sources": ["url1", "url2"],
+  "verification_method": "tavily_search|arxiv_search|combined|no_search",
+  "search_queries_used": ["query text"],
+  "reasoning": "Step-by-step reasoning for your verdict"
+}
+
+Verdict Rules:
+- TRUE: Found specific, direct evidence CONFIRMING the claim with a cited URL
+- FALSE: Found specific evidence CONTRADICTING the claim
+- UNVERIFIABLE: Could not find clear confirming OR contradicting evidence
+
+CRITICAL: Start with search, then produce JSON only. No text outside the JSON."""
+
+    @classmethod
+    def build(cls, article_type: Optional[str] = None) -> str:
+        """
+        Build complete system prompt from modular components.
+
+        Args:
+            article_type: Optional article type for targeted guidance
+                         ("academic", "news", "product", "blog", "policy", "opinion")
+
+        Returns:
+            Complete system prompt string
+        """
+        parts = [
+            cls.CORE_ROLE,
+            cls.TOOLS_DESCRIPTION,
+            cls.WORKFLOW_STEPS,
+        ]
+
+        if article_type and article_type.lower() in cls.ARTICLE_TYPE_GUIDANCE:
+            parts.append(cls.ARTICLE_TYPE_GUIDANCE[article_type.lower()])
+
+        parts.extend([
+            cls.VERDICT_CRITERIA,
+            cls.OUTPUT_FORMAT,
+            cls.SELF_VERIFICATION_STEP,
+            cls.CRITICAL_GUIDELINES
+        ])
+
+        return "\n".join(parts)
+
+    @classmethod
+    def get_article_types(cls) -> List[str]:
+        """Return list of supported article types."""
+        return list(cls.ARTICLE_TYPE_GUIDANCE.keys())
+
+
+@Model.llm_register("ArticleFactChecker")
+class ArticleFactChecker(BaseAgent):
+    """
+    Article-level fact-checking agent using LangChain ReAct (Agent-First pattern).
+
+    The agent autonomously:
+    1. Extracts claims via claims_extractor
+    2. Selects the best verification tool per claim type (arxiv_search / tavily_search)
+    3. Builds evidence chains and generates a structured verification report
+
+    Configuration Example::
+
+        {
+            "name": "ArticleFactChecker",
+            "config": {
+                "key": "your-openai-api-key",
+                "model": "gpt-4o-mini",
+                "parameters": {
+                    "agent_config": {
+                        "max_iterations": 10,
+                        "tools": {
+                            "claims_extractor": {
+                                "api_key": "your-openai-api-key",
+                                "max_claims": 50,
+                                "claim_types": ["factual", "institutional", "statistical", "attribution"]
+                            },
+                            "tavily_search": {
+                                "api_key": "your-tavily-api-key",
+                                "max_results": 5
+                            },
+                            "arxiv_search": {"max_results": 5}
+                        }
+                    }
+                }
+            }
+        }
+    """
+
+    use_agent_executor = True  # Enable Agent-First mode
+    available_tools = [
+        "claims_extractor",  # Extract verifiable claims from article
+        "arxiv_search",      # Verify academic papers and institutions
+        "tavily_search"      # General web search verification
+    ]
+    max_iterations = 10  # Allow more iterations for comprehensive checking
+    max_concurrent_claims = 5  # Default parallel claim verification slots
+
+    _required_fields = [RequiredField.CONTENT]  # Article text
+
+    _metric_info = {
+        "metric_name": "ArticleFactChecker",
+        "description": "Article-level fact checking with autonomous claims extraction and verification"
+    }
+
+    # Lock to serialise ClaimsExtractor class-level config mutation across threads.
+    # Required because LocalExecutor may call eval() from multiple threads concurrently.
+    _claims_extractor_lock = threading.Lock()
+
+    # --- Output Path and File Saving Methods ---
+
+    @classmethod
+    def _get_output_dir(cls) -> Optional[str]:
+        """
+        Get output directory for artifact files.
+
+        Returns:
+            Output directory path (created if needed), or None if saving is disabled.
+        """
+        params = cls.dynamic_config.parameters or {}
+        agent_cfg = params.get('agent_config') or {}
+
+        explicit_path = agent_cfg.get('output_path')
+        if explicit_path:
+            os.makedirs(explicit_path, exist_ok=True)
+            return explicit_path
+
+        if agent_cfg.get('save_artifacts') is False:
+            return None
+
+        base_output = agent_cfg.get('base_output_path') or 'outputs'
+        create_time = time.strftime("%Y%m%d_%H%M%S", time.localtime())
+        auto_path = os.path.join(base_output, f"article_factcheck_{create_time}_{uuid.uuid4().hex[:6]}")
+        os.makedirs(auto_path, exist_ok=True)
+        log.debug(f"ArticleFactChecker: artifact path auto-derived: {auto_path}")
+        return auto_path
+
+    @classmethod
+    def _save_article_content(cls, output_dir: str, content: str) -> Optional[str]:
+        """
+        Save original article content to output directory.
+
+        Args:
+            output_dir: Output directory path
+            content: Article markdown content
+
+        Returns:
+            Path to saved file, or None on failure
+        """
+        file_path = os.path.join(output_dir, "article_content.md")
+        try:
+            with open(file_path, 'w', encoding='utf-8') as f:
+                f.write(content)
+            log.info(f"Saved article content to {file_path}")
+            return file_path
+        except (IOError, OSError) as e:
+            log.error(f"Failed to save article content: {e}")
+            return None
+
+    @classmethod
+    def _write_jsonl_file(cls, file_path: str, records: List[Dict]) -> Optional[str]:
+        """Write records as JSONL. Returns file_path on success, None on failure."""
+        try:
+            with open(file_path, 'w', encoding='utf-8') as f:
+                for record in records:
+                    f.write(json.dumps(record, ensure_ascii=False) + '\n')
+            return file_path
+        except (IOError, OSError) as e:
+            log.error(f"Failed to write {file_path}: {e}")
+            return None
+
+    @classmethod
+    def _save_claims(cls, output_dir: str, claims: List[Dict]) -> Optional[str]:
+        """Save extracted claims to JSONL file."""
+        file_path = os.path.join(output_dir, "claims_extracted.jsonl")
+        saved = cls._write_jsonl_file(file_path, claims)
+        if saved:
+            log.info(f"Saved {len(claims)} claims to {file_path}")
+        return saved
+
+    @classmethod
+    def _save_verification_details(cls, output_dir: str, enriched_claims: List[Dict]) -> Optional[str]:
+        """Save per-claim verification details to JSONL file."""
+        file_path = os.path.join(output_dir, "claims_verification.jsonl")
+        saved = cls._write_jsonl_file(file_path, enriched_claims)
+        if saved:
+            log.info(f"Saved {len(enriched_claims)} verification details to {file_path}")
+        return saved
+
+    @classmethod
+    def _save_full_report(cls, output_dir: str, report_data: Dict) -> Optional[str]:
+        """
+        Save full structured verification report to JSON file.
+
+        Args:
+            output_dir: Output directory path
+            report_data: Complete report dictionary
+
+        Returns:
+            Path to saved file, or None on failure
+        """
+        file_path = os.path.join(output_dir, "verification_report.json")
+        try:
+            with open(file_path, 'w', encoding='utf-8') as f:
+                json.dump(report_data, f, ensure_ascii=False, indent=2)
+            log.info(f"Saved verification report to {file_path}")
+            return file_path
+        except (IOError, OSError) as e:
+            log.error(f"Failed to save verification report: {e}")
+            return None
+
+    # --- Data Processing Methods ---
+
+    @classmethod
+    def _extract_claims_from_tool_calls(cls, tool_calls: List[Dict]) -> List[Dict]:
+        """
+        Extract claims list from tool_calls observation data.
+
+        The claims_extractor tool returns its results in the observation field
+        of the tool_calls list (via langchain_adapter).
+
+        Args:
+            tool_calls: List of tool call dicts from AgentWrapper
+
+        Returns:
+            List of claim dictionaries extracted from claims_extractor output
+        """
+        for tc in tool_calls:
+            if tc.get('tool') == 'claims_extractor':
+                observation = tc.get('observation', '')
+                if not observation:
+                    continue
+                try:
+                    obs_data = json.loads(observation)
+                    if obs_data.get('success'):
+                        # Claims may be in data.claims (langchain_adapter wrapping)
+                        # or directly in obs_data.claims
+                        data_section = obs_data.get('data', obs_data)
+                        claims = data_section.get('claims', [])
+                        if claims:
+                            return claims
+                except (json.JSONDecodeError, TypeError) as e:
+                    log.warning(f"Failed to parse claims_extractor observation: {e}")
+        return []
+
+    @classmethod
+    def _extract_claims_from_detailed_findings(cls, verification_data: Dict[str, Any]) -> List[Dict]:
+        """
+        Fallback: extract claims from agent's detailed_findings when
+        claims_extractor tool was not called.
+
+        Args:
+            verification_data: Agent's parsed JSON output
+
+        Returns:
+            List of claim dicts with source="agent_reasoning"
+        """
+        return [
+            {
+                "claim_id": finding.get("claim_id", ""),
+                "claim": finding.get("original_claim", ""),
+                "claim_type": finding.get("claim_type", "unknown"),
+                "confidence": None,
+                "verifiable": True,
+                "source": "agent_reasoning"
+            }
+            for finding in verification_data.get("detailed_findings", [])
+        ]
+
+    _VERDICT_MAP = {
+        "TRUE": "TRUE", "FALSE": "FALSE", "UNVERIFIABLE": "UNVERIFIABLE",
+        "CONFIRMED": "TRUE", "ACCURATE": "TRUE", "CORRECT": "TRUE", "VERIFIED": "TRUE",
+        "INACCURATE": "FALSE", "INCORRECT": "FALSE", "WRONG": "FALSE",
+        "DISPROVEN": "FALSE", "REFUTED": "FALSE",
+    }
+
+    @classmethod
+    def _normalize_verdict(cls, verdict: Any) -> str:
+        """Normalize verdict to standard values (TRUE/FALSE/UNVERIFIABLE). Unknown values default to UNVERIFIABLE."""
+        if not verdict or not isinstance(verdict, str):
+            return "UNVERIFIABLE"
+        return cls._VERDICT_MAP.get(verdict.strip().upper(), "UNVERIFIABLE")
+
+    # Pre-compiled regexes for Tier 3 per-field extraction in _parse_claim_json_robust.
+    _RE_VERDICT = re.compile(r'"verification_result"\s*:\s*"(TRUE|FALSE|UNVERIFIABLE)"', re.IGNORECASE)
+    _RE_EVIDENCE = re.compile(r'"evidence"\s*:\s*"((?:[^"\\]|\\.)*)"', re.DOTALL)
+    _RE_EVIDENCE_TRUNC = re.compile(r'"evidence"\s*:\s*"((?:[^"\\]|\\.)+)', re.DOTALL)
+    _RE_SOURCES = re.compile(r'"sources"\s*:\s*\[(.*?)\]', re.DOTALL)
+    _RE_SOURCES_TRUNC = re.compile(r'"sources"\s*:\s*\[(.*)', re.DOTALL)
+    _RE_REASONING = re.compile(r'"reasoning"\s*:\s*"((?:[^"\\]|\\.)*)"', re.DOTALL)
+    _RE_REASONING_TRUNC = re.compile(r'"reasoning"\s*:\s*"((?:[^"\\]|\\.)+)', re.DOTALL)
+
+    # Hedging language patterns that indicate reasoning contradicts a TRUE verdict.
+    _HEDGING_PATTERNS = re.compile(
+        r"(?:"
+        r"not explicitly (?:stated|listed|mentioned|confirmed|found)"
+        r"|(?:cannot|could not|couldn't) (?:be verified|confirm|find|verify)"
+        r"|unable to (?:verify|confirm|find)"
+        r"|is(?:n't| not) explicitly"
+        r"|no (?:direct|explicit) evidence"
+        r"|insufficient evidence"
+        r"|not directly (?:confirmed|stated|verified)"
+        r"|cannot be fully verified"
+        r"|exact .{0,30} isn't .{0,30} stated"
+        r"|while .{0,40} isn't .{0,30} stated"
+        r"|not .{0,20} explicitly .{0,20} in (?:the )?(?:available |found )?(?:sources?|documentation|results?)"
+        r")",
+        re.IGNORECASE
+    )
+
+    @classmethod
+    def _check_reasoning_verdict_consistency(cls, enriched_claims: List[Dict]) -> int:
+        """
+        Downgrade TRUE verdicts to UNVERIFIABLE when reasoning contains hedging language.
+
+        Only affects TRUE verdicts; FALSE verdicts are never changed.
+
+        Args:
+            enriched_claims: List of enriched claim dicts (modified in place)
+
+        Returns:
+            Number of verdicts downgraded
+        """
+        downgraded = 0
+        for claim in enriched_claims:
+            if claim.get("verification_result") != "TRUE":
+                continue
+
+            reasoning = claim.get("reasoning", "")
+            if not reasoning:
+                continue
+
+            match = cls._HEDGING_PATTERNS.search(reasoning)
+            if match:
+                claim["verification_result"] = "UNVERIFIABLE"
+                claim_id = claim.get("claim_id", "unknown")
+                matched_text = match.group(0)
+                log.info(
+                    f"Verdict downgraded TRUE→UNVERIFIABLE for {claim_id}: "
+                    f"hedging detected in reasoning: '{matched_text}'"
+                )
+                downgraded += 1
+
+        return downgraded
+
+    @classmethod
+    def _recalculate_summary(cls, enriched_claims: List[Dict]) -> Dict[str, Any]:
+        """
+        Recalculate verification summary from actual enriched claim data.
+
+        This ensures the summary matches the actual verdict distribution,
+        overriding any inconsistent self-reported summary from the agent.
+
+        Args:
+            enriched_claims: List of enriched claim dicts with normalized verdicts
+
+        Returns:
+            Summary dict with total_claims, verified_claims, false_claims,
+            unverifiable_claims, and accuracy_score
+        """
+        total = len(enriched_claims)
+        true_count = sum(1 for c in enriched_claims if c.get("verification_result") == "TRUE")
+        false_count = sum(1 for c in enriched_claims if c.get("verification_result") == "FALSE")
+        unverifiable_count = sum(1 for c in enriched_claims if c.get("verification_result") == "UNVERIFIABLE")
+        accuracy = true_count / total if total > 0 else 0.0
+        return {
+            "total_claims": total,
+            "verified_claims": true_count,
+            "false_claims": false_count,
+            "unverifiable_claims": unverifiable_count,
+            "accuracy_score": round(accuracy, 4)
+        }
+
+    @classmethod
+    def _build_per_claim_verification(
+        cls,
+        verification_data: Dict[str, Any],
+        extracted_claims: List[Dict],
+        tool_calls: List[Dict]
+    ) -> List[Dict]:
+        """
+        Merge verification_data, extracted_claims, and tool_calls into
+        per-claim verification records.
+
+        Data sources:
+        - detailed_findings: verification result, evidence, sources, reasoning
+        - extracted_claims: claim_type, confidence, verifiable, context
+        - tool_calls: search queries and tool usage details
+
+        Args:
+            verification_data: Agent's parsed JSON output
+            extracted_claims: Claims from claims_extractor tool
+            tool_calls: Complete tool call list from agent
+
+        Returns:
+            List of enriched per-claim verification records
+        """
+        detailed_findings = verification_data.get("detailed_findings", [])
+
+        # Build lookup from extracted claims by claim_id
+        claims_by_id: Dict[str, Dict] = {}
+        for claim in extracted_claims:
+            cid = claim.get('claim_id', '')
+            if cid:
+                claims_by_id[cid] = claim
+
+        enriched_claims: List[Dict] = []
+        for finding in detailed_findings:
+            claim_id = finding.get('claim_id', '')
+            extracted = claims_by_id.get(claim_id, {})
+
+            enriched = {
+                "claim_id": claim_id,
+                "original_claim": finding.get('original_claim', extracted.get('claim', '')),
+                "claim_type": finding.get('claim_type', extracted.get('claim_type', 'unknown')),
+                "confidence": extracted.get('confidence'),
+                "verification_result": finding.get('verification_result', 'UNVERIFIABLE'),
+                "evidence": finding.get('evidence', ''),
+                "sources": finding.get('sources', []),
+                "verification_method": finding.get('verification_method', ''),
+                "search_queries_used": finding.get('search_queries_used', []),
+                "reasoning": finding.get('reasoning', ''),
+            }
+
+            enriched_claims.append(enriched)
+
+        # If no detailed_findings but we have extracted claims, create placeholder records
+        if not enriched_claims and extracted_claims:
+            for claim in extracted_claims:
+                enriched_claims.append({
+                    "claim_id": claim.get('claim_id', ''),
+                    "original_claim": claim.get('claim', ''),
+                    "claim_type": claim.get('claim_type', 'unknown'),
+                    "confidence": claim.get('confidence'),
+                    "verification_result": "UNVERIFIABLE",
+                    "evidence": "",
+                    "sources": [],
+                    "verification_method": "",
+                    "search_queries_used": [],
+                    "reasoning": "No verification data available from agent",
+                })
+
+        return enriched_claims
+
+    @classmethod
+    def _build_structured_report(
+        cls,
+        verification_data: Dict[str, Any],
+        extracted_claims: List[Dict],
+        enriched_claims: List[Dict],
+        tool_calls: List[Dict],
+        reasoning_steps: int,
+        content_length: int,
+        execution_time: float,
+        claims_source: str = "claims_extractor_tool"
+    ) -> Dict[str, Any]:
+        """
+        Build a complete structured verification report.
+
+        Args:
+            verification_data: Agent's parsed JSON output
+            extracted_claims: Claims from claims_extractor or fallback
+            enriched_claims: Merged per-claim verification records
+            tool_calls: Complete tool call list
+            reasoning_steps: Number of reasoning steps
+            content_length: Length of original article content
+            execution_time: Total execution time in seconds
+            claims_source: Where claims came from ("claims_extractor_tool" or "agent_reasoning")
+
+        Returns:
+            Complete structured report dictionary
+        """
+        summary = verification_data.get("article_verification_summary", {})
+
+        # Claims extraction stats
+        claim_types_dist: Dict[str, int] = {}
+        verifiable_count = 0
+        for claim in extracted_claims:
+            ct = claim.get('claim_type', 'unknown')
+            claim_types_dist[ct] = claim_types_dist.get(ct, 0) + 1
+            if claim.get('verifiable', True):
+                verifiable_count += 1
+
+        report = {
+            "report_version": "2.0",
+            "generated_at": datetime.now().isoformat(timespec='seconds'),
+            "article_info": {
+                "content_source": "markdown",
+                "content_length": content_length
+            },
+            "claims_extraction": {
+                "total_extracted": len(extracted_claims),
+                "claims_source": claims_source,
+                "verifiable": verifiable_count,
+                "claim_types_distribution": claim_types_dist
+            },
+            "verification_summary": {
+                "total_verified": summary.get("verified_claims", 0) + summary.get("false_claims", 0),
+                "verified_true": summary.get("verified_claims", 0),
+                "verified_false": summary.get("false_claims", 0),
+                "unverifiable": summary.get("unverifiable_claims", 0),
+                "accuracy_score": summary.get("accuracy_score", 0.0)
+            },
+            "detailed_findings": enriched_claims,
+            "false_claims_comparison": verification_data.get("false_claims_comparison", []),
+            "agent_metadata": {
+                "model": getattr(cls.dynamic_config, 'model', 'unknown'),
+                "tool_calls_count": len(tool_calls),
+                "reasoning_steps": reasoning_steps,
+                "execution_time_seconds": round(execution_time, 2)
+            }
+        }
+
+        return report
+
+    # --- Overridden Core Methods ---
+
+    @classmethod
+    def eval(cls, input_data: Data) -> EvalDetail:
+        """
+        Two-phase async fact-checking with parallel claim verification.
+
+        Phase 1: Extract claims via ClaimsExtractor (direct call, ~30s)
+        Phase 2: Verify each claim with a focused mini-agent using asyncio.gather
+                 with Semaphore(max_concurrent_claims) to limit concurrency (~80-120s)
+
+        This replaces the old single-agent sequential approach (~669s for 15 claims).
+
+        Temperature defaults to 0 for deterministic tool selection and
+        consistent verification results. Users can override via config.
+
+        Args:
+            input_data: Data object with article content
+
+        Returns:
+            EvalDetail with comprehensive verification report
+        """
+        start_time = time.time()
+        output_dir = cls._get_output_dir()
+
+        if cls.dynamic_config:
+            if cls.dynamic_config.parameters is None:
+                cls.dynamic_config.parameters = {}
+            cls.dynamic_config.parameters.setdefault("temperature", 0)
+
+        if output_dir and input_data.content:
+            cls._save_article_content(output_dir, input_data.content)
+
+        try:
+            return asyncio.run(cls._async_eval(input_data, start_time, output_dir))
+        except RuntimeError as e:
+            # Fallback when called inside an already-running event loop (e.g. Jupyter, tests)
+            if "cannot run" in str(e).lower() or "already running" in str(e).lower():
+                import concurrent.futures
+                with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
+                    future = pool.submit(
+                        lambda: asyncio.run(cls._async_eval(input_data, start_time, output_dir))
+                    )
+                    return future.result()
+            raise
+
+    # --- Two-Phase Async Architecture Methods ---
+
+    @classmethod
+    async def _async_eval(
+        cls, input_data: Data, start_time: float, output_dir: Optional[str]
+    ) -> EvalDetail:
+        """
+        Async two-phase orchestrator for parallel claim verification.
+
+        Phase 1: Extract claims directly via ClaimsExtractor tool (~30s).
+        Phase 2: Verify claims concurrently with asyncio.gather and Semaphore.
+        """
+        # Phase 1: Extract claims directly (no agent overhead)
+        print("[ArticleFactChecker] Phase 1: Extracting claims from article...", flush=True)
+        claims = await cls._async_extract_claims(input_data)
+        if not claims:
+            return cls._create_error_result("No claims extracted from article")
+
+        print(f"[ArticleFactChecker] Phase 1 done: {len(claims)} claims extracted", flush=True)
+        if output_dir:
+            cls._save_claims(output_dir, claims)
+
+        # Phase 2: Parallel verification with semaphore-controlled concurrency
+        max_concurrent = cls._get_max_concurrent_claims()
+        semaphore = asyncio.Semaphore(max_concurrent)
+        total = len(claims)
+        print(
+            f"[ArticleFactChecker] Phase 2: Verifying {total} claims "
+            f"(max {max_concurrent} concurrent)...",
+            flush=True
+        )
+        log.info(f"ArticleFactChecker: verifying {total} claims with max_concurrent={max_concurrent}")
+
+        # Pre-create LLM and tools once to avoid concurrent config modification
+        llm = cls.get_langchain_llm()
+        lc_tools = cls.get_langchain_tools()
+        search_tools = [t for t in lc_tools if t.name in ('tavily_search', 'arxiv_search')]
+
+        _completed = [0]  # mutable counter; safe in asyncio single-threaded context
+
+        async def _verify_with_progress(claim: Dict) -> Any:
+            claim_id = claim.get('claim_id', '')
+            try:
+                result = await cls._async_verify_single_claim(claim, semaphore, llm, search_tools)
+            except Exception as exc:
+                _completed[0] += 1
+                print(f"[ArticleFactChecker]   [{_completed[0]}/{total}] {claim_id} → ERROR", flush=True)
+                return exc
+            _completed[0] += 1
+            if not isinstance(result, dict) or not result.get('success'):
+                verdict = 'ERROR'
+            else:
+                out = (result.get('agent_result') or {}).get('output') or ''
+                m = cls._RE_VERDICT.search(out)
+                verdict = m.group(1) if m else '?'
+            print(f"[ArticleFactChecker]   [{_completed[0]}/{total}] {claim_id} → {verdict}", flush=True)
+            return result
+
+        verification_results = await asyncio.gather(
+            *[_verify_with_progress(claim) for claim in claims],
+            return_exceptions=True
+        )
+
+        elapsed = time.time() - start_time
+        print(
+            f"[ArticleFactChecker] Phase 2 done: {total}/{total} claims verified "
+            f"({elapsed:.1f}s elapsed)",
+            flush=True
+        )
+        return cls._aggregate_parallel_results(
+            input_data, claims, verification_results, start_time, output_dir
+        )
+
+    @classmethod
+    async def _async_extract_claims(cls, input_data: Data) -> List[Dict]:
+        """
+        Phase 1: Extract claims by calling ClaimsExtractor directly.
+
+        Runs the synchronous ClaimsExtractor.execute() in a thread executor
+        to avoid blocking the event loop.
+
+        Returns:
+            List of claim dicts with claim_id, claim, claim_type, etc.
+        """
+        from dingo.model.llm.agent.tools.claims_extractor import ClaimsExtractor, ClaimsExtractorConfig
+
+        params = cls.dynamic_config.parameters or {}
+        agent_cfg = params.get('agent_config') or {}
+        extractor_cfg = agent_cfg.get('tools', {}).get('claims_extractor', {})
+
+        config_kwargs: Dict[str, Any] = {
+            'model': cls.dynamic_config.model or "gpt-4o-mini",
+            'api_key': extractor_cfg.get('api_key') or cls.dynamic_config.key,
+            'max_claims': extractor_cfg.get('max_claims', 50),
+        }
+        base_url = extractor_cfg.get('base_url') or getattr(cls.dynamic_config, 'api_url', None)
+        if base_url:
+            config_kwargs['base_url'] = base_url
+        claim_types = extractor_cfg.get('claim_types')
+        if claim_types:
+            config_kwargs['claim_types'] = claim_types
+
+        content = input_data.content or ''
+        loop = asyncio.get_running_loop()
+        with cls._claims_extractor_lock:
+            ClaimsExtractor.config = ClaimsExtractorConfig(**config_kwargs)
+            result = await loop.run_in_executor(None, ClaimsExtractor.execute, content)
+
+        if result.get('success'):
+            data_section = result.get('data', result)
+            return data_section.get('claims', [])
+
+        log.warning(f"ClaimsExtractor failed: {result.get('error', 'unknown')}")
+        return []
+
+    @classmethod
+    async def _async_verify_single_claim(
+        cls,
+        claim: Dict,
+        semaphore: asyncio.Semaphore,
+        llm: Any,
+        search_tools: List,
+    ) -> Dict:
+        """
+        Phase 2: Verify one claim with a focused mini-agent.
+
+        The semaphore limits concurrent API calls to prevent rate limiting.
+        Each mini-agent only handles one claim with a simplified prompt,
+        returning structured JSON verification output.
+
+        Args:
+            claim: Claim dict from ClaimsExtractor (has claim_id, claim, claim_type)
+            semaphore: Asyncio semaphore for concurrency control
+            llm: Pre-created LangChain LLM instance (shared, thread-safe)
+            search_tools: Pre-configured search tools (tavily_search / arxiv_search)
+
+        Returns:
+            Dict with claim, agent_result, success keys
+        """
+        from dingo.model.llm.agent.agent_wrapper import AgentWrapper
+
+        async with semaphore:
+            claim_id = claim.get('claim_id', 'unknown')
+            claim_text = claim.get('claim', '')
+            claim_type = claim.get('claim_type', 'factual')
+            claim_preview = (claim_text or '')[:60]
+            print(f"[ArticleFactChecker]   → {claim_id} ({claim_type}): {claim_preview}", flush=True)
+
+            try:
+                agent = AgentWrapper.create_agent(
+                    llm=llm,
+                    tools=search_tools,
+                    system_prompt=PromptTemplates.PER_CLAIM_VERIFICATION_PROMPT
+                )
+
+                input_text = (
+                    f"Claim ID: {claim_id}\n"
+                    f"Claim Type: {claim_type}\n"
+                    f"Claim to verify: {claim_text}"
+                )
+
+                per_claim_max_iter = max(cls.get_max_iterations(), 5)
+
+                agent_result = await AgentWrapper.async_invoke_and_format(
+                    agent,
+                    input_text=input_text,
+                    max_iterations=per_claim_max_iter
+                )
+
+                log.debug(f"Verified {claim_id}: success={agent_result.get('success')}")
+                return {"claim": claim, "agent_result": agent_result, "success": True}
+
+            except Exception as e:
+                log.error(f"Failed to verify {claim_id}: {e}")
+                return {
+                    "claim": claim,
+                    "agent_result": {"output": "", "success": False, "error": str(e)},
+                    "success": False
+                }
+
+    @classmethod
+    def _get_max_concurrent_claims(cls) -> int:
+        """Read max_concurrent_claims from agent_config or use class default."""
+        params = cls.dynamic_config.parameters or {}
+        agent_cfg = params.get('agent_config') or {}
+        return agent_cfg.get('max_concurrent_claims', cls.max_concurrent_claims)
+
+    @classmethod
+    def _parse_claim_json_robust(cls, output: Optional[str]) -> Dict[str, Any]:
+        """
+        Robustly parse claim verification JSON from LLM output.
+
+        Three-tier parsing strategy:
+          1. Regex match for a complete *flat* JSON object containing
+             ``"verification_result"`` (cannot match nested ``{}``).
+          2. Truncated-JSON repair: strip markdown fences, append missing
+             closing characters, then ``json.loads``.
+          3. Per-field regex extraction as last resort (includes fallback
+             patterns for truncated string values).
+
+        Args:
+            output: Raw string returned by the per-claim mini-agent, or None.
+
+        Returns:
+            Dict with as many fields as could be recovered; empty dict on
+            total failure.
+        """
+        if not output or not isinstance(output, str):
+            return {}
+
+        # --- Tier 1: exact regex match for flat JSON object ---
+        try:
+            json_match = re.search(
+                r'\{[^{}]*"verification_result"[^{}]*\}', output, re.DOTALL
+            )
+            if json_match:
+                return json.loads(json_match.group(0))
+        except (json.JSONDecodeError, AttributeError):
+            pass
+
+        # --- Tier 2: truncated-JSON repair ---
+        try:
+            text = output.strip()
+            text = re.sub(r'^```(?:json)?\s*', '', text)
+            text = re.sub(r'\s*```\s*$', '', text)
+            text = text.strip()
+
+            brace_start = text.find('{')
+            if brace_start != -1:
+                fragment = text[brace_start:]
+                suffixes = ['', '"', '"}', '"]', '"]}', '"}]']
+                for suffix in suffixes:
+                    patched = fragment + suffix
+                    open_braces = patched.count('{') - patched.count('}')
+                    open_brackets = patched.count('[') - patched.count(']')
+                    closing = ']' * max(0, open_brackets) + '}' * max(0, open_braces)
+                    try:
+                        candidate = json.loads(patched + closing)
+                        if isinstance(candidate, dict) and 'verification_result' in candidate:
+                            return candidate
+                    except (json.JSONDecodeError, ValueError):
+                        continue
+        except Exception:
+            pass
+
+        # --- Tier 3: per-field regex extraction ---
+        extracted: Dict[str, Any] = {}
+        try:
+            verdict_m = cls._RE_VERDICT.search(output)
+            if verdict_m:
+                extracted['verification_result'] = verdict_m.group(1).upper()
+
+            evidence_m = cls._RE_EVIDENCE.search(output) or cls._RE_EVIDENCE_TRUNC.search(output)
+            if evidence_m:
+                extracted['evidence'] = evidence_m.group(1).replace('\\"', '"').replace('\\n', '\n')
+
+            sources_m = cls._RE_SOURCES.search(output) or cls._RE_SOURCES_TRUNC.search(output)
+            if sources_m:
+                raw_sources = sources_m.group(1)
+                extracted['sources'] = [
+                    s.strip().strip('"') for s in raw_sources.split(',')
+                    if s.strip().strip('"')
+                ]
+
+            reasoning_m = cls._RE_REASONING.search(output) or cls._RE_REASONING_TRUNC.search(output)
+            if reasoning_m:
+                extracted['reasoning'] = reasoning_m.group(1).replace('\\"', '"').replace('\\n', '\n')
+        except Exception:
+            pass
+
+        return extracted
+
+    @classmethod
+    def _parse_single_claim_result(cls, claim: Dict, agent_result: Dict) -> Dict:
+        """
+        Parse mini-agent JSON output into enriched claim verification record.
+
+        Tries to extract the JSON block from agent output; falls back to
+        metadata derived from tool_calls when parsing fails.
+
+        Args:
+            claim: Original claim dict from ClaimsExtractor
+            agent_result: Result dict from AgentWrapper.async_invoke_and_format
+
+        Returns:
+            Enriched claim dict compatible with existing report structure
+        """
+        output = agent_result.get('output', '')
+        tool_calls = agent_result.get('tool_calls', [])
+
+        parsed = cls._parse_claim_json_robust(output)
+
+        search_queries = [
+            tc.get('args', {}).get('query', '')
+            for tc in tool_calls
+            if tc.get('args', {}).get('query')
+        ]
+        methods_used = list({tc.get('tool', '') for tc in tool_calls if tc.get('tool')})
+        if parsed.get('verification_method'):
+            verification_method = parsed['verification_method']
+        elif len(methods_used) > 1:
+            verification_method = 'combined'
+        elif methods_used:
+            verification_method = methods_used[0]
+        else:
+            verification_method = 'no_search'
+
+        return {
+            "claim_id": claim.get('claim_id', ''),
+            "original_claim": claim.get('claim', ''),
+            "claim_type": claim.get('claim_type', 'unknown'),
+            "confidence": claim.get('confidence'),
+            "verification_result": cls._normalize_verdict(
+                parsed.get('verification_result', 'UNVERIFIABLE')
+            ),
+            "evidence": parsed.get('evidence', ''),
+            "sources": parsed.get('sources', []),
+            "verification_method": verification_method,
+            "search_queries_used": parsed.get('search_queries_used', search_queries),
+            "reasoning": parsed.get('reasoning', output[:500] if output else ''),
+        }
+
+    @classmethod
+    def _build_unverifiable_claim_record(cls, claim: Dict, error_msg: str) -> Dict:
+        """Build a fallback UNVERIFIABLE record when claim verification fails."""
+        return {
+            "claim_id": claim.get('claim_id', ''),
+            "original_claim": claim.get('claim', ''),
+            "claim_type": claim.get('claim_type', 'unknown'),
+            "confidence": None,
+            "verification_result": "UNVERIFIABLE",
+            "evidence": "",
+            "sources": [],
+            "verification_method": "error",
+            "search_queries_used": [],
+            "reasoning": f"Verification failed: {error_msg}",
+        }
+
+    @classmethod
+    def _aggregate_parallel_results(
+        cls,
+        input_data: Data,
+        claims: List[Dict],
+        verification_results: List[Any],
+        start_time: float,
+        output_dir: Optional[str],
+    ) -> EvalDetail:
+        """
+        Aggregate parallel verification results into a final EvalDetail.
+
+        Merges per-claim mini-agent outputs, applies reasoning-verdict
+        consistency checks, recalculates the summary, and produces the
+        same structured report format as the sequential path.
+
+        Args:
+            input_data: Original article Data object
+            claims: Extracted claims from Phase 1
+            verification_results: List of results from asyncio.gather
+                (may contain Exception objects due to return_exceptions=True)
+            start_time: Wall-clock start time for execution_time calculation
+            output_dir: Optional path to save artifacts
+
+        Returns:
+            EvalDetail with full verification report
+        """
+        execution_time = time.time() - start_time
+        enriched_claims: List[Dict] = []
+        all_tool_calls: List[Dict] = []
+        total_reasoning_steps = 0
+
+        for claim, vr in zip(claims, verification_results):
+            if isinstance(vr, Exception):
+                enriched = cls._build_unverifiable_claim_record(claim, str(vr))
+            elif not vr.get('success', False):
+                error = vr.get('agent_result', {}).get('error', 'unknown error')
+                enriched = cls._build_unverifiable_claim_record(claim, error)
+            else:
+                agent_result = vr.get('agent_result', {})
+                enriched = cls._parse_single_claim_result(claim, agent_result)
+                all_tool_calls.extend(agent_result.get('tool_calls', []))
+                total_reasoning_steps += agent_result.get('reasoning_steps', 0)
+            enriched_claims.append(enriched)
+
+        # Apply reasoning-verdict consistency downgrade (TRUE → UNVERIFIABLE on hedging)
+        downgraded = cls._check_reasoning_verdict_consistency(enriched_claims)
+        if downgraded:
+            log.info(f"Consistency check: downgraded {downgraded} TRUE→UNVERIFIABLE")
+
+        summary = cls._recalculate_summary(enriched_claims)
+
+        # Build verification_data in the format _build_structured_report() expects
+        verification_data: Dict[str, Any] = {
+            "article_verification_summary": {
+                "article_type": "unknown",
+                **summary
+            },
+            "detailed_findings": enriched_claims,
+            "false_claims_comparison": [
+                {
+                    "article_claimed": c["original_claim"],
+                    "evidence": c.get("evidence", ""),
+                }
+                for c in enriched_claims
+                if c.get("verification_result") == "FALSE"
+            ],
+        }
+
+        report = cls._build_structured_report(
+            verification_data=verification_data,
+            extracted_claims=claims,
+            enriched_claims=enriched_claims,
+            tool_calls=all_tool_calls,
+            reasoning_steps=total_reasoning_steps,
+            content_length=len(input_data.content or ''),
+            execution_time=execution_time,
+            claims_source="claims_extractor_direct_async",
+        )
+
+        if output_dir:
+            cls._save_verification_details(output_dir, enriched_claims)
+            cls._save_full_report(output_dir, report)
+
+        # Build EvalDetail with the same structure as _build_eval_detail_from_verification
+        return cls._build_eval_detail_from_verification(
+            verification_data,
+            all_tool_calls,
+            total_reasoning_steps,
+            report=report,
+        )
+
+    @classmethod
+    def _format_agent_input(cls, input_data: Data) -> str:
+        """
+        Format article content for agent.
+
+        Args:
+            input_data: Data object with content (article text)
+
+        Returns:
+            Formatted input string with task instructions
+        """
+        article_text = input_data.content
+
+        return f"""Please fact-check the following article comprehensively:
+
+===== ARTICLE START =====
+{article_text}
+===== ARTICLE END =====
+
+Your Task:
+0. First, analyze the article type (academic/news/product/blog/policy) to guide your verification strategy
+1. Extract ALL verifiable claims from this article using claims_extractor tool
+2. Verify each claim using autonomous tool selection based on claim type and article context
+3. Generate a comprehensive verification report
+
+Begin your systematic fact-checking process now.
+"""
+
+    @classmethod
+    def _get_system_prompt(cls, input_data: Data) -> str:
+        """Build system prompt, optionally tailored to article type."""
+        article_type = getattr(input_data, 'article_type', None)
+        return PromptTemplates.build(article_type=article_type)
+
+    @classmethod
+    def aggregate_results(cls, input_data: Data, results: List[Any]) -> EvalDetail:
+        """
+        Parse agent output into structured EvalDetail report with full artifact saving.
+
+        This method:
+        1. Parses the agent's JSON output
+        2. Extracts claims from tool_calls
+        3. Builds per-claim verification records
+        4. Generates structured report
+        5. Saves all artifacts to output directory
+        6. Returns EvalDetail with dual-layer reason (text + structured data)
+
+        Args:
+            input_data: Original article data
+            results: List containing agent execution result dictionary
+
+        Returns:
+            EvalDetail with comprehensive verification report
+        """
+        if not results:
+            return cls._create_error_result("No results from agent")
+
+        agent_result = results[0]
+
+        # Check for execution errors
+        if not agent_result.get('success', True):
+            error_msg = agent_result.get('error', 'Unknown error')
+
+            # For recursion limit errors, create custom EvalDetail
+            if "recursion limit" in error_msg.lower():
+                limit_match = re.search(r'recursion limit of (\d+)', error_msg.lower())
+                limit = int(limit_match.group(1)) if limit_match else 25
+
+                result = EvalDetail(metric=cls.__name__)
+                result.status = True  # True indicates an issue/error
+                result.label = [f"{QualityLabel.QUALITY_BAD_PREFIX}AGENT_RECURSION_LIMIT"]
+                result.reason = [
+                    "Article Fact-Checking Failed: Recursion Limit Exceeded",
+                    "=" * 70,
+                    f"Agent reached maximum iteration limit ({limit} iterations).",
+                    "",
+                    "The article may be too long or contain too many claims to verify.",
+                    "",
+                    "Recommendations:",
+                    f"  1. Increase max_iterations to {limit + 20} in agent_config",
+                    "  2. Reduce max_claims from 50 to 20-30 in claims_extractor",
+                    "  3. Use a shorter article or split into sections",
+                    "",
+                    "See detailed execution trace in ERROR logs above."
+                ]
+                return result
+
+            # For other timeout errors, create custom EvalDetail
+            elif "timed out" in error_msg.lower() or "timeout" in error_msg.lower():
+                result = EvalDetail(metric=cls.__name__)
+                result.status = True
+                result.label = [f"{QualityLabel.QUALITY_BAD_PREFIX}AGENT_TIMEOUT"]
+                result.reason = [
+                    "Article Fact-Checking Failed: Request Timeout",
+                    "=" * 70,
+                    "Request timed out during fact-checking.",
+                    "",
+                    "Possible causes:",
+                    "  - LLM API is responding slowly",
+                    "  - Article is too long to process",
+                    "  - Network connectivity issues",
+                    "",
+                    "Recommendations:",
+                    "  1. Switch to faster model (e.g., gpt-4o-mini instead of deepseek-chat)",
+                    "  2. Reduce article length (try shorter articles first)",
+                    "  3. Reduce max_claims in claims_extractor (from 50 to 20-30)",
+                    "  4. Check API response time and network connection",
+                    "",
+                    "See detailed execution trace in ERROR logs above (if available)."
+                ]
+                return result
+
+            # For other errors, use default error template
+            return cls._create_error_result(error_msg)
+
+        # Extract agent output
+        output = agent_result.get('output', '')
+        tool_calls = agent_result.get('tool_calls', [])
+        reasoning_steps = agent_result.get('reasoning_steps', 0)
+
+        # Validate output exists
+        if not output or not output.strip():
+            return cls._create_error_result(
+                "Agent returned empty output. "
+                "This may indicate the agent reached max_iterations without completing."
+            )
+
+        # Parse agent output (JSON format)
+        try:
+            verification_data = cls._parse_verification_output(output)
+        except Exception as e:
+            return cls._create_error_result(
+                f"Failed to parse agent output: {str(e)}\nOutput: {output[:300]}..."
+            )
+
+        # --- Extract claims and build enriched verification records ---
+        extracted_claims = cls._extract_claims_from_tool_calls(tool_calls)
+        claims_source = "claims_extractor_tool"
+        if not extracted_claims:
+            extracted_claims = cls._extract_claims_from_detailed_findings(verification_data)
+            claims_source = "agent_reasoning"
+            if extracted_claims:
+                log.info(f"Claims from agent reasoning (fallback): {len(extracted_claims)}")
+
+        enriched_claims = cls._build_per_claim_verification(
+            verification_data, extracted_claims, tool_calls
+        )
+
+        # Normalize verdicts to standard values (TRUE/FALSE/UNVERIFIABLE)
+        for claim in enriched_claims:
+            claim["verification_result"] = cls._normalize_verdict(claim.get("verification_result", ""))
+
+        # Code-level reasoning-verdict consistency check:
+        # Detect hedging language in reasoning that contradicts TRUE verdicts
+        downgraded = cls._check_reasoning_verdict_consistency(enriched_claims)
+        if downgraded:
+            log.info(f"Reasoning-verdict consistency check: {downgraded} verdict(s) downgraded")
+
+        # Recalculate summary from actual data to override agent's self-reported summary
+        if enriched_claims:
+            recalculated = cls._recalculate_summary(enriched_claims)
+            original_summary = verification_data.get("article_verification_summary", {})
+            verification_data["article_verification_summary"] = {
+                "article_type": original_summary.get("article_type", "unknown"),
+                **recalculated
+            }
+
+        # Note: this legacy path is only reached if someone calls aggregate_results()
+        # directly (bypassing the overridden eval()). Timing metadata is unavailable
+        # here; use the async eval() path for accurate execution_time and artifact saving.
+        execution_time = 0.0
+        content_length = len(getattr(input_data, 'content', '') or '')
+        output_dir = None
+
+        # Build structured report
+        report = cls._build_structured_report(
+            verification_data=verification_data,
+            extracted_claims=extracted_claims,
+            enriched_claims=enriched_claims,
+            tool_calls=tool_calls,
+            reasoning_steps=reasoning_steps,
+            content_length=content_length,
+            execution_time=execution_time,
+            claims_source=claims_source
+        )
+
+        # --- Save artifacts to output directory ---
+        if output_dir:
+            try:
+                if extracted_claims:
+                    cls._save_claims(output_dir, extracted_claims)
+                if enriched_claims:
+                    cls._save_verification_details(output_dir, enriched_claims)
+                cls._save_full_report(output_dir, report)
+            except Exception as e:
+                log.warning(f"Failed to save some output artifacts: {e}")
+
+        # Build EvalDetail from verification data (with enriched report)
+        return cls._build_eval_detail_from_verification(
+            verification_data,
+            tool_calls,
+            reasoning_steps,
+            report=report
+        )
+
+    @classmethod
+    def _parse_verification_output(cls, output: str) -> Dict[str, Any]:
+        """
+        Parse agent output to extract verification data.
+
+        Supports multiple formats with enhanced fallback parsing:
+        1. JSON in code block (```json ... ```)
+        2. JSON in generic code block (``` ... ```)
+        3. Raw JSON object
+        4. Partial JSON extraction
+        5. Text analysis fallback with pattern matching
+
+        Args:
+            output: Agent's text output
+
+        Returns:
+            Parsed verification data dictionary
+
+        Note:
+            Never raises - always returns a valid structure with raw_output for debugging
+        """
+        # Strategy 1: Extract JSON from ```json code block
+        json_match = re.search(
+            r'```json\s*(\{.*?\})\s*```',
+            output,
+            re.DOTALL | re.IGNORECASE
+        )
+
+        if json_match:
+            try:
+                return json.loads(json_match.group(1))
+            except json.JSONDecodeError as e:
+                log.debug(f"Failed to parse ```json block: {e}")
+
+        # Strategy 2: Extract JSON from generic ``` code block
+        generic_block_match = re.search(
+            r'```\s*(\{.*?\})\s*```',
+            output,
+            re.DOTALL
+        )
+
+        if generic_block_match:
+            try:
+                return json.loads(generic_block_match.group(1))
+            except json.JSONDecodeError as e:
+                log.debug(f"Failed to parse generic code block: {e}")
+
+        # Strategy 3: Try direct JSON parsing (entire output is JSON)
+        try:
+            return json.loads(output.strip())
+        except json.JSONDecodeError:
+            pass
+
+        # Strategy 4: Find and extract JSON object anywhere in text
+        # Look for { ... } pattern that could be valid JSON
+        json_object_match = re.search(
+            r'(\{[^{}]*"article_verification_summary"[^{}]*\{[^{}]*\}[^{}]*\})',
+            output,
+            re.DOTALL
+        )
+
+        if json_object_match:
+            try:
+                return json.loads(json_object_match.group(1))
+            except json.JSONDecodeError:
+                pass
+
+        # Strategy 5: Try to find any valid JSON object
+        # Find the largest balanced { } block
+        brace_positions = []
+        depth = 0
+        start_pos = None
+
+        for i, char in enumerate(output):
+            if char == '{':
+                if depth == 0:
+                    start_pos = i
+                depth += 1
+            elif char == '}':
+                depth -= 1
+                if depth == 0 and start_pos is not None:
+                    brace_positions.append((start_pos, i + 1))
+                    start_pos = None
+
+        # Try each JSON candidate from largest to smallest
+        for start, end in sorted(brace_positions, key=lambda x: x[1] - x[0], reverse=True):
+            try:
+                candidate = output[start:end]
+                parsed = json.loads(candidate)
+                if isinstance(parsed, dict) and ("article_verification_summary" in parsed or "total_claims" in parsed):
+                    return parsed
+            except json.JSONDecodeError:
+                continue
+
+        # Strategy 6: Enhanced text analysis fallback
+        log.warning("Failed to parse as JSON, creating fallback structure from text analysis")
+
+        # Extract summary numbers using multiple patterns
+        patterns = {
+            'total': [
+                r'total[_\s]*claims?[:\s]*(\d+)',
+                r'"total_claims"[:\s]*(\d+)',
+                r'(\d+)\s*(?:total\s+)?claims?\s+(?:analyzed|extracted|found)',
+            ],
+            'false': [
+                r'false[_\s]*claims?[:\s]*(\d+)',
+                r'"false_claims"[:\s]*(\d+)',
+                r'(\d+)\s*(?:false|incorrect|inaccurate)\s+claims?',
+            ],
+            'verified': [
+                r'verified[_\s]*claims?[:\s]*(\d+)',
+                r'"verified_claims"[:\s]*(\d+)',
+                r'(\d+)\s*(?:verified|true|accurate)\s+claims?',
+            ],
+            'unverifiable': [
+                r'unverifiable[_\s]*claims?[:\s]*(\d+)',
+                r'"unverifiable_claims"[:\s]*(\d+)',
+                r'(\d+)\s*(?:unverifiable|unknown|unclear)\s+claims?',
+            ],
+            'accuracy': [
+                r'accuracy[_\s]*(?:score)?[:\s]*([\d.]+)',
+                r'"accuracy_score"[:\s]*([\d.]+)',
+                r'overall\s+accuracy[:\s]*([\d.]+)',
+            ],
+            'article_type': [
+                r'"article_type"[:\s]*"(\w+)"',
+                r'article\s+type[:\s]*(\w+)',
+            ]
+        }
+
+        def extract_first_match(pattern_list: List[str], default=None):
+            for pattern in pattern_list:
+                match = re.search(pattern, output, re.IGNORECASE)
+                if match:
+                    return match.group(1)
+            return default
+
+        total = int(extract_first_match(patterns['total'], '0'))
+        false = int(extract_first_match(patterns['false'], '0'))
+        verified = int(extract_first_match(patterns['verified'], '0') or (total - false))
+        unverifiable = int(extract_first_match(patterns['unverifiable'], '0'))
+        accuracy_str = extract_first_match(patterns['accuracy'], '0')
+        article_type = extract_first_match(patterns['article_type'], 'unknown')
+
+        # Parse accuracy (handle both 0.95 and 95% formats)
+        try:
+            accuracy = float(accuracy_str)
+            if accuracy > 1.0:  # Likely percentage format
+                accuracy = accuracy / 100.0
+        except (ValueError, TypeError):
+            accuracy = verified / total if total > 0 else 0.0
+
+        # Extract false claims details if present
+        false_claims_comparison = []
+        claim_pattern = r'(?:claim|error|false)[:\s]*["\']?([^"\']+)["\']?\s*(?:→|->|:)\s*["\']?([^"\']+)["\']?'
+        claim_matches = re.findall(claim_pattern, output, re.IGNORECASE)
+        for claimed, truth in claim_matches[:5]:  # Limit to 5 claims
+            false_claims_comparison.append({
+                "article_claimed": claimed.strip(),
+                "actual_truth": truth.strip(),
+            })
+
+        return {
+            "article_verification_summary": {
+                "article_type": article_type,
+                "total_claims": total,
+                "verified_claims": verified,
+                "false_claims": false,
+                "unverifiable_claims": unverifiable,
+                "accuracy_score": accuracy
+            },
+            "false_claims_comparison": false_claims_comparison,
+            "raw_output": output,  # Include raw output for debugging
+            "parse_method": "text_analysis_fallback"
+        }
+
+    @classmethod
+    def _build_eval_detail_from_verification(
+        cls,
+        verification_data: Dict[str, Any],
+        tool_calls: List,
+        reasoning_steps: int,
+        report: Optional[Dict[str, Any]] = None
+    ) -> EvalDetail:
+        """
+        Build EvalDetail from parsed verification data with dual-layer reason.
+
+        reason[0] is a human-readable text summary string.
+        reason[1] is the full structured report dict (JSON-serializable).
+
+        Args:
+            verification_data: Parsed verification results
+            tool_calls: List of tool calls made by agent
+            reasoning_steps: Number of reasoning steps taken
+            report: Optional structured report dict from _build_structured_report
+
+        Returns:
+            EvalDetail with comprehensive report
+        """
+        summary = verification_data.get("article_verification_summary", {})
+        total = summary.get("total_claims", 0)
+        false_count = summary.get("false_claims", 0)
+        unverifiable_count = summary.get("unverifiable_claims", 0)
+        verified = summary.get("verified_claims", 0)
+        accuracy = summary.get("accuracy_score", 0.0)
+
+        # Binary status aligned with Dingo's evaluation model:
+        # - TRUE claims → good (no issue)
+        # - FALSE / UNVERIFIABLE claims → bad (issue detected)
+        # Unverifiable claims indicate sourcing deficiencies, which is
+        # a data quality problem (consistent with journalism standards).
+        has_issues = (false_count + unverifiable_count) > 0
+        result = EvalDetail(metric=cls.__name__)
+        result.status = has_issues
+        result.score = accuracy
+        if false_count > 0:
+            result.label = [f"{QualityLabel.QUALITY_BAD_PREFIX}ARTICLE_FACTUAL_ERROR"]
+        elif unverifiable_count > 0:
+            result.label = [f"{QualityLabel.QUALITY_BAD_PREFIX}ARTICLE_UNVERIFIED_CLAIMS"]
+        else:
+            result.label = [QualityLabel.QUALITY_GOOD]
+
+        # Build human-readable text summary
+        lines = [
+            "Article Fact-Checking Report",
+            "=" * 70,
+            f"Total Claims Analyzed: {total}",
+            f"Verified Claims: {verified}",
+            f"False Claims: {false_count}",
+            f"Unverifiable Claims: {unverifiable_count}",
+            f"Overall Accuracy: {accuracy:.1%}",
+            "",
+            "Agent Performance:",
+            f"   Tool Calls: {len(tool_calls)}",
+            f"   Reasoning Steps: {reasoning_steps}",
+            ""
+        ]
+
+        # Add false claims comparison table
+        false_claims = verification_data.get("false_claims_comparison", [])
+        if false_claims:
+            lines.append("FALSE CLAIMS DETAILED COMPARISON:")
+            lines.append("=" * 70)
+
+            for i, fc in enumerate(false_claims, 1):
+                lines.extend([
+                    f"\n#{i} FALSE CLAIM",
+                    "   Article Claimed:",
+                    f"      {fc.get('article_claimed', 'N/A')}",
+                    "   Actual Truth:",
+                    f"      {fc.get('actual_truth', 'N/A')}",
+                    "   Evidence:",
+                    f"      {fc.get('evidence', 'N/A')}",
+                ])
+
+        # Add detailed findings summary
+        detailed = verification_data.get("detailed_findings", [])
+        if detailed:
+            lines.append("\n\nALL CLAIMS VERIFICATION SUMMARY:")
+            lines.append("=" * 70)
+
+            result_counts = Counter(f.get("verification_result", "UNKNOWN") for f in detailed)
+            for result_type, count in result_counts.items():
+                lines.append(f"   {result_type}: {count} claims")
+
+            # Show sample false claims
+            false_findings = [f for f in detailed if f.get("verification_result") == "FALSE"]
+            if false_findings and len(false_findings) <= 5:
+                lines.append("\n   False Claims Details:")
+                for finding in false_findings[:5]:
+                    lines.append(
+                        f"   - {finding.get('claim_id')}: {finding.get('original_claim', '')[:80]}..."
+                    )
+
+        # Add raw output if available (for debugging)
+        if "raw_output" in verification_data:
+            lines.extend([
+                "",
+                "DEBUG: Raw Agent Output (first 500 chars):",
+                verification_data["raw_output"][:500] + "..."
+            ])
+
+        # Dual-layer reason: [text_summary, structured_report]
+        text_summary = "\n".join(lines)
+        result.reason = [text_summary]
+
+        if report:
+            result.reason.append(report)
+
+        return result
+
+    @classmethod
+    def _create_error_result(cls, error_message: str) -> EvalDetail:
+        """
+        Create error result for agent failures.
+
+        Args:
+            error_message: Description of the error
+
+        Returns:
+            EvalDetail with error status
+        """
+        result = EvalDetail(metric=cls.__name__)
+        result.status = True  # True indicates an issue/error
+        result.label = [f"{QualityLabel.QUALITY_BAD_PREFIX}AGENT_ERROR"]
+        result.reason = [
+            "Article Fact-Checking Failed",
+            "=" * 70,
+            f"Error: {error_message}",
+            "",
+            "Possible causes:",
+            "- Agent exceeded max_iterations without completing",
+            "- LLM failed to follow output format instructions",
+            "- Tool execution errors (API failures, rate limits)",
+            "- Invalid or empty article content",
+            "",
+            "Troubleshooting:",
+            "1. Check agent configuration (API keys, max_iterations)",
+            "2. Verify article content is valid and non-empty",
+            "3. Check tool configurations (claims_extractor, arxiv_search, tavily_search)",
+            "4. Review agent logs for detailed error messages"
+        ]
+        return result
+
+    @classmethod
+    def plan_execution(cls, input_data: Data) -> List[Dict[str, Any]]:
+        """
+        Not used when use_agent_executor=True.
+
+        The LangChain agent autonomously plans its execution using ReAct pattern.
+        This method is only called for legacy agent path (use_agent_executor=False).
+
+        Args:
+            input_data: Input data (unused)
+
+        Returns:
+            Empty list (no manual planning needed)
+        """
+        return []
diff --git a/dingo/model/llm/agent/agent_wrapper.py b/dingo/model/llm/agent/agent_wrapper.py
index eb46778d..4240c1ef 100644
--- a/dingo/model/llm/agent/agent_wrapper.py
+++ b/dingo/model/llm/agent/agent_wrapper.py
@@ -1,291 +1,348 @@
-"""
-Agent Wrapper for Dingo Agents (LangChain 1.0)
-
-Wraps LangChain's create_agent to work with Dingo's agent patterns.
-Uses the modern LangChain 1.0 API (released November 2025).
-
-Key Changes from AgentExecutor:
-- Uses langchain.agents.create_agent (built on LangGraph)
-- Returns CompiledStateGraph instead of AgentExecutor
-- Message-based invocation interface
-- Built-in persistence and checkpointing support
-"""
-
-from typing import Any, Dict, List, Optional
-
-from dingo.utils import log
-
-
-class AgentWrapper:
-    """
-    Wrapper that integrates LangChain 1.0 create_agent with Dingo agents.
-
-    Handles:
-    - Tool conversion from Dingo to LangChain format
-    - Agent creation using create_agent
-    - Result parsing from message-based output to Dingo structures
-    - Configuration and logging
-    """
-
-    @staticmethod
-    def create_agent(
-        llm,
-        tools: List,
-        system_prompt: Optional[str] = None,
-        **config
-    ):
-        """
-        Create a LangChain agent using langchain.agents.create_agent.
-
-        Args:
-            llm: LangChain LLM instance (ChatOpenAI)
-            tools: List of LangChain StructuredTools
-            system_prompt: Optional system message
-            **config: Additional configuration (debug, middleware, etc.)
-
-        Returns:
-            CompiledStateGraph (LangGraph agent)
-
-        Example:
-            llm = AgentWrapper.get_openai_llm_from_dingo_config(config)
-            tools = convert_dingo_tools(["tavily_search"], agent)
-            agent = AgentWrapper.create_agent(
-                llm=llm,
-                tools=tools,
-                system_prompt="You are a fact-checking agent..."
-            )
-        """
-        try:
-            from langchain.agents import create_agent
-        except ImportError as e:
-            error_msg = (
-                "LangChain is not installed but required for agent creation.\n\n"
-                "Install with:\n"
-                "  pip install -r requirements/agent.txt\n"
-                "Or:\n"
-                "  pip install 'dingo-python[agent]'"
-            )
-            log.error(error_msg)
-            raise ImportError(error_msg) from e
-
-        try:
-            # Create agent using LangChain 1.0 API
-            agent = create_agent(
-                model=llm,
-                tools=tools,
-                system_prompt=system_prompt or "You are a helpful assistant with access to tools.",
-                debug=config.get("debug", False)
-            )
-
-            log.debug(
-                f"Created agent with {len(tools)} tools using langchain.agents.create_agent"
-            )
-            return agent
-
-        except Exception as e:
-            log.error(f"Failed to create agent: {e}")
-            raise
-
-    @staticmethod
-    def invoke_and_format(
-        agent,
-        input_text: str,
-        input_data: Optional[Any] = None,
-        max_iterations: Optional[int] = None
-    ) -> Dict[str, Any]:
-        """
-        Invoke agent and format output for Dingo.
-
-        Args:
-            agent: Compiled agent (from create_agent)
-            input_text: Text to pass to agent
-            input_data: Optional Data object for context
-            max_iterations: Maximum reasoning iterations (default: 25)
-                In LangChain 1.0, this is passed as 'recursion_limit' to the agent
-
-        Returns:
-            Dict with:
-            - output: str (agent's final response)
-            - messages: List[Message] (full conversation)
-            - tool_calls: List[Dict] (parsed tool invocations)
-            - success: bool
-
-        Example:
-            result = AgentWrapper.invoke_and_format(
-                agent,
-                input_text="Is Paris the capital of France?",
-                input_data=data_obj,
-                max_iterations=10
-            )
-
-        Note:
-            In LangChain 1.0, iteration limits are controlled by recursion_limit,
-            which is passed at invocation time rather than during agent creation.
-        """
-        try:
-            # Build config dict for agent invocation
-            config = {}
-            if max_iterations is not None:
-                # LangChain 1.0 uses 'recursion_limit' instead of 'max_iterations'
-                config["recursion_limit"] = max_iterations
-                log.debug(f"Setting recursion_limit={max_iterations}")
-
-            # Invoke agent with message-based input and config
-            if config:
-                result = agent.invoke(
-                    {"messages": [("user", input_text)]},
-                    config
-                )
-            else:
-                # No config needed, use default recursion_limit (25)
-                result = agent.invoke({
-                    "messages": [("user", input_text)]
-                })
-
-            # Extract messages from result
-            messages = result.get('messages', [])
-
-            # Get final output (last AI message)
-            output = ""
-            if messages:
-                last_message = messages[-1]
-                output = getattr(last_message, 'content', str(last_message))
-
-            # Parse tool calls from messages
-            tool_calls = AgentWrapper._extract_tool_calls(messages)
-
-            # Count reasoning steps (messages between user input and final response)
-            reasoning_steps = len([m for m in messages if hasattr(m, 'type') and m.type == 'ai'])
-
-            formatted_result = {
-                'output': output,
-                'messages': messages,
-                'tool_calls': tool_calls,
-                'reasoning_steps': reasoning_steps,
-                'success': True
-            }
-
-            log.debug(
-                f"Agent execution completed: {len(tool_calls)} tool calls, "
-                f"{reasoning_steps} reasoning steps"
-            )
-
-            return formatted_result
-
-        except Exception as e:
-            log.error(f"Agent invocation failed: {e}")
-            return {
-                'output': '',
-                'messages': [],
-                'tool_calls': [],
-                'reasoning_steps': 0,
-                'success': False,
-                'error': str(e)
-            }
-
-    @staticmethod
-    def _extract_tool_calls(messages: List) -> List[Dict[str, Any]]:
-        """
-        Extract tool calls from message sequence.
-
-        Parses AIMessage objects with tool_calls and their corresponding
-        ToolMessage responses.
-
-        Args:
-            messages: List of message objects
-
-        Returns:
-            List of dicts with tool, args, observation
-        """
-        tool_calls = []
-
-        try:
-            from langchain_core.messages import AIMessage, ToolMessage
-
-            for i, message in enumerate(messages):
-                # Check if AI message has tool calls
-                if isinstance(message, AIMessage) and hasattr(message, 'tool_calls'):
-                    for tool_call in message.tool_calls:
-                        # Find corresponding tool response
-                        observation = ""
-                        if i + 1 < len(messages) and isinstance(messages[i + 1], ToolMessage):
-                            observation = messages[i + 1].content
-
-                        tool_calls.append({
-                            'tool': tool_call.get('name', 'unknown'),
-                            'args': tool_call.get('args', {}),
-                            'observation': observation
-                        })
-
-        except ImportError:
-            # Fallback if langchain_core not available
-            log.warning("Could not import langchain_core for tool call extraction")
-
-        except Exception as e:
-            log.warning(f"Error extracting tool calls: {e}")
-
-        return tool_calls
-
-    @staticmethod
-    def get_openai_llm_from_dingo_config(dynamic_config):
-        """
-        Create LangChain ChatOpenAI LLM from Dingo's dynamic_config.
-
-        Args:
-            dynamic_config: BaseOpenAI.dynamic_config (EvaluatorLLMArgs)
-
-        Returns:
-            LangChain ChatOpenAI instance
-
-        Note:
-            This wraps Dingo's existing client creation pattern
-            for use with LangChain's agent framework.
-
-        Example:
-            llm = AgentWrapper.get_openai_llm_from_dingo_config(
-                agent.dynamic_config
-            )
-        """
-        try:
-            from langchain_openai import ChatOpenAI
-        except ImportError as e:
-            error_msg = (
-                "langchain-openai is not installed but required for LLM integration.\n\n"
-                "Install with:\n"
-                "  pip install -r requirements/agent.txt\n"
-                "Or:\n"
-                "  pip install 'dingo-python[agent]'"
-            )
-            log.error(error_msg)
-            raise ImportError(error_msg) from e
-
-        if not hasattr(dynamic_config, 'key') or not dynamic_config.key:
-            raise ValueError(
-                "dynamic_config must have 'key' (API key) for LLM"
-            )
-
-        if not hasattr(dynamic_config, 'api_url') or not dynamic_config.api_url:
-            raise ValueError(
-                "dynamic_config must have 'api_url' (base URL) for LLM"
-            )
-
-        # Extract parameters
-        params = dynamic_config.parameters or {}
-
-        # Create ChatOpenAI instance
-        llm = ChatOpenAI(
-            api_key=dynamic_config.key,
-            base_url=dynamic_config.api_url,
-            model=dynamic_config.model or "gpt-4.1-mini",
-            temperature=params.get("temperature", 0.3),
-            max_tokens=params.get("max_tokens", 1000),  # Lower default to avoid context length issues
-            top_p=params.get("top_p", 1.0),
-            timeout=params.get("timeout", 30)
-        )
-
-        log.debug(
-            f"Created ChatOpenAI: model={dynamic_config.model}, "
-            f"temp={params.get('temperature', 0.3)}"
-        )
-
-        return llm
+"""
+Agent Wrapper for Dingo Agents (LangChain 1.0)
+
+Wraps LangChain's create_agent to work with Dingo's agent patterns.
+Uses the modern LangChain 1.0 API (released November 2025).
+
+Key Changes from AgentExecutor:
+- Uses langchain.agents.create_agent (built on LangGraph)
+- Returns CompiledStateGraph instead of AgentExecutor
+- Message-based invocation interface
+- Built-in persistence and checkpointing support
+"""
+
+from typing import Any, Dict, List, Optional
+
+from dingo.utils import log
+
+
+class AgentWrapper:
+    """
+    Wrapper that integrates LangChain 1.0 create_agent with Dingo agents.
+
+    Handles:
+    - Tool conversion from Dingo to LangChain format
+    - Agent creation using create_agent
+    - Result parsing from message-based output to Dingo structures
+    - Configuration and logging
+    """
+
+    @staticmethod
+    def create_agent(
+        llm,
+        tools: List,
+        system_prompt: Optional[str] = None,
+        **config
+    ):
+        """
+        Create a LangChain agent using langchain.agents.create_agent.
+
+        Args:
+            llm: LangChain LLM instance (ChatOpenAI)
+            tools: List of LangChain StructuredTools
+            system_prompt: Optional system message
+            **config: Additional configuration (debug, middleware, etc.)
+
+        Returns:
+            CompiledStateGraph (LangGraph agent)
+
+        Example:
+            llm = AgentWrapper.get_openai_llm_from_dingo_config(config)
+            tools = convert_dingo_tools(["tavily_search"], agent)
+            agent = AgentWrapper.create_agent(
+                llm=llm,
+                tools=tools,
+                system_prompt="You are a fact-checking agent..."
+            )
+        """
+        try:
+            from langchain.agents import create_agent
+        except ImportError as e:
+            error_msg = (
+                "LangChain is not installed but required for agent creation.\n\n"
+                "Install with:\n"
+                "  pip install -r requirements/agent.txt\n"
+                "Or:\n"
+                "  pip install 'dingo-python[agent]'"
+            )
+            log.error(error_msg)
+            raise ImportError(error_msg) from e
+
+        try:
+            # Create agent using LangChain 1.0 API
+            agent = create_agent(
+                model=llm,
+                tools=tools,
+                system_prompt=system_prompt or "You are a helpful assistant with access to tools.",
+                debug=config.get("debug", False)
+            )
+
+            log.debug(
+                f"Created agent with {len(tools)} tools using langchain.agents.create_agent"
+            )
+            return agent
+
+        except Exception as e:
+            log.error(f"Failed to create agent: {e}")
+            raise
+
+    @staticmethod
+    def invoke_and_format(
+        agent,
+        input_text: str,
+        input_data: Optional[Any] = None,
+        max_iterations: Optional[int] = None
+    ) -> Dict[str, Any]:
+        """
+        Invoke agent and format output for Dingo.
+
+        Args:
+            agent: Compiled agent (from create_agent)
+            input_text: Text to pass to agent
+            input_data: Optional Data object for context
+            max_iterations: Maximum reasoning iterations (default: 25)
+                In LangChain 1.0, this is passed as 'recursion_limit' to the agent
+
+        Returns:
+            Dict with:
+            - output: str (agent's final response)
+            - messages: List[Message] (full conversation)
+            - tool_calls: List[Dict] (parsed tool invocations)
+            - success: bool
+
+        Example:
+            result = AgentWrapper.invoke_and_format(
+                agent,
+                input_text="Is Paris the capital of France?",
+                input_data=data_obj,
+                max_iterations=10
+            )
+
+        Note:
+            In LangChain 1.0, iteration limits are controlled by recursion_limit,
+            which is passed at invocation time rather than during agent creation.
+        """
+        try:
+            # Build config dict for agent invocation
+            config = {}
+            if max_iterations is not None:
+                # LangChain 1.0 uses 'recursion_limit' instead of 'max_iterations'
+                config["recursion_limit"] = max_iterations
+                log.debug(f"Setting recursion_limit={max_iterations}")
+
+            # Invoke agent with message-based input and config
+            if config:
+                result = agent.invoke(
+                    {"messages": [("user", input_text)]},
+                    config
+                )
+            else:
+                # No config needed, use default recursion_limit (25)
+                result = agent.invoke({
+                    "messages": [("user", input_text)]
+                })
+
+            formatted_result = AgentWrapper._format_agent_result(result)
+            log.debug(
+                f"Agent execution completed: {len(formatted_result['tool_calls'])} tool calls, "
+                f"{formatted_result['reasoning_steps']} reasoning steps"
+            )
+            return formatted_result
+
+        except Exception as e:
+            log.error(f"Agent invocation failed: {e}")
+            return AgentWrapper._make_error_result(str(e))
+
+    @staticmethod
+    async def async_invoke_and_format(
+        agent,
+        input_text: str,
+        input_data: Optional[Any] = None,
+        max_iterations: Optional[int] = None
+    ) -> Dict[str, Any]:
+        """
+        Async version of invoke_and_format using agent.ainvoke().
+
+        Used for concurrent claim verification in ArticleFactChecker's
+        two-phase parallel architecture.
+
+        Args:
+            agent: Compiled agent (from create_agent)
+            input_text: Text to pass to agent
+            input_data: Optional Data object for context (unused, kept for API parity)
+            max_iterations: Maximum reasoning iterations (recursion_limit)
+
+        Returns:
+            Dict with output, messages, tool_calls, reasoning_steps, success
+        """
+        try:
+            config = {}
+            if max_iterations is not None:
+                config["recursion_limit"] = max_iterations
+
+            if config:
+                result = await agent.ainvoke(
+                    {"messages": [("user", input_text)]},
+                    config
+                )
+            else:
+                result = await agent.ainvoke({"messages": [("user", input_text)]})
+
+            formatted_result = AgentWrapper._format_agent_result(result)
+            log.debug(
+                f"Async agent execution completed: {len(formatted_result['tool_calls'])} tool calls, "
+                f"{formatted_result['reasoning_steps']} reasoning steps"
+            )
+            return formatted_result
+
+        except Exception as e:
+            log.error(f"Async agent invocation failed: {e}")
+            return AgentWrapper._make_error_result(str(e))
+
+    @staticmethod
+    def _format_agent_result(result: Dict) -> Dict[str, Any]:
+        """
+        Convert raw agent invocation result into Dingo's standard output format.
+
+        Shared by both invoke_and_format (sync) and async_invoke_and_format (async)
+        to avoid duplication of message-parsing logic.
+
+        Args:
+            result: Raw dict returned by agent.invoke() / agent.ainvoke()
+
+        Returns:
+            Dict with output, messages, tool_calls, reasoning_steps, success=True
+        """
+        messages = result.get('messages', [])
+        output = ""
+        if messages:
+            last_message = messages[-1]
+            output = getattr(last_message, 'content', str(last_message))
+        tool_calls = AgentWrapper._extract_tool_calls(messages)
+        reasoning_steps = len([m for m in messages if hasattr(m, 'type') and m.type == 'ai'])
+        return {
+            'output': output,
+            'messages': messages,
+            'tool_calls': tool_calls,
+            'reasoning_steps': reasoning_steps,
+            'success': True,
+        }
+
+    @staticmethod
+    def _make_error_result(error: str) -> Dict[str, Any]:
+        """Build a standard error result dict for failed agent invocations."""
+        return {
+            'output': '',
+            'messages': [],
+            'tool_calls': [],
+            'reasoning_steps': 0,
+            'success': False,
+            'error': error,
+        }
+
+    @staticmethod
+    def _extract_tool_calls(messages: List) -> List[Dict[str, Any]]:
+        """
+        Extract tool calls from message sequence.
+
+        Parses AIMessage objects with tool_calls and their corresponding
+        ToolMessage responses.
+
+        Args:
+            messages: List of message objects
+
+        Returns:
+            List of dicts with tool, args, observation
+        """
+        tool_calls = []
+
+        try:
+            from langchain_core.messages import AIMessage, ToolMessage
+
+            for i, message in enumerate(messages):
+                # Check if AI message has tool calls
+                if isinstance(message, AIMessage) and hasattr(message, 'tool_calls'):
+                    for tool_call in message.tool_calls:
+                        # Find corresponding tool response
+                        observation = ""
+                        if i + 1 < len(messages) and isinstance(messages[i + 1], ToolMessage):
+                            observation = messages[i + 1].content
+
+                        tool_calls.append({
+                            'tool': tool_call.get('name', 'unknown'),
+                            'args': tool_call.get('args', {}),
+                            'observation': observation
+                        })
+
+        except ImportError:
+            # Fallback if langchain_core not available
+            log.warning("Could not import langchain_core for tool call extraction")
+
+        except Exception as e:
+            log.warning(f"Error extracting tool calls: {e}")
+
+        return tool_calls
+
+    @staticmethod
+    def get_openai_llm_from_dingo_config(dynamic_config):
+        """
+        Create LangChain ChatOpenAI LLM from Dingo's dynamic_config.
+
+        Args:
+            dynamic_config: BaseOpenAI.dynamic_config (EvaluatorLLMArgs)
+
+        Returns:
+            LangChain ChatOpenAI instance
+
+        Note:
+            This wraps Dingo's existing client creation pattern
+            for use with LangChain's agent framework.
+
+        Example:
+            llm = AgentWrapper.get_openai_llm_from_dingo_config(
+                agent.dynamic_config
+            )
+        """
+        try:
+            from langchain_openai import ChatOpenAI
+        except ImportError as e:
+            error_msg = (
+                "langchain-openai is not installed but required for LLM integration.\n\n"
+                "Install with:\n"
+                "  pip install -r requirements/agent.txt\n"
+                "Or:\n"
+                "  pip install 'dingo-python[agent]'"
+            )
+            log.error(error_msg)
+            raise ImportError(error_msg) from e
+
+        if not hasattr(dynamic_config, 'key') or not dynamic_config.key:
+            raise ValueError(
+                "dynamic_config must have 'key' (API key) for LLM"
+            )
+
+        if not hasattr(dynamic_config, 'api_url') or not dynamic_config.api_url:
+            raise ValueError(
+                "dynamic_config must have 'api_url' (base URL) for LLM"
+            )
+
+        # Extract parameters
+        params = dynamic_config.parameters or {}
+
+        # Create ChatOpenAI instance
+        llm = ChatOpenAI(
+            api_key=dynamic_config.key,
+            base_url=dynamic_config.api_url,
+            model=dynamic_config.model or "gpt-4.1-mini",
+            temperature=params.get("temperature", 0.3),
+            max_tokens=params.get("max_tokens", 4096),
+            top_p=params.get("top_p", 1.0),
+            timeout=params.get("timeout", 30)
+        )
+
+        log.debug(
+            f"Created ChatOpenAI: model={dynamic_config.model}, "
+            f"temp={params.get('temperature', 0.3)}"
+        )
+
+        return llm
diff --git a/dingo/model/llm/agent/tools/arxiv_search.py b/dingo/model/llm/agent/tools/arxiv_search.py
new file mode 100644
index 00000000..5d946602
--- /dev/null
+++ b/dingo/model/llm/agent/tools/arxiv_search.py
@@ -0,0 +1,472 @@
+"""
+arXiv Search Tool
+
+This module provides integration with arXiv API for academic paper search and verification.
+arXiv is a free distribution service and open-access archive for scholarly articles in
+the fields of physics, mathematics, computer science, and more.
+
+Dependencies:
+    arxiv>=2.4.0
+
+Configuration:
+    max_results: Maximum number of search results (default: 5, range: 1-50)
+    sort_by: Sort order - "relevance", "lastUpdatedDate", or "submittedDate" (default: "relevance")
+    sort_order: "ascending" or "descending" (default: "descending")
+    rate_limit_delay: Delay between requests in seconds (default: 3.0)
+    timeout: Request timeout in seconds (default: 30)
+    api_key: Not required for arXiv (public API)
+"""
+
+import re
+import threading
+import time
+from typing import Any, Dict, List, Optional
+
+from pydantic import Field
+
+from dingo.io.input import RequiredField
+from dingo.model.llm.agent.tools.base_tool import BaseTool, ToolConfig
+from dingo.model.llm.agent.tools.tool_registry import tool_register
+from dingo.utils import log
+
+
+class ArxivConfig(ToolConfig):
+    """Configuration for arXiv search tool"""
+    api_key: Optional[str] = None  # Override parent - not needed for arXiv
+    max_results: int = Field(default=5, ge=1, le=50)
+    sort_by: str = Field(default="relevance", pattern="^(relevance|lastUpdatedDate|submittedDate)$")
+    sort_order: str = Field(default="descending", pattern="^(ascending|descending)$")
+    rate_limit_delay: float = Field(default=3.0, ge=0.0)
+    timeout: int = Field(default=30, ge=1)
+
+
+@tool_register
+class ArxivSearch(BaseTool):
+    """
+    arXiv search tool for academic paper verification.
+
+    Provides search capabilities for academic papers in arXiv's open-access archive.
+    Supports searching by arXiv ID, DOI, title, author, and keywords with automatic
+    detection of query type.
+
+    Features:
+    - Auto-detection of arXiv IDs and DOIs
+    - No API key required (public API)
+    - Rate limiting to respect arXiv guidelines
+    - Support for multiple search modes
+    - Comprehensive paper metadata
+
+    arXiv ID Patterns:
+    - New format: 2301.12345 or 2301.12345v1 (with version)
+    - Old format: hep-ph/0123456 or hep-ph/0123456v1
+
+    DOI Pattern:
+    - Standard DOI: 10.1234/example.doi
+
+    Usage:
+        # Auto-detect search type
+        result = ArxivSearch.execute(query="1706.03762")
+
+        # Explicit search by title
+        result = ArxivSearch.execute(
+            query="Attention is All You Need",
+            search_type="title"
+        )
+
+        # Result structure:
+        {
+            'success': True,
+            'query': '1706.03762',
+            'search_type': 'arxiv_id',
+            'results': [
+                {
+                    'arxiv_id': '1706.03762',
+                    'title': 'Attention is All You Need',
+                    'authors': ['Vaswani, Ashish', ...],
+                    'summary': 'We propose a new...',
+                    'published': '2017-06-12',
+                    'updated': '2017-12-06',
+                    'pdf_url': 'http://arxiv.org/pdf/1706.03762v5',
+                    'doi': '10.48550/arXiv.1706.03762',
+                    'categories': ['cs.CL', 'cs.LG'],
+                    'journal_ref': 'NIPS 2017'
+                },
+                ...
+            ]
+        }
+    """
+
+    name = "arxiv_search"
+    description = (
+        "Search arXiv for academic papers by ID, DOI, title, or author. "
+        "Returns comprehensive paper metadata including title, authors, abstract, "
+        "publication date, PDF URL, and citations. Useful for verifying academic "
+        "claims, finding research papers, and checking paper details."
+    )
+    config: ArxivConfig = ArxivConfig()
+
+    _required_fields = [RequiredField.CONTENT]
+    _last_request_time: float = 0.0
+    _rate_limit_lock: threading.Lock = threading.Lock()
+
+    @classmethod
+    def execute(cls, query: str, search_type: str = "auto", **kwargs) -> Dict[str, Any]:
+        """
+        Execute arXiv search.
+
+        Args:
+            query: Search query string (arXiv ID, DOI, title, author, or keywords)
+            search_type: Search mode - "auto", "id", "doi", "title", "author" (default: "auto")
+            **kwargs: Optional overrides for configuration
+                - max_results: Override max_results config
+                - sort_by: Override sort_by config
+                - sort_order: Override sort_order config
+
+        Returns:
+            Dict with search results:
+            {
+                'success': bool,
+                'query': str,
+                'search_type': str,
+                'results': List[Dict],
+                'count': int
+            }
+
+        Raises:
+            ImportError: If arxiv library is not installed
+            ValueError: If query is empty or search_type is invalid
+            Exception: For API errors
+        """
+        # Validate inputs
+        if not query or not query.strip():
+            log.error("arXiv search query cannot be empty")
+            return {
+                'success': False,
+                'error': 'Search query cannot be empty',
+                'query': query
+            }
+
+        valid_search_types = ["auto", "id", "doi", "title", "author"]
+        if search_type not in valid_search_types:
+            log.error(f"Invalid search_type: {search_type}")
+            return {
+                'success': False,
+                'error': f'Invalid search_type. Must be one of: {", ".join(valid_search_types)}',
+                'query': query
+            }
+
+        # Import arxiv library (lazy import)
+        try:
+            import arxiv
+        except ImportError:
+            error_msg = (
+                "arxiv library is not installed but required for arXiv search.\n\n"
+                "Install with:\n"
+                "  pip install -r requirements/agent.txt\n"
+                "Or:\n"
+                "  pip install arxiv\n"
+                "Or:\n"
+                "  pip install 'dingo-python[agent]'"
+            )
+            log.error(error_msg)
+            return {
+                'success': False,
+                'error': error_msg,
+                'query': query,
+                'error_type': 'DependencyError'
+            }
+
+        # Apply rate limiting
+        cls._apply_rate_limiting()
+
+        # Execute search
+        try:
+            log.info(f"Executing arXiv search: {query[:100]}... (type: {search_type})")
+
+            # Build search query based on type
+            detected_type, arxiv_query = cls._build_arxiv_query(query, search_type)
+
+            # Get configuration
+            max_results = kwargs.get('max_results', cls.config.max_results)
+            sort_by_str = kwargs.get('sort_by', cls.config.sort_by)
+            sort_order_str = kwargs.get('sort_order', cls.config.sort_order)
+
+            # Map sort_by string to arxiv.SortCriterion
+            sort_by_map = {
+                'relevance': arxiv.SortCriterion.Relevance,
+                'lastUpdatedDate': arxiv.SortCriterion.LastUpdatedDate,
+                'submittedDate': arxiv.SortCriterion.SubmittedDate
+            }
+            sort_by = sort_by_map.get(sort_by_str, arxiv.SortCriterion.Relevance)
+
+            # Map sort_order string to arxiv.SortOrder
+            sort_order_map = {
+                'ascending': arxiv.SortOrder.Ascending,
+                'descending': arxiv.SortOrder.Descending
+            }
+            sort_order = sort_order_map.get(sort_order_str, arxiv.SortOrder.Descending)
+
+            # Create search
+            search = arxiv.Search(
+                query=arxiv_query,
+                max_results=max_results,
+                sort_by=sort_by,
+                sort_order=sort_order
+            )
+
+            # Execute search and collect results
+            results = []
+            client = arxiv.Client()
+            for paper in client.results(search):
+                results.append(cls._format_paper(paper))
+
+            # Format response
+            result = {
+                'success': True,
+                'query': query,
+                'search_type': detected_type,
+                'results': results,
+                'count': len(results)
+            }
+
+            log.info(f"arXiv search successful: {len(results)} results")
+            return result
+
+        except Exception as e:
+            log.error(f"arXiv search failed: {e}")
+
+            # Sanitize error message to prevent information disclosure
+            error_str = str(e).lower()
+            if "timeout" in error_str:
+                error_msg = "Search request timed out"
+            elif "network" in error_str or "connection" in error_str:
+                error_msg = "Network connection error"
+            elif "rate limit" in error_str:
+                error_msg = "Rate limit exceeded"
+            else:
+                error_msg = f"Search failed: {type(e).__name__}"
+
+            return {
+                'success': False,
+                'error': error_msg,
+                'query': query,
+                'error_type': type(e).__name__
+            }
+
+    @classmethod
+    def _build_arxiv_query(cls, query: str, search_type: str) -> tuple:
+        """
+        Build arXiv API query based on search type.
+
+        Auto-detection priority:
+        1. arXiv ID (e.g., "2301.12345" or "hep-ph/0123456")
+        2. DOI (e.g., "10.1234/example")
+        3. Title/keyword search
+
+        Args:
+            query: User query
+            search_type: "auto", "id", "doi", "title", or "author"
+
+        Returns:
+            Tuple of (detected_type: str, arxiv_query: str)
+        """
+        query = query.strip()
+
+        # Auto-detect or explicit type
+        if search_type == "auto":
+            # Check for arXiv ID
+            if cls._is_arxiv_id(query):
+                detected_type = "arxiv_id"
+                # Clean up arXiv ID (remove "arXiv:" prefix if present)
+                clean_id = query.replace("arXiv:", "").replace("arxiv:", "").strip()
+                arxiv_query = f"id:{clean_id}"
+
+            # Check for DOI
+            elif cls._is_doi(query):
+                detected_type = "doi"
+                arxiv_query = f"doi:{query}"
+
+            # Default to title search
+            else:
+                detected_type = "title"
+                arxiv_query = f"ti:{query}"
+
+        elif search_type == "id":
+            detected_type = "arxiv_id"
+            clean_id = query.replace("arXiv:", "").replace("arxiv:", "").strip()
+            arxiv_query = f"id:{clean_id}"
+
+        elif search_type == "doi":
+            detected_type = "doi"
+            arxiv_query = f"doi:{query}"
+
+        elif search_type == "title":
+            detected_type = "title"
+            arxiv_query = f"ti:{query}"
+
+        elif search_type == "author":
+            detected_type = "author"
+            arxiv_query = f"au:{query}"
+
+        else:
+            # Fallback
+            detected_type = "title"
+            arxiv_query = f"ti:{query}"
+
+        return detected_type, arxiv_query
+
+    @classmethod
+    def _is_arxiv_id(cls, text: str) -> bool:
+        """
+        Check if text matches arXiv ID pattern.
+
+        Patterns:
+        - New format: YYMM.NNNNN or YYMM.NNNNNvN (e.g., 2301.12345, 2301.12345v1)
+        - Old format: archive/NNNNNNN or archive/NNNNNNNvN (e.g., hep-ph/0123456)
+
+        Args:
+            text: Text to check
+
+        Returns:
+            True if text matches arXiv ID pattern
+        """
+        text = text.strip().replace("arXiv:", "").replace("arxiv:", "")
+
+        # New format: YYMM.NNNNN(vN)?
+        new_pattern = r'^\d{4}\.\d{4,5}(v\d+)?$'
+        if re.match(new_pattern, text):
+            return True
+
+        # Old format: archive/NNNNNNN(vN)?
+        old_pattern = r'^[a-z\-]+/\d{7}(v\d+)?$'
+        if re.match(old_pattern, text):
+            return True
+
+        return False
+
+    @classmethod
+    def _is_doi(cls, text: str) -> bool:
+        """
+        Check if text matches DOI pattern.
+
+        Pattern: 10.NNNN/... (standard DOI format)
+
+        Args:
+            text: Text to check
+
+        Returns:
+            True if text matches DOI pattern
+        """
+        text = text.strip()
+        doi_pattern = r'^10\.\d{4,9}/[-._;()/:A-Z0-9]+$'
+        return bool(re.match(doi_pattern, text, re.IGNORECASE))
+
+    @classmethod
+    def _format_paper(cls, paper) -> Dict[str, Any]:
+        """
+        Format arxiv.Result to standard dictionary.
+
+        Args:
+            paper: arxiv.Result object
+
+        Returns:
+            Formatted paper dictionary
+        """
+        return {
+            'arxiv_id': paper.entry_id.split('/')[-1],  # Extract ID from full URL
+            'title': paper.title,
+            'authors': [author.name for author in paper.authors],
+            'summary': paper.summary,
+            'published': paper.published.strftime('%Y-%m-%d') if paper.published else None,
+            'updated': paper.updated.strftime('%Y-%m-%d') if paper.updated else None,
+            'pdf_url': paper.pdf_url,
+            'doi': paper.doi,
+            'categories': paper.categories,
+            'primary_category': paper.primary_category,
+            'journal_ref': paper.journal_ref,
+            'comment': paper.comment
+        }
+
+    @classmethod
+    def _apply_rate_limiting(cls):
+        """
+        Apply rate limiting to respect arXiv guidelines.
+
+        arXiv recommends at least 3 seconds between requests.
+        This method enforces the configured rate_limit_delay.
+        Thread-safe: uses _rate_limit_lock to prevent concurrent requests
+        from bypassing the rate limit.
+        """
+        with cls._rate_limit_lock:
+            current_time = time.time()
+            time_since_last_request = current_time - cls._last_request_time
+
+            if time_since_last_request < cls.config.rate_limit_delay:
+                sleep_time = cls.config.rate_limit_delay - time_since_last_request
+                log.debug(f"Rate limiting: sleeping for {sleep_time:.2f} seconds")
+                time.sleep(sleep_time)
+
+            cls._last_request_time = time.time()
+
+    @classmethod
+    def detect_paper_references(cls, text: str) -> Dict[str, List[str]]:
+        """
+        Utility: Detect paper references in text.
+
+        Searches for arXiv IDs and DOIs in text and returns them.
+        Useful for preprocessing text to find papers to look up.
+
+        Args:
+            text: Text to search for paper references
+
+        Returns:
+            Dict with 'arxiv_ids' and 'dois' keys containing found references
+
+        Example:
+            text = "See arXiv:1706.03762 and DOI 10.1234/example"
+            refs = ArxivSearch.detect_paper_references(text)
+            # refs = {
+            #     'arxiv_ids': ['1706.03762'],
+            #     'dois': ['10.1234/example']
+            # }
+        """
+        # Find arXiv IDs
+        arxiv_ids = []
+
+        # New format: YYMM.NNNNN(vN)? - use non-capturing group to avoid tuple returns
+        new_pattern = r'\b\d{4}\.\d{4,5}(?:v\d+)?\b'
+        arxiv_ids.extend(re.findall(new_pattern, text))
+
+        # Old format: archive/NNNNNNN(vN)? - use non-capturing group
+        old_pattern = r'\b[a-z\-]+/\d{7}(?:v\d+)?\b'
+        arxiv_ids.extend(re.findall(old_pattern, text))
+
+        # Also look for explicit "arXiv:..." mentions
+        arxiv_prefix_pattern = r'arXiv:\s*(\d{4}\.\d{4,5}(?:v\d+)?|[a-z\-]+/\d{7}(?:v\d+)?)'
+        arxiv_ids.extend(re.findall(arxiv_prefix_pattern, text, re.IGNORECASE))
+
+        # Find DOIs
+        doi_pattern = r'\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b'
+        dois = re.findall(doi_pattern, text, re.IGNORECASE)
+
+        # Deduplicate
+        arxiv_ids = list(set(arxiv_ids))
+        dois = list(set(dois))
+
+        return {
+            'arxiv_ids': arxiv_ids,
+            'dois': dois
+        }
+
+    @classmethod
+    def validate_config(cls):
+        """
+        Validate tool configuration.
+
+        arXiv doesn't require an API key, so we override the parent's
+        api_key validation.
+        """
+        # arXiv is a public API - no API key required
+        # Just validate that config exists
+        if not hasattr(cls, 'config'):
+            raise ValueError(f"{cls.name}: Missing configuration")
diff --git a/dingo/model/llm/agent/tools/claims_extractor.py b/dingo/model/llm/agent/tools/claims_extractor.py
new file mode 100644
index 00000000..f3204b96
--- /dev/null
+++ b/dingo/model/llm/agent/tools/claims_extractor.py
@@ -0,0 +1,606 @@
+"""
+Claims Extraction Tool
+
+This module provides LLM-based extraction of verifiable claims from long-form text.
+Based on Claimify methodology and ACL 2025 best practices for atomic fact extraction.
+
+Dependencies:
+    openai>=1.0.0 (for LLM-based extraction)
+
+Configuration:
+    model: LLM model for extraction (default: "gpt-4o-mini")
+    api_key: OpenAI API key
+    base_url: Custom API base URL (optional, e.g., "https://api.deepseek.com/v1" for DeepSeek)
+    max_claims: Maximum number of claims to extract (default: 50, range: 1-200)
+    claim_types: Types of claims to extract (default: all types)
+    chunk_size: Text chunk size for processing (default: 2000)
+    include_context: Include surrounding context (default: True)
+"""
+
+import json
+import re
+from typing import Any, Dict, List, Optional
+
+from pydantic import Field
+
+from dingo.model.llm.agent.tools.base_tool import BaseTool, ToolConfig
+from dingo.model.llm.agent.tools.tool_registry import tool_register
+from dingo.utils import log
+
+
+class ClaimsExtractorConfig(ToolConfig):
+    """Configuration for claims extraction tool"""
+    model: str = Field(default="gpt-4o-mini", description="LLM model for extraction")
+    api_key: Optional[str] = Field(default=None, description="OpenAI API key")
+    base_url: Optional[str] = Field(default=None, description="Custom API base URL (e.g., for DeepSeek)")
+    max_claims: int = Field(default=50, ge=1, le=200)
+    claim_types: List[str] = Field(
+        default=[
+            # Original claim types
+            "factual",       # General facts
+            "statistical",   # Numbers, percentages, metrics
+            "attribution",   # Who said/did/published what
+            "institutional",   # Organizations, affiliations, collaborations
+            # New claim types for multi-type article support
+            "temporal",      # Time-related claims (dates, durations, "recently")
+            "comparative",   # Comparisons between entities/products
+            "monetary",      # Financial figures, costs, prices
+            "technical"      # Technical specifications, capabilities
+        ],
+        description="Types of claims to extract (8 types)"
+    )
+    chunk_size: int = Field(default=2000, ge=500, le=10000, description="Text chunk size")
+    include_context: bool = Field(default=True, description="Include surrounding context")
+    temperature: float = Field(default=0.1, ge=0.0, le=1.0, description="LLM temperature")
+
+
+@tool_register
+class ClaimsExtractor(BaseTool):
+    """
+    Extract verifiable claims from long-form text (articles, blog posts).
+
+    This tool uses LLM-based extraction to identify atomic, decontextualized claims
+    that can be independently fact-checked. Based on Claimify (ACL 2025) methodology.
+
+    Features:
+    - Atomic claim extraction (one fact per claim)
+    - Decontextualization (claims stand alone)
+    - Claim type classification
+    - Context preservation (optional)
+    - Deduplication and merging
+
+    Claim Types (8 types):
+    - factual: General facts (e.g., "The tower is 330 meters tall")
+    - statistical: Numbers, percentages (e.g., "Model has 0.9B parameters")
+    - attribution: Who said/did what (e.g., "Vaswani et al. proposed Transformer")
+    - institutional: Organizations, affiliations (e.g., "Released by MIT and Stanford")
+    - temporal: Time-related (e.g., "Released on December 5, 2024")
+    - comparative: Comparisons (e.g., "GPU improved 20% vs previous gen")
+    - monetary: Financial figures (e.g., "Priced at $999")
+    - technical: Technical specs (e.g., "A17 Pro chip with 3nm process")
+
+    Usage:
+        # Extract all types of claims (using default OpenAI API)
+        result = ClaimsExtractor.execute(text=article_text)
+
+        # Extract only institutional claims
+        result = ClaimsExtractor.execute(
+            text=article_text,
+            claim_types=["institutional"]
+        )
+
+        # Use custom API (e.g., DeepSeek)
+        ClaimsExtractor.config.model = "deepseek-chat"
+        ClaimsExtractor.config.base_url = "https://api.deepseek.com/v1"
+        result = ClaimsExtractor.execute(text=article_text)
+
+        # Result structure:
+        {
+            'success': True,
+            'claims': [
+                {
+                    'claim_id': 'claim_001',
+                    'claim': 'OmniDocBench was released by Tsinghua University',
+                    'claim_type': 'institutional',
+                    'context': 'PaddleOCR-VL登顶的OmniDocBench V1.5...',
+                    'position': {'start': 120, 'end': 180},
+                    'verifiable': True,
+                    'confidence': 0.95
+                },
+                ...
+            ],
+            'metadata': {
+                'total_claims': 25,
+                'verifiable_claims': 20,
+                'claim_types_distribution': {...}
+            }
+        }
+    """
+
+    name = "claims_extractor"
+    description = (
+        "Extract verifiable claims from long-form text (articles, blog posts). "
+        "Returns atomic, decontextualized claims with context and metadata. "
+        "Useful for fact-checking articles, identifying checkable statements. "
+        "Supports 8 claim types: factual, statistical, attribution, institutional, "
+        "temporal, comparative, monetary, technical."
+    )
+    config: ClaimsExtractorConfig = ClaimsExtractorConfig()
+
+    # System prompt for LLM-based extraction
+    EXTRACTION_SYSTEM_PROMPT = """You are an expert fact-checker specialized in extracting verifiable claims from text.
+
+Your task is to extract ATOMIC, VERIFIABLE claims that can be independently fact-checked.
+
+Guidelines:
+1. Atomicity: Each claim describes ONE fact, statistic, or attribution
+2. Verifiability: Can be checked against authoritative sources
+3. Decontextualization: Include necessary context to stand alone
+4. Faithfulness: Preserve original meaning
+5. Specificity: Extract specific, checkable claims (not opinions or vague statements)
+
+Claim Types (EXPANDED from 4 to 8 for multi-type article support):
+- factual: General facts (e.g., "The tower is 330 meters tall")
+- statistical: Numbers, percentages, metrics (e.g., "Model has 0.9B parameters")
+- attribution: Who said/did/published what (e.g., "Vaswani et al. proposed Transformer")
+- institutional: Organizations, affiliations, collaborations (e.g., "Released by MIT and Stanford")
+- temporal: Time-related claims - dates, durations, "recently" (e.g., "Released on Dec 5, 2024")
+- comparative: Comparisons between entities/products (e.g., "GPU improved 20% vs A16")
+- monetary: Financial figures, costs, prices (e.g., "128GB model priced at $999")
+- technical: Technical specifications, capabilities (e.g., "A17 Pro chip with 3nm process")
+
+Output Format (JSON):
+{
+    "claims": [
+        {
+            "claim": "具体的声明文本",
+            "claim_type": "institutional",
+            "context": "周围的上下文(帮助理解)",
+            "verifiable": true,
+            "confidence": 0.95
+        }
+    ]
+}
+
+Examples:
+
+Example 1 - Academic Article:
+Input: "百度刚刚发布的PaddleOCR-VL模型登顶了由清华大学、阿里达摩院等联合发布的OmniDocBench榜单。"
+
+Output:
+{
+    "claims": [
+        {
+            "claim": "PaddleOCR-VL model was just released by Baidu",
+            "claim_type": "attribution",
+            "context": "百度刚刚发布的PaddleOCR-VL模型...",
+            "verifiable": true,
+            "confidence": 0.90
+        },
+        {
+            "claim": "PaddleOCR-VL topped the OmniDocBench leaderboard",
+            "claim_type": "factual",
+            "context": "模型登顶了...OmniDocBench榜单",
+            "verifiable": true,
+            "confidence": 0.95
+        },
+        {
+            "claim": "OmniDocBench was jointly released by Tsinghua University and Alibaba DAMO Academy",
+            "claim_type": "institutional",
+            "context": "由清华大学、阿里达摩院等联合发布的OmniDocBench榜单",
+            "verifiable": true,
+            "confidence": 0.95
+        }
+    ]
+}
+
+Example 2 - News Article:
+Input: "OpenAI于2024年12月5日正式发布o1推理模型。CEO Sam Altman表示这是AGI道路上的里程碑。ChatGPT Plus月费保持20美元。"
+
+Output:
+{
+    "claims": [
+        {
+            "claim": "OpenAI released o1 reasoning model on December 5, 2024",
+            "claim_type": "temporal",
+            "context": "OpenAI于2024年12月5日正式发布o1推理模型",
+            "verifiable": true,
+            "confidence": 0.98
+        },
+        {
+            "claim": "Sam Altman stated o1 is a milestone on the path to AGI",
+            "claim_type": "attribution",
+            "context": "CEO Sam Altman表示这是AGI道路上的里程碑",
+            "verifiable": true,
+            "confidence": 0.90
+        },
+        {
+            "claim": "ChatGPT Plus monthly fee remains $20",
+            "claim_type": "monetary",
+            "context": "ChatGPT Plus月费保持20美元",
+            "verifiable": true,
+            "confidence": 0.95
+        }
+    ]
+}
+
+Example 3 - Product Review:
+Input: "iPhone 15 Pro搭载A17 Pro芯片,采用3纳米工艺。GPU性能相比A16提升20%。国行128GB版售价7999元。"
+
+Output:
+{
+    "claims": [
+        {
+            "claim": "iPhone 15 Pro features A17 Pro chip with 3nm process",
+            "claim_type": "technical",
+            "context": "iPhone 15 Pro搭载A17 Pro芯片,采用3纳米工艺",
+            "verifiable": true,
+            "confidence": 0.98
+        },
+        {
+            "claim": "GPU performance improved 20% compared to A16",
+            "claim_type": "comparative",
+            "context": "GPU性能相比A16提升20%",
+            "verifiable": true,
+            "confidence": 0.90
+        },
+        {
+            "claim": "China 128GB model priced at 7999 yuan",
+            "claim_type": "monetary",
+            "context": "国行128GB版售价7999元",
+            "verifiable": true,
+            "confidence": 0.95
+        }
+    ]
+}
+
+Critical: Extract SPECIFIC claims with verifiable details. Ignore opinions, marketing language, or vague statements.
+"""
+
+    @classmethod
+    def execute(
+        cls,
+        text: str,
+        claim_types: Optional[List[str]] = None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Extract verifiable claims from text.
+
+        Args:
+            text: Input text (supports Markdown)
+            claim_types: Types of claims to extract (default: all types from config)
+            **kwargs: Optional configuration overrides
+                - max_claims: Override max_claims config
+                - include_context: Override include_context config
+                - chunk_size: Override chunk_size config
+
+        Returns:
+            Dict with extracted claims:
+            {
+                'success': bool,
+                'claims': List[Dict],
+                'metadata': Dict
+            }
+
+        Raises:
+            ImportError: If openai library is not installed
+            ValueError: If text is empty or API key is missing
+            Exception: For API errors
+        """
+        # Validate inputs
+        if not text or not text.strip():
+            log.error("Claims extraction: text cannot be empty")
+            return {
+                'success': False,
+                'error': 'Input text cannot be empty',
+                'claims': []
+            }
+
+        if not cls.config.api_key:
+            error_msg = (
+                "OpenAI API key is required for claims extraction.\n\n"
+                "Set api_key in tool configuration or environment variable OPENAI_API_KEY"
+            )
+            log.error(error_msg)
+            return {
+                'success': False,
+                'error': error_msg,
+                'error_type': 'ConfigurationError',
+                'claims': []
+            }
+
+        # Import OpenAI library (lazy import)
+        try:
+            from openai import OpenAI
+        except ImportError:
+            error_msg = (
+                "openai library is not installed but required for claims extraction.\n\n"
+                "Install with:\n"
+                "  pip install -r requirements/agent.txt\n"
+                "Or:\n"
+                "  pip install openai>=1.0.0"
+            )
+            log.error(error_msg)
+            return {
+                'success': False,
+                'error': error_msg,
+                'error_type': 'DependencyError',
+                'claims': []
+            }
+
+        # Get configuration
+        claim_types_filter = claim_types or cls.config.claim_types
+        max_claims = kwargs.get('max_claims', cls.config.max_claims)
+        include_context = kwargs.get('include_context', cls.config.include_context)
+        chunk_size = kwargs.get('chunk_size', cls.config.chunk_size)
+
+        log.info(f"Extracting claims from text ({len(text)} chars, chunk_size={chunk_size})")
+
+        try:
+            # Create OpenAI client (with optional custom base_url)
+            client_kwargs = {"api_key": cls.config.api_key}
+            if cls.config.base_url:
+                client_kwargs["base_url"] = cls.config.base_url
+                log.info(f"Using custom API base URL: {cls.config.base_url}")
+            client = OpenAI(**client_kwargs)
+
+            # Chunk text if needed
+            chunks = cls._chunk_text(text, chunk_size)
+            log.debug(f"Split text into {len(chunks)} chunks")
+
+            # Extract claims from each chunk
+            all_claims = []
+            for i, chunk_data in enumerate(chunks):
+                log.debug(f"Processing chunk {i+1}/{len(chunks)}")
+
+                chunk_claims = cls._extract_claims_from_chunk(
+                    client,
+                    chunk_data['text'],
+                    chunk_data['start_pos'],
+                    claim_types_filter,
+                    include_context
+                )
+                all_claims.extend(chunk_claims)
+
+            # Deduplicate and merge similar claims
+            unique_claims = cls._deduplicate_claims(all_claims)
+
+            # Limit to max_claims
+            if len(unique_claims) > max_claims:
+                log.warning(f"Limiting claims from {len(unique_claims)} to {max_claims}")
+                unique_claims = unique_claims[:max_claims]
+
+            # Add claim IDs
+            for i, claim in enumerate(unique_claims, 1):
+                claim['claim_id'] = f"claim_{i:03d}"
+
+            # Build metadata
+            metadata = cls._build_metadata(unique_claims)
+
+            result = {
+                'success': True,
+                'claims': unique_claims,
+                'metadata': metadata
+            }
+
+            log.info(f"Claims extraction successful: {len(unique_claims)} claims extracted")
+            return result
+
+        except Exception as e:
+            log.error(f"Claims extraction failed: {e}")
+
+            # Sanitize error message
+            error_str = str(e).lower()
+            if "api key" in error_str or "authentication" in error_str:
+                error_msg = "Invalid or missing API key"
+            elif "rate limit" in error_str:
+                error_msg = "Rate limit exceeded"
+            elif "timeout" in error_str:
+                error_msg = "Request timed out"
+            else:
+                error_msg = f"Extraction failed: {type(e).__name__}"
+
+            return {
+                'success': False,
+                'error': error_msg,
+                'error_type': type(e).__name__,
+                'claims': []
+            }
+
+    @classmethod
+    def _chunk_text(cls, text: str, chunk_size: int) -> List[Dict[str, Any]]:
+        """
+        Split long text into chunks for processing.
+
+        Args:
+            text: Input text
+            chunk_size: Maximum chunk size in characters
+
+        Returns:
+            List of chunk dictionaries with text and position info
+        """
+        if len(text) <= chunk_size:
+            return [{'text': text, 'start_pos': 0, 'end_pos': len(text)}]
+
+        chunks = []
+        start = 0
+
+        while start < len(text):
+            end = start + chunk_size
+
+            # Try to break at sentence boundary
+            if end < len(text):
+                # Look for sentence ending within last 20% of chunk
+                search_start = start + int((end - start) * 0.8)
+                sentence_end = max(
+                    text.rfind('。', search_start, end),
+                    text.rfind('.', search_start, end),
+                    text.rfind('\n\n', search_start, end)
+                )
+                if sentence_end > start:
+                    end = sentence_end + 1
+
+            chunk_text = text[start:end]
+            chunks.append({
+                'text': chunk_text,
+                'start_pos': start,
+                'end_pos': end
+            })
+
+            start = end
+
+        return chunks
+
+    @classmethod
+    def _extract_claims_from_chunk(
+        cls,
+        client,
+        chunk_text: str,
+        start_pos: int,
+        claim_types: List[str],
+        include_context: bool
+    ) -> List[Dict]:
+        """
+        Extract claims from a single text chunk using LLM.
+
+        Args:
+            client: OpenAI client
+            chunk_text: Text chunk to process
+            start_pos: Start position of chunk in original text
+            claim_types: Types of claims to extract
+            include_context: Whether to include context
+
+        Returns:
+            List of extracted claims
+        """
+        # Build user prompt
+        user_prompt = f"""Extract verifiable claims from the following text.
+
+Focus on these claim types: {', '.join(claim_types)}
+
+Text:
+{chunk_text}
+
+Return JSON with claims array as specified in the system prompt.
+"""
+
+        # Call LLM
+        try:
+            response = client.chat.completions.create(
+                model=cls.config.model,
+                messages=[
+                    {"role": "system", "content": cls.EXTRACTION_SYSTEM_PROMPT},
+                    {"role": "user", "content": user_prompt}
+                ],
+                temperature=cls.config.temperature,
+                response_format={"type": "json_object"}  # Force JSON output
+            )
+
+            output_text = response.choices[0].message.content
+
+            # Parse JSON
+            result_json = json.loads(output_text)
+            claims = result_json.get('claims', [])
+
+            # Add position info and filter by type
+            filtered_claims = []
+            for claim in claims:
+                claim_type = claim.get('claim_type', 'unknown')
+                if claim_type in claim_types or 'all' in claim_types:
+                    # Add position (approximate - based on chunk)
+                    claim['position'] = {
+                        'start': start_pos,
+                        'end': start_pos + len(chunk_text)
+                    }
+
+                    # Remove context if not requested
+                    if not include_context:
+                        claim.pop('context', None)
+
+                    filtered_claims.append(claim)
+
+            return filtered_claims
+
+        except json.JSONDecodeError as e:
+            log.warning(f"Failed to parse LLM output as JSON: {e}")
+            return []
+        except Exception as e:
+            log.error(f"LLM call failed: {e}")
+            return []
+
+    @classmethod
+    def _deduplicate_claims(cls, claims: List[Dict]) -> List[Dict]:
+        """
+        Remove duplicate or highly similar claims.
+
+        Args:
+            claims: List of claims
+
+        Returns:
+            Deduplicated claims
+        """
+        if len(claims) <= 1:
+            return claims
+
+        unique_claims = []
+        seen_texts = set()
+
+        for claim in claims:
+            claim_text = claim.get('claim', '').strip().lower()
+
+            # Skip if empty
+            if not claim_text:
+                continue
+
+            # Skip if exact duplicate
+            if claim_text in seen_texts:
+                continue
+
+            # Check for very similar claims (simple substring check)
+            is_duplicate = False
+            for seen_text in seen_texts:
+                # If one is substring of other and length difference < 20%
+                if claim_text in seen_text or seen_text in claim_text:
+                    len_diff = abs(len(claim_text) - len(seen_text))
+                    if len_diff < 0.2 * max(len(claim_text), len(seen_text)):
+                        is_duplicate = True
+                        break
+
+            if not is_duplicate:
+                unique_claims.append(claim)
+                seen_texts.add(claim_text)
+
+        return unique_claims
+
+    @classmethod
+    def _build_metadata(cls, claims: List[Dict]) -> Dict[str, Any]:
+        """
+        Build metadata summary for extracted claims.
+
+        Args:
+            claims: List of claims
+
+        Returns:
+            Metadata dictionary
+        """
+        total_claims = len(claims)
+        verifiable_claims = sum(1 for c in claims if c.get('verifiable', True))
+
+        # Count by type
+        type_distribution = {}
+        for claim in claims:
+            claim_type = claim.get('claim_type', 'unknown')
+            type_distribution[claim_type] = type_distribution.get(claim_type, 0) + 1
+
+        return {
+            'total_claims': total_claims,
+            'verifiable_claims': verifiable_claims,
+            'claim_types_distribution': type_distribution
+        }
+
+    @classmethod
+    def validate_config(cls):
+        """Validate tool configuration before execution."""
+        if not cls.config.api_key:
+            raise ValueError(f"{cls.name}: OpenAI API key is required")
diff --git a/docs/agent_architecture.md b/docs/agent_architecture.md
new file mode 100644
index 00000000..c55d34c6
--- /dev/null
+++ b/docs/agent_architecture.md
@@ -0,0 +1,605 @@
+# Dingo Agent Architecture & Implementation Guide
+
+## Overview
+
+Dingo's Agent system extends traditional rule and LLM-based evaluation with **multi-step reasoning**, **tool usage**, and **adaptive context gathering** capabilities. This document provides a comprehensive overview of the Agent architecture, file structure, and implementation patterns.
+
+## Table of Contents
+
+1. [Architecture Overview](#architecture-overview)
+2. [File Structure](#file-structure)
+3. [Core Components](#core-components)
+4. [Implementation Patterns](#implementation-patterns)
+5. [Data Flow](#data-flow)
+6. [Configuration](#configuration)
+7. [Examples](#examples)
+
+---
+
+## Architecture Overview
+
+### High-Level Architecture
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                    Dingo Evaluation System                   │
+├─────────────────────────────────────────────────────────────┤
+│  Data Input → Executor → [Rules | LLMs | Agents] → Results  │
+└─────────────────────────────────────────────────────────────┘
+                              ▼
+                    ┌─────────────────────┐
+                    │   Agent Framework   │
+                    └─────────────────────┘
+                              │
+        ┌─────────────────────┼─────────────────────┐
+        ▼                     ▼                     ▼
+   ┌─────────┐         ┌──────────┐         ┌──────────┐
+   │  Base   │         │  Tools   │         │ LangChain│
+   │  Agent  │◄────────│ Registry │         │ Adapter  │
+   └─────────┘         └──────────┘         └──────────┘
+        │                     │
+        ▼                     ▼
+┌────────────────┐    ┌──────────────────┐
+│ AgentFactCheck │    │  tavily_search   │
+│AgentHallucin..│    │  arxiv_search    │
+│ArticleFactChk │    │  claims_extractor│
+│   (Custom)     │    │  render_tool     │
+└────────────────┘    │  mineru_ocr_tool │
+                      └──────────────────┘
+```
+
+### Evaluation Flow Comparison
+
+```
+Traditional Evaluation:
+┌──────┐      ┌─────────┐      ┌────────────┐
+│ Data │─────▶│ Rule/LLM│─────▶│ EvalDetail │
+└──────┘      └─────────┘      └────────────┘
+
+Agent-Based Evaluation:
+┌──────┐      ┌───────┐      ┌──────────┐      ┌─────┐      ┌────────────┐
+│ Data │─────▶│ Agent │─────▶│Tool Calls│─────▶│ LLM │─────▶│ EvalDetail │
+└──────┘      └───────┘      └──────────┘      └─────┘      └────────────┘
+                                    │              │
+                               Web Search    Reasoning &
+                               OCR Tools     Synthesis
+```
+
+---
+
+## File Structure
+
+### Current Implementation (Latest)
+
+```
+dingo/
+├── model/
+│   ├── llm/                              # LLM-based evaluators
+│   │   ├── agent/                        # ✨ Agent Framework
+│   │   │   ├── __init__.py               # Package exports (BaseAgent, tools)
+│   │   │   ├── base_agent.py             # BaseAgent abstract class
+│   │   │   ├── agent_fact_check.py       # LangChain-based agent (framework-driven)
+│   │   │   ├── agent_hallucination.py    # Custom workflow agent (imperative)
+│   │   │   ├── agent_article_fact_checker.py  # Agent-First article fact-checker
+│   │   │   ├── agent_wrapper.py          # LangChain 1.0 integration wrapper
+│   │   │   ├── langchain_adapter.py      # Dingo ↔ LangChain tool adapter
+│   │   │   └── tools/                    # Agent tools
+│   │   │       ├── __init__.py           # Tool registry exports
+│   │   │       ├── base_tool.py          # BaseTool abstract class
+│   │   │       ├── tool_registry.py      # Tool registration & discovery
+│   │   │       ├── claims_extractor.py   # Claims extraction tool (LLM-based)
+│   │   │       ├── arxiv_search.py       # Academic paper search tool
+│   │   │       ├── tavily_search.py      # Web search tool (Tavily API)
+│   │   │       ├── render_tool.py        # HTML rendering tool
+│   │   │       └── mineru_ocr_tool.py    # OCR tool (MinerU integration)
+│   │   ├── base_openai.py                # Base class for OpenAI-compatible LLMs
+│   │   └── ...                           # Other LLM evaluators
+│   ├── model.py                          # ✏️ Central registry (@Model decorator)
+│   └── rule/                             # Rule-based evaluators
+│
+├── config/
+│   └── input_args.py                     # ✏️ Configuration models (Pydantic)
+│                                         #    - InputArgs
+│                                         #    - EvaluatorArgs (includes agent_config)
+│
+├── exec/
+│   ├── local.py                          # ✏️ Local executor with thread/process pools
+│   │                                     #    - Agents run in ThreadPoolExecutor (I/O-bound)
+│   └── spark.py                          # Distributed executor (Spark)
+│
+├── io/
+│   ├── input/
+│   │   └── data.py                       # Data class (standardized input)
+│   └── output/
+│       └── eval_detail.py                # EvalDetail (evaluation result)
+│
+└── utils/
+    └── log_util/                         # Logging utilities
+        └── logger.py
+
+examples/
+└── agent/                                # ✨ Agent usage examples
+    ├── agent_executor_example.py         # Basic agent execution
+    ├── agent_hallucination_example.py    # Hallucination detection example
+    └── agent_article_fact_checking_example.py  # Article fact-checking example
+
+test/
+└── scripts/
+    └── model/
+        └── llm/
+            └── agent/                    # ✨ Agent tests
+                ├── test_agent_fact_check.py
+                ├── test_agent_hallucination.py
+                ├── test_article_fact_checker.py       # ArticleFactChecker tests (88 tests)
+                ├── test_async_article_fact_checker.py # Async/parsing tests (30 tests)
+                ├── test_tool_registry.py
+                └── tools/
+                    ├── test_claims_extractor.py
+                    ├── test_arxiv_search.py
+                    ├── test_tavily_search.py
+                    ├── test_render_tool.py
+                    └── test_mineru_ocr_tool.py
+
+docs/
+├── agent_development_guide.md            # Comprehensive development guide
+├── agent_architecture.md                 # This file
+├── article_fact_checking_guide.md        # ArticleFactChecker guide
+└── quick_start_article_fact_checking.md  # Quick start for article fact-checking
+
+requirements/
+└── agent.txt                             # Agent dependencies
+                                          #   - langchain>=1.0.0
+                                          #   - langchain-openai
+                                          #   - tavily-python
+                                          #   - etc.
+
+.github/
+└── env/
+    └── agent_hallucination.json          # Example agent configuration
+```
+
+### Key File Changes from "Old Version"
+
+| Old Path | New Path | Notes |
+|----------|----------|-------|
+| `dingo/model/agent/` | `dingo/model/llm/agent/` | Moved under LLM module hierarchy |
+| N/A | `agent_wrapper.py` | Added LangChain 1.0 integration |
+| N/A | `langchain_adapter.py` | Added Dingo ↔ LangChain adapters |
+| `agent_fact_check_web.py` | `agent_fact_check.py` | Simplified naming |
+| N/A | `agent_hallucination.py` | Added custom workflow example |
+| `tools/web_search.py` | `tools/tavily_search.py` | Specific implementation naming |
+| N/A | `tools/render_tool.py` | Added HTML rendering |
+| N/A | `tools/mineru_ocr_tool.py` | Added OCR capabilities |
+
+---
+
+## Core Components
+
+### 1. BaseAgent (base_agent.py)
+
+**Purpose**: Abstract base class for all agent-based evaluators
+
+**Key Features**:
+- Extends `BaseOpenAI` to inherit LLM functionality
+- Supports dual execution paths: Legacy (manual) and LangChain (framework-driven)
+- Manages tool execution and configuration injection
+- Provides agent orchestration methods
+
+**Core Methods**:
+```python
+class BaseAgent(BaseOpenAI):
+    # Configuration
+    available_tools: List[str] = []      # Tools this agent can use
+    max_iterations: int = 5              # Safety limit
+    use_agent_executor: bool = False     # Enable LangChain path
+
+    # Abstract methods (must implement)
+    @abstractmethod
+    def plan_execution(cls, input_data: Data) -> List[Dict[str, Any]]
+    @abstractmethod
+    def aggregate_results(cls, input_data: Data, results: List[Any]) -> EvalDetail
+
+    # Main evaluation entry point
+    def eval(cls, input_data: Data) -> EvalDetail
+
+    # Tool execution
+    def execute_tool(cls, tool_name: str, **kwargs) -> Dict[str, Any]
+    def configure_tool(cls, tool_name: str, tool_class)
+
+    # LangChain integration
+    def _eval_with_langchain_agent(cls, input_data: Data) -> EvalDetail
+    def get_langchain_tools(cls)
+    def _format_agent_input(cls, input_data: Data) -> str
+    def _get_system_prompt(cls, input_data: Data) -> str
+```
+
+**Execution Flow**:
+```
+eval()
+├─ use_agent_executor == True?  (standard path)
+│  ├─ Yes → _eval_with_langchain_agent()
+│  │         ├─ get_langchain_tools()
+│  │         ├─ get_langchain_llm()
+│  │         ├─ AgentWrapper.create_agent()
+│  │         ├─ AgentWrapper.invoke_and_format()
+│  │         └─ aggregate_results()
+│  │
+│  └─ No  → Legacy path
+│            ├─ plan_execution()
+│            ├─ Loop through plan steps
+│            │   ├─ execute_tool() for tool steps
+│            │   └─ send_messages() for LLM steps
+│            └─ aggregate_results()
+
+Note: ArticleFactChecker overrides eval() entirely and uses a two-phase
+async parallel architecture (asyncio.run → _async_eval) instead of
+the above base-class dispatch. See ArticleFactChecker section below.
+```
+
+### 2. Tool System
+
+#### BaseTool (tools/base_tool.py)
+
+**Purpose**: Abstract interface for all agent tools
+
+```python
+class BaseTool(ABC):
+    name: str                           # Unique identifier
+    description: str                    # For LLM understanding
+    config: ToolConfig                  # Tool-specific config
+
+    @abstractmethod
+    def execute(cls, **kwargs) -> Dict[str, Any]
+    def validate_config(cls)
+    def update_config(cls, config_dict: Dict[str, Any])
+```
+
+#### ToolRegistry (tools/tool_registry.py)
+
+**Purpose**: Central registry for tool discovery and management
+
+**Key Features**:
+- Auto-discovery via `@tool_register()` decorator
+- Lazy loading (tools loaded on first use)
+- Configuration injection from agent config
+
+```python
+@tool_register("tavily_search")
+class TavilySearch(BaseTool):
+    name = "tavily_search"
+    description = "Search the web using Tavily API"
+
+    @classmethod
+    def execute(cls, query: str, **kwargs) -> Dict[str, Any]:
+        # Implementation
+        return {
+            'success': True,
+            'results': [...],
+            'answer': "..."
+        }
+```
+
+**Built-in Tools**:
+
+| Tool | File | Purpose | Dependencies |
+|------|------|---------|--------------|
+| `claims_extractor` | `claims_extractor.py` | LLM-based claims extraction | `openai` |
+| `arxiv_search` | `arxiv_search.py` | Academic paper search | `arxiv` |
+| `tavily_search` | `tavily_search.py` | Web search via Tavily API | `tavily-python` |
+| `render_tool` | `render_tool.py` | HTML rendering with Playwright | `playwright` |
+| `mineru_ocr_tool` | `mineru_ocr_tool.py` | OCR with MinerU | `magic-pdf` |
+
+### 3. LangChain Integration
+
+#### AgentWrapper (agent_wrapper.py)
+
+**Purpose**: Wrapper for LangChain 1.0 create_agent API
+
+**Key Methods**:
+```python
+class AgentWrapper:
+    @staticmethod
+    def create_agent(llm, tools, system_prompt, **config)
+        # Uses langchain.agents.create_agent (LangGraph-based)
+
+    @staticmethod
+    def invoke_and_format(agent, input_text, input_data, max_iterations)
+        # Invokes agent and formats results for Dingo
+
+    @staticmethod
+    def get_openai_llm_from_dingo_config(dynamic_config)
+        # Creates ChatOpenAI from Dingo config
+```
+
+**LangChain 1.0 Changes** (Nov 2025):
+- Uses `create_agent()` instead of deprecated `AgentExecutor`
+- Built on LangGraph for better state management
+- `recursion_limit` instead of `max_iterations`
+- Message-based invocation interface
+
+#### LangChain Adapter (langchain_adapter.py)
+
+**Purpose**: Converts Dingo tools to LangChain StructuredTool format
+
+```python
+def convert_dingo_tools(tool_names: List[str], agent_class) -> List[StructuredTool]:
+    # Wraps Dingo tools for LangChain compatibility
+    # Preserves Dingo's configuration injection mechanism
+```
+
+### 4. Agent Implementations
+
+#### AgentFactCheck (agent_fact_check.py)
+
+**Pattern**: LangChain-Based (Framework-Driven)
+
+**Key Characteristics**:
+- Sets `use_agent_executor = True`
+- Overrides `_format_agent_input()` for custom input formatting
+- Overrides `_get_system_prompt()` for task-specific instructions
+- LangChain handles autonomous tool calling and reasoning
+- Parses structured output in `aggregate_results()`
+
+**Workflow**:
+```
+Input: Question + Response + Context (optional)
+  ↓
+LangChain Agent decides:
+  - With context: MAY search for additional verification
+  - Without context: MUST search to verify facts
+  ↓
+Agent autonomously:
+  - Calls tavily_search tool as needed
+  - Reasons about results
+  - Returns structured output (HALLUCINATION_DETECTED: YES/NO)
+  ↓
+aggregate_results() parses output → EvalDetail
+```
+
+**When to Use**:
+- ✅ Complex multi-step reasoning
+- ✅ Benefit from LangChain's orchestration
+- ✅ Prefer declarative style
+- ✅ Rapid prototyping
+
+#### AgentHallucination (agent_hallucination.py)
+
+**Pattern**: Custom Workflow (Imperative)
+
+**Key Characteristics**:
+- Implements custom `eval()` with explicit workflow
+- Manually calls `execute_tool()` for searches
+- Manually calls `send_messages()` for LLM interactions
+- Delegates to existing evaluator (LLMHallucination)
+- Full control over execution flow
+
+**Workflow**:
+```
+Input: Content + Context (optional)
+  ↓
+Check context availability
+  ↓
+├─ Has context? → Delegate to LLMHallucination
+│
+└─ No context? → Agent workflow:
+    1. Extract factual claims (LLM call)
+    2. Search web for each claim (Tavily tool)
+    3. Synthesize context (combine results)
+    4. Evaluate with synthesized context (LLMHallucination)
+  ↓
+Return EvalDetail with provenance
+```
+
+**When to Use**:
+- Fine-grained control over steps
+- Compose with existing evaluators
+- Prefer explicit behavior
+- Domain-specific workflows
+- Conditional logic between steps
+
+#### ArticleFactChecker (agent_article_fact_checker.py)
+
+**Pattern**: Agent-First with Context Tracking (LangChain ReAct + Artifact Saving)
+
+**Key Characteristics**:
+- Sets `use_agent_executor = True` (same as AgentFactCheck)
+- Overrides `eval()` to add context tracking and file saving
+- Uses thread-local storage (`threading.local()`) for concurrent safety
+- Extracts claims from tool_calls observation data
+- Builds enriched per-claim verification records
+- Saves intermediate artifacts (article, claims, verification, report)
+- Produces dual-layer `EvalDetail.reason`: `[text_summary, structured_report_dict]`
+
+**Workflow** (two-phase parallel architecture):
+```
+Input: Article text (Markdown)
+  |
+eval() override:
+  |- Save article content to output_path
+  |- asyncio.run(_async_eval())
+  |
+Phase 1 — Claims Extraction:
+  |- ClaimsExtractor.execute(content)   # Direct tool call, not via agent
+  |- Returns list of factual claims
+  |
+Phase 2 — Parallel Claim Verification:
+  |- asyncio.gather() with Semaphore(max_concurrent_claims)
+  |- Each claim → independent LangChain mini-agent
+  │    |- _async_verify_single_claim()
+  │    |- AgentWrapper.async_invoke_and_format()
+  │    |- _parse_claim_json_robust()    # 3-tier robust JSON parsing
+  │    └─ Returns per-claim verdict
+  |
+Aggregation:
+  |- _aggregate_parallel_results()
+  |- _recalculate_summary()
+  |- Save artifacts (claims_extracted.jsonl, claims_verification.jsonl, report.json)
+  |- Return EvalDetail with dual-layer reason
+```
+
+**When to Use**:
+- Article-level comprehensive fact-checking
+- Need intermediate artifacts (claims list, per-claim details, full report)
+- Benefit from transparent evidence chains
+- Want structured report alongside text summary
+
+---
+---
+
+## Data Flow
+
+### Complete Evaluation Pipeline
+
+```
+┌───────────────────────────────────────────────────────────────┐
+│ 1. Configuration Loading                                       │
+└───────────────────────────────────────────────────────────────┘
+    JSON Config → InputArgs (Pydantic) → EvaluatorArgs
+                                            ├─ name: "AgentFactCheck"
+                                            ├─ config.key: API key
+                                            ├─ config.model: "gpt-4"
+                                            └─ config.parameters.agent_config:
+                                                 ├─ max_iterations: 10
+                                                 └─ tools:
+                                                      └─ tavily_search:
+                                                           └─ api_key: "..."
+
+┌───────────────────────────────────────────────────────────────┐
+│ 2. Data Loading & Conversion                                   │
+└───────────────────────────────────────────────────────────────┘
+    DataSource.load() → Generator[raw_data]
+                            ↓
+    Converter.convert() → Data objects
+                            ├─ content: str
+                            ├─ prompt: Optional[str]
+                            ├─ context: Optional[List[str]]
+                            └─ raw_data: Dict
+
+┌───────────────────────────────────────────────────────────────┐
+│ 3. Agent Execution (ThreadPoolExecutor)                        │
+└───────────────────────────────────────────────────────────────┘
+    BaseAgent.eval(Data) → EvalDetail
+         │
+         ├─ use_agent_executor?
+         │
+         ├─ YES (LangChain Path):
+         │    ├─ _format_agent_input(Data) → input_text
+         │    ├─ _get_system_prompt(Data) → system_prompt
+         │    ├─ get_langchain_tools() → StructuredTool[]
+         │    ├─ get_langchain_llm() → ChatOpenAI
+         │    ├─ AgentWrapper.create_agent() → CompiledStateGraph
+         │    ├─ AgentWrapper.invoke_and_format()
+         │    │     ├─ Agent reasoning loop (ReAct)
+         │    │     ├─ Tool calls (autonomous)
+         │    │     └─ Final output
+         │    └─ aggregate_results() → EvalDetail
+         │
+         └─ NO (Legacy Path):
+              ├─ plan_execution(Data) → plan: List[step]
+              ├─ Loop through steps:
+              │    ├─ Tool step: execute_tool(name, **args)
+              │    │               ├─ ToolRegistry.get(name)
+              │    │               ├─ configure_tool()
+              │    │               └─ tool.execute()
+              │    └─ LLM step: send_messages(messages)
+              └─ aggregate_results(results) → EvalDetail
+
+┌───────────────────────────────────────────────────────────────┐
+│ 4. Result Aggregation                                          │
+└───────────────────────────────────────────────────────────────┘
+    EvalDetail
+      ├─ metric: str                    # "AgentFactCheck"
+      ├─ status: bool                   # True = issue detected
+      ├─ score: Optional[float]         # Numeric score
+      ├─ label: List[str]              # ["QUALITY_BAD.HALLUCINATION"]
+      └─ reason: List[Any]             # Dual-layer reason:
+                                        #   reason[0]: str (human-readable text)
+                                        #   reason[1]: Dict (structured report, optional)
+                                        #   ArticleFactChecker uses this for
+                                        #   text summary + full report dict
+
+┌───────────────────────────────────────────────────────────────┐
+│ 5. Summary Generation                                          │
+└───────────────────────────────────────────────────────────────┘
+    ResultInfo → SummaryModel
+      ├─ total_count: int
+      ├─ good_count: int
+      ├─ bad_count: int
+      ├─ type_ratio: Dict[field, Dict[label, count]]
+      └─ metrics_score_stats: Dict[metric, stats]
+```
+
+### Tool Execution Flow
+
+```
+BaseAgent.execute_tool(tool_name, **kwargs)
+    ↓
+Check if tool in available_tools
+    ↓
+ToolRegistry.get(tool_name) → tool_class
+    ↓
+configure_tool(tool_name, tool_class)
+    ├─ Extract config from dynamic_config.parameters.agent_config.tools.{tool_name}
+    └─ tool_class.update_config(config_dict)
+    ↓
+tool_class.execute(**kwargs)
+    ├─ Tool-specific logic (API calls, processing, etc.)
+    └─ Return Dict[str, Any] with 'success' key
+    ↓
+Return to agent for processing
+```
+
+---
+
+## Summary
+
+### Key Takeaways
+
+1. **Architecture**: Agents extend `BaseOpenAI` and are registered via `@Model.llm_register()`
+2. **Location**: All agent code lives under `dingo/model/llm/agent/`
+3. **Three Patterns**: LangChain-based (declarative), Custom Workflow (imperative), Agent-First + Context (hybrid)
+4. **Tool System**: Centralized registry with configuration injection
+5. **Execution**: Runs in ThreadPoolExecutor alongside other LLMs
+6. **Configuration**: Nested under `parameters.agent_config` in evaluator config
+7. **Artifact Saving**: ArticleFactChecker auto-saves intermediate artifacts to a timestamped directory by default; override via `agent_config.output_path`, or disable with `agent_config.save_artifacts=false`
+
+### Implementation Checklist
+
+Creating a new agent:
+- [ ] Choose pattern (LangChain vs Custom)
+- [ ] Create agent file under `dingo/model/llm/agent/`
+- [ ] Extend `BaseAgent`
+- [ ] Register with `@Model.llm_register("YourAgent")`
+- [ ] Define `available_tools` list
+- [ ] Implement required methods based on pattern
+- [ ] Add tests under `test/scripts/model/llm/agent/`
+- [ ] Update documentation
+- [ ] Add example usage under `examples/agent/`
+
+Creating a new tool:
+- [ ] Create tool file under `dingo/model/llm/agent/tools/`
+- [ ] Extend `BaseTool`
+- [ ] Register with `@tool_register("your_tool")`
+- [ ] Implement `execute()` method
+- [ ] Define custom `ToolConfig` if needed
+- [ ] Add tests under `test/scripts/model/llm/agent/tools/`
+- [ ] Update requirements/agent.txt if dependencies needed
+
+### Next Steps
+
+- Read `docs/agent_development_guide.md` for detailed implementation guide
+- Study `agent_fact_check.py` for LangChain pattern example
+- Study `agent_hallucination.py` for custom workflow example
+- Study `agent_article_fact_checker.py` for Agent-First + artifact saving pattern
+- Review `examples/agent/` for usage examples
+- Check `test/scripts/model/llm/agent/` for testing patterns
+
+---
+
+## Reference Links
+
+- [Agent Development Guide](./agent_development_guide.md) - Comprehensive development guide
+- [Article Fact-Checking Guide](./article_fact_checking_guide.md) - ArticleFactChecker usage guide
+- [CLAUDE.md](../CLAUDE.md) - Project overview and common commands
+- [LangChain Documentation](https://python.langchain.com/docs/concepts/agents/) - Agent concepts
+- [Tavily API](https://tavily.com/) - Web search tool documentation
diff --git a/docs/agent_development_guide.md b/docs/agent_development_guide.md
index 3a5dc3d0..da071b7c 100644
--- a/docs/agent_development_guide.md
+++ b/docs/agent_development_guide.md
@@ -48,18 +48,18 @@ Data → Agent → [Tool 1, Tool 2, ...] → LLM Reasoning → EvalDetail
 
 ## Agent Implementation Patterns
 
-Dingo supports two complementary patterns for implementing agent-based evaluators. Both patterns share the same configuration interface and are transparent to users, allowing you to choose the approach that best fits your needs.
+Dingo supports three complementary patterns for implementing agent-based evaluators. All patterns share the same configuration interface and are transparent to users, allowing you to choose the approach that best fits your needs.
 
 ### Pattern Comparison
 
-| Aspect | LangChain-Based | Custom Workflow |
-|--------|-----------------|-----------------|
-| **Control** | Framework-driven | Developer-driven |
-| **Complexity** | Simple (declarative) | Moderate (imperative) |
-| **Flexibility** | Limited to LangChain patterns | Unlimited |
-| **Code Volume** | Low (~100 lines) | Medium (~200 lines) |
-| **Best For** | Multi-step reasoning | Workflow composition |
-| **Example** | AgentFactCheck | AgentHallucination |
+| Aspect | LangChain-Based | Custom Workflow | Agent-First + Context |
+|--------|-----------------|-----------------|----------------------|
+| **Control** | Framework-driven | Developer-driven | Framework + override |
+| **Complexity** | Simple (declarative) | Moderate (imperative) | Moderate (hybrid) |
+| **Flexibility** | Limited to LangChain | Unlimited | LangChain + artifacts |
+| **Code Volume** | Low (~100 lines) | Medium (~200 lines) | High (~500+ lines) |
+| **Best For** | Multi-step reasoning | Workflow composition | Article-level verification |
+| **Example** | AgentFactCheck | AgentHallucination | ArticleFactChecker |
 
 ### Pattern 1: LangChain-Based Agents (Framework-Driven)
 
@@ -367,25 +367,140 @@ Provide a concise summary of the key facts."""
 
 ---
 
+### Pattern 3: Agent-First with Context Tracking (ArticleFactChecker)
+
+**Philosophy**: Use LangChain's ReAct pattern for autonomous reasoning, override `eval()` and `aggregate_results()` for context tracking and artifact saving.
+
+#### When to Use
+
+- Article-level comprehensive verification (many claims)
+- Need intermediate artifacts (claims list, per-claim details, structured report)
+- Want dual-layer output: human-readable text + structured data
+- Benefit from thread-safe concurrent evaluation
+
+#### Key Implementation Steps
+
+1. Set `use_agent_executor = True` (same as Pattern 1)
+2. **Override `eval()`** with a two-phase async architecture:
+   - Save article content to output directory
+   - Call `asyncio.run(cls._async_eval(input_data, ...))` (bypasses `_eval_with_langchain_agent`)
+   - Phase 1: Direct `ClaimsExtractor.execute()` call (no agent overhead)
+   - Phase 2: Per-claim verification via `asyncio.gather()` + `Semaphore(max_concurrent_claims)`
+3. **Each claim** gets its own independent LangChain mini-agent:
+   - `_async_verify_single_claim()` invokes `AgentWrapper.async_invoke_and_format()`
+   - Results parsed by `_parse_claim_json_robust()` (3-tier robust parser)
+4. **Aggregation** via `_aggregate_parallel_results()` and `_recalculate_summary()`
+   - Save artifacts (claims_extracted.jsonl, claims_verification.jsonl, report.json)
+   - Return EvalDetail with dual-layer reason: `[text_summary, report_dict]`
+
+#### Async Parallel Execution Pattern
+
+```python
+import asyncio
+import threading
+
+class ArticleFactChecker(BaseAgent):
+    _thread_local = threading.local()
+    _claims_extractor_lock = threading.Lock()  # Thread-safe config mutation
+
+    @classmethod
+    def eval(cls, input_data: Data) -> EvalDetail:
+        start_time = time.time()
+        output_dir = cls._get_output_dir()
+        if output_dir and input_data.content:
+            cls._save_article_content(output_dir, input_data.content)
+        try:
+            return asyncio.run(cls._async_eval(input_data, start_time, output_dir))
+        except RuntimeError:
+            # Fallback for already-running event loop (e.g., Jupyter)
+            loop = asyncio.new_event_loop()
+            return loop.run_until_complete(cls._async_eval(input_data, start_time, output_dir))
+
+    @classmethod
+    async def _async_eval(cls, input_data, start_time, output_dir) -> EvalDetail:
+        claims = await cls._async_extract_claims(input_data)
+        semaphore = asyncio.Semaphore(cls._get_max_concurrent_claims())
+        tasks = [cls._async_verify_single_claim(c, semaphore, ...) for c in claims]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        return cls._build_eval_detail(results, start_time, output_dir, input_data)
+```
+
+#### Output Path Access Pattern
+
+`_get_output_dir()` uses a three-priority chain (highest to lowest):
+
+1. **Explicit path** – `agent_config.output_path` is set → use it (backward-compatible)
+2. **Opt-out** – `agent_config.save_artifacts=false` → return `None`, skip saving
+3. **Auto-generate** – default behaviour: `outputs/article_factcheck_<timestamp>_<uuid>/`
+   - Override the base directory with `agent_config.base_output_path`
+
+```python
+@classmethod
+def _get_output_dir(cls) -> Optional[str]:
+    """
+    Get output directory for artifact files (three-priority chain).
+    Returns output dir path (created if needed), or None if saving disabled.
+    """
+    params = cls.dynamic_config.parameters or {}
+    agent_cfg = params.get('agent_config') or {}
+
+    explicit_path = agent_cfg.get('output_path')
+    if explicit_path:
+        os.makedirs(explicit_path, exist_ok=True)
+        return explicit_path
+
+    if agent_cfg.get('save_artifacts') is False:
+        return None  # Opted out of artifact saving
+
+    base_output = agent_cfg.get('base_output_path') or 'outputs'
+    create_time = time.strftime("%Y%m%d_%H%M%S", time.localtime())
+    auto_path = os.path.join(base_output, f"article_factcheck_{create_time}_{uuid.uuid4().hex[:6]}")
+    os.makedirs(auto_path, exist_ok=True)
+    return auto_path
+```
+
+#### Dual-Layer EvalDetail.reason
+
+```python
+# reason[0]: Human-readable text summary (str)
+# reason[1]: Structured report dict (JSON-serializable, optional)
+result.reason = [text_summary]
+if report:
+    result.reason.append(report)  # Dict, not str
+```
+
+This ensures the Dingo standard output contains both readable summaries and full structured data.
+
+**Full implementation**: `dingo/model/llm/agent/agent_article_fact_checker.py`
+**Tests**: `test/scripts/model/llm/agent/test_article_fact_checker.py` (88 tests),
+`test/scripts/model/llm/agent/test_async_article_fact_checker.py` (30 tests)
+**Guide**: `docs/article_fact_checking_guide.md`
+
+---
+
 ### Decision Tree: Which Pattern Should I Use?
 
 ```
 Start
-  │
-  ├─ Do you need to compose with existing Dingo evaluators?
-  │    ├─ Yes → Use Custom Pattern (AgentHallucination style)
-  │    └─ No → Continue
-  │
-  ├─ Is your workflow highly domain-specific?
-  │    ├─ Yes → Use Custom Pattern
-  │    └─ No → Continue
-  │
-  ├─ Do you prefer explicit control over every step?
-  │    ├─ Yes → Use Custom Pattern
-  │    └─ No → Continue
-  │
-  └─ Default → Use LangChain Pattern (AgentFactCheck style)
-       ✅ Simpler, less code, battle-tested
+  |
+  +- Do you need intermediate artifact saving (claims, reports)?
+  |    +- Yes -> Use Agent-First + Context (ArticleFactChecker style)
+  |    +- No  -> Continue
+  |
+  +- Do you need to compose with existing Dingo evaluators?
+  |    +- Yes -> Use Custom Pattern (AgentHallucination style)
+  |    +- No  -> Continue
+  |
+  +- Is your workflow highly domain-specific?
+  |    +- Yes -> Use Custom Pattern
+  |    +- No  -> Continue
+  |
+  +- Do you prefer explicit control over every step?
+  |    +- Yes -> Use Custom Pattern
+  |    +- No  -> Continue
+  |
+  +- Default -> Use LangChain Pattern (AgentFactCheck style)
+       Simpler, less code, battle-tested
 ```
 
 ### Can I Mix Both Patterns?
@@ -395,6 +510,7 @@ Start
 ```json
 {
   "evaluator": [{
+    "fields": {"content": "content"},
     "evals": [
       {"name": "AgentFactCheck"},      // LangChain-based
       {"name": "AgentHallucination"}   // Custom workflow
@@ -1408,7 +1524,11 @@ class TestMyAgent:
 
 - **AgentHallucination**: `dingo/model/llm/agent/agent_hallucination.py` - Production agent with web search
 - **AgentFactCheck**: `examples/agent/agent_executor_example.py` - LangChain 1.0 agent example
+- **ArticleFactChecker**: `dingo/model/llm/agent/agent_article_fact_checker.py` - Agent-First with context tracking and artifact saving
+- **ArticleFactChecker Example**: `examples/agent/agent_article_fact_checking_example.py` - Full article fact-checking example
 - **TavilySearch Tool**: `dingo/model/llm/agent/tools/tavily_search.py` - Web search tool implementation
+- **ClaimsExtractor Tool**: `dingo/model/llm/agent/tools/claims_extractor.py` - LLM-based claims extraction tool
+- **ArxivSearch Tool**: `dingo/model/llm/agent/tools/arxiv_search.py` - Academic paper search tool
 
 **Note**: For complete implementation examples, refer to the files above. They demonstrate real-world patterns for agent and tool development.
 
@@ -1525,10 +1645,15 @@ summary = executor.execute()
 ## Additional Resources
 
 - [AgentHallucination Implementation](../dingo/model/llm/agent/agent_hallucination.py)
+- [ArticleFactChecker Implementation](../dingo/model/llm/agent/agent_article_fact_checker.py)
 - [BaseAgent Source](../dingo/model/llm/agent/base_agent.py)
 - [Tool Registry Source](../dingo/model/llm/agent/tools/tool_registry.py)
 - [Tavily Search Example](../dingo/model/llm/agent/tools/tavily_search.py)
+- [Claims Extractor](../dingo/model/llm/agent/tools/claims_extractor.py)
+- [ArxivSearch](../dingo/model/llm/agent/tools/arxiv_search.py)
 - [Example Usage](../examples/agent/agent_hallucination_example.py)
+- [Article Fact-Checking Example](../examples/agent/agent_article_fact_checking_example.py)
+- [Article Fact-Checking Guide](./article_fact_checking_guide.md)
 
 ---
 
diff --git a/docs/article_fact_checking_guide.md b/docs/article_fact_checking_guide.md
new file mode 100644
index 00000000..74410a31
--- /dev/null
+++ b/docs/article_fact_checking_guide.md
@@ -0,0 +1,860 @@
+# Article Fact-Checking Guide
+
+This guide explains how to use the `ArticleFactChecker` agent for comprehensive article fact-checking.
+
+## Overview
+
+The `ArticleFactChecker` is an Agent-First architecture implementation that autonomously:
+1. Extracts verifiable claims from long-form articles
+2. Selects appropriate verification tools based on claim types
+3. Verifies institutional attributions and factual statements
+4. Generates structured verification reports with evidence
+
+**Implementation Pattern:** Agent-First (LangChain 1.0 ReAct)
+
+## Quick Start
+
+### Basic Usage (Direct Evaluation)
+
+```python
+import os
+from dingo.io.input import Data
+from dingo.model.llm.agent import ArticleFactChecker
+
+# Set API keys (use environment variables)
+os.environ["OPENAI_API_KEY"] = "your-openai-api-key"
+os.environ["TAVILY_API_KEY"] = "your-tavily-api-key"  # Optional
+
+# Fact-check article
+article_text = """
+Your article content here...
+"""
+
+data = Data(content=article_text)
+result = ArticleFactChecker.eval(data)
+
+# View results
+print(f"Accuracy: {result.score:.1%}")
+print(f"Issues Found: {result.status}")
+
+# reason[0]: Human-readable text summary (always present)
+if result.reason:
+    print(result.reason[0] if isinstance(result.reason[0], str) else str(result.reason[0]))
+
+    # reason[1]: Structured report dict (always present after evaluation)
+    if len(result.reason) > 1 and isinstance(result.reason[1], dict):
+        report = result.reason[1]
+        print(f"Report Version: {report.get('report_version', 'N/A')}")
+```
+
+### Advanced Usage (Full Configuration)
+
+> **Note**: Executor requires `input_path` pointing to a file. The `plaintext` format reads
+> line-by-line, splitting the article into separate Data objects per line. Use `jsonl` format
+> instead: `json.dumps` encodes newlines as `\n`, keeping the entire article as one Data object.
+
+```python
+import json
+import os
+import tempfile
+
+from dingo.config import InputArgs
+from dingo.exec import Executor
+
+# Read article and convert to JSONL (entire article as one Data object)
+with open("article.md", "r") as f:
+    article_text = f.read()
+
+temp_jsonl = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, encoding='utf-8')
+temp_jsonl.write(json.dumps({"content": article_text}, ensure_ascii=False) + '\n')
+temp_jsonl.close()
+
+# Configure ArticleFactChecker with full options
+config = {
+    "input_path": temp_jsonl.name,
+    "dataset": {"source": "local", "format": "jsonl"},
+    "executor": {"max_workers": 1},
+    "evaluator": [{
+        "fields": {"content": "content"},
+        "evals": [{
+            "name": "ArticleFactChecker",
+            "config": {
+                "key": os.getenv("OPENAI_API_KEY"),
+                "model": "deepseek-chat",  # or "gpt-4o-mini" for OpenAI
+                "parameters": {
+                    "agent_config": {
+                        "max_iterations": 15,
+                        "output_path": "outputs/article_factcheck/",  # Optional: save intermediate artifacts
+                        "tools": {
+                            "claims_extractor": {
+                                "api_key": os.getenv("OPENAI_API_KEY"),
+                                "max_claims": 50,
+                                "claim_types": [
+                                    "factual", "statistical", "attribution", "institutional",
+                                    "temporal", "comparative", "monetary", "technical"
+                                ]
+                            },
+                            "tavily_search": {
+                                "api_key": os.getenv("TAVILY_API_KEY")
+                            },
+                            "arxiv_search": {"max_results": 5}
+                        }
+                    }
+                }
+            }
+        }]
+    }]
+}
+
+# Execute
+input_args = InputArgs(**config)
+result = Executor.exec_map["local"](input_args).execute()
+
+print(f"Total: {result.total_count}, Good: {result.good_count}, Bad: {result.bad_count}")
+
+# Cleanup
+os.unlink(temp_jsonl.name)
+```
+
+### CLI Usage
+
+```bash
+# 1. Convert article to JSONL format (entire article as one line)
+python -c "
+import json
+with open('path/to/article.md', 'r') as f:
+    text = f.read()
+with open('article_input.jsonl', 'w') as f:
+    f.write(json.dumps({'content': text}, ensure_ascii=False) + '\n')
+"
+
+# 2. Create configuration file
+cat > article_check_config.json << EOF
+{
+  "input_path": "article_input.jsonl",
+  "dataset": {
+    "source": "local",
+    "format": "jsonl"
+  },
+  "evaluator": [{
+    "fields": {"content": "content"},
+    "evals": [{
+      "name": "ArticleFactChecker",
+      "config": {
+        "key": "${OPENAI_API_KEY}",
+        "model": "deepseek-chat",
+        "parameters": {
+          "agent_config": {
+            "max_iterations": 15,
+            "tools": {
+              "claims_extractor": {
+                "api_key": "${OPENAI_API_KEY}",
+                "max_claims": 50
+              },
+              "tavily_search": {
+                "api_key": "${TAVILY_API_KEY}"
+              },
+              "arxiv_search": {}
+            }
+          }
+        }
+      }
+    }]
+  }]
+}
+EOF
+
+# 3. Run fact-checking
+python -m dingo.run.cli --input article_check_config.json
+```
+
+## Supported Article Types
+
+`ArticleFactChecker` is designed to handle various article types with adaptive verification strategies:
+
+### 1. Academic Articles
+
+**Characteristics:** Research paper announcements, academic news, conference proceedings
+
+**Claim Types:** institutional, attribution, statistical, factual
+
+**Verification Strategy:**
+- Use `arxiv_search` for paper metadata (title, authors, abstract)
+- Use `tavily_search` for institutional affiliations verification
+- Combine both tools for comprehensive verification
+
+**Example:**
+```python
+academic_article = """
+百度刚刚发布的PaddleOCR-VL模型登顶了由清华大学、阿里达摩院等联合发布的OmniDocBench榜单。
+"""
+
+data = Data(content=academic_article)
+result = ArticleFactChecker.eval(data)
+```
+
+**Expected Claims:**
+- Attribution: "PaddleOCR-VL released by Baidu"
+- Institutional: "OmniDocBench jointly released by Tsinghua and Alibaba DAMO"
+- Factual: "PaddleOCR-VL topped OmniDocBench leaderboard"
+
+---
+
+### 2. News Articles
+
+**Characteristics:** Tech news, product launches, current events, announcements
+
+**Claim Types:** temporal, attribution, factual, statistical, monetary
+
+**Verification Strategy:**
+- Use `tavily_search` with date filters for temporal claims
+- Verify attributions through official announcements
+- Cross-check statistics with authoritative sources
+
+**Example:**
+```python
+news_article = """
+OpenAI于2024年12月5日正式发布o1推理模型。CEO Sam Altman表示这是AGI道路上的里程碑。
+根据技术报告,o1在数学推理任务上的准确率达到89.3%。ChatGPT Plus月费保持20美元。
+"""
+
+data = Data(content=news_article)
+result = ArticleFactChecker.eval(data)
+```
+
+**Expected Claims:**
+- Temporal: "Released on December 5, 2024"
+- Attribution: "Sam Altman stated o1 is a milestone"
+- Statistical: "89.3% accuracy on math reasoning"
+- Monetary: "ChatGPT Plus remains $20/month"
+
+---
+
+### 3. Product Reviews
+
+**Characteristics:** Gadget reviews, product comparisons, specifications
+
+**Claim Types:** technical, comparative, monetary, statistical, factual
+
+**Verification Strategy:**
+- Use `tavily_search` for official specifications
+- Verify comparative claims with benchmark databases
+- Check pricing against official sources
+
+**Example:**
+```python
+product_review = """
+iPhone 15 Pro搭载A17 Pro芯片,采用3纳米工艺。
+GPU性能相比A16提升20%。国行128GB版售价7999元。
+在Geekbench 6测试中,单核跑分达到2920。
+"""
+
+data = Data(content=product_review)
+result = ArticleFactChecker.eval(data)
+```
+
+**Expected Claims:**
+- Technical: "A17 Pro chip with 3nm process"
+- Comparative: "GPU improved 20% vs A16"
+- Monetary: "128GB priced at 7999 yuan"
+- Statistical: "Geekbench single-core: 2920"
+
+---
+
+### 4. Technical Blogs
+
+**Characteristics:** Engineering blogs, tutorials, technical analysis
+
+**Claim Types:** factual, attribution, technical, comparative
+
+**Verification Strategy:**
+- Use `tavily_search` for technical documentation
+- Verify code examples and API usage
+- Cross-check with official docs and benchmarks
+
+**Example:**
+```python
+tech_blog = """
+React 18引入了并发渲染特性,性能提升了3倍。
+根据Dan Abramov的博客,新的Suspense API简化了异步数据加载。
+"""
+
+data = Data(content=tech_blog)
+result = ArticleFactChecker.eval(data)
+```
+
+**Expected Claims:**
+- Factual: "React 18 introduced concurrent rendering"
+- Comparative: "Performance improved 3x"
+- Attribution: "Dan Abramov stated Suspense simplifies async loading"
+
+---
+
+### Claim Types Reference
+
+The agent supports **8 claim types** (expanded from original 4):
+
+| Claim Type | Description | Example |
+|------------|-------------|---------|
+| **factual** | General facts | "The tower is 330 meters tall" |
+| **statistical** | Numbers, percentages, metrics | "Model has 0.9B parameters" |
+| **attribution** | Who said/did/published what | "Vaswani et al. proposed Transformer" |
+| **institutional** | Organizations, affiliations | "Released by MIT and Stanford" |
+| **temporal** | Time-related claims | "Released on Dec 5, 2024" |
+| **comparative** | Comparisons between entities | "GPU improved 20% vs A16" |
+| **monetary** | Financial figures, prices | "Priced at $999" |
+| **technical** | Technical specifications | "A17 Pro chip with 3nm process" |
+
+Note: temporal, comparative, monetary, technical types were added in v0.3.0 for multi-type article support
+
+---
+
+## How It Works
+
+### Agent-First Architecture
+
+The `ArticleFactChecker` uses **Agent-First** design with `use_agent_executor = True`:
+
+```
+┌─────────────────────────────────────────────────┐
+│   ArticleFactChecker (LangChain Agent)          │
+│   [Autonomous Decision-Making]                  │
+└─────────────────────────────────────────────────┘
+           ↓ Autonomous Decision
+    ┌──────────────────────────────┐
+    │   Available Tools            │
+    └──────────────────────────────┘
+     ↓         ↓             ↓
+┌──────────┐ ┌─────────┐ ┌──────────┐
+│claims_   │ │arxiv_   │ │tavily_   │
+│extractor │ │search   │ │search    │
+└──────────┘ └─────────┘ └──────────┘
+```
+
+**Key Advantages:**
+- **Intelligent Tool Selection**: Agent chooses tools based on claim semantics
+- **Multi-Step Reasoning**: Builds evidence chains across multiple verifications
+- **Adaptive Strategies**: Adjusts approach based on intermediate results
+- **Fallback Mechanisms**: Tries alternative tools if initial verification fails
+
+### Workflow
+
+**Step 0: Article Type Analysis**
+   - Agent first identifies the article type: academic, news, product, blog, policy, opinion
+   - This classification guides claim extraction and verification strategy
+   - Different article types emphasize different claim types:
+     - Academic → institutional, attribution, statistical
+     - News → temporal, attribution, factual
+     - Product → technical, comparative, monetary
+     - Blog → factual, technical, attribution
+
+**Step 1: Claims Extraction**
+   - Agent calls `claims_extractor` tool on full article
+   - Extracts atomic, verifiable claims with 8 types: factual, statistical, attribution,
+     institutional, temporal, comparative, monetary, technical
+   - Claims are decontextualized (stand-alone) for independent verification
+
+**Step 2: Autonomous Tool Selection**
+   - Agent analyzes each claim type and article context
+   - Selects best verification tool based on principles (not rigid IF-THEN rules):
+     - **Academic papers** → `arxiv_search` (metadata) + `tavily_search` (institutions)
+     - **Institutional/organizational claims** → `tavily_search` (primary)
+     - **Current events/news** → `tavily_search` with date filters
+     - **Product specs/pricing** → `tavily_search` for official sources
+     - **Technical documentation** → `tavily_search` for docs
+   - **Adaptive Strategy:** Combines tools, uses fallbacks, cross-verifies with multiple sources
+
+3. **Verification**
+   - Agent calls selected tools to verify each claim
+   - Collects evidence and sources
+   - Adapts if initial verification fails
+
+4. **Report Generation**
+   - Synthesizes verification results
+   - Generates structured report with:
+     - Summary statistics
+     - False claims comparison table
+     - Evidence and sources
+     - Severity ratings
+
+## Claim Types
+
+### Institutional Claims
+
+Claims about organizational affiliations:
+
+```
+Example: "OmniDocBench was released by Tsinghua University"
+
+Agent Decision:
+1. Recognizes institutional claim
+2. Checks if paper mentioned → Yes (OmniDocBench)
+3. Selects arxiv_search tool
+4. Searches for paper metadata and author affiliations
+5. Compares claimed vs actual institutions via LLM reasoning
+```
+
+### Statistical Claims
+
+Claims with numbers or percentages:
+
+```
+Example: "The model has 0.9B parameters"
+
+Agent Decision:
+1. Recognizes statistical claim
+2. Selects tavily_search for general verification
+3. Searches for official sources
+4. Verifies number accuracy
+```
+
+### Factual Claims
+
+General factual statements:
+
+```
+Example: "PaddleOCR-VL topped the OmniDocBench leaderboard"
+
+Agent Decision:
+1. Recognizes factual claim
+2. Selects tavily_search
+3. Searches for leaderboard information
+4. Verifies ranking claim
+```
+
+## Configuration
+
+### Agent Configuration
+
+```python
+{
+  "agent_config": {
+    "max_iterations": 15,       # Maximum reasoning steps
+
+    # Artifact output path (three options, evaluated in priority order):
+    # 1. "output_path": "path/to/dir"  → use explicit path (backward-compatible)
+    # 2. "save_artifacts": false        → disable artifact saving entirely
+    # 3. (default)                      → auto-generate outputs/article_factcheck_<timestamp>_<uuid>/
+    #    Override base dir with "base_output_path": "custom/base/"
+
+    "tools": {
+      "claims_extractor": {
+        "api_key": "...",
+        "max_claims": 50,           # Max claims to extract
+        "claim_types": [            # Types to extract
+          "factual",
+          "statistical",
+          "attribution",
+          "institutional"
+        ],
+        "chunk_size": 2000,         # Text chunk size
+        "include_context": true,    # Include surrounding context
+        "temperature": 0.1          # LLM temperature
+      },
+      "arxiv_search": {
+        "max_results": 5,           # Max search results
+        "sort_by": "relevance",
+        "rate_limit_delay": 3.0     # Delay between requests
+      },
+      "tavily_search": {
+        "api_key": "...",
+        "max_results": 5,
+        "search_depth": "advanced"  # or "basic"
+      }
+    },
+    "max_concurrent_claims": 5       # Max parallel claim verifications (asyncio Semaphore)
+  }
+}
+```
+
+### Output Format
+
+The `EvalDetail` returned by `ArticleFactChecker` uses a **dual-layer reason** structure:
+
+- `reason[0]`: Human-readable text summary (always present, `str`)
+- `reason[1]`: Structured report dictionary (always present after evaluation, `dict`)
+
+```python
+{
+  "metric": "ArticleFactChecker",
+  "status": true,  # true = issues found, false = all good
+  "score": 0.75,   # Overall accuracy (0.0-1.0)
+  "label": ["QUALITY_BAD_ARTICLE_FACTUAL_ERROR"],  # or QUALITY_BAD_ARTICLE_UNVERIFIED_CLAIMS / QUALITY_GOOD
+  "reason": [
+    # reason[0]: Human-readable text summary (str)
+    "Article Fact-Checking Report\n"
+    "======================================================================\n"
+    "Total Claims Analyzed: 20\n"
+    "Verified Claims: 15\n"
+    "False Claims: 5\n"
+    "Unverifiable Claims: 0\n"
+    "Overall Accuracy: 75.0%\n"
+    "\n"
+    "Agent Performance:\n"
+    "   Tool Calls: 8\n"
+    "   Reasoning Steps: 10\n"
+    "\n"
+    "FALSE CLAIMS DETAILED COMPARISON:\n"
+    "======================================================================\n"
+    "\n"
+    "#1 FALSE CLAIM\n"
+    "   Article Claimed:\n"
+    "      OmniDocBench was released by Tsinghua University...\n"
+    "   Actual Truth:\n"
+    "      OmniDocBench was released by Shanghai AI Lab, Abaka AI, 2077AI\n"
+    "   Evidence:\n"
+    "      Verified via arXiv paper 2412.07626 author list",
+
+    # reason[1]: Structured report dict (always present)
+    {
+      "report_version": "2.0",
+      "generated_at": "2026-02-06T15:30:00",
+      "article_info": {"content_source": "markdown", "content_length": 5432},
+      "claims_extraction": {
+        "total_extracted": 20,
+        "verifiable": 18,
+        "claim_types_distribution": {"factual": 5, "institutional": 3, "...": "..."}
+      },
+      "verification_summary": {
+        "total_verified": 20,
+        "verified_true": 15,
+        "verified_false": 5,
+        "unverifiable": 0,
+        "accuracy_score": 0.75
+      },
+      "detailed_findings": ["..."],
+      "false_claims_comparison": ["..."],
+      "agent_metadata": {
+        "model": "deepseek-chat",
+        "tool_calls_count": 8,
+        "reasoning_steps": 10,
+        "execution_time_seconds": 45.2
+      }
+    }
+  ]
+}
+```
+
+### Output Files
+
+ArticleFactChecker auto-saves intermediate artifacts to a timestamped directory by default.
+
+**Dingo standard output** (saved to executor output_path):
+
+Default mode (`merge=false`, the default):
+- `summary.json` - Aggregated statistics
+- `content/<LABEL>.jsonl` - Results grouped by quality label
+
+Merge mode (`executor.result_save.merge=true`):
+- `all_results.jsonl` - All EvalDetail records in single file
+- `summary.json` - Aggregated statistics
+
+**Intermediate artifacts** (auto-saved by default; path: `outputs/article_factcheck_<timestamp>_<uuid>/`):
+```
+{output_path}/
+  |-- article_content.md           # Original Markdown article
+  |-- claims_extracted.jsonl       # Extracted claims (from claims_extractor tool or agent reasoning fallback)
+  |-- claims_verification.jsonl    # Per-claim verification details
+  +-- verification_report.json     # Full structured report (v2.0)
+```
+
+#### claims_extracted.jsonl format
+
+Each line contains one extracted claim:
+```json
+{"claim_id":"claim_001","claim":"OmniDocBench was jointly released by Tsinghua University","claim_type":"institutional","confidence":0.95,"verifiable":true,"context":"..."}
+```
+
+#### claims_verification.jsonl format
+
+Each line contains a complete verification record:
+```json
+{"claim_id":"claim_001","original_claim":"...","claim_type":"institutional","confidence":0.95,"verification_result":"FALSE","evidence":"...","sources":["https://arxiv.org/abs/2412.07626"],"verification_method":"arxiv_search","search_queries_used":["OmniDocBench"],"reasoning":"..."}
+```
+
+## Real-World Example
+
+### Case Study: OmniDocBench Attribution Error
+
+**Article Claim:**
+> "它经清华大学、阿里达摩院、上海人工智能实验室等联合发布"
+>
+> Translation: "It was jointly released by Tsinghua University, Alibaba DAMO Academy, Shanghai AI Laboratory"
+
+**Agent Workflow:**
+
+1. **Claim Extraction**
+   ```
+   Extracted: "OmniDocBench was jointly released by Tsinghua University,
+               Alibaba DAMO Academy, Shanghai AI Laboratory"
+   Type: institutional
+   ```
+
+2. **Tool Selection**
+   ```
+   Agent Analysis: This is an institutional affiliation claim
+   Decision: Use arxiv_search to verify author institutions
+   Reasoning: Academic paper mentioned, can verify via arXiv
+   ```
+
+3. **Verification**
+   ```
+   Tool: arxiv_search
+   Query: "OmniDocBench 2412.07626"
+
+   Paper Found: arXiv:2412.07626
+   Authors/Affiliations from arXiv metadata:
+   - Shanghai AI Laboratory ✅
+   - Abaka AI
+   - 2077AI
+
+   LLM Reasoning:
+   - 清华大学 (Tsinghua): ❌ NOT found in paper metadata
+   - 阿里达摩院 (Alibaba DAMO): ❌ NOT found in paper metadata
+   - 上海人工智能实验室 (Shanghai AI Lab): ✅ VERIFIED
+   ```
+
+4. **Report**
+   ```
+   FALSE CLAIM DETECTED:
+
+   Article Claimed: Released by Tsinghua, Alibaba DAMO, Shanghai AI Lab
+   Actual Truth: Released ONLY by Shanghai AI Lab, Abaka AI, 2077AI
+   Evidence: arXiv:2412.07626 author list verification
+   ```
+
+## Best Practices
+
+### 1. Choose Appropriate max_iterations
+
+```python
+# For short articles (<1000 words):
+"max_iterations": 10
+
+# For long articles (>2000 words):
+"max_iterations": 15-20
+
+# For comprehensive verification:
+"max_iterations": 25-30
+```
+
+### 2. Configure Claim Types Based on Content
+
+```python
+# Technical/Academic articles:
+"claim_types": ["factual", "institutional", "attribution", "statistical"]
+
+# News articles:
+"claim_types": ["factual", "attribution", "statistical"]
+
+# Product announcements:
+"claim_types": ["factual", "statistical"]
+```
+
+### 3. Use Both Search Tools
+
+```python
+# Recommended: Enable both for comprehensive coverage
+"tools": {
+    "arxiv_search": {},        # Academic verification
+    "tavily_search": {         # General web search
+        "api_key": "..."
+    }
+}
+```
+
+### 4. Monitor Agent Performance
+
+```python
+result = ArticleFactChecker.eval(data)
+
+# Check agent metrics via structured report (reason[1])
+if len(result.reason) > 1 and isinstance(result.reason[1], dict):
+    report = result.reason[1]
+    meta = report.get('agent_metadata', {})
+    print(f"Tool Calls: {meta.get('tool_calls_count', 'N/A')}")
+    print(f"Reasoning Steps: {meta.get('reasoning_steps', 'N/A')}")
+    print(f"Execution Time: {meta.get('execution_time_seconds', 'N/A')}s")
+
+    v_summary = report.get('verification_summary', {})
+    print(f"Verified True: {v_summary.get('verified_true', 'N/A')}")
+    print(f"Verified False: {v_summary.get('verified_false', 'N/A')}")
+else:
+    # Fallback: parse from text summary (reason[0])
+    reason_text = result.reason[0] if result.reason else ''
+    import re
+    match = re.search(r'Tool Calls: (\d+)', reason_text)
+    if match:
+        print(f"Agent made {match.group(1)} tool calls")
+```
+
+## Troubleshooting
+
+### Issue: Agent Exceeds max_iterations
+
+**Symptom:** Error message "Agent returned empty output"
+
+**Solutions:**
+1. Increase `max_iterations`
+2. Reduce article length
+3. Reduce `max_claims` in claims_extractor
+
+### Issue: Missing Institutional Claims
+
+**Symptom:** Agent doesn't detect institutional misattributions
+
+**Solutions:**
+1. Verify `claim_types` includes "institutional"
+2. Increase `max_claims`
+3. For academic papers: Use `arxiv_search` for paper metadata + `tavily_search` for institution verification
+4. The agent will combine tools automatically for comprehensive verification
+
+### Issue: API Rate Limits
+
+**Symptom:** "Rate limit exceeded" errors
+
+**Solutions:**
+1. Increase `rate_limit_delay` for arxiv_search (default: 3.0s)
+2. Process articles in smaller batches
+3. Use caching if available
+4. `tavily_search` has built-in retry logic with exponential backoff (default: 3 retries)
+
+### Issue: Network Errors / Timeouts
+
+**Symptom:** "Network connection error" or "timeout" messages
+
+**Solutions:**
+1. `tavily_search` automatically retries transient errors (timeout, network, 5xx)
+2. Configure `max_retries` (default: 3) and `retry_base_delay` (default: 1.0s)
+3. Non-retryable errors (authentication, rate limit) fail immediately
+
+## Testing
+
+### Unit Tests
+
+```bash
+# Test claims extractor (requires OPENAI_API_KEY)
+pytest test/scripts/model/llm/agent/tools/test_claims_extractor.py -v
+
+# Test arXiv search tool
+pytest test/scripts/model/llm/agent/tools/test_arxiv_search.py -v
+
+# Test Tavily search tool (includes retry logic tests)
+pytest test/scripts/model/llm/agent/tools/test_tavily_search.py -v
+```
+
+### Integration Tests
+
+```bash
+# Test full article fact-checking (requires API keys)
+pytest test/scripts/model/llm/agent/test_article_fact_checker.py -v -s
+
+# Run specific test
+pytest test/scripts/model/llm/agent/test_article_fact_checker.py::TestArticleFactChecker::test_real_blog_article_fact_check -v -s
+```
+
+### Example Script
+
+```bash
+# Run example
+python examples/agent/agent_article_fact_checking_example.py
+```
+
+## API Reference
+
+### ArticleFactChecker
+
+**Class:** `dingo.model.llm.agent.ArticleFactChecker`
+
+**Attributes:**
+- `use_agent_executor`: `True` (Agent-First mode)
+- `available_tools`: `["claims_extractor", "arxiv_search", "tavily_search"]`
+- `max_iterations`: `10` (default)
+
+**Methods:**
+- `eval(input_data: Data) -> EvalDetail`: Main evaluation method
+
+### ClaimsExtractor
+
+**Class:** `dingo.model.llm.agent.tools.ClaimsExtractor`
+
+**Methods:**
+- `execute(text: str, claim_types: List[str] = None, **kwargs) -> Dict`
+
+**Returns:**
+```python
+{
+    'success': bool,
+    'claims': List[{
+        'claim_id': str,
+        'claim': str,
+        'claim_type': str,
+        'context': str,
+        'verifiable': bool,
+        'confidence': float
+    }],
+    'metadata': Dict
+}
+```
+
+### ArxivSearch
+
+**Class:** `dingo.model.llm.agent.tools.ArxivSearch`
+
+**Methods:**
+- `execute(query: str, search_type: str = "auto", **kwargs) -> Dict`
+
+**Parameters:**
+- `query`: Search query (arXiv ID, DOI, title, or keywords)
+- `search_type`: `"auto"`, `"id"`, `"doi"`, `"title"`, or `"author"`
+
+**Returns:**
+```python
+{
+    'success': bool,
+    'query': str,
+    'search_type': str,  # Detected type
+    'results': List[{
+        'arxiv_id': str,
+        'title': str,
+        'authors': List[str],
+        'summary': str,
+        'published': str,
+        'pdf_url': str,
+        'doi': str
+    }],
+    'count': int
+}
+```
+
+**Note:** For institutional verification, use `arxiv_search` to get paper metadata,
+then use `tavily_search` to verify institutional affiliations via web search.
+
+### TavilySearch
+
+**Class:** `dingo.model.llm.agent.tools.TavilySearch`
+
+**Methods:**
+- `execute(query: str, **kwargs) -> Dict`
+
+**Configuration:**
+```python
+{
+    'api_key': str,          # Required
+    'max_results': int,      # Default: 5
+    'search_depth': str,     # "basic" or "advanced"
+    'max_retries': int,      # Default: 3 (for transient errors)
+    'retry_base_delay': float  # Default: 1.0 seconds
+}
+```
+
+**Retry Behavior:**
+- Automatically retries on timeout, network, and 5xx errors
+- Does NOT retry on authentication or rate limit errors
+- Uses exponential backoff: delay = base_delay * (2 ^ attempt)
+
+## Further Reading
+
+- [Agent Development Guide](./agent_development_guide.md)
+- [Fact-Checking Guide](./factcheck_guide.md)
+- [Agent Architecture Documentation](./agent_architecture.md)
diff --git a/examples/agent/agent_article_fact_checking_example.py b/examples/agent/agent_article_fact_checking_example.py
new file mode 100644
index 00000000..4a1501c6
--- /dev/null
+++ b/examples/agent/agent_article_fact_checking_example.py
@@ -0,0 +1,163 @@
+"""
+Article Fact-Checking Example using ArticleFactChecker Agent.
+
+Usage:
+    python examples/agent/agent_article_fact_checking_example.py
+
+Requirements:
+    - OPENAI_API_KEY: For LLM agent and claims extraction
+    - TAVILY_API_KEY: (Optional) For web search verification
+"""
+
+import json
+import os
+import tempfile
+
+from dingo.config import InputArgs
+from dingo.exec import Executor
+
+
+def main() -> int:
+    """Run article fact-checking example."""
+
+    # Verify API keys
+    openai_key = os.getenv("OPENAI_API_KEY")
+    if not openai_key:
+        print("ERROR: OPENAI_API_KEY environment variable not set")
+        print("\nSet it with:")
+        print("  export OPENAI_API_KEY='your-api-key'")
+        return 1
+
+    tavily_key = os.getenv("TAVILY_API_KEY")
+    if not tavily_key:
+        print("WARNING: TAVILY_API_KEY not set - web search verification will be limited")
+        print("   Set it with: export TAVILY_API_KEY='your-api-key'")
+
+    # Read the complete article (Markdown input)
+    article_path = "test/data/blog_article_full.md"
+    if not os.path.exists(article_path):
+        print(f"ERROR: Article file not found: {article_path}")
+        return 1
+
+    with open(article_path, 'r', encoding='utf-8') as f:
+        article_content = f.read()
+
+    # Wrap article in JSONL so Executor treats it as a single Data object.
+    temp_jsonl = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, encoding='utf-8')
+    temp_jsonl.write(json.dumps({"content": article_content}, ensure_ascii=False) + '\n')
+    temp_jsonl.close()
+
+    # Configuration for ArticleFactChecker
+    config = {
+        "input_path": temp_jsonl.name,
+        "dataset": {
+            "source": "local",
+            "format": "jsonl"
+        },
+        "executor": {
+            "max_workers": 1
+        },
+        "evaluator": [
+            {
+                "fields": {
+                    "content": "content"
+                },
+                "evals": [
+                    {
+                        "name": "ArticleFactChecker",
+                        "config": {
+                            "key": openai_key,
+                            "model": "intern-s1-pro",
+                            "api_url": "https://chat.intern-ai.org.cn/api/v1/",
+                            "parameters": {
+                                "timeout": 600,
+                                "temperature": 0,  # deterministic output
+                                "agent_config": {
+                                    "max_concurrent_claims": 10,
+                                    "max_iterations": 50,
+                                    # Artifacts auto-saved to outputs/article_factcheck_<timestamp>/
+                                    # Override with: "output_path": "your/custom/path"
+                                    "tools": {
+                                        "claims_extractor": {
+                                            "api_key": openai_key,
+                                            "model": "intern-s1-pro",
+                                            "base_url": "https://chat.intern-ai.org.cn/api/v1/",
+                                            "max_claims": 50,
+                                            "claim_types": [
+                                                "factual", "statistical", "attribution", "institutional",
+                                                "temporal", "comparative", "monetary", "technical"
+                                            ]
+                                        },
+                                        "tavily_search": {
+                                            "api_key": tavily_key
+                                        } if tavily_key else {},
+                                        "arxiv_search": {
+                                            "max_results": 5
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                ]
+            }
+        ]
+    }
+
+    print("Starting Article Fact-Checking")
+    print("=" * 70)
+    print(f"Article: {article_path}")
+    print("Agent: ArticleFactChecker (Agent-First architecture)")
+    print(f"Model: {config['evaluator'][0]['evals'][0]['config']['model']}")
+    print("Artifact output: outputs/article_factcheck_<timestamp>/")
+    print("=" * 70)
+
+    # Create input args and executor
+    input_args = InputArgs(**config)
+    executor = Executor.exec_map["local"](input_args)
+
+    try:
+        # Execute fact-checking
+        print("\nExecuting agent-based fact-checking...\n")
+
+        result = executor.execute()
+
+        # Display results
+        print("\n" + "=" * 70)
+        print("FACT-CHECKING RESULTS")
+        print("=" * 70)
+
+        if result:
+            print(f"\nTotal items evaluated: {result.total}")
+            print(f"Passed: {result.num_good}  |  Issues found: {result.num_bad}")
+            if result.score:
+                print(f"Overall score: {result.score:.2%}")
+            if result.type_ratio:
+                print("\nIssue breakdown:")
+                for field_key, type_counts in result.type_ratio.items():
+                    for label, count in type_counts.items():
+                        print(f"  [{field_key}] {label}: {count}")
+
+        print("\nFact-checking complete!")
+        print(f"\nDingo standard output: {input_args.output_path}/")
+        print("  |-- summary.json                  (aggregated statistics)")
+        print("  +-- content/<LABEL>.jsonl          (results grouped by quality label)")
+
+        print("\nIntermediate artifacts: outputs/article_factcheck_<timestamp>_<uuid>/")
+        print("  |-- article_content.md           (original Markdown article)")
+        print("  |-- claims_extracted.jsonl        (extracted claims, one per line)")
+        print("  |-- claims_verification.jsonl     (per-claim verification details)")
+        print("  +-- verification_report.json      (full structured report)")
+        print("\nNote: Override artifact path with agent_config.output_path in config")
+
+    finally:
+        try:
+            os.unlink(temp_jsonl.name)
+        except OSError:
+            pass
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/setup.cfg b/setup.cfg
index 2e96d8fc..3aaf8f13 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -28,3 +28,9 @@ extend-ignore = E251
 per-file-ignores =
     */__init__.py: F401
 max-line-length = 120
+
+[tool:pytest]
+markers =
+    slow: marks tests as slow running (deselect with '-m "not slow"')
+    external: marks tests requiring external APIs or services
+    integration: marks integration tests requiring full system setup
diff --git a/test/data/blog_article.md b/test/data/blog_article.md
new file mode 100644
index 00000000..fe6c96e7
--- /dev/null
+++ b/test/data/blog_article.md
@@ -0,0 +1,3 @@
+PaddleOCR-VL登顶的OmniDocBench V1.5是目前全球衡量文档解析能力最具权威性，也最具挑战性的评测体系之一。
+
+它经清华大学、阿里达摩院、上海人工智能实验室等联合发布，由开源社区推动发展，主要面向真实场景中的PDF文档解析任务，包含1355页PDF，涵盖9种文档类型、4种布局类型和3种语言类型，以及文本、表格、公式、阅读顺序等多维任务。
diff --git a/test/data/blog_article_full.md b/test/data/blog_article_full.md
new file mode 100644
index 00000000..4aebd1a0
--- /dev/null
+++ b/test/data/blog_article_full.md
@@ -0,0 +1,179 @@
+# 全球OCR最强模型仅0.9B！百度文心衍生模型刚刚横扫4项SOTA
+
+全球AI多模态竞速激战正酣，百度又放了个大招！
+
+旗下新模型凭借0.9B参数量，在最新OmniDocBench V1.5榜单上拿下92.6分的成绩，获得综合性能全球第一。
+
+它就是 百度刚刚发布并在Day 1就开源的自研多模态文档解析模型PaddleOCR-VL。
+
+（ps：0.9B参数量，对开发者的个人电脑真的炒鸡友好！）
+
+发布16小时内，该模型就登顶了抱抱脸Trending全球第一。
+
+![Image](https://mmbiz.qpic.cn/mmbiz_png/YicUhk5aAGtCicFY0Q4FgqmfU4UzwibutIO53FiaSCwD0xqicDYzJc7pOS1Q0Zz73OEY55gbuOFrxib8pP1594gljQ0Q/640?wx_fmt=png&from=appmsg&tp=webp&wxfrom=5&wx_lazy=1#imgIndex=0)
+
+非常抢眼的是，这款模型不仅得分高，它还 在文本识别、公式识别、表格理解、阅读顺序四大核心能力上全面拿下SOTA，成为当前唯一在这四个维度全部排名第一的模型，刷新了全球OCR VL模型性能的新高线。
+
+![Image](https://mmbiz.qpic.cn/mmbiz_png/YicUhk5aAGtCicFY0Q4FgqmfU4UzwibutIO3XOSFBzzpb5b9YcA5we1eXlkNQfC39Hiao7sgpvjLHh68R9AwnQyWfw/640?wx_fmt=png&from=appmsg&tp=webp&wxfrom=5&wx_lazy=1#imgIndex=1)
+
+PaddleOCR-VL是一款面向复杂文档结构解析而设计的模型，是百度文心大模型体系下专注文档解析任务的轻量化衍生产品，具备极强的行业落地导向和平台集成能力，能轻松看懂令人头秃的PDF和图片。
+
+敲黑板划重点： 它真的能理解格式杂、长度长的文档中的逻辑结构、表格关系、数学表达等等。
+
+𝕏和小红书等平台上，这个模型已经被大家先用起来并分享使用体验。
+
+![Image](https://mmbiz.qpic.cn/mmbiz_png/YicUhk5aAGtCicFY0Q4FgqmfU4UzwibutIOO1X79gNpuQJU84qOAq11uHYQlbr4Vqia9UJpSXCzqeB8icobFm1Mib0AQ/640?wx_fmt=png&from=appmsg&tp=webp&wxfrom=5&wx_lazy=1#imgIndex=2)
+
+实用又好用，已经收获“哇”声一片。
+
+![Image](https://mmbiz.qpic.cn/mmbiz_png/YicUhk5aAGtCicFY0Q4FgqmfU4UzwibutIO3dD7XpP3ZqiazdGZogyibbefWskABfrvaXXOvHZJHLyx6JtwZs9wDiaew/640?wx_fmt=png&from=appmsg&tp=webp&wxfrom=5&wx_lazy=1#imgIndex=3)
+
+在AI从感知到认知不断跃迁的当下，当模型不再只是识字工具，变成了具备结构感知与语义还原能力的利器，OCR在AI时代的意义也被彻底改写。
+
+## 登顶OmniDocBench，四大核心能力全线SOTA
+
+PaddleOCR-VL登顶的OmniDocBench V1.5是目前全球衡量文档解析能力最具权威性，也最具挑战性的评测体系之一。
+
+它经清华大学、阿里达摩院、上海人工智能实验室等联合发布，由开源社区推动发展，主要面向真实场景中的PDF文档解析任务，包含1355页PDF，涵盖9种文档类型、4种布局类型和3种语言类型，以及文本、表格、公式、阅读顺序等多维任务。
+
+在最新一期OmniDoc Bench V1.5榜单中，PaddleOCR-VL以92.6的综合得分问鼎榜首。
+
+这顶全球桂冠背后，其实标志着该模型在模型结构设计、能力理解广度和任务适配性上的整体优势。
+
+尤其值得注意的是，PaddleOCR-VL 核心模型参数仅0.9B——以轻量之身越级打怪，正面超越了Gemini-2.5 Pro、GPT-4o等与其体量悬殊的巨型多模态大模型，同时击败了OCR领域的垂直模型dots.ocr、MinerU等等。
+
+更重要的是，PaddleOCR-VL以一己之身刷新了四项核心能力的SOTA。
+
+第一项，文本识别。
+
+PaddleOCR-VL以96.5的成绩拿下全场最高分。
+
+技术报告显示，PaddleOCR-VL模型支持109种语言，覆盖中文、英文、法文、阿拉伯文等主流语种，并在手写、竖排、艺术字体等复杂形态下也保持极高识别精度，打破了传统OCR“只识打印体”的能力瓶颈。
+
+![Image](https://mmbiz.qpic.cn/mmbiz_jpg/YicUhk5aAGtCicFY0Q4FgqmfU4UzwibutIOVGblfKYNRH6UDmzzOxqx9dibATgFkRociciaA8jTkgWGBw8p9Gkw23dTQ/640?wx_fmt=jpeg&from=appmsg&tp=webp&wxfrom=5&wx_lazy=1#imgIndex=5)
+
+需要注意的是，OmniDocBench主要评测还局限在中英文印刷体上。
+
+如果拉齐到手写、古籍、多语种这些更复杂的场景，PaddleOCR-VL能以更惊人的优势甩开现有多模态和OCR模型。
+
+再来看这张被骑手加点餐人“折磨”到皱皱巴巴的外卖单，部分文字因折角、单据变形而被遮挡；因为拍摄光线不好，单据上产生了明暗阴影……
+
+就算是面对外卖单的变形和拍摄环境光照不均，PaddleOCR-VL也没在怕的：
+
+第二项，公式识别。
+
+它CDM得分高达0.9453，远超其他对标模型，能精准还原论文、教材、试卷中复杂的数学公式，支持Latex格式生成——终于不用再手敲Latex了，抹泪。
+
+在公式识别单项测评集上，PaddleOCR-VL的成绩为91.4，超过MinerU、MonkeyOCR-pro-3B等OCR界网红模型，也是能力测试中唯一得分超过90的模型。
+
+第三项，表格理解。
+
+PaddleOCR-VL能够精准解析财报、统计报表中的嵌套表格与合并单元格，将非结构化图像信息快速转换为结构化数据。
+
+![Image](https://mmbiz.qpic.cn/mmbiz_png/YicUhk5aAGtCicFY0Q4FgqmfU4UzwibutIOLvelu9AbGtNUGtpL2PuKS8zxFNm3uNhhWRocGLK92BvZxZibNTztkiag/640?wx_fmt=png&from=appmsg&tp=webp&wxfrom=5&wx_lazy=1#imgIndex=8)
+
+单项评测中，该模型得分达到89.8，在真实场景适配性上表现优异。
+
+第四项，阅读顺序。
+
+这项能力让它能够像人一样读文档，具体来说，PaddleOCR-VL可以自动判断页面中标题、正文、图片、图注的阅读逻辑，实现智能还原人类阅读习惯。
+
+技术报告显示，PaddleOCR-VL的阅读顺序预测误差 （Reading Order Edit Distance） 仅有0.043，是该榜单所有模型中最优的表现。
+
+![Image](https://mmbiz.qpic.cn/mmbiz_png/YicUhk5aAGtCicFY0Q4FgqmfU4UzwibutIO0Lwia5sv1eV0mPeqicf2sxZicico2Htx7ZpOs38VMuxBb8wp1atvEalWOw/640?wx_fmt=png&from=appmsg&tp=webp&wxfrom=5&wx_lazy=1#imgIndex=9)
+
+BTW，四项核心能力外的一些能力，PaddleOCR-VL也稳稳没在怕的。
+
+比如现在新闻、报表中经常会碰到的图表，处理起来同样是小菜一碟：
+
+从语言到公式，从表格到阅读逻辑，多项评测中，PaddleOCR-VL几乎在所有维度上实现了人类级理解——
+
+不仅能够还原多栏报纸的复杂排版，还能智能重建教材中的多页笔记结构，准确分辨内容逻辑与版式结构。
+
+回到这个成绩背后，我们看到的不止是模型能力的突破，更是AI逐步逼近人类文档理解方式的一次真实跃迁。
+
+## 小体量，大能量，创新设计突破逐行识别
+
+传统OCR系统大多采用逐行识别策略，面对多栏、嵌套、错行、图文混排等复杂版面时往往力不从心，容易出现错位、信息遗漏等问题。
+
+PaddleOCR-VL之所以拥有“像人一样理解结构”的能力，一方面是其在数据构建与训练策略上完成了优秀的系统工程——
+
+整个模型虽然只有0.9B参数量，但 在训练过程中，共使用超3000万样本。
+
+这些训练数据涵盖文本、表格、公式、图表等多模态信息，数据来源包括公开数据、自动合成数据、互联网采样数据和百度自研数据，辅以难例挖掘机制，保证训练集的多样性和挑战性。
+
+![Image](https://mmbiz.qpic.cn/mmbiz_png/YicUhk5aAGtCicFY0Q4FgqmfU4UzwibutIOQ9TdKuPj5IvtUSatSa6a3DTYSE07YQ98W0V7mx1s7wzoz75YQibpJnw/640?wx_fmt=png&from=appmsg&tp=webp&wxfrom=5&wx_lazy=1#imgIndex=11)
+
+另一方面，也是最重要的一方面，PaddleOCR-VL研发团队从底层架构上进行了革新。
+
+从架构层面来看，PaddleOCR-VL采用了创新性的两阶段架构：
+
+第一阶段由PP-DocLayoutV2模型负责对文档版面进行分析，定位语义区域，并预测阅读顺序。
+
+第二阶段则由PaddleOCR-VL-0.9B进行细粒度识别，完成文本、表格、公式、图表等多类内容的结构化输出。
+
+相较端到端黑盒式方案，这种模块解耦、任务细化的设计让模型在面对复杂版面任务时，表现得更稳定、更高效，有效避免了多模态模型常见的幻觉与错位问题。
+
+作为文心4.5衍生模型，PaddleOCR-VL-0.9B通过融合NaViT动态分辨率视觉编码器与ERNIE-4.5-0.3B语言模型，在效率与精度上取得了双重突破。
+
+推理方面，PaddleOCR-VL在单张A100上推理速度达1881token/s。
+
+精度方面，PaddleOCR-VL实现了文本编辑距离仅0.035、公式识别CDM 91.43、表格 TEDS 89.76、阅读顺序预测误差值0.043的纪录级表现。
+
+除上之外，PaddleOCR-VL还集成了四大技术突破。
+
+- 高性能、资源高效的文档解析能力 ：采用轻量化设计与异步推理机制，显著领先同类模型。
+- 复杂文档内容的高级解析能力 ：支持复杂公式、嵌套表格、手写图表等难度场景，适配真实业务流程。
+- 图表结构化转换能力 ：能将柱状图、饼图等图像信息结构化为表格格式，支撑自动化分析。
+- 全面的多语种文本识别 ：涵盖109种语言，特别强化对竖排、艺术字体、手写字符等的识别能力。
+
+看到这里，我们拿出了 最近被网友在GitHub上扒出的宇树科技创始人王兴兴的硕士毕业论文《新型电驱式四足机器人研制与测试》。
+
+这篇近10年前的论文，里面含大量行内或独立的Latex公式，图表交错，插图与文字混排，引用繁多，是一份非常合格的用来测试PaddleOCR-VL真实能力的 超绝必胜技 （doge）。
+
+在Document Parsing模式 （这个模式可识别具有结构化布局的整页文档，例如报告、论文或杂志） 下，无论是像人一样自动判断页面逻辑，并识别和分析原论文中的各项内容——
+
+还是传统OCR模型难以正确提取的复杂流程图——
+
+亦或者集 公式和图像 于一页的case——
+
+PaddleOCR-VL真的全部都完美处理了……
+
+难怪PaddleOCR-VL在全球大模型混战中，在OCR这条赛道上实现精度、速度、功耗的三赢。
+
+它打破了“大模型才有好效果”的行业迷思，证明了架构合理、任务聚焦的“小”模型同样可以在实际应用中跑赢大模型，具备更强的落地能力与部署价值。
+
+这也使其成为文心4.5大模型家族中最具工程价值与产业可行性的代表之一，补足文心在复杂文档解析任务上的关键拼图。
+
+## 全球大模型都在卷，百度派出文心最强衍生模型先跑一步
+
+在产业智能化浪潮中，OCR早已成为各行业不可或缺的数字化基础设施，是推动万物智能化、流程自动化、信息结构化的关键底层能力。
+
+生活中诸多现实场景，如金融商业、教育与科研、政务与公共服务、文化与历史保护等，OCR都在起到降本增效的不可替代作用。
+
+尤其在文档密集型行业，PaddleOCR-VL能看、能读、能理解，可以作为“文档工作助手”接入各种流程即刻上岗，真正帮企业提效、帮用户省心。
+
+大模型浪潮汹涌而来的当下，PaddleOCR-VL的结构化输出能力还能与RAG系统深度融合，为大模型提供更高质量、更可控的知识输入，构建起从“非结构化文档”到“可用知识”的闭环。这也意味着，它不仅是一款文档解析工具，更是AI时代企业知识中台建设中的关键基础设施。
+
+没错，进入大模型技术汹涌澎湃的时代，OCR已经被赋予了前所未有的战略价值——它不再只是帮助或代替人识字的工具，而是进阶成为AI理解世界的入口。
+
+首先可以看到，如今的现实世界，信息大多以非结构化文档、图片、扫描件的形式存在，OCR承担了“从真实世界到数字世界”的转换职责。
+
+与此同时，在RAG、智能搜索、知识问答等系统中，OCR识别质量决定了输入信息的保真度。输入有多准，最终输出才有多可靠。
+
+不知不觉间，OCR其实已经被时代技术浪潮推上了“AI新应用链条的守门人”之位。
+
+于是也就不难理解，成为底层语义理解的试金石的OCR，已成为 全球科技巨头大模型布局中不可或缺的一环。Mistral AI、Google、OpenAI、阿里、腾讯等均在此方向加大投入，试图将视觉-语言模型延伸至文档语义深层解析。
+
+PaddleOCR-VL正是百度瞄准这一趋势对OCR能力进行的革新性升级。
+
+作为文心4.5体系中唯一以OCR为核心任务深度优化的产品，它将文心的理解能力延展至最复杂、最具结构挑战的文档领域，将文心的理解能力进一步拓展到复杂文档结构解析任务，在语义理解的精度与广度上打开了新边界。
+
+更重要的是，PaddleOCR-VL的领先并非大力出奇迹的参数优势或偶然的工程叠加。
+
+PaddleOCR-VL综合性能全球第一、四项核心能力拿下新SOTA的力量，源自百度在多模态智能方向上多年持续布局的系统性成果。通过融合 NaViT动态分辨率视觉编码器与ERNIE-4.5-0.3B语言模型，从文心主干模型到衍生垂类模型，这一体系化建设终于在OCR领域结出硕果。
+
+AI正在重构信息的入口，而格式繁复内容丰富的文档，是世界最难被理解的一种语言。谁能读懂现实世界的文档，谁就掌握了理解现实的钥匙。
+
+PaddleOCR-VL的出现，把这把钥匙从参数堆砌的巨兽手中，交还给真正理解场景的设计者。
+它的诞生还标志着中国模型第一次以“划线者”的姿态，在全球多模态文档解析赛道上写下自己的标准答案。
diff --git a/test/data/news_article_excerpt.md b/test/data/news_article_excerpt.md
new file mode 100644
index 00000000..71532a42
--- /dev/null
+++ b/test/data/news_article_excerpt.md
@@ -0,0 +1,19 @@
+# OpenAI发布o1推理模型
+
+**2024年12月5日消息**,OpenAI公司今日正式发布其最新推理模型o1,标志着AI推理能力的重大突破。
+
+## 核心亮点
+
+CEO Sam Altman在发布会上表示:"o1模型代表了我们在AGI道路上的重要里程碑。它在复杂推理任务上展现了前所未有的能力。"
+
+根据OpenAI官方技术报告,o1模型在数学推理任务上的准确率达到89.3%,相比GPT-4提升了15个百分点。在AIME 2024数学竞赛模拟测试中,o1的表现超过了83%的参赛者。
+
+## 定价和可用性
+
+该模型将于12月底向ChatGPT Plus用户开放使用,订阅费用保持20美元/月不变。企业用户可通过API访问,定价为每百万token 15美元(输入)和60美元(输出)。
+
+## 技术创新
+
+o1采用了强化学习驱动的"链式思考"(Chain of Thought)推理方式,能够在回答问题前进行深度思考。内部测试显示,o1在编程、物理和化学领域的表现显著优于GPT-4o。
+
+OpenAI表示,o1-mini轻量版也将同步发布,为开发者提供更具成本效益的选择。
diff --git a/test/data/product_review_excerpt.md b/test/data/product_review_excerpt.md
new file mode 100644
index 00000000..f02f5716
--- /dev/null
+++ b/test/data/product_review_excerpt.md
@@ -0,0 +1,33 @@
+# iPhone 15 Pro深度评测
+
+苹果于2023年9月发布的iPhone 15 Pro系列,带来了多项重大升级。
+
+## 核心配置
+
+iPhone 15 Pro搭载全新A17 Pro芯片,这是业界首款采用3纳米工艺的移动处理器。根据苹果官方数据,CPU性能相比A16 Bionic提升10%,GPU性能提升20%。
+
+在Geekbench 6测试中,iPhone 15 Pro单核跑分达到2920,多核跑分达到7230,相比iPhone 14 Pro分别提升约12%和15%。
+
+## 影像系统
+
+后置4800万像素主摄,支持2倍光学变焦和最高15倍数字变焦。夜景模式在暗光环境下的表现显著优于三星Galaxy S23 Ultra,细节保留更丰富。
+
+新增的空间视频拍摄功能,为Apple Vision Pro头显提供了内容基础。
+
+## 定价
+
+国行版本定价如下:
+- 128GB: 7999元人民币
+- 256GB: 8999元人民币
+- 512GB: 10999元人民币
+- 1TB: 12999元人民币
+
+相比iPhone 14 Pro同容量版本,涨价约800元。
+
+## 续航
+
+内置3274mAh电池,支持27W有线快充和15W MagSafe无线充电。实测视频连续播放可达23小时,超过iPhone 14 Pro的20小时。
+
+## 总结
+
+iPhone 15 Pro是一款综合实力强大的旗舰机型,A17 Pro芯片的性能提升明显,影像系统也有显著进步。但价格上涨可能会影响消费者的购买决策。
diff --git a/test/scripts/model/llm/agent/test_agent_fact_check.py b/test/scripts/model/llm/agent/test_agent_fact_check.py
index 1511ff42..b9f5549b 100644
--- a/test/scripts/model/llm/agent/test_agent_fact_check.py
+++ b/test/scripts/model/llm/agent/test_agent_fact_check.py
@@ -1,514 +1,514 @@
-"""
-Test suite for AgentFactCheck hallucination detection agent.
-
-Tests cover:
-- Agent registration
-- Input formatting (with/without prompt, context)
-- System prompt generation (context-aware)
-- Output parsing (structured format + fallbacks)
-- Error handling (empty output, parsing failures)
-- Integration scenarios (mocked agent execution)
-"""
-
-from unittest.mock import patch
-
-from dingo.io import Data
-from dingo.io.output.eval_detail import QualityLabel
-from dingo.model import Model
-from dingo.model.llm.agent.agent_fact_check import AgentFactCheck
-
-
-class TestAgentFactCheckRegistration:
-    """Test agent registration and configuration."""
-
-    def test_agent_registered(self):
-        """Test that AgentFactCheck is registered in Model registry."""
-        assert "AgentFactCheck" in Model.llm_name_map
-        assert Model.llm_name_map["AgentFactCheck"] == AgentFactCheck
-
-    def test_agent_configuration(self):
-        """Test agent configuration attributes."""
-        assert AgentFactCheck.use_agent_executor is True
-        assert "tavily_search" in AgentFactCheck.available_tools
-        assert AgentFactCheck.max_iterations == 10
-
-
-class TestFormatAgentInput:
-    """Test _format_agent_input method with various input combinations."""
-
-    def test_format_with_prompt_and_content_only(self):
-        """Test formatting with prompt and content, no context."""
-        data = Data(prompt="What is 2+2?", content="The answer is 5")
-
-        result = AgentFactCheck._format_agent_input(data)
-
-        assert "**Question:**" in result
-        assert "What is 2+2?" in result
-        assert "**Response to Evaluate:**" in result
-        assert "The answer is 5" in result
-        assert "**Context:** None provided" in result
-
-    def test_format_with_prompt_content_and_context(self):
-        """Test formatting with all fields present."""
-        data = Data(
-            prompt="What is the capital of France?",
-            content="The capital is Berlin",
-            context="France's capital is Paris"
-        )
-
-        result = AgentFactCheck._format_agent_input(data)
-
-        assert "**Question:**" in result
-        assert "capital of France" in result
-        assert "**Response to Evaluate:**" in result
-        assert "Berlin" in result
-        assert "**Context:**" in result
-        assert "Paris" in result
-        assert "None provided" not in result
-
-    def test_format_with_context_list(self):
-        """Test formatting when context is a list."""
-        data = Data(
-            prompt="Who wrote Hamlet?",
-            content="Charles Dickens",
-            context=["Shakespeare wrote Hamlet", "Hamlet is a tragedy"]
-        )
-
-        result = AgentFactCheck._format_agent_input(data)
-
-        assert "**Context:**" in result
-        assert "- Shakespeare wrote Hamlet" in result
-        assert "- Hamlet is a tragedy" in result
-
-    def test_format_without_prompt(self):
-        """Test formatting when prompt is missing."""
-        # Create Data without prompt attribute
-        data = Data(content="Some content to evaluate")
-        # Ensure prompt attribute doesn't exist
-        if hasattr(data, 'prompt'):
-            delattr(data, 'prompt')
-
-        result = AgentFactCheck._format_agent_input(data)
-
-        assert "**Response to Evaluate:**" in result
-        assert "Some content to evaluate" in result
-        # Should not have Question section when prompt doesn't exist
-        # But our implementation checks input_data.prompt, so it will get None
-        # and skip the question section
-
-
-class TestGetSystemPrompt:
-    """Test _get_system_prompt method."""
-
-    def test_system_prompt_with_context(self):
-        """Test system prompt when context is available."""
-        data = Data(
-            prompt="Test question",
-            content="Test content",
-            context="Test context"
-        )
-
-        prompt = AgentFactCheck._get_system_prompt(data)
-
-        assert "fact-checking agent" in prompt
-        assert "Context is provided" in prompt
-        assert "MAY use web search" in prompt
-        assert "Make your own decision" in prompt
-        assert "HALLUCINATION_DETECTED:" in prompt
-        assert "YES or NO" in prompt
-
-    def test_system_prompt_without_context(self):
-        """Test system prompt when context is not available."""
-        data = Data(prompt="Test question", content="Test content")
-
-        prompt = AgentFactCheck._get_system_prompt(data)
-
-        assert "fact-checking agent" in prompt
-        assert "NO Context is available" in prompt
-        assert "MUST use web search" in prompt
-        assert "HALLUCINATION_DETECTED:" in prompt
-
-    def test_system_prompt_includes_format_instructions(self):
-        """Test that system prompt includes format instructions."""
-        data = Data(prompt="Test", content="Test")
-
-        prompt = AgentFactCheck._get_system_prompt(data)
-
-        assert "HALLUCINATION_DETECTED:" in prompt
-        assert "EXPLANATION:" in prompt
-        assert "EVIDENCE:" in prompt
-        assert "Example:" in prompt
-
-
-class TestDetectHallucinationFromOutput:
-    """Test _detect_hallucination_from_output method."""
-
-    def test_detect_yes_structured_format(self):
-        """Test detection of YES in structured format."""
-        output = """HALLUCINATION_DETECTED: YES
-EXPLANATION: The response claims incorrect information.
-EVIDENCE: According to reliable sources, this is false."""
-
-        result = AgentFactCheck._detect_hallucination_from_output(output)
-
-        assert result is True
-
-    def test_detect_no_structured_format(self):
-        """Test detection of NO in structured format."""
-        output = """HALLUCINATION_DETECTED: NO
-EXPLANATION: The response is factually accurate.
-EVIDENCE: All claims verified against multiple sources."""
-
-        result = AgentFactCheck._detect_hallucination_from_output(output)
-
-        assert result is False
-
-    def test_detect_case_insensitive(self):
-        """Test that detection is case insensitive."""
-        output1 = "hallucination_detected: yes\nExplanation here..."
-        output2 = "HALLUCINATION_DETECTED: no\nExplanation here..."
-
-        assert AgentFactCheck._detect_hallucination_from_output(output1) is True
-        assert AgentFactCheck._detect_hallucination_from_output(output2) is False
-
-    def test_detect_with_extra_whitespace(self):
-        """Test detection handles extra whitespace."""
-        output = "HALLUCINATION_DETECTED:   YES  \nMore text..."
-
-        result = AgentFactCheck._detect_hallucination_from_output(output)
-
-        assert result is True
-
-    def test_detect_fallback_to_keywords_yes(self):
-        """Test fallback keyword detection for hallucination."""
-        output = "Analysis: Hallucination detected in the response. The claim is false."
-
-        result = AgentFactCheck._detect_hallucination_from_output(output)
-
-        assert result is True
-
-    def test_detect_fallback_to_keywords_no(self):
-        """Test fallback keyword detection for no hallucination."""
-        output = "Analysis: No hallucination detected. The information is factually accurate."
-
-        result = AgentFactCheck._detect_hallucination_from_output(output)
-
-        assert result is False
-
-    def test_detect_empty_output(self):
-        """Test detection with empty output returns False."""
-        assert AgentFactCheck._detect_hallucination_from_output("") is False
-        assert AgentFactCheck._detect_hallucination_from_output(None) is False
-
-    def test_detect_ambiguous_output_defaults_to_false(self):
-        """Test that ambiguous output defaults to False (no hallucination)."""
-        output = "This is some text without clear signals."
-
-        result = AgentFactCheck._detect_hallucination_from_output(output)
-
-        # Should default to False to avoid false positives
-        assert result is False
-
-    def test_detect_at_start_of_response(self):
-        """Test detection when marker is at start."""
-        output = "HALLUCINATION_DETECTED: YES\nBecause XYZ..."
-
-        result = AgentFactCheck._detect_hallucination_from_output(output)
-
-        assert result is True
-
-
-class TestExtractSourcesFromOutput:
-    """Test _extract_sources_from_output method."""
-
-    def test_extract_sources_with_dashes(self):
-        """Test extraction of sources with - prefix."""
-        output = """HALLUCINATION_DETECTED: YES
-EXPLANATION: Some explanation
-SOURCES:
-- https://example.com/source1
-- https://example.com/source2
-EVIDENCE: Some evidence"""
-
-        sources = AgentFactCheck._extract_sources_from_output(output)
-
-        assert len(sources) == 2
-        assert "https://example.com/source1" in sources
-        assert "https://example.com/source2" in sources
-
-    def test_extract_sources_with_bullets(self):
-        """Test extraction of sources with • prefix."""
-        output = """SOURCES:
-• https://example.com/source1
-• https://example.com/source2"""
-
-        sources = AgentFactCheck._extract_sources_from_output(output)
-
-        assert len(sources) == 2
-        assert "https://example.com/source1" in sources
-        assert "https://example.com/source2" in sources
-
-    def test_extract_sources_direct_urls(self):
-        """Test extraction of direct URLs without prefix."""
-        output = """SOURCES:
-https://example.com/source1
-https://example.com/source2"""
-
-        sources = AgentFactCheck._extract_sources_from_output(output)
-
-        assert len(sources) == 2
-        assert "https://example.com/source1" in sources
-        assert "https://example.com/source2" in sources
-
-    def test_extract_sources_no_sources_section(self):
-        """Test when output has no SOURCES section."""
-        output = """HALLUCINATION_DETECTED: NO
-EXPLANATION: Everything is correct"""
-
-        sources = AgentFactCheck._extract_sources_from_output(output)
-
-        assert len(sources) == 0
-        assert sources == []
-
-    def test_extract_sources_empty_sources_section(self):
-        """Test when SOURCES section is empty."""
-        output = """HALLUCINATION_DETECTED: YES
-SOURCES:
-EXPLANATION: Some explanation"""
-
-        sources = AgentFactCheck._extract_sources_from_output(output)
-
-        assert len(sources) == 0
-
-    def test_extract_sources_mixed_format(self):
-        """Test extraction with mixed formats."""
-        output = """SOURCES:
-- https://example.com/source1
-• https://example.com/source2
-https://example.com/source3"""
-
-        sources = AgentFactCheck._extract_sources_from_output(output)
-
-        assert len(sources) == 3
-
-    def test_extract_sources_case_insensitive(self):
-        """Test that SOURCES detection is case insensitive."""
-        output = """sources:
-- https://example.com/source1"""
-
-        sources = AgentFactCheck._extract_sources_from_output(output)
-
-        assert len(sources) == 1
-        assert "https://example.com/source1" in sources
-
-    def test_extract_sources_stops_at_next_section(self):
-        """Test that extraction stops at the next section header."""
-        output = """SOURCES:
-- https://example.com/source1
-- https://example.com/source2
-EXPLANATION: This should not be included
-- https://example.com/source3"""
-
-        sources = AgentFactCheck._extract_sources_from_output(output)
-
-        # Should only get the first two sources, not the third
-        assert len(sources) == 2
-        assert "https://example.com/source3" not in sources
-
-
-class TestAggregateResults:
-    """Test aggregate_results method."""
-
-    def test_aggregate_with_no_results(self):
-        """Test aggregation when no results returned."""
-        data = Data(prompt="Test", content="Test")
-
-        result = AgentFactCheck.aggregate_results(data, [])
-
-        assert result.status is True  # Error status
-        assert "AGENT_ERROR" in result.label[0]
-        assert "No results" in result.reason[0]
-
-    def test_aggregate_with_failure_result(self):
-        """Test aggregation when agent execution failed."""
-        data = Data(prompt="Test", content="Test")
-        agent_result = {
-            'success': False,
-            'error': 'Execution timeout'
-        }
-
-        result = AgentFactCheck.aggregate_results(data, [agent_result])
-
-        assert result.status is True
-        assert "AGENT_ERROR" in result.label[0]
-        assert "timeout" in result.reason[0].lower()
-
-    def test_aggregate_with_empty_output(self):
-        """Test aggregation when agent returns empty output."""
-        data = Data(prompt="Test", content="Test")
-        agent_result = {
-            'success': True,
-            'output': '',
-            'tool_calls': [],
-            'reasoning_steps': 0
-        }
-
-        result = AgentFactCheck.aggregate_results(data, [agent_result])
-
-        assert result.status is True
-        assert "AGENT_ERROR" in result.label[0]
-        assert "empty output" in result.reason[0].lower()
-
-    def test_aggregate_hallucination_detected(self):
-        """Test aggregation when hallucination is detected."""
-        data = Data(prompt="Test", content="Test")
-        agent_result = {
-            'success': True,
-            'output': 'HALLUCINATION_DETECTED: YES\nExplanation: Incorrect claim.',
-            'tool_calls': [{'tool': 'tavily_search'}],
-            'reasoning_steps': 3
-        }
-
-        result = AgentFactCheck.aggregate_results(data, [agent_result])
-
-        assert result.status is True  # Hallucination found
-        assert "HALLUCINATION" in result.label[0]
-        assert "YES" in result.reason[0]
-        assert "Web searches performed: 1" in result.reason[2]
-
-    def test_aggregate_no_hallucination(self):
-        """Test aggregation when no hallucination detected."""
-        data = Data(prompt="Test", content="Test")
-        agent_result = {
-            'success': True,
-            'output': 'HALLUCINATION_DETECTED: NO\nExplanation: All facts verified.',
-            'tool_calls': [],
-            'reasoning_steps': 2
-        }
-
-        result = AgentFactCheck.aggregate_results(data, [agent_result])
-
-        assert result.status is False  # No hallucination
-        assert result.label[0] == QualityLabel.QUALITY_GOOD
-        assert "NO" in result.reason[0]
-        assert "Web searches performed: 0" in result.reason[2]
-
-    def test_aggregate_with_parsing_exception(self):
-        """Test aggregation handles parsing exceptions."""
-        data = Data(prompt="Test", content="Test")
-        agent_result = {
-            'success': True,
-            'output': 'Valid output',
-            'tool_calls': [],
-            'reasoning_steps': 1
-        }
-
-        # Mock _detect_hallucination_from_output to raise exception
-        with patch.object(
-            AgentFactCheck,
-            '_detect_hallucination_from_output',
-            side_effect=ValueError("Parse error")
-        ):
-            result = AgentFactCheck.aggregate_results(data, [agent_result])
-
-        assert result.status is True  # Error status
-        assert "AGENT_ERROR" in result.label[0]
-        assert "Failed to parse" in result.reason[0]
-
-
-class TestIntegration:
-    """Integration tests with mocked agent execution."""
-
-    @patch('dingo.model.llm.agent.agent_wrapper.AgentWrapper')
-    @patch.object(AgentFactCheck, 'create_client')
-    @patch.object(AgentFactCheck, 'get_langchain_tools')
-    @patch.object(AgentFactCheck, 'get_langchain_llm')
-    @patch.object(AgentFactCheck, '_check_langchain_available', return_value=True)
-    def test_eval_with_context_no_search(
-        self,
-        mock_check_langchain,
-        mock_get_llm,
-        mock_get_tools,
-        mock_create_client,
-        mock_wrapper
-    ):
-        """Test evaluation with context where agent doesn't search."""
-        # Setup mocks
-        mock_get_tools.return_value = []
-        mock_get_llm.return_value = "mock_llm"
-        mock_wrapper.create_agent.return_value = "mock_agent"
-        mock_wrapper.invoke_and_format.return_value = {
-            'success': True,
-            'output': 'HALLUCINATION_DETECTED: NO\nContext was sufficient.',
-            'tool_calls': [],  # No search performed
-            'reasoning_steps': 2
-        }
-
-        data = Data(
-            prompt="What is 2+2?",
-            content="The answer is 4",
-            context="2+2=4 is correct"
-        )
-
-        result = AgentFactCheck.eval(data)
-
-        assert result.status is False  # No hallucination
-        assert "QUALITY_GOOD" in result.label[0]
-        # Verify input formatting was used
-        call_args = mock_wrapper.invoke_and_format.call_args
-        input_text = call_args[1]['input_text']
-        assert "**Question:**" in input_text
-        assert "**Response to Evaluate:**" in input_text
-        assert "**Context:**" in input_text
-
-    @patch('dingo.model.llm.agent.agent_wrapper.AgentWrapper')
-    @patch.object(AgentFactCheck, 'create_client')
-    @patch.object(AgentFactCheck, 'get_langchain_tools')
-    @patch.object(AgentFactCheck, 'get_langchain_llm')
-    @patch.object(AgentFactCheck, '_check_langchain_available', return_value=True)
-    def test_eval_without_context_must_search(
-        self,
-        mock_check_langchain,
-        mock_get_llm,
-        mock_get_tools,
-        mock_create_client,
-        mock_wrapper
-    ):
-        """Test evaluation without context where agent must search."""
-        # Setup mocks
-        mock_get_tools.return_value = []
-        mock_get_llm.return_value = "mock_llm"
-        mock_wrapper.create_agent.return_value = "mock_agent"
-        mock_wrapper.invoke_and_format.return_value = {
-            'success': True,
-            'output': 'HALLUCINATION_DETECTED: YES\nWeb search revealed error.',
-            'tool_calls': [{'tool': 'tavily_search', 'query': 'fact check'}],
-            'reasoning_steps': 4
-        }
-
-        data = Data(
-            prompt="What is the capital of Mars?",
-            content="The capital is Olympus City"
-        )
-
-        result = AgentFactCheck.eval(data)
-
-        assert result.status is True  # Hallucination found
-        assert "HALLUCINATION" in result.label[0]
-        # Verify system prompt instructs to search
-        call_args = mock_wrapper.create_agent.call_args
-        system_prompt = call_args[1]['system_prompt']
-        assert "MUST use web search" in system_prompt
-
-
-class TestPlanExecution:
-    """Test plan_execution method."""
-
-    def test_plan_execution_returns_empty(self):
-        """Test that plan_execution returns empty list for LangChain agents."""
-        data = Data(prompt="Test", content="Test")
-
-        result = AgentFactCheck.plan_execution(data)
-
-        assert result == []
-        assert isinstance(result, list)
+"""
+Test suite for AgentFactCheck hallucination detection agent.
+
+Tests cover:
+- Agent registration
+- Input formatting (with/without prompt, context)
+- System prompt generation (context-aware)
+- Output parsing (structured format + fallbacks)
+- Error handling (empty output, parsing failures)
+- Integration scenarios (mocked agent execution)
+"""
+
+from unittest.mock import patch
+
+from dingo.io import Data
+from dingo.io.output.eval_detail import QualityLabel
+from dingo.model import Model
+from dingo.model.llm.agent.agent_fact_check import AgentFactCheck
+
+
+class TestAgentFactCheckRegistration:
+    """Test agent registration and configuration."""
+
+    def test_agent_registered(self):
+        """Test that AgentFactCheck is registered in Model registry."""
+        assert "AgentFactCheck" in Model.llm_name_map
+        assert Model.llm_name_map["AgentFactCheck"] == AgentFactCheck
+
+    def test_agent_configuration(self):
+        """Test agent configuration attributes."""
+        assert AgentFactCheck.use_agent_executor is True
+        assert "tavily_search" in AgentFactCheck.available_tools
+        assert AgentFactCheck.max_iterations == 10
+
+
+class TestFormatAgentInput:
+    """Test _format_agent_input method with various input combinations."""
+
+    def test_format_with_prompt_and_content_only(self):
+        """Test formatting with prompt and content, no context."""
+        data = Data(prompt="What is 2+2?", content="The answer is 5")
+
+        result = AgentFactCheck._format_agent_input(data)
+
+        assert "**Question:**" in result
+        assert "What is 2+2?" in result
+        assert "**Response to Evaluate:**" in result
+        assert "The answer is 5" in result
+        assert "**Context:** None provided" in result
+
+    def test_format_with_prompt_content_and_context(self):
+        """Test formatting with all fields present."""
+        data = Data(
+            prompt="What is the capital of France?",
+            content="The capital is Berlin",
+            context="France's capital is Paris"
+        )
+
+        result = AgentFactCheck._format_agent_input(data)
+
+        assert "**Question:**" in result
+        assert "capital of France" in result
+        assert "**Response to Evaluate:**" in result
+        assert "Berlin" in result
+        assert "**Context:**" in result
+        assert "Paris" in result
+        assert "None provided" not in result
+
+    def test_format_with_context_list(self):
+        """Test formatting when context is a list."""
+        data = Data(
+            prompt="Who wrote Hamlet?",
+            content="Charles Dickens",
+            context=["Shakespeare wrote Hamlet", "Hamlet is a tragedy"]
+        )
+
+        result = AgentFactCheck._format_agent_input(data)
+
+        assert "**Context:**" in result
+        assert "- Shakespeare wrote Hamlet" in result
+        assert "- Hamlet is a tragedy" in result
+
+    def test_format_without_prompt(self):
+        """Test formatting when prompt is missing."""
+        # Create Data without prompt attribute
+        data = Data(content="Some content to evaluate")
+        # Ensure prompt attribute doesn't exist
+        if hasattr(data, 'prompt'):
+            delattr(data, 'prompt')
+
+        result = AgentFactCheck._format_agent_input(data)
+
+        assert "**Response to Evaluate:**" in result
+        assert "Some content to evaluate" in result
+        # Should not have Question section when prompt doesn't exist
+        # But our implementation checks input_data.prompt, so it will get None
+        # and skip the question section
+
+
+class TestGetSystemPrompt:
+    """Test _get_system_prompt method."""
+
+    def test_system_prompt_with_context(self):
+        """Test system prompt when context is available."""
+        data = Data(
+            prompt="Test question",
+            content="Test content",
+            context="Test context"
+        )
+
+        prompt = AgentFactCheck._get_system_prompt(data)
+
+        assert "fact-checking agent" in prompt
+        assert "Context is provided" in prompt
+        assert "MAY use web search" in prompt
+        assert "Make your own decision" in prompt
+        assert "HALLUCINATION_DETECTED:" in prompt
+        assert "YES or NO" in prompt
+
+    def test_system_prompt_without_context(self):
+        """Test system prompt when context is not available."""
+        data = Data(prompt="Test question", content="Test content")
+
+        prompt = AgentFactCheck._get_system_prompt(data)
+
+        assert "fact-checking agent" in prompt
+        assert "NO Context is available" in prompt
+        assert "MUST use web search" in prompt
+        assert "HALLUCINATION_DETECTED:" in prompt
+
+    def test_system_prompt_includes_format_instructions(self):
+        """Test that system prompt includes format instructions."""
+        data = Data(prompt="Test", content="Test")
+
+        prompt = AgentFactCheck._get_system_prompt(data)
+
+        assert "HALLUCINATION_DETECTED:" in prompt
+        assert "EXPLANATION:" in prompt
+        assert "EVIDENCE:" in prompt
+        assert "Example:" in prompt
+
+
+class TestDetectHallucinationFromOutput:
+    """Test _detect_hallucination_from_output method."""
+
+    def test_detect_yes_structured_format(self):
+        """Test detection of YES in structured format."""
+        output = """HALLUCINATION_DETECTED: YES
+EXPLANATION: The response claims incorrect information.
+EVIDENCE: According to reliable sources, this is false."""
+
+        result = AgentFactCheck._detect_hallucination_from_output(output)
+
+        assert result is True
+
+    def test_detect_no_structured_format(self):
+        """Test detection of NO in structured format."""
+        output = """HALLUCINATION_DETECTED: NO
+EXPLANATION: The response is factually accurate.
+EVIDENCE: All claims verified against multiple sources."""
+
+        result = AgentFactCheck._detect_hallucination_from_output(output)
+
+        assert result is False
+
+    def test_detect_case_insensitive(self):
+        """Test that detection is case insensitive."""
+        output1 = "hallucination_detected: yes\nExplanation here..."
+        output2 = "HALLUCINATION_DETECTED: no\nExplanation here..."
+
+        assert AgentFactCheck._detect_hallucination_from_output(output1) is True
+        assert AgentFactCheck._detect_hallucination_from_output(output2) is False
+
+    def test_detect_with_extra_whitespace(self):
+        """Test detection handles extra whitespace."""
+        output = "HALLUCINATION_DETECTED:   YES  \nMore text..."
+
+        result = AgentFactCheck._detect_hallucination_from_output(output)
+
+        assert result is True
+
+    def test_detect_fallback_to_keywords_yes(self):
+        """Test fallback keyword detection for hallucination."""
+        output = "Analysis: Hallucination detected in the response. The claim is false."
+
+        result = AgentFactCheck._detect_hallucination_from_output(output)
+
+        assert result is True
+
+    def test_detect_fallback_to_keywords_no(self):
+        """Test fallback keyword detection for no hallucination."""
+        output = "Analysis: No hallucination detected. The information is factually accurate."
+
+        result = AgentFactCheck._detect_hallucination_from_output(output)
+
+        assert result is False
+
+    def test_detect_empty_output(self):
+        """Test detection with empty output returns False."""
+        assert AgentFactCheck._detect_hallucination_from_output("") is False
+        assert AgentFactCheck._detect_hallucination_from_output(None) is False
+
+    def test_detect_ambiguous_output_defaults_to_false(self):
+        """Test that ambiguous output defaults to False (no hallucination)."""
+        output = "This is some text without clear signals."
+
+        result = AgentFactCheck._detect_hallucination_from_output(output)
+
+        # Should default to False to avoid false positives
+        assert result is False
+
+    def test_detect_at_start_of_response(self):
+        """Test detection when marker is at start."""
+        output = "HALLUCINATION_DETECTED: YES\nBecause XYZ..."
+
+        result = AgentFactCheck._detect_hallucination_from_output(output)
+
+        assert result is True
+
+
+class TestExtractSourcesFromOutput:
+    """Test _extract_sources_from_output method."""
+
+    def test_extract_sources_with_dashes(self):
+        """Test extraction of sources with - prefix."""
+        output = """HALLUCINATION_DETECTED: YES
+EXPLANATION: Some explanation
+SOURCES:
+- https://example.com/source1
+- https://example.com/source2
+EVIDENCE: Some evidence"""
+
+        sources = AgentFactCheck._extract_sources_from_output(output)
+
+        assert len(sources) == 2
+        assert "https://example.com/source1" in sources
+        assert "https://example.com/source2" in sources
+
+    def test_extract_sources_with_bullets(self):
+        """Test extraction of sources with • prefix."""
+        output = """SOURCES:
+• https://example.com/source1
+• https://example.com/source2"""
+
+        sources = AgentFactCheck._extract_sources_from_output(output)
+
+        assert len(sources) == 2
+        assert "https://example.com/source1" in sources
+        assert "https://example.com/source2" in sources
+
+    def test_extract_sources_direct_urls(self):
+        """Test extraction of direct URLs without prefix."""
+        output = """SOURCES:
+https://example.com/source1
+https://example.com/source2"""
+
+        sources = AgentFactCheck._extract_sources_from_output(output)
+
+        assert len(sources) == 2
+        assert "https://example.com/source1" in sources
+        assert "https://example.com/source2" in sources
+
+    def test_extract_sources_no_sources_section(self):
+        """Test when output has no SOURCES section."""
+        output = """HALLUCINATION_DETECTED: NO
+EXPLANATION: Everything is correct"""
+
+        sources = AgentFactCheck._extract_sources_from_output(output)
+
+        assert len(sources) == 0
+        assert sources == []
+
+    def test_extract_sources_empty_sources_section(self):
+        """Test when SOURCES section is empty."""
+        output = """HALLUCINATION_DETECTED: YES
+SOURCES:
+EXPLANATION: Some explanation"""
+
+        sources = AgentFactCheck._extract_sources_from_output(output)
+
+        assert len(sources) == 0
+
+    def test_extract_sources_mixed_format(self):
+        """Test extraction with mixed formats."""
+        output = """SOURCES:
+- https://example.com/source1
+• https://example.com/source2
+https://example.com/source3"""
+
+        sources = AgentFactCheck._extract_sources_from_output(output)
+
+        assert len(sources) == 3
+
+    def test_extract_sources_case_insensitive(self):
+        """Test that SOURCES detection is case insensitive."""
+        output = """sources:
+- https://example.com/source1"""
+
+        sources = AgentFactCheck._extract_sources_from_output(output)
+
+        assert len(sources) == 1
+        assert "https://example.com/source1" in sources
+
+    def test_extract_sources_stops_at_next_section(self):
+        """Test that extraction stops at the next section header."""
+        output = """SOURCES:
+- https://example.com/source1
+- https://example.com/source2
+EXPLANATION: This should not be included
+- https://example.com/source3"""
+
+        sources = AgentFactCheck._extract_sources_from_output(output)
+
+        # Should only get the first two sources, not the third
+        assert len(sources) == 2
+        assert "https://example.com/source3" not in sources
+
+
+class TestAggregateResults:
+    """Test aggregate_results method."""
+
+    def test_aggregate_with_no_results(self):
+        """Test aggregation when no results returned."""
+        data = Data(prompt="Test", content="Test")
+
+        result = AgentFactCheck.aggregate_results(data, [])
+
+        assert result.status is True  # Error status
+        assert "AGENT_ERROR" in result.label[0]
+        assert "No results" in result.reason[0]
+
+    def test_aggregate_with_failure_result(self):
+        """Test aggregation when agent execution failed."""
+        data = Data(prompt="Test", content="Test")
+        agent_result = {
+            'success': False,
+            'error': 'Execution timeout'
+        }
+
+        result = AgentFactCheck.aggregate_results(data, [agent_result])
+
+        assert result.status is True
+        assert "AGENT_ERROR" in result.label[0]
+        assert "timeout" in result.reason[0].lower()
+
+    def test_aggregate_with_empty_output(self):
+        """Test aggregation when agent returns empty output."""
+        data = Data(prompt="Test", content="Test")
+        agent_result = {
+            'success': True,
+            'output': '',
+            'tool_calls': [],
+            'reasoning_steps': 0
+        }
+
+        result = AgentFactCheck.aggregate_results(data, [agent_result])
+
+        assert result.status is True
+        assert "AGENT_ERROR" in result.label[0]
+        assert "empty output" in result.reason[0].lower()
+
+    def test_aggregate_hallucination_detected(self):
+        """Test aggregation when hallucination is detected."""
+        data = Data(prompt="Test", content="Test")
+        agent_result = {
+            'success': True,
+            'output': 'HALLUCINATION_DETECTED: YES\nExplanation: Incorrect claim.',
+            'tool_calls': [{'tool': 'tavily_search'}],
+            'reasoning_steps': 3
+        }
+
+        result = AgentFactCheck.aggregate_results(data, [agent_result])
+
+        assert result.status is True  # Hallucination found
+        assert "HALLUCINATION" in result.label[0]
+        assert "YES" in result.reason[0]
+        assert "Web searches performed: 1" in result.reason[2]
+
+    def test_aggregate_no_hallucination(self):
+        """Test aggregation when no hallucination detected."""
+        data = Data(prompt="Test", content="Test")
+        agent_result = {
+            'success': True,
+            'output': 'HALLUCINATION_DETECTED: NO\nExplanation: All facts verified.',
+            'tool_calls': [],
+            'reasoning_steps': 2
+        }
+
+        result = AgentFactCheck.aggregate_results(data, [agent_result])
+
+        assert result.status is False  # No hallucination
+        assert result.label[0] == QualityLabel.QUALITY_GOOD
+        assert "NO" in result.reason[0]
+        assert "Web searches performed: 0" in result.reason[2]
+
+    def test_aggregate_with_parsing_exception(self):
+        """Test aggregation handles parsing exceptions."""
+        data = Data(prompt="Test", content="Test")
+        agent_result = {
+            'success': True,
+            'output': 'Valid output',
+            'tool_calls': [],
+            'reasoning_steps': 1
+        }
+
+        # Mock _detect_hallucination_from_output to raise exception
+        with patch.object(
+            AgentFactCheck,
+            '_detect_hallucination_from_output',
+            side_effect=ValueError("Parse error")
+        ):
+            result = AgentFactCheck.aggregate_results(data, [agent_result])
+
+        assert result.status is True  # Error status
+        assert "AGENT_ERROR" in result.label[0]
+        assert "Failed to parse" in result.reason[0]
+
+
+class TestIntegration:
+    """Integration tests with mocked agent execution."""
+
+    @patch('dingo.model.llm.agent.agent_wrapper.AgentWrapper')
+    @patch.object(AgentFactCheck, 'create_client')
+    @patch.object(AgentFactCheck, 'get_langchain_tools')
+    @patch.object(AgentFactCheck, 'get_langchain_llm')
+    @patch.object(AgentFactCheck, '_check_langchain_available', return_value=True)
+    def test_eval_with_context_no_search(
+        self,
+        mock_check_langchain,
+        mock_get_llm,
+        mock_get_tools,
+        mock_create_client,
+        mock_wrapper
+    ):
+        """Test evaluation with context where agent doesn't search."""
+        # Setup mocks
+        mock_get_tools.return_value = []
+        mock_get_llm.return_value = "mock_llm"
+        mock_wrapper.create_agent.return_value = "mock_agent"
+        mock_wrapper.invoke_and_format.return_value = {
+            'success': True,
+            'output': 'HALLUCINATION_DETECTED: NO\nContext was sufficient.',
+            'tool_calls': [],  # No search performed
+            'reasoning_steps': 2
+        }
+
+        data = Data(
+            prompt="What is 2+2?",
+            content="The answer is 4",
+            context="2+2=4 is correct"
+        )
+
+        result = AgentFactCheck.eval(data)
+
+        assert result.status is False  # No hallucination
+        assert "QUALITY_GOOD" in result.label[0]
+        # Verify input formatting was used
+        call_args = mock_wrapper.invoke_and_format.call_args
+        input_text = call_args[1]['input_text']
+        assert "**Question:**" in input_text
+        assert "**Response to Evaluate:**" in input_text
+        assert "**Context:**" in input_text
+
+    @patch('dingo.model.llm.agent.agent_wrapper.AgentWrapper')
+    @patch.object(AgentFactCheck, 'create_client')
+    @patch.object(AgentFactCheck, 'get_langchain_tools')
+    @patch.object(AgentFactCheck, 'get_langchain_llm')
+    @patch.object(AgentFactCheck, '_check_langchain_available', return_value=True)
+    def test_eval_without_context_must_search(
+        self,
+        mock_check_langchain,
+        mock_get_llm,
+        mock_get_tools,
+        mock_create_client,
+        mock_wrapper
+    ):
+        """Test evaluation without context where agent must search."""
+        # Setup mocks
+        mock_get_tools.return_value = []
+        mock_get_llm.return_value = "mock_llm"
+        mock_wrapper.create_agent.return_value = "mock_agent"
+        mock_wrapper.invoke_and_format.return_value = {
+            'success': True,
+            'output': 'HALLUCINATION_DETECTED: YES\nWeb search revealed error.',
+            'tool_calls': [{'tool': 'tavily_search', 'query': 'fact check'}],
+            'reasoning_steps': 4
+        }
+
+        data = Data(
+            prompt="What is the capital of Mars?",
+            content="The capital is Olympus City"
+        )
+
+        result = AgentFactCheck.eval(data)
+
+        assert result.status is True  # Hallucination found
+        assert "HALLUCINATION" in result.label[0]
+        # Verify system prompt instructs to search
+        call_args = mock_wrapper.create_agent.call_args
+        system_prompt = call_args[1]['system_prompt']
+        assert "MUST use web search" in system_prompt
+
+
+class TestPlanExecution:
+    """Test plan_execution method."""
+
+    def test_plan_execution_returns_empty(self):
+        """Test that plan_execution returns empty list for LangChain agents."""
+        data = Data(prompt="Test", content="Test")
+
+        result = AgentFactCheck.plan_execution(data)
+
+        assert result == []
+        assert isinstance(result, list)
diff --git a/test/scripts/model/llm/agent/test_article_fact_checker.py b/test/scripts/model/llm/agent/test_article_fact_checker.py
new file mode 100644
index 00000000..5376dfcc
--- /dev/null
+++ b/test/scripts/model/llm/agent/test_article_fact_checker.py
@@ -0,0 +1,1468 @@
+"""
+Integration tests for ArticleFactChecker agent.
+
+Tests the end-to-end article fact-checking workflow including:
+- Agent initialization and configuration
+- Tool registration and availability
+- Result structure validation
+- Claims extraction from tool calls
+- Per-claim verification merging
+- Structured report generation
+- File saving methods
+- Verdict normalization and summary recalculation
+- Claims fallback extraction from detailed_findings
+- Prompt enhancements (VERDICT_CRITERIA, SELF_VERIFICATION_STEP)
+"""
+
+import json
+import os
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from dingo.io.input import Data
+from dingo.model import Model
+from dingo.model.llm.agent import ArticleFactChecker
+from dingo.model.llm.agent.agent_article_fact_checker import PromptTemplates
+
+
+class TestArticleFactCheckerBasic:
+    """Basic tests for ArticleFactChecker agent structure"""
+
+    def test_agent_registered(self):
+        """Test that ArticleFactChecker is registered in Model registry"""
+        Model.load_model()
+        assert "ArticleFactChecker" in Model.llm_name_map
+        assert Model.llm_name_map["ArticleFactChecker"] == ArticleFactChecker
+
+    def test_agent_configuration(self):
+        """Test agent configuration attributes"""
+        assert ArticleFactChecker.use_agent_executor is True
+        assert 'claims_extractor' in ArticleFactChecker.available_tools
+        assert 'arxiv_search' in ArticleFactChecker.available_tools
+        assert 'tavily_search' in ArticleFactChecker.available_tools
+        assert ArticleFactChecker.max_iterations == 10
+
+    def test_format_agent_input(self):
+        """Test _format_agent_input method"""
+        article_text = "Test article content"
+        data = Data(content=article_text)
+
+        result = ArticleFactChecker._format_agent_input(data)
+
+        assert "ARTICLE START" in result
+        assert "ARTICLE END" in result
+        assert article_text in result
+        assert "analyze the article type" in result
+        assert "Extract ALL verifiable claims" in result
+
+    def test_get_system_prompt(self):
+        """Test system prompt generation"""
+        data = Data(content="test")
+        prompt = ArticleFactChecker._get_system_prompt(data)
+
+        # Check core prompt content
+        assert "expert article fact-checker" in prompt
+        assert "claims_extractor" in prompt
+        assert "arxiv_search" in prompt
+        assert "tavily_search" in prompt
+        # Check for all 8 claim types
+        assert "temporal" in prompt
+        assert "comparative" in prompt
+        assert "monetary" in prompt
+        assert "technical" in prompt
+        # Check for article type analysis step (modular prompts)
+        assert "article type" in prompt.lower()
+        assert "Analyze Article Type" in prompt
+
+    def test_get_system_prompt_with_article_type(self):
+        """Test system prompt generation with specific article type"""
+        # Test default prompt
+        default_prompt = PromptTemplates.build()
+        assert "expert article fact-checker" in default_prompt
+        assert len(default_prompt) > 3000  # Substantial prompt
+
+        # Test academic article type prompt
+        academic_prompt = PromptTemplates.build(article_type="academic")
+        assert "arxiv_search" in academic_prompt
+        assert len(academic_prompt) > len(default_prompt)  # Has additional guidance
+
+        # Test news article type prompt
+        news_prompt = PromptTemplates.build(article_type="news")
+        assert "tavily_search" in news_prompt
+
+        # Test all article types are available
+        article_types = PromptTemplates.get_article_types()
+        assert "academic" in article_types
+        assert "news" in article_types
+        assert "product" in article_types
+        assert "blog" in article_types
+        assert len(article_types) == 6
+
+    def test_output_format_prompt_contains_new_fields(self):
+        """Test that OUTPUT_FORMAT prompt requires verification_method, search_queries_used, reasoning"""
+        output_fmt = PromptTemplates.OUTPUT_FORMAT
+        assert "verification_method" in output_fmt
+        assert "search_queries_used" in output_fmt
+        assert "reasoning" in output_fmt
+
+
+class TestArticleFactCheckerResultStructure:
+    """Test result structure and parsing"""
+
+    def test_parse_verification_output_json(self):
+        """Test parsing valid JSON output"""
+        json_output = """{
+            "article_verification_summary": {
+                "article_type": "academic",
+                "total_claims": 5,
+                "verified_claims": 4,
+                "false_claims": 1,
+                "unverifiable_claims": 0,
+                "accuracy_score": 0.8
+            }
+        }"""
+
+        result = ArticleFactChecker._parse_verification_output(json_output)
+
+        assert result is not None
+        assert "article_verification_summary" in result
+        assert result["article_verification_summary"]["total_claims"] == 5
+        assert result["article_verification_summary"]["false_claims"] == 1
+
+    def test_parse_verification_output_with_code_block(self):
+        """Test parsing JSON in code block"""
+        output_with_block = """Here is the result:
+```json
+{
+    "article_verification_summary": {
+        "total_claims": 3,
+        "verified_claims": 3,
+        "false_claims": 0,
+        "accuracy_score": 1.0
+    }
+}
+```
+"""
+
+        result = ArticleFactChecker._parse_verification_output(output_with_block)
+
+        assert result is not None
+        assert result["article_verification_summary"]["total_claims"] == 3
+        assert result["article_verification_summary"]["false_claims"] == 0
+
+    def test_parse_verification_output_fallback(self):
+        """Test fallback parsing for non-JSON output"""
+        text_output = """
+        Total claims: 5
+        False claims: 2
+        Verified claims: 3
+        """
+
+        result = ArticleFactChecker._parse_verification_output(text_output)
+
+        assert result is not None
+        assert "article_verification_summary" in result
+        assert result["article_verification_summary"]["total_claims"] == 5
+        assert result["article_verification_summary"]["false_claims"] == 2
+
+    def test_build_eval_detail_from_verification_without_report(self):
+        """Test building EvalDetail from verification data (no report)"""
+        verification_data = {
+            "article_verification_summary": {
+                "total_claims": 10,
+                "verified_claims": 8,
+                "false_claims": 2,
+                "unverifiable_claims": 0,
+                "accuracy_score": 0.8
+            },
+            "detailed_findings": [
+                {"claim_id": "claim_001", "verification_result": "TRUE"},
+                {"claim_id": "claim_002", "verification_result": "FALSE"}
+            ]
+        }
+
+        result = ArticleFactChecker._build_eval_detail_from_verification(
+            verification_data, tool_calls=[], reasoning_steps=5
+        )
+
+        assert result is not None
+        assert result.metric == "ArticleFactChecker"
+        assert result.status is True  # Has false claims
+        assert result.score == 0.8
+        assert len(result.reason) >= 1
+        # reason[0] should be a string summary
+        assert isinstance(result.reason[0], str)
+        assert "Total Claims" in result.reason[0]
+
+    def test_build_eval_detail_from_verification_with_report(self):
+        """Test building EvalDetail with dual-layer reason (text + report)"""
+        verification_data = {
+            "article_verification_summary": {
+                "total_claims": 5,
+                "verified_claims": 4,
+                "false_claims": 1,
+                "unverifiable_claims": 0,
+                "accuracy_score": 0.8
+            },
+            "detailed_findings": []
+        }
+        report = {"report_version": "2.0", "verification_summary": {"accuracy_score": 0.8}}
+
+        result = ArticleFactChecker._build_eval_detail_from_verification(
+            verification_data, tool_calls=[], reasoning_steps=3, report=report
+        )
+
+        assert len(result.reason) == 2
+        assert isinstance(result.reason[0], str)
+        assert isinstance(result.reason[1], dict)
+        assert result.reason[1]["report_version"] == "2.0"
+
+    def test_create_error_result(self):
+        """Test error result creation"""
+        error_msg = "Test error message"
+
+        result = ArticleFactChecker._create_error_result(error_msg)
+
+        assert result is not None
+        assert result.metric == "ArticleFactChecker"
+        assert result.status is True  # Error = issue
+        assert any("ERROR" in label for label in result.label)
+        assert any(error_msg in str(line) for line in result.reason)
+
+
+class TestClaimsExtractionFromToolCalls:
+    """Test _extract_claims_from_tool_calls method"""
+
+    def test_extract_claims_from_valid_tool_calls(self):
+        """Test extracting claims from claims_extractor observation"""
+        tool_calls = [
+            {
+                "tool": "claims_extractor",
+                "args": {"text": "article text..."},
+                "observation": json.dumps({
+                    "success": True,
+                    "data": {
+                        "claims": [
+                            {"claim_id": "claim_001", "claim": "Claim A", "claim_type": "factual", "confidence": 0.9},
+                            {"claim_id": "claim_002", "claim": "Claim B", "claim_type": "institutional", "confidence": 0.85}
+                        ]
+                    }
+                })
+            },
+            {
+                "tool": "tavily_search",
+                "args": {"query": "Claim A"},
+                "observation": "{\"success\": true, \"data\": {\"results\": []}}"
+            }
+        ]
+
+        claims = ArticleFactChecker._extract_claims_from_tool_calls(tool_calls)
+
+        assert len(claims) == 2
+        assert claims[0]["claim_id"] == "claim_001"
+        assert claims[1]["claim_type"] == "institutional"
+
+    def test_extract_claims_from_empty_tool_calls(self):
+        """Test with no tool calls"""
+        claims = ArticleFactChecker._extract_claims_from_tool_calls([])
+        assert claims == []
+
+    def test_extract_claims_when_no_claims_extractor_called(self):
+        """Test when only search tools were called"""
+        tool_calls = [
+            {"tool": "tavily_search", "args": {"query": "test"}, "observation": "{}"}
+        ]
+        claims = ArticleFactChecker._extract_claims_from_tool_calls(tool_calls)
+        assert claims == []
+
+    def test_extract_claims_with_failed_observation(self):
+        """Test when claims_extractor returned failure"""
+        tool_calls = [
+            {
+                "tool": "claims_extractor",
+                "args": {"text": "article"},
+                "observation": json.dumps({"success": False, "error": "API error"})
+            }
+        ]
+        claims = ArticleFactChecker._extract_claims_from_tool_calls(tool_calls)
+        assert claims == []
+
+    def test_extract_claims_with_malformed_observation(self):
+        """Test when observation is not valid JSON"""
+        tool_calls = [
+            {"tool": "claims_extractor", "args": {}, "observation": "not json"}
+        ]
+        claims = ArticleFactChecker._extract_claims_from_tool_calls(tool_calls)
+        assert claims == []
+
+
+class TestPerClaimVerification:
+    """Test _build_per_claim_verification method"""
+
+    def test_merge_with_complete_data(self):
+        """Test merging when all three data sources have matching data"""
+        verification_data = {
+            "detailed_findings": [
+                {
+                    "claim_id": "claim_001",
+                    "original_claim": "Test claim",
+                    "claim_type": "factual",
+                    "verification_result": "TRUE",
+                    "evidence": "Found evidence",
+                    "sources": ["https://example.com"],
+                    "verification_method": "tavily_search",
+                    "search_queries_used": ["test query"],
+                    "reasoning": "Step-by-step..."
+                }
+            ],
+            "false_claims_comparison": []
+        }
+        extracted_claims = [
+            {"claim_id": "claim_001", "claim": "Test claim", "claim_type": "factual", "confidence": 0.95}
+        ]
+        tool_calls = [
+            {"tool": "tavily_search", "args": {"query": "test query"}, "observation": "{}"}
+        ]
+
+        enriched = ArticleFactChecker._build_per_claim_verification(
+            verification_data, extracted_claims, tool_calls
+        )
+
+        assert len(enriched) == 1
+        assert enriched[0]["claim_id"] == "claim_001"
+        assert enriched[0]["confidence"] == 0.95
+        assert enriched[0]["verification_result"] == "TRUE"
+        assert enriched[0]["verification_method"] == "tavily_search"
+
+    def test_merge_with_false_claims_preserves_evidence(self):
+        """Test that FALSE claims preserve evidence from detailed_findings"""
+        verification_data = {
+            "detailed_findings": [
+                {
+                    "claim_id": "claim_001",
+                    "original_claim": "OpenAI released o1 in November 2024",
+                    "verification_result": "FALSE",
+                    "evidence": "Released Dec 5"
+                }
+            ],
+            "false_claims_comparison": [
+                {
+                    "article_claimed": "OpenAI released o1 in November 2024",
+                    "actual_truth": "Released December 5",
+                }
+            ]
+        }
+
+        enriched = ArticleFactChecker._build_per_claim_verification(
+            verification_data, [], []
+        )
+
+        assert len(enriched) == 1
+        assert enriched[0]["verification_result"] == "FALSE"
+        assert enriched[0]["evidence"] == "Released Dec 5"
+        assert "error_type" not in enriched[0]
+        assert "severity" not in enriched[0]
+
+    def test_fallback_when_no_detailed_findings(self):
+        """Test placeholder records when agent has no detailed_findings"""
+        verification_data = {"detailed_findings": []}
+        extracted_claims = [
+            {"claim_id": "claim_001", "claim": "Some claim", "claim_type": "factual", "confidence": 0.9}
+        ]
+
+        enriched = ArticleFactChecker._build_per_claim_verification(
+            verification_data, extracted_claims, []
+        )
+
+        assert len(enriched) == 1
+        assert enriched[0]["verification_result"] == "UNVERIFIABLE"
+        assert enriched[0]["original_claim"] == "Some claim"
+
+    def test_empty_all_sources(self):
+        """Test with no data at all"""
+        enriched = ArticleFactChecker._build_per_claim_verification({}, [], [])
+        assert enriched == []
+
+
+class TestStructuredReport:
+    """Test _build_structured_report method"""
+
+    def setup_method(self):
+        """Set up dynamic_config mock for model name access"""
+        from dingo.config.input_args import EvaluatorLLMArgs
+        self._original_dynamic_config = getattr(ArticleFactChecker, 'dynamic_config', None)
+        ArticleFactChecker.dynamic_config = EvaluatorLLMArgs(
+            key="test-key", api_url="https://api.example.com", model="test-model"
+        )
+
+    def teardown_method(self):
+        """Restore original dynamic_config to avoid test pollution"""
+        if self._original_dynamic_config is not None:
+            ArticleFactChecker.dynamic_config = self._original_dynamic_config
+
+    def test_report_structure(self):
+        """Test that report has all required top-level keys"""
+        verification_data = {
+            "article_verification_summary": {
+                "total_claims": 3,
+                "verified_claims": 2,
+                "false_claims": 1,
+                "unverifiable_claims": 0,
+                "accuracy_score": 0.67
+            },
+            "false_claims_comparison": []
+        }
+        extracted_claims = [
+            {"claim_id": "claim_001", "claim_type": "factual", "verifiable": True},
+            {"claim_id": "claim_002", "claim_type": "institutional", "verifiable": True},
+            {"claim_id": "claim_003", "claim_type": "factual", "verifiable": False}
+        ]
+
+        report = ArticleFactChecker._build_structured_report(
+            verification_data=verification_data,
+            extracted_claims=extracted_claims,
+            enriched_claims=[],
+            tool_calls=[{"tool": "tavily_search"}],
+            reasoning_steps=5,
+            content_length=1000,
+            execution_time=30.5
+        )
+
+        assert report["report_version"] == "2.0"
+        assert "generated_at" in report
+        assert report["article_info"]["content_length"] == 1000
+        assert report["claims_extraction"]["total_extracted"] == 3
+        assert report["claims_extraction"]["verifiable"] == 2
+        assert report["claims_extraction"]["claim_types_distribution"]["factual"] == 2
+        assert report["verification_summary"]["accuracy_score"] == 0.67
+        assert report["agent_metadata"]["tool_calls_count"] == 1
+        assert report["agent_metadata"]["execution_time_seconds"] == 30.5
+        assert report["agent_metadata"]["model"] == "test-model"
+
+    def test_report_verified_true_math_after_recalculation(self):
+        """Test that verified_true equals true_count, not true_count - false_count.
+
+        Regression test: _recalculate_summary sets verified_claims=true_count.
+        _build_structured_report must use verified_claims directly for verified_true,
+        and verified_claims + false_claims for total_verified.
+        """
+        # Simulate recalculated summary: 3 TRUE, 1 FALSE, 1 UNVERIFIABLE
+        verification_data = {
+            "article_verification_summary": {
+                "total_claims": 5,
+                "verified_claims": 3,
+                "false_claims": 1,
+                "unverifiable_claims": 1,
+                "accuracy_score": 0.6
+            },
+            "false_claims_comparison": []
+        }
+        enriched = [
+            {"claim_id": f"c{i}", "verification_result": v, "claim_type": "factual"}
+            for i, v in enumerate(["TRUE", "TRUE", "TRUE", "FALSE", "UNVERIFIABLE"])
+        ]
+
+        report = ArticleFactChecker._build_structured_report(
+            verification_data=verification_data,
+            extracted_claims=enriched,
+            enriched_claims=enriched,
+            tool_calls=[],
+            reasoning_steps=5,
+            content_length=500,
+            execution_time=10.0
+        )
+
+        summary = report["verification_summary"]
+        assert summary["verified_true"] == 3, "verified_true should equal true_count"
+        assert summary["verified_false"] == 1
+        assert summary["unverifiable"] == 1
+        assert summary["total_verified"] == 4, "total_verified should be true + false"
+
+
+class TestFileSaving:
+    """Test file saving methods"""
+
+    def setup_method(self):
+        """Save original dynamic_config before tests that modify it"""
+        self._original_dynamic_config = getattr(ArticleFactChecker, 'dynamic_config', None)
+
+    def teardown_method(self):
+        """Restore original dynamic_config to avoid test pollution"""
+        if self._original_dynamic_config is not None:
+            ArticleFactChecker.dynamic_config = self._original_dynamic_config
+
+    def test_save_article_content(self, tmp_path):
+        """Test saving article content to markdown file"""
+        content = "# Test Article\n\nThis is test content."
+
+        result_path = ArticleFactChecker._save_article_content(str(tmp_path), content)
+
+        assert os.path.exists(result_path)
+        with open(result_path, 'r', encoding='utf-8') as f:
+            assert f.read() == content
+
+    def test_save_claims(self, tmp_path):
+        """Test saving claims to JSONL file"""
+        claims = [
+            {"claim_id": "claim_001", "claim": "First claim"},
+            {"claim_id": "claim_002", "claim": "Second claim"}
+        ]
+
+        result_path = ArticleFactChecker._save_claims(str(tmp_path), claims)
+
+        assert os.path.exists(result_path)
+        with open(result_path, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+        assert len(lines) == 2
+        assert json.loads(lines[0])["claim_id"] == "claim_001"
+
+    def test_save_verification_details(self, tmp_path):
+        """Test saving verification details to JSONL file"""
+        enriched = [
+            {"claim_id": "claim_001", "verification_result": "TRUE"},
+            {"claim_id": "claim_002", "verification_result": "FALSE"}
+        ]
+
+        result_path = ArticleFactChecker._save_verification_details(str(tmp_path), enriched)
+
+        assert os.path.exists(result_path)
+        with open(result_path, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+        assert len(lines) == 2
+        assert json.loads(lines[1])["verification_result"] == "FALSE"
+
+    def test_save_full_report(self, tmp_path):
+        """Test saving full report to JSON file"""
+        report = {
+            "report_version": "2.0",
+            "verification_summary": {"accuracy_score": 0.8}
+        }
+
+        result_path = ArticleFactChecker._save_full_report(str(tmp_path), report)
+
+        assert os.path.exists(result_path)
+        with open(result_path, 'r', encoding='utf-8') as f:
+            loaded = json.load(f)
+        assert loaded["report_version"] == "2.0"
+
+    def test_get_output_dir_auto_generates_path_when_not_configured(self, tmp_path):
+        """Test _get_output_dir auto-generates timestamped path when no output_path configured"""
+        from dingo.config.input_args import EvaluatorLLMArgs
+        ArticleFactChecker.dynamic_config = EvaluatorLLMArgs(
+            key="test", api_url="https://api.example.com", model="test",
+            parameters={"agent_config": {"base_output_path": str(tmp_path)}}
+        )
+        result = ArticleFactChecker._get_output_dir()
+        assert result is not None
+        assert os.path.isdir(result)
+        assert "article_factcheck_" in os.path.basename(result)
+        assert result.startswith(str(tmp_path))
+
+    def test_get_output_dir_returns_none_when_save_artifacts_disabled(self):
+        """Test _get_output_dir returns None when save_artifacts=False"""
+        from dingo.config.input_args import EvaluatorLLMArgs
+        ArticleFactChecker.dynamic_config = EvaluatorLLMArgs(
+            key="test", api_url="https://api.example.com", model="test",
+            parameters={"agent_config": {"save_artifacts": False}}
+        )
+        result = ArticleFactChecker._get_output_dir()
+        assert result is None
+
+    def test_get_output_dir_creates_directory(self, tmp_path):
+        """Test _get_output_dir creates directory when configured"""
+        from dingo.config.input_args import EvaluatorLLMArgs
+
+        output_dir = str(tmp_path / "new_output_dir")
+        ArticleFactChecker.dynamic_config = EvaluatorLLMArgs(
+            key="test", api_url="https://api.example.com", model="test",
+            parameters={"agent_config": {"output_path": output_dir}}
+        )
+
+        result = ArticleFactChecker._get_output_dir()
+
+        assert result == output_dir
+        assert os.path.isdir(output_dir)
+
+
+class TestAggregateResultsErrorPaths:
+    """Test aggregate_results error handling paths"""
+
+    def setup_method(self):
+        """Set up dynamic_config"""
+        from dingo.config.input_args import EvaluatorLLMArgs
+        self._original_dynamic_config = getattr(ArticleFactChecker, 'dynamic_config', None)
+        ArticleFactChecker.dynamic_config = EvaluatorLLMArgs(
+            key="test-key", api_url="https://api.example.com", model="test-model"
+        )
+
+    def teardown_method(self):
+        """Restore original dynamic_config"""
+        if self._original_dynamic_config is not None:
+            ArticleFactChecker.dynamic_config = self._original_dynamic_config
+
+    def test_aggregate_results_with_empty_results(self):
+        """Test aggregate_results when results list is empty"""
+        data = Data(content="test")
+        result = ArticleFactChecker.aggregate_results(data, [])
+
+        assert result.status is True
+        assert any("AGENT_ERROR" in label for label in result.label)
+
+    def test_aggregate_results_with_recursion_limit_error(self):
+        """Test aggregate_results handles recursion limit error"""
+        data = Data(content="test")
+        agent_result = {
+            'success': False,
+            'error': 'Recursion limit of 25 reached without finishing.'
+        }
+
+        result = ArticleFactChecker.aggregate_results(data, [agent_result])
+
+        assert result.status is True
+        assert any("RECURSION_LIMIT" in label for label in result.label)
+        assert any("25" in str(line) for line in result.reason)
+
+    def test_aggregate_results_with_timeout_error(self):
+        """Test aggregate_results handles timeout error"""
+        data = Data(content="test")
+        agent_result = {
+            'success': False,
+            'error': 'Request timed out after 120 seconds'
+        }
+
+        result = ArticleFactChecker.aggregate_results(data, [agent_result])
+
+        assert result.status is True
+        assert any("TIMEOUT" in label for label in result.label)
+
+    def test_aggregate_results_with_empty_output(self):
+        """Test aggregate_results when agent returns empty output"""
+        data = Data(content="test")
+        agent_result = {
+            'success': True,
+            'output': '',
+            'tool_calls': [],
+            'reasoning_steps': 0
+        }
+
+        result = ArticleFactChecker.aggregate_results(data, [agent_result])
+
+        assert result.status is True
+        assert any("AGENT_ERROR" in label for label in result.label)
+
+    def test_aggregate_results_with_valid_json_output(self):
+        """Test aggregate_results with valid JSON agent output"""
+        data = Data(content="test article")
+        agent_output = json.dumps({
+            "article_verification_summary": {
+                "article_type": "blog",
+                "total_claims": 3,
+                "verified_claims": 3,
+                "false_claims": 0,
+                "unverifiable_claims": 0,
+                "accuracy_score": 1.0
+            },
+            "detailed_findings": [],
+            "false_claims_comparison": []
+        })
+        agent_result = {
+            'success': True,
+            'output': agent_output,
+            'tool_calls': [],
+            'reasoning_steps': 5
+        }
+
+        result = ArticleFactChecker.aggregate_results(data, [agent_result])
+
+        assert result.status is False  # No false claims
+        assert result.score == 1.0
+        assert isinstance(result.reason[0], str)
+
+
+class TestArticleFactCheckerIntegration:
+    """Integration tests requiring API keys (marked as slow)"""
+
+    # DeepSeek API configuration (uses OpenAI SDK)
+    DEEPSEEK_BASE_URL = "https://api.deepseek.com/v1"
+    DEEPSEEK_MODEL = "deepseek-chat"
+
+    def setup_method(self):
+        """Configure ArticleFactChecker to use DeepSeek API"""
+        from dingo.config.input_args import EvaluatorLLMArgs
+
+        api_key = os.getenv("OPENAI_API_KEY")
+        if api_key:
+            ArticleFactChecker.dynamic_config = EvaluatorLLMArgs(
+                key=api_key,
+                api_url=self.DEEPSEEK_BASE_URL,
+                model=self.DEEPSEEK_MODEL
+            )
+
+    @pytest.fixture
+    def api_keys(self):
+        """Get API keys from environment"""
+        openai_key = os.getenv("OPENAI_API_KEY")
+        tavily_key = os.getenv("TAVILY_API_KEY")
+
+        if not openai_key:
+            pytest.skip("OPENAI_API_KEY not set")
+
+        return {
+            'openai': openai_key,
+            'tavily': tavily_key
+        }
+
+    @pytest.fixture
+    def blog_article_path(self):
+        """Get path to blog article test data"""
+        test_file = Path(__file__)
+        article_path = test_file.parents[4] / "data" / "blog_article.md"
+
+        if not article_path.exists():
+            pytest.skip(f"Blog article not found: {article_path}")
+
+        return article_path
+
+    @pytest.mark.slow
+    @pytest.mark.skipif(
+        not os.getenv("OPENAI_API_KEY"),
+        reason="Requires OPENAI_API_KEY for real API test"
+    )
+    def test_eval_with_real_article(self, api_keys, blog_article_path):
+        """
+        Integration test with real article and API calls.
+
+        NOTE: This test uses real LLM and search APIs, so it:
+        - Requires valid API keys
+        - Consumes API quota
+        - Results may vary based on external data
+        """
+        with open(blog_article_path, 'r', encoding='utf-8') as f:
+            article_content = f.read()
+
+        data = Data(content=article_content)
+
+        result = ArticleFactChecker.eval(data)
+
+        # Verify result structure
+        assert result is not None
+        assert result.metric == "ArticleFactChecker"
+        assert isinstance(result.status, bool)
+        assert result.reason is not None
+        assert len(result.reason) >= 1
+        # reason[0] should be human-readable text
+        assert isinstance(result.reason[0], str)
+        assert len(result.reason[0]) > 100
+
+    @pytest.mark.slow
+    @pytest.mark.skipif(
+        not os.getenv("OPENAI_API_KEY"),
+        reason="Requires OPENAI_API_KEY"
+    )
+    def test_eval_with_empty_article(self, api_keys):
+        """Test handling of empty article"""
+        data = Data(content="")
+
+        result = ArticleFactChecker.eval(data)
+
+        assert result is not None
+        assert result.metric == "ArticleFactChecker"
+        assert isinstance(result.status, bool)
+        assert result.score == 0.0 or result.score is None
+
+    @pytest.mark.slow
+    @pytest.mark.skipif(
+        not os.getenv("OPENAI_API_KEY"),
+        reason="Requires OPENAI_API_KEY"
+    )
+    def test_eval_with_short_article(self, api_keys):
+        """Test with very short article"""
+        short_article = """
+# Short Test Article
+
+PaddleOCR-VL is an OCR model. It scored 92.6 on OmniDocBench.
+"""
+
+        data = Data(content=short_article)
+
+        result = ArticleFactChecker.eval(data)
+
+        assert result is not None
+        assert result.metric == "ArticleFactChecker"
+        assert isinstance(result.status, bool)
+        assert result.reason is not None
+
+
+class TestVerdictNormalization:
+    """Test _normalize_verdict method"""
+
+    def test_standard_values_unchanged(self):
+        """Test that standard verdicts pass through unchanged"""
+        assert ArticleFactChecker._normalize_verdict("TRUE") == "TRUE"
+        assert ArticleFactChecker._normalize_verdict("FALSE") == "FALSE"
+        assert ArticleFactChecker._normalize_verdict("UNVERIFIABLE") == "UNVERIFIABLE"
+
+    def test_case_insensitive(self):
+        """Test case insensitivity"""
+        assert ArticleFactChecker._normalize_verdict("true") == "TRUE"
+        assert ArticleFactChecker._normalize_verdict("False") == "FALSE"
+        assert ArticleFactChecker._normalize_verdict("unverifiable") == "UNVERIFIABLE"
+
+    def test_variant_mappings_true(self):
+        """Test TRUE variant mappings"""
+        assert ArticleFactChecker._normalize_verdict("CONFIRMED") == "TRUE"
+        assert ArticleFactChecker._normalize_verdict("ACCURATE") == "TRUE"
+        assert ArticleFactChecker._normalize_verdict("CORRECT") == "TRUE"
+        assert ArticleFactChecker._normalize_verdict("VERIFIED") == "TRUE"
+
+    def test_variant_mappings_false(self):
+        """Test FALSE variant mappings"""
+        assert ArticleFactChecker._normalize_verdict("INACCURATE") == "FALSE"
+        assert ArticleFactChecker._normalize_verdict("INCORRECT") == "FALSE"
+        assert ArticleFactChecker._normalize_verdict("WRONG") == "FALSE"
+        assert ArticleFactChecker._normalize_verdict("DISPROVEN") == "FALSE"
+        assert ArticleFactChecker._normalize_verdict("REFUTED") == "FALSE"
+
+    def test_unknown_defaults_to_unverifiable(self):
+        """Test that unknown values default to UNVERIFIABLE"""
+        assert ArticleFactChecker._normalize_verdict("MAYBE") == "UNVERIFIABLE"
+        assert ArticleFactChecker._normalize_verdict("PARTIAL") == "UNVERIFIABLE"
+        assert ArticleFactChecker._normalize_verdict("UNKNOWN") == "UNVERIFIABLE"
+
+    def test_empty_and_none_values(self):
+        """Test empty and None values"""
+        assert ArticleFactChecker._normalize_verdict("") == "UNVERIFIABLE"
+        assert ArticleFactChecker._normalize_verdict(None) == "UNVERIFIABLE"
+
+    def test_non_string_input_returns_unverifiable(self):
+        """Test that non-string types (int, bool, list) return UNVERIFIABLE"""
+        assert ArticleFactChecker._normalize_verdict(0) == "UNVERIFIABLE"
+        assert ArticleFactChecker._normalize_verdict(42) == "UNVERIFIABLE"
+        assert ArticleFactChecker._normalize_verdict(True) == "UNVERIFIABLE"
+        assert ArticleFactChecker._normalize_verdict(False) == "UNVERIFIABLE"
+        assert ArticleFactChecker._normalize_verdict(["TRUE"]) == "UNVERIFIABLE"
+
+    def test_whitespace_handling(self):
+        """Test that whitespace is stripped"""
+        assert ArticleFactChecker._normalize_verdict("  TRUE  ") == "TRUE"
+        assert ArticleFactChecker._normalize_verdict(" false ") == "FALSE"
+
+
+class TestRecalculateSummary:
+    """Test _recalculate_summary method"""
+
+    def test_basic_counts(self):
+        """Test basic counting of verdict types"""
+        claims = [
+            {"verification_result": "TRUE"},
+            {"verification_result": "TRUE"},
+            {"verification_result": "FALSE"},
+            {"verification_result": "UNVERIFIABLE"},
+        ]
+        result = ArticleFactChecker._recalculate_summary(claims)
+
+        assert result["total_claims"] == 4
+        assert result["verified_claims"] == 2
+        assert result["false_claims"] == 1
+        assert result["unverifiable_claims"] == 1
+        assert result["accuracy_score"] == 0.5
+
+    def test_empty_list(self):
+        """Test with empty claims list"""
+        result = ArticleFactChecker._recalculate_summary([])
+
+        assert result["total_claims"] == 0
+        assert result["verified_claims"] == 0
+        assert result["false_claims"] == 0
+        assert result["unverifiable_claims"] == 0
+        assert result["accuracy_score"] == 0.0
+
+    def test_all_true(self):
+        """Test when all claims are TRUE"""
+        claims = [
+            {"verification_result": "TRUE"},
+            {"verification_result": "TRUE"},
+            {"verification_result": "TRUE"},
+        ]
+        result = ArticleFactChecker._recalculate_summary(claims)
+
+        assert result["total_claims"] == 3
+        assert result["verified_claims"] == 3
+        assert result["accuracy_score"] == 1.0
+
+    def test_all_unverifiable(self):
+        """Test when all claims are UNVERIFIABLE"""
+        claims = [
+            {"verification_result": "UNVERIFIABLE"},
+            {"verification_result": "UNVERIFIABLE"},
+        ]
+        result = ArticleFactChecker._recalculate_summary(claims)
+
+        assert result["total_claims"] == 2
+        assert result["verified_claims"] == 0
+        assert result["accuracy_score"] == 0.0
+
+    def test_accuracy_rounding(self):
+        """Test accuracy score rounding to 4 decimal places"""
+        claims = [
+            {"verification_result": "TRUE"},
+            {"verification_result": "FALSE"},
+            {"verification_result": "UNVERIFIABLE"},
+        ]
+        result = ArticleFactChecker._recalculate_summary(claims)
+        assert result["accuracy_score"] == 0.3333
+
+
+class TestClaimsFallbackExtraction:
+    """Test _extract_claims_from_detailed_findings method"""
+
+    def test_extract_from_detailed_findings(self):
+        """Test extracting claims from agent's detailed_findings"""
+        verification_data = {
+            "detailed_findings": [
+                {
+                    "claim_id": "claim_001",
+                    "original_claim": "The model achieved 95% accuracy",
+                    "claim_type": "statistical",
+                    "verification_result": "TRUE"
+                },
+                {
+                    "claim_id": "claim_002",
+                    "original_claim": "Released by Google in 2024",
+                    "claim_type": "temporal",
+                    "verification_result": "FALSE"
+                }
+            ]
+        }
+
+        claims = ArticleFactChecker._extract_claims_from_detailed_findings(verification_data)
+
+        assert len(claims) == 2
+        assert claims[0]["claim_id"] == "claim_001"
+        assert claims[0]["claim"] == "The model achieved 95% accuracy"
+        assert claims[0]["claim_type"] == "statistical"
+        assert claims[0]["source"] == "agent_reasoning"
+        assert claims[0]["confidence"] is None
+        assert claims[0]["verifiable"] is True
+
+    def test_empty_findings(self):
+        """Test with empty detailed_findings"""
+        claims = ArticleFactChecker._extract_claims_from_detailed_findings({"detailed_findings": []})
+        assert claims == []
+
+    def test_missing_findings_key(self):
+        """Test with missing detailed_findings key"""
+        claims = ArticleFactChecker._extract_claims_from_detailed_findings({})
+        assert claims == []
+
+    def test_source_marker(self):
+        """Test that all extracted claims have source='agent_reasoning'"""
+        verification_data = {
+            "detailed_findings": [
+                {"claim_id": "c1", "original_claim": "Test", "claim_type": "factual"},
+                {"claim_id": "c2", "original_claim": "Test2"},
+            ]
+        }
+        claims = ArticleFactChecker._extract_claims_from_detailed_findings(verification_data)
+        for claim in claims:
+            assert claim["source"] == "agent_reasoning"
+
+    def test_missing_fields_use_defaults(self):
+        """Test that missing fields use appropriate defaults"""
+        verification_data = {
+            "detailed_findings": [
+                {"verification_result": "TRUE"}  # Minimal finding, missing most fields
+            ]
+        }
+        claims = ArticleFactChecker._extract_claims_from_detailed_findings(verification_data)
+
+        assert len(claims) == 1
+        assert claims[0]["claim_id"] == ""
+        assert claims[0]["claim"] == ""
+        assert claims[0]["claim_type"] == "unknown"
+
+
+class TestPromptEnhancements:
+    """Test prompt template enhancements for verdict consistency"""
+
+    def test_verdict_criteria_exists(self):
+        """Test that VERDICT_CRITERIA is defined and substantive"""
+        assert hasattr(PromptTemplates, 'VERDICT_CRITERIA')
+        criteria = PromptTemplates.VERDICT_CRITERIA
+        assert "TRUE" in criteria
+        assert "FALSE" in criteria
+        assert "UNVERIFIABLE" in criteria
+        assert "CRITICAL RULE" in criteria
+        assert "Absence of contradictory evidence" in criteria
+
+    def test_self_verification_step_exists(self):
+        """Test that SELF_VERIFICATION_STEP is defined and substantive"""
+        assert hasattr(PromptTemplates, 'SELF_VERIFICATION_STEP')
+        step = PromptTemplates.SELF_VERIFICATION_STEP
+        assert "Self-Verify" in step
+        assert "MANDATORY" in step
+        assert "consistency" in step.lower()
+
+    def test_build_includes_new_components(self):
+        """Test that build() includes VERDICT_CRITERIA and SELF_VERIFICATION_STEP"""
+        prompt = PromptTemplates.build()
+        assert "Verdict Decision Criteria" in prompt
+        assert "Self-Verify Verdict-Reasoning Consistency" in prompt
+
+    def test_build_assembly_order(self):
+        """Test that prompt components are in correct order"""
+        prompt = PromptTemplates.build()
+        # SELF_VERIFICATION_STEP should come after WORKFLOW_STEPS
+        workflow_pos = prompt.index("Workflow (Autonomous Decision-Making)")
+        self_verify_pos = prompt.index("Self-Verify Verdict-Reasoning Consistency")
+        assert self_verify_pos > workflow_pos
+
+        # VERDICT_CRITERIA should come before OUTPUT_FORMAT
+        verdict_pos = prompt.index("Verdict Decision Criteria")
+        output_pos = prompt.index("Output Format:")
+        assert verdict_pos < output_pos
+
+    def test_workflow_step1_mandatory_language(self):
+        """Test that Step 1 uses mandatory language for claims_extractor"""
+        prompt = PromptTemplates.build()
+        assert "REQUIRED - Do NOT skip this step" in prompt
+        assert "You MUST call the claims_extractor tool" in prompt
+
+    def test_build_with_article_type_includes_all_components(self):
+        """Test that article-type prompt still includes all new components"""
+        prompt = PromptTemplates.build(article_type="academic")
+        assert "Verdict Decision Criteria" in prompt
+        assert "Self-Verify Verdict-Reasoning Consistency" in prompt
+        assert "Article Type Guidance (Academic)" in prompt
+
+    def test_institutional_claim_tool_guidance_in_workflow(self):
+        """Test that WORKFLOW_STEPS includes institutional claim-specific tool guidance.
+
+        Institutional claims must use arxiv_search + tavily_search combination
+        regardless of article type. This guidance must be in WORKFLOW_STEPS (not
+        just ARTICLE_TYPE_GUIDANCE) to apply to all article types.
+        """
+        prompt = PromptTemplates.build()
+        assert "INSTITUTIONAL/ATTRIBUTION claims" in prompt
+        assert "arxiv_search FIRST" in prompt
+        assert "Do NOT rely on" in prompt
+
+
+class TestAggregateResultsNormalization:
+    """Test verdict normalization and summary recalculation in aggregate_results"""
+
+    def setup_method(self):
+        """Set up dynamic_config"""
+        from dingo.config.input_args import EvaluatorLLMArgs
+        self._original_dynamic_config = getattr(ArticleFactChecker, 'dynamic_config', None)
+        ArticleFactChecker.dynamic_config = EvaluatorLLMArgs(
+            key="test-key", api_url="https://api.example.com", model="test-model"
+        )
+
+    def teardown_method(self):
+        """Restore original dynamic_config"""
+        if self._original_dynamic_config is not None:
+            ArticleFactChecker.dynamic_config = self._original_dynamic_config
+
+    def test_nonstandard_verdicts_are_normalized(self):
+        """Test that non-standard verdicts are normalized in aggregate_results output"""
+        data = Data(content="test article")
+        agent_output = json.dumps({
+            "article_verification_summary": {
+                "article_type": "blog",
+                "total_claims": 3,
+                "verified_claims": 2,
+                "false_claims": 1,
+                "unverifiable_claims": 0,
+                "accuracy_score": 0.67
+            },
+            "detailed_findings": [
+                {"claim_id": "c1", "original_claim": "Claim 1", "verification_result": "CONFIRMED"},
+                {"claim_id": "c2", "original_claim": "Claim 2", "verification_result": "INCORRECT"},
+                {"claim_id": "c3", "original_claim": "Claim 3", "verification_result": "MAYBE"},
+            ],
+            "false_claims_comparison": []
+        })
+        agent_result = {
+            'success': True,
+            'output': agent_output,
+            'tool_calls': [],
+            'reasoning_steps': 5
+        }
+
+        result = ArticleFactChecker.aggregate_results(data, [agent_result])
+
+        # After normalization: CONFIRMED->TRUE, INCORRECT->FALSE, MAYBE->UNVERIFIABLE
+        # Recalculated: 1 TRUE, 1 FALSE, 1 UNVERIFIABLE -> accuracy = 1/3 ≈ 0.3333
+        assert result is not None
+        assert result.score == pytest.approx(0.3333, abs=0.001)
+        # Binary alignment: FALSE + UNVERIFIABLE → status=True (issue detected)
+        assert result.status is True
+        assert any("FACTUAL_ERROR" in label for label in result.label)
+
+    def test_summary_recalculated_from_actual_data(self):
+        """Test that agent's self-reported summary is overridden by recalculated data"""
+        data = Data(content="test article")
+        # Agent reports 3 verified, 0 false - but detailed_findings show 1 FALSE
+        agent_output = json.dumps({
+            "article_verification_summary": {
+                "article_type": "news",
+                "total_claims": 3,
+                "verified_claims": 3,
+                "false_claims": 0,
+                "unverifiable_claims": 0,
+                "accuracy_score": 1.0
+            },
+            "detailed_findings": [
+                {"claim_id": "c1", "original_claim": "Claim 1", "verification_result": "TRUE"},
+                {"claim_id": "c2", "original_claim": "Claim 2", "verification_result": "FALSE"},
+                {"claim_id": "c3", "original_claim": "Claim 3", "verification_result": "TRUE"},
+            ],
+            "false_claims_comparison": []
+        })
+        agent_result = {
+            'success': True,
+            'output': agent_output,
+            'tool_calls': [],
+            'reasoning_steps': 3
+        }
+
+        result = ArticleFactChecker.aggregate_results(data, [agent_result])
+
+        # Recalculated: 2 TRUE, 1 FALSE -> accuracy = 2/3 ≈ 0.6667
+        assert result.status is True  # Has false claims
+        assert result.score == pytest.approx(0.6667, abs=0.001)
+
+    def test_claims_source_in_report(self):
+        """Test that claims_source appears in structured report"""
+        data = Data(content="test article")
+        agent_output = json.dumps({
+            "article_verification_summary": {
+                "article_type": "blog",
+                "total_claims": 1,
+                "verified_claims": 1,
+                "false_claims": 0,
+                "unverifiable_claims": 0,
+                "accuracy_score": 1.0
+            },
+            "detailed_findings": [
+                {"claim_id": "c1", "original_claim": "Test claim", "verification_result": "TRUE"},
+            ],
+            "false_claims_comparison": []
+        })
+        agent_result = {
+            'success': True,
+            'output': agent_output,
+            'tool_calls': [],  # No claims_extractor tool call
+            'reasoning_steps': 3
+        }
+
+        result = ArticleFactChecker.aggregate_results(data, [agent_result])
+
+        # Should have report in reason[1]
+        assert len(result.reason) >= 2
+        report = result.reason[1]
+        assert isinstance(report, dict)
+        assert report["claims_extraction"]["claims_source"] == "agent_reasoning"
+
+
+class TestReasoningVerdictConsistency:
+    """Test code-level reasoning-verdict consistency check.
+
+    This tests the hedging language detector that downgrades TRUE verdicts
+    to UNVERIFIABLE when the reasoning contains language indicating
+    insufficient evidence. This is a systemic safety net, not claim-type specific.
+    """
+
+    def test_hedging_downgrades_true_to_unverifiable(self):
+        """Test that hedging language in reasoning downgrades TRUE → UNVERIFIABLE"""
+        claims = [
+            {
+                "claim_id": "c1",
+                "verification_result": "TRUE",
+                "reasoning": "The exact tripartite collaboration isn't explicitly stated in the README"
+            }
+        ]
+        downgraded = ArticleFactChecker._check_reasoning_verdict_consistency(claims)
+        assert downgraded == 1
+        assert claims[0]["verification_result"] == "UNVERIFIABLE"
+
+    def test_run3_exact_scenario(self):
+        """Test Run 3's exact institutional claim failure case.
+
+        Run 3 had: reasoning="While the exact tripartite collaboration isn't
+        explicitly stated in the GitHub README, multiple sources reference..."
+        verdict=TRUE → should be downgraded to UNVERIFIABLE.
+        """
+        claims = [
+            {
+                "claim_id": "claim_010",
+                "claim_type": "institutional",
+                "verification_result": "TRUE",
+                "reasoning": (
+                    "The OmniDocBench GitHub repository shows it's maintained by "
+                    "OpenDataLab with institutional affiliations. While the exact "
+                    "tripartite collaboration isn't explicitly stated in the GitHub "
+                    "README, multiple sources reference Tsinghua and Alibaba DAMO's "
+                    "involvement in OmniDocBench development."
+                )
+            }
+        ]
+        downgraded = ArticleFactChecker._check_reasoning_verdict_consistency(claims)
+        assert downgraded == 1
+        assert claims[0]["verification_result"] == "UNVERIFIABLE"
+
+    def test_false_verdicts_never_changed(self):
+        """Test that FALSE verdicts are never affected by hedging detection"""
+        claims = [
+            {
+                "claim_id": "c1",
+                "verification_result": "FALSE",
+                "reasoning": "The paper does not explicitly list these institutions"
+            }
+        ]
+        downgraded = ArticleFactChecker._check_reasoning_verdict_consistency(claims)
+        assert downgraded == 0
+        assert claims[0]["verification_result"] == "FALSE"
+
+    def test_unverifiable_verdicts_not_affected(self):
+        """Test that UNVERIFIABLE verdicts are not affected"""
+        claims = [
+            {
+                "claim_id": "c1",
+                "verification_result": "UNVERIFIABLE",
+                "reasoning": "Could not find evidence"
+            }
+        ]
+        downgraded = ArticleFactChecker._check_reasoning_verdict_consistency(claims)
+        assert downgraded == 0
+        assert claims[0]["verification_result"] == "UNVERIFIABLE"
+
+    def test_clear_reasoning_passes_through(self):
+        """Test that clear, definitive reasoning does not trigger downgrade"""
+        claims = [
+            {
+                "claim_id": "c1",
+                "verification_result": "TRUE",
+                "reasoning": (
+                    "The arXiv paper 2412.07626 confirms the model was released by "
+                    "Baidu with 0.9B parameters. Multiple independent sources verify "
+                    "this information."
+                )
+            }
+        ]
+        downgraded = ArticleFactChecker._check_reasoning_verdict_consistency(claims)
+        assert downgraded == 0
+        assert claims[0]["verification_result"] == "TRUE"
+
+    def test_multiple_claims_selective_downgrade(self):
+        """Test that only hedging TRUE claims are downgraded in a batch"""
+        claims = [
+            {
+                "claim_id": "c1",
+                "verification_result": "TRUE",
+                "reasoning": "Confirmed by arXiv paper with specific evidence"
+            },
+            {
+                "claim_id": "c2",
+                "verification_result": "TRUE",
+                "reasoning": "The specific numbers cannot be fully verified from available sources"
+            },
+            {
+                "claim_id": "c3",
+                "verification_result": "FALSE",
+                "reasoning": "Contradicts the paper's author list"
+            },
+            {
+                "claim_id": "c4",
+                "verification_result": "TRUE",
+                "reasoning": "Not directly confirmed by the search results found"
+            },
+        ]
+        downgraded = ArticleFactChecker._check_reasoning_verdict_consistency(claims)
+        assert downgraded == 2
+        assert claims[0]["verification_result"] == "TRUE"
+        assert claims[1]["verification_result"] == "UNVERIFIABLE"
+        assert claims[2]["verification_result"] == "FALSE"
+        assert claims[3]["verification_result"] == "UNVERIFIABLE"
+
+    @pytest.mark.parametrize("hedging_phrase", [
+        "not explicitly stated in the documentation",
+        "cannot be verified from available sources",
+        "could not find direct evidence",
+        "isn't explicitly mentioned in the results",
+        "is not explicitly listed in the paper",
+        "no direct evidence found for this claim",
+        "not directly confirmed by search results",
+        "the exact details cannot be fully verified",
+        "unable to verify the institutional affiliation",
+        "unable to confirm the claimed partnership",
+        "insufficient evidence to support this claim",
+    ])
+    def test_hedging_patterns_comprehensive(self, hedging_phrase):
+        """Test various hedging patterns all trigger downgrade"""
+        claims = [
+            {
+                "claim_id": "c1",
+                "verification_result": "TRUE",
+                "reasoning": f"Some context. {hedging_phrase}. More context."
+            }
+        ]
+        downgraded = ArticleFactChecker._check_reasoning_verdict_consistency(claims)
+        assert downgraded == 1, f"Pattern not detected: '{hedging_phrase}'"
+        assert claims[0]["verification_result"] == "UNVERIFIABLE"
+
+    def test_empty_reasoning_not_downgraded(self):
+        """Test that empty reasoning does not trigger downgrade"""
+        claims = [
+            {
+                "claim_id": "c1",
+                "verification_result": "TRUE",
+                "reasoning": ""
+            }
+        ]
+        downgraded = ArticleFactChecker._check_reasoning_verdict_consistency(claims)
+        assert downgraded == 0
+        assert claims[0]["verification_result"] == "TRUE"
+
+    def test_no_reasoning_key_not_downgraded(self):
+        """Test that missing reasoning key does not trigger downgrade"""
+        claims = [
+            {
+                "claim_id": "c1",
+                "verification_result": "TRUE",
+            }
+        ]
+        downgraded = ArticleFactChecker._check_reasoning_verdict_consistency(claims)
+        assert downgraded == 0
+
+    def test_integration_with_aggregate_results(self):
+        """Test that consistency check is integrated into aggregate_results pipeline"""
+        from dingo.config.input_args import EvaluatorLLMArgs
+        original_config = getattr(ArticleFactChecker, 'dynamic_config', None)
+        ArticleFactChecker.dynamic_config = EvaluatorLLMArgs(
+            key="test-key", api_url="https://api.example.com", model="test-model"
+        )
+        try:
+            data = Data(content="test article")
+            agent_output = json.dumps({
+                "article_verification_summary": {
+                    "article_type": "academic",
+                    "total_claims": 2,
+                    "verified_claims": 2,
+                    "false_claims": 0,
+                    "unverifiable_claims": 0,
+                    "accuracy_score": 1.0
+                },
+                "detailed_findings": [
+                    {
+                        "claim_id": "c1",
+                        "original_claim": "Paper by University X",
+                        "claim_type": "institutional",
+                        "verification_result": "TRUE",
+                        "reasoning": "The exact institutional affiliation is not explicitly stated in available sources"
+                    },
+                    {
+                        "claim_id": "c2",
+                        "original_claim": "Model has 0.9B params",
+                        "claim_type": "technical",
+                        "verification_result": "TRUE",
+                        "reasoning": "Confirmed by arXiv paper title and Hugging Face model card"
+                    },
+                ],
+                "false_claims_comparison": []
+            })
+            agent_result = {
+                'success': True,
+                'output': agent_output,
+                'tool_calls': [],
+                'reasoning_steps': 3
+            }
+
+            result = ArticleFactChecker.aggregate_results(data, [agent_result])
+
+            # c1 should be downgraded: 1 TRUE + 1 UNVERIFIABLE → accuracy = 0.5
+            assert result.score == pytest.approx(0.5, abs=0.01)
+            # Binary alignment: UNVERIFIABLE → status=True (issue detected)
+            assert result.status is True
+        finally:
+            if original_config is not None:
+                ArticleFactChecker.dynamic_config = original_config
+
+
+class TestBinaryAlignmentWithDingo:
+    """Test binary alignment of verification results with Dingo's evaluation model.
+
+    Dingo uses a binary evaluation system: status=True (issue) or status=False (pass).
+    ArticleFactChecker maps:
+      - TRUE claims → no issue (status=False)
+      - FALSE claims → issue (status=True, label=ARTICLE_FACTUAL_ERROR)
+      - UNVERIFIABLE claims → issue (status=True, label=ARTICLE_UNVERIFIED_CLAIMS)
+
+    FALSE takes label priority over UNVERIFIABLE when both are present.
+    """
+
+    @staticmethod
+    def _make_summary(total, verified, false_claims, unverifiable, accuracy):
+        """Build verification_data dict for _build_eval_detail_from_verification."""
+        return {
+            "article_verification_summary": {
+                "total_claims": total,
+                "verified_claims": verified,
+                "false_claims": false_claims,
+                "unverifiable_claims": unverifiable,
+                "accuracy_score": accuracy,
+            },
+            "detailed_findings": [],
+        }
+
+    def test_all_true_returns_no_issue(self):
+        """Test: all TRUE claims → status=False, QUALITY_GOOD label"""
+        verification_data = self._make_summary(total=3, verified=3, false_claims=0, unverifiable=0, accuracy=1.0)
+        result = ArticleFactChecker._build_eval_detail_from_verification(
+            verification_data, tool_calls=[], reasoning_steps=3
+        )
+        assert result.status is False
+        assert result.score == 1.0
+        assert any("QUALITY_GOOD" in label for label in result.label)
+
+    def test_unverifiable_only_returns_issue(self):
+        """Test: UNVERIFIABLE claims (no FALSE) → status=True, UNVERIFIED_CLAIMS label"""
+        verification_data = self._make_summary(total=5, verified=3, false_claims=0, unverifiable=2, accuracy=0.6)
+        result = ArticleFactChecker._build_eval_detail_from_verification(
+            verification_data, tool_calls=[], reasoning_steps=5
+        )
+        assert result.status is True
+        assert result.score == 0.6
+        assert any("ARTICLE_UNVERIFIED_CLAIMS" in label for label in result.label)
+
+    def test_false_only_returns_factual_error(self):
+        """Test: FALSE claims (no UNVERIFIABLE) → status=True, FACTUAL_ERROR label"""
+        verification_data = self._make_summary(total=4, verified=3, false_claims=1, unverifiable=0, accuracy=0.75)
+        result = ArticleFactChecker._build_eval_detail_from_verification(
+            verification_data, tool_calls=[], reasoning_steps=4
+        )
+        assert result.status is True
+        assert result.score == 0.75
+        assert any("ARTICLE_FACTUAL_ERROR" in label for label in result.label)
+
+    def test_false_plus_unverifiable_prefers_factual_error_label(self):
+        """Test: both FALSE and UNVERIFIABLE → FACTUAL_ERROR label takes priority"""
+        verification_data = self._make_summary(total=5, verified=2, false_claims=1, unverifiable=2, accuracy=0.4)
+        result = ArticleFactChecker._build_eval_detail_from_verification(
+            verification_data, tool_calls=[], reasoning_steps=5
+        )
+        assert result.status is True
+        assert result.score == 0.4
+        assert any("ARTICLE_FACTUAL_ERROR" in label for label in result.label)
+
+    def test_zero_claims_returns_no_issue(self):
+        """Test: zero claims → status=False (no evidence of issues)"""
+        verification_data = self._make_summary(total=0, verified=0, false_claims=0, unverifiable=0, accuracy=0.0)
+        result = ArticleFactChecker._build_eval_detail_from_verification(
+            verification_data, tool_calls=[], reasoning_steps=1
+        )
+        assert result.status is False
+        assert any("QUALITY_GOOD" in label for label in result.label)
diff --git a/test/scripts/model/llm/agent/test_article_fact_checker_news.py b/test/scripts/model/llm/agent/test_article_fact_checker_news.py
new file mode 100644
index 00000000..47c38299
--- /dev/null
+++ b/test/scripts/model/llm/agent/test_article_fact_checker_news.py
@@ -0,0 +1,156 @@
+"""
+Test ArticleFactChecker with news articles.
+
+This test suite validates news article handling with temporal,
+attribution, and monetary claims.
+"""
+
+import functools
+import os
+from pathlib import Path
+
+import pytest
+
+from dingo.io.input.data import Data
+from dingo.model.llm.agent.agent_article_fact_checker import ArticleFactChecker
+
+
+def get_test_data_path(filename: str) -> Path:
+    """Get absolute path to test data file."""
+    return Path(__file__).parents[4] / "data" / filename
+
+
+def skip_on_api_error(test_func):
+    """Decorator to skip test if API execution fails (preserves function signature for pytest)."""
+    @functools.wraps(test_func)
+    def wrapper(*args, **kwargs):
+        try:
+            return test_func(*args, **kwargs)
+        except Exception as e:
+            pytest.skip(f"API execution failed: {e}")
+    return wrapper
+
+
+class TestArticleFactCheckerNews:
+    """Test suite for news article fact-checking"""
+
+    # DeepSeek API configuration (uses OpenAI SDK)
+    DEEPSEEK_BASE_URL = "https://api.deepseek.com/v1"
+    DEEPSEEK_MODEL = "deepseek-chat"
+
+    def setup_method(self):
+        """Configure ArticleFactChecker to use DeepSeek API"""
+        from dingo.config.input_args import EvaluatorLLMArgs
+
+        api_key = os.getenv("OPENAI_API_KEY")
+        if api_key:
+            ArticleFactChecker.dynamic_config = EvaluatorLLMArgs(
+                key=api_key,
+                api_url=self.DEEPSEEK_BASE_URL,
+                model=self.DEEPSEEK_MODEL
+            )
+
+    @pytest.fixture
+    def news_article(self) -> str:
+        """Load news article about OpenAI o1 release."""
+        path = get_test_data_path("news_article_excerpt.md")
+        return path.read_text(encoding='utf-8')
+
+    @pytest.fixture(autouse=True)
+    def skip_if_no_api_key(self):
+        """Auto-skip all tests if no API keys available."""
+        if not os.getenv("OPENAI_API_KEY"):
+            pytest.skip("OPENAI_API_KEY not set")
+
+    def test_structure_validation(self, news_article: str):
+        """Test data structure without API calls."""
+        data = Data(dingo_id="news_001", content=news_article)
+
+        assert data.content is not None
+        assert "OpenAI" in data.content
+        assert "o1" in data.content
+        assert "2024" in data.content
+
+    @pytest.mark.slow
+    @pytest.mark.external
+    @skip_on_api_error
+    def test_claim_extraction(self, news_article: str):
+        """
+        Test claim extraction from news article.
+
+        Expected: temporal, attribution, statistical, monetary claims.
+        """
+        data = Data(dingo_id="news_002", content=news_article)
+        result = ArticleFactChecker.eval(data)
+
+        assert result is not None
+        assert hasattr(result, 'status')
+        assert hasattr(result, 'score')
+
+    @pytest.mark.slow
+    @pytest.mark.external
+    @skip_on_api_error
+    def test_temporal_verification(self, news_article: str):
+        """
+        Test temporal claim verification.
+
+        Example: "Released on December 5, 2024"
+        Tool: tavily_search with date filters
+        """
+        data = Data(dingo_id="news_003", content=news_article)
+        result = ArticleFactChecker.eval(data)
+
+        assert result is not None
+
+    @pytest.mark.slow
+    @pytest.mark.external
+    @skip_on_api_error
+    def test_attribution_verification(self, news_article: str):
+        """
+        Test attribution claim verification.
+
+        Example: "Sam Altman stated o1 is a milestone"
+        Tool: tavily_search
+        """
+        data = Data(dingo_id="news_004", content=news_article)
+        result = ArticleFactChecker.eval(data)
+
+        assert result is not None
+
+    @pytest.mark.slow
+    @pytest.mark.external
+    @skip_on_api_error
+    def test_monetary_verification(self, news_article: str):
+        """
+        Test monetary claim verification.
+
+        Example: "ChatGPT Plus remains $20/month"
+        Tool: tavily_search
+        """
+        data = Data(dingo_id="news_005", content=news_article)
+        result = ArticleFactChecker.eval(data)
+
+        assert result is not None
+
+    @pytest.mark.integration
+    @pytest.mark.slow
+    @pytest.mark.external
+    @skip_on_api_error
+    def test_full_workflow(self, news_article: str):
+        """
+        Integration test: Full news article workflow.
+
+        Steps: Type ID → Claim extraction → Verification → Report
+        """
+        data = Data(dingo_id="news_integration", content=news_article)
+        result = ArticleFactChecker.eval(data)
+
+        assert result is not None
+        assert hasattr(result, 'status')
+        assert hasattr(result, 'score')
+        assert hasattr(result, 'label')
+        assert hasattr(result, 'reason')
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])
diff --git a/test/scripts/model/llm/agent/test_article_fact_checker_product.py b/test/scripts/model/llm/agent/test_article_fact_checker_product.py
new file mode 100644
index 00000000..ee473b94
--- /dev/null
+++ b/test/scripts/model/llm/agent/test_article_fact_checker_product.py
@@ -0,0 +1,186 @@
+"""
+Test ArticleFactChecker with product reviews.
+
+This test suite validates product review handling with technical,
+comparative, and monetary claims.
+"""
+
+import functools
+import os
+from pathlib import Path
+
+import pytest
+
+from dingo.io.input.data import Data
+from dingo.model.llm.agent.agent_article_fact_checker import ArticleFactChecker
+
+
+def get_test_data_path(filename: str) -> Path:
+    """Get absolute path to test data file."""
+    return Path(__file__).parents[4] / "data" / filename
+
+
+def skip_on_api_error(test_func):
+    """Decorator to skip test if API execution fails (preserves function signature for pytest)."""
+    @functools.wraps(test_func)
+    def wrapper(*args, **kwargs):
+        try:
+            return test_func(*args, **kwargs)
+        except Exception as e:
+            pytest.skip(f"API execution failed: {e}")
+    return wrapper
+
+
+class TestArticleFactCheckerProduct:
+    """Test suite for product review fact-checking"""
+
+    # DeepSeek API configuration (uses OpenAI SDK)
+    DEEPSEEK_BASE_URL = "https://api.deepseek.com/v1"
+    DEEPSEEK_MODEL = "deepseek-chat"
+
+    def setup_method(self):
+        """Configure ArticleFactChecker to use DeepSeek API"""
+        from dingo.config.input_args import EvaluatorLLMArgs
+
+        api_key = os.getenv("OPENAI_API_KEY")
+        if api_key:
+            ArticleFactChecker.dynamic_config = EvaluatorLLMArgs(
+                key=api_key,
+                api_url=self.DEEPSEEK_BASE_URL,
+                model=self.DEEPSEEK_MODEL
+            )
+
+    @pytest.fixture
+    def product_review(self) -> str:
+        """Load product review for iPhone 15 Pro."""
+        path = get_test_data_path("product_review_excerpt.md")
+        return path.read_text(encoding='utf-8')
+
+    @pytest.fixture(autouse=True)
+    def skip_if_no_api_key(self):
+        """Auto-skip all tests if no API keys available."""
+        if not os.getenv("OPENAI_API_KEY"):
+            pytest.skip("OPENAI_API_KEY not set")
+
+    def test_structure_validation(self, product_review: str):
+        """Test data structure without API calls."""
+        data = Data(dingo_id="product_001", content=product_review)
+
+        assert data.content is not None
+        assert "iPhone 15 Pro" in data.content
+        assert "A17 Pro" in data.content
+        assert "7999" in data.content
+
+    @pytest.mark.slow
+    @pytest.mark.external
+    @skip_on_api_error
+    def test_claim_extraction(self, product_review: str):
+        """
+        Test claim extraction from product review.
+
+        Expected: technical, comparative, monetary, statistical claims.
+        """
+        data = Data(dingo_id="product_002", content=product_review)
+        result = ArticleFactChecker.eval(data)
+
+        assert result is not None
+        assert hasattr(result, 'status')
+        assert hasattr(result, 'score')
+
+    @pytest.mark.slow
+    @pytest.mark.external
+    @skip_on_api_error
+    def test_technical_verification(self, product_review: str):
+        """
+        Test technical specification verification.
+
+        Example: "A17 Pro chip with 3nm process"
+        Tool: tavily_search for official specs
+        """
+        data = Data(dingo_id="product_003", content=product_review)
+        result = ArticleFactChecker.eval(data)
+
+        assert result is not None
+
+    @pytest.mark.slow
+    @pytest.mark.external
+    @skip_on_api_error
+    def test_comparative_verification(self, product_review: str):
+        """
+        Test comparative claim verification.
+
+        Examples: "GPU improved 20% vs A16", "12% vs iPhone 14 Pro"
+        Tool: tavily_search for benchmarks
+        """
+        data = Data(dingo_id="product_004", content=product_review)
+        result = ArticleFactChecker.eval(data)
+
+        assert result is not None
+
+    @pytest.mark.slow
+    @pytest.mark.external
+    @skip_on_api_error
+    def test_monetary_verification(self, product_review: str):
+        """
+        Test pricing verification.
+
+        Examples: "128GB: 7999 yuan", "Price increase: 800 yuan"
+        Tool: tavily_search for official pricing
+        """
+        data = Data(dingo_id="product_005", content=product_review)
+        result = ArticleFactChecker.eval(data)
+
+        assert result is not None
+
+    @pytest.mark.slow
+    @pytest.mark.external
+    @skip_on_api_error
+    def test_statistical_verification(self, product_review: str):
+        """
+        Test benchmark score verification.
+
+        Examples: "Geekbench 6: 2920/7230", "Video: 23 hours"
+        Tool: tavily_search for benchmarks
+        """
+        data = Data(dingo_id="product_006", content=product_review)
+        result = ArticleFactChecker.eval(data)
+
+        assert result is not None
+
+    @pytest.mark.integration
+    @pytest.mark.slow
+    @pytest.mark.external
+    @skip_on_api_error
+    def test_full_workflow(self, product_review: str):
+        """
+        Integration test: Full product review workflow.
+
+        Steps: Type ID → Claim extraction → Verification → Report
+        """
+        data = Data(dingo_id="product_integration", content=product_review)
+        result = ArticleFactChecker.eval(data)
+
+        assert result is not None
+        assert hasattr(result, 'status')
+        assert hasattr(result, 'score')
+        assert hasattr(result, 'label')
+        assert hasattr(result, 'reason')
+
+    @pytest.mark.slow
+    @pytest.mark.external
+    @skip_on_api_error
+    def test_cross_device_comparison(self, product_review: str):
+        """
+        Test cross-device comparative claims.
+
+        Example: "Night mode better than Samsung Galaxy S23 Ultra"
+        Note: May mark subjective claims as UNVERIFIABLE
+        """
+        data = Data(dingo_id="product_007", content=product_review)
+        result = ArticleFactChecker.eval(data)
+
+        assert result is not None
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])
diff --git a/test/scripts/model/llm/agent/test_async_article_fact_checker.py b/test/scripts/model/llm/agent/test_async_article_fact_checker.py
new file mode 100644
index 00000000..f9529f56
--- /dev/null
+++ b/test/scripts/model/llm/agent/test_async_article_fact_checker.py
@@ -0,0 +1,559 @@
+"""
+Tests for the two-phase async ArticleFactChecker.
+
+Covers:
+- Parallel execution path (mock agents)
+- Semaphore concurrency limit
+- asyncio.run() bridge in thread context
+- Fallback when event loop is already running
+- JSON parsing from mini-agent output
+- Fallback when parsing fails
+- _build_unverifiable_claim_record error handling
+- _aggregate_parallel_results summary calculation
+"""
+
+import asyncio
+import json
+import threading
+import time
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+# ─── Fixtures ────────────────────────────────────────────────────────────────
+
+
+def _make_claim(n: int) -> dict:
+    return {
+        "claim_id": f"claim_{n:03d}",
+        "claim": f"Test claim number {n}",
+        "claim_type": "factual",
+        "confidence": 0.9,
+        "verifiable": True,
+    }
+
+
+def _make_agent_result(verdict: str = "TRUE", tool: str = "tavily_search") -> dict:
+    output_json = json.dumps({
+        "verification_result": verdict,
+        "evidence": f"Evidence for {verdict}",
+        "sources": ["https://example.com"],
+        "verification_method": tool,
+        "search_queries_used": ["test query"],
+        "reasoning": f"Found direct evidence: {verdict}",
+    })
+    return {
+        "output": output_json,
+        "messages": [],
+        "tool_calls": [{"tool": tool, "args": {"query": "test query"}, "observation": "ok"}],
+        "reasoning_steps": 2,
+        "success": True,
+    }
+
+
+# ─── Tests for _parse_single_claim_result ────────────────────────────────────
+
+
+class TestParseSingleClaimResult:
+    """Unit tests for JSON parsing of mini-agent output."""
+
+    def setup_method(self):
+        from dingo.model.llm.agent.agent_article_fact_checker import ArticleFactChecker
+
+        self.checker = ArticleFactChecker
+
+    def test_parse_valid_json_returns_enriched_record(self):
+        """Valid JSON output should be fully parsed into enriched record."""
+        claim = _make_claim(1)
+        agent_result = _make_agent_result("TRUE", "tavily_search")
+
+        result = self.checker._parse_single_claim_result(claim, agent_result)
+
+        assert result["claim_id"] == "claim_001"
+        assert result["verification_result"] == "TRUE"
+        assert result["evidence"] == "Evidence for TRUE"
+        assert result["sources"] == ["https://example.com"]
+        assert result["verification_method"] == "tavily_search"
+        assert "Found direct evidence" in result["reasoning"]
+
+    def test_parse_false_verdict(self):
+        """FALSE verdict should be preserved correctly."""
+        claim = _make_claim(2)
+        agent_result = _make_agent_result("FALSE", "arxiv_search")
+
+        result = self.checker._parse_single_claim_result(claim, agent_result)
+
+        assert result["verification_result"] == "FALSE"
+
+    def test_parse_invalid_json_falls_back_gracefully(self):
+        """When LLM returns non-JSON, should fall back to UNVERIFIABLE with truncated output."""
+        claim = _make_claim(3)
+        agent_result = {
+            "output": "Sorry, I could not find any evidence.",
+            "tool_calls": [],
+            "reasoning_steps": 1,
+            "success": True,
+        }
+
+        result = self.checker._parse_single_claim_result(claim, agent_result)
+
+        assert result["claim_id"] == "claim_003"
+        assert result["verification_result"] == "UNVERIFIABLE"
+        assert "Sorry" in result["reasoning"]
+
+    def test_parse_extracts_search_queries_from_tool_calls(self):
+        """When JSON lacks search_queries_used, should extract from tool_calls."""
+        claim = _make_claim(4)
+        output_json = json.dumps({
+            "verification_result": "TRUE",
+            "evidence": "Found",
+            "sources": [],
+            "verification_method": "tavily_search",
+            "reasoning": "ok",
+            # search_queries_used intentionally omitted
+        })
+        agent_result = {
+            "output": output_json,
+            "tool_calls": [
+                {"tool": "tavily_search", "args": {"query": "my search"}, "observation": "data"}
+            ],
+            "reasoning_steps": 1,
+            "success": True,
+        }
+
+        result = self.checker._parse_single_claim_result(claim, agent_result)
+
+        assert result["search_queries_used"] == ["my search"]
+
+    def test_parse_combined_method_when_multiple_tools_used(self):
+        """When multiple tools are used, verification_method should be 'combined'."""
+        claim = _make_claim(5)
+        output_json = json.dumps({
+            "verification_result": "TRUE",
+            "evidence": "Multi-source",
+            "sources": [],
+            "reasoning": "both tools used",
+            # verification_method intentionally omitted
+        })
+        agent_result = {
+            "output": output_json,
+            "tool_calls": [
+                {"tool": "tavily_search", "args": {"query": "q1"}, "observation": ""},
+                {"tool": "arxiv_search", "args": {"query": "q2"}, "observation": ""},
+            ],
+            "reasoning_steps": 2,
+            "success": True,
+        }
+
+        result = self.checker._parse_single_claim_result(claim, agent_result)
+
+        assert result["verification_method"] == "combined"
+
+
+# ─── Tests for _parse_claim_json_robust ──────────────────────────────────────
+
+
+class TestParseClaimJsonRobust:
+    """Unit tests for the three-tier robust JSON parser."""
+
+    def setup_method(self):
+        from dingo.model.llm.agent.agent_article_fact_checker import ArticleFactChecker
+
+        self.checker = ArticleFactChecker
+
+    def test_complete_json_parsed_normally(self):
+        """Tier 1: complete JSON with verification_result should parse directly."""
+        output = json.dumps({
+            "verification_result": "TRUE",
+            "evidence": "Found direct evidence.",
+            "sources": ["https://example.com"],
+            "reasoning": "The claim is supported by evidence.",
+        })
+        result = self.checker._parse_claim_json_robust(output)
+
+        assert result["verification_result"] == "TRUE"
+        assert result["evidence"] == "Found direct evidence."
+        assert result["sources"] == ["https://example.com"]
+        assert "supported" in result["reasoning"]
+
+    def test_truncated_json_missing_closing_brace(self):
+        """Tier 2: JSON truncated mid-value, missing closing brace, should be repaired."""
+        output = (
+            '{"verification_result": "FALSE", "evidence": "Contradicted by source X", '
+            '"sources": ["https://example.com"], "reasoning": "The claim is false'
+        )
+        result = self.checker._parse_claim_json_robust(output)
+
+        assert result["verification_result"] == "FALSE"
+        assert "Contradicted" in result.get("evidence", "")
+
+    def test_markdown_wrapped_truncated_json(self):
+        """Tier 2: markdown code-block wrapped truncated JSON should be unwrapped and repaired."""
+        output = (
+            '```json\n'
+            '{"verification_result": "TRUE", "evidence": "Confirmed by multiple sources", '
+            '"sources": ["https://a.com", "https://b.com"], "reasoning": "Strong evidence'
+        )
+        result = self.checker._parse_claim_json_robust(output)
+
+        assert result.get("verification_result") == "TRUE"
+        assert "Confirmed" in result.get("evidence", "")
+
+    def test_truncated_json_with_corrupted_trailing_text_repaired_by_tier2(self):
+        """Tier 2: truncated JSON with corrupted trailing text should be repaired by truncation repair."""
+        output = (
+            '{"verification_result": "FALSE", "evidence": "The data shows otherwise", '
+            '"sources": ["https://example.com"], "reasoning": "Clear contradiction<ctrl46>'
+        )
+        result = self.checker._parse_claim_json_robust(output)
+
+        assert result.get("verification_result") == "FALSE"
+        assert "data shows" in result.get("evidence", "")
+        assert result.get("sources") == ["https://example.com"]
+
+    def test_completely_irrelevant_text_returns_empty(self):
+        """When output is completely non-JSON, should return empty dict."""
+        output = "I apologize, but I was unable to verify this claim due to technical issues."
+        result = self.checker._parse_claim_json_robust(output)
+
+        assert result == {}
+
+    def test_empty_string_returns_empty(self):
+        """Empty string input should return empty dict."""
+        assert self.checker._parse_claim_json_robust("") == {}
+
+    def test_none_input_returns_empty(self):
+        """None input should return empty dict."""
+        assert self.checker._parse_claim_json_robust(None) == {}
+
+    def test_truncated_json_with_incomplete_sources_array(self):
+        """Tier 2: JSON truncated inside sources array should recover what it can."""
+        output = (
+            '{"verification_result": "TRUE", "evidence": "Found evidence", '
+            '"sources": ["https://a.com", "https://b.com'
+        )
+        result = self.checker._parse_claim_json_robust(output)
+
+        # Should at least extract verification_result
+        assert result.get("verification_result") == "TRUE"
+
+    def test_json_embedded_in_surrounding_text(self):
+        """Tier 1: JSON block embedded in prose should be extracted."""
+        output = (
+            'Based on my analysis, here is the result:\n'
+            '{"verification_result": "UNVERIFIABLE", "evidence": "", "sources": [], '
+            '"reasoning": "No relevant sources found"}\n'
+            'Let me know if you need more details.'
+        )
+        result = self.checker._parse_claim_json_robust(output)
+
+        assert result["verification_result"] == "UNVERIFIABLE"
+        assert result["reasoning"] == "No relevant sources found"
+
+    def test_tier1_match_but_invalid_json_falls_to_tier2(self):
+        """Tier 1 regex match with trailing comma should fall through to Tier 2."""
+        output = (
+            '{"verification_result": "TRUE", "evidence": "found",}'
+        )
+        result = self.checker._parse_claim_json_robust(output)
+
+        # Tier 1 json.loads fails on trailing comma; Tier 2 or Tier 3 should recover
+        assert result.get("verification_result") == "TRUE"
+
+    def test_case_insensitive_verdict_in_tier3(self):
+        """Tier 3 should match lowercase/mixed-case verdicts and normalize to uppercase."""
+        # No opening brace → Tier 1 and 2 skip, only Tier 3 regex fires
+        output = 'Result: "verification_result": "true", "evidence": "confirmed"'
+        result = self.checker._parse_claim_json_robust(output)
+
+        assert result.get("verification_result") == "TRUE"
+        assert result.get("evidence") == "confirmed"
+
+    def test_escaped_quotes_in_string_values(self):
+        """Strings with escaped quotes should be parsed correctly."""
+        output = json.dumps({
+            "verification_result": "TRUE",
+            "evidence": 'The study states "significant results"',
+            "sources": [],
+            "reasoning": "ok",
+        })
+        result = self.checker._parse_claim_json_robust(output)
+
+        assert result["verification_result"] == "TRUE"
+        assert '"significant results"' in result["evidence"]
+
+    def test_truncated_json_missing_reasoning_repaired_by_tier2(self):
+        """Tier 2: truncated JSON with missing closing quote/brace should recover reasoning field."""
+        output = (
+            '{"verification_result": "FALSE", '
+            '"reasoning": "The claim contradicts multiple peer-reviewed'
+        )
+        result = self.checker._parse_claim_json_robust(output)
+
+        assert result.get("verification_result") == "FALSE"
+        assert "contradicts" in result.get("reasoning", "")
+
+    def test_integration_with_parse_single_claim_result(self):
+        """Robust parser should integrate correctly with _parse_single_claim_result."""
+        claim = _make_claim(99)
+        # Simulate truncated output that old regex couldn't handle
+        truncated_output = (
+            '{"verification_result": "FALSE", "evidence": "Source contradicts claim", '
+            '"sources": ["https://example.com"], "reasoning": "Clear evidence of'
+        )
+        agent_result = {
+            "output": truncated_output,
+            "tool_calls": [{"tool": "tavily_search", "args": {"query": "test"}, "observation": "ok"}],
+            "reasoning_steps": 2,
+            "success": True,
+        }
+
+        result = self.checker._parse_single_claim_result(claim, agent_result)
+
+        # Should recover FALSE instead of falling back to UNVERIFIABLE
+        assert result["verification_result"] == "FALSE"
+        assert "contradicts" in result.get("evidence", "").lower()
+
+
+# ─── Tests for _build_unverifiable_claim_record ──────────────────────────────
+
+
+class TestBuildUnverifiableClaimRecord:
+    def setup_method(self):
+        from dingo.model.llm.agent.agent_article_fact_checker import ArticleFactChecker
+
+        self.checker = ArticleFactChecker
+
+    def test_builds_correct_structure(self):
+        claim = _make_claim(1)
+        record = self.checker._build_unverifiable_claim_record(claim, "API timeout")
+
+        assert record["claim_id"] == "claim_001"
+        assert record["verification_result"] == "UNVERIFIABLE"
+        assert record["verification_method"] == "error"
+        assert "API timeout" in record["reasoning"]
+        assert record["sources"] == []
+        assert "error_type" not in record
+
+
+# ─── Tests for _aggregate_parallel_results ───────────────────────────────────
+
+
+class TestAggregateParallelResults:
+    def setup_method(self):
+        from dingo.io.input.data import Data
+        from dingo.model.llm.agent.agent_article_fact_checker import ArticleFactChecker
+
+        self.checker = ArticleFactChecker
+        self.data = Data(dingo_id="test_001", content="Test article content")
+
+    def _make_vr_success(self, verdict: str) -> dict:
+        """Create a success verification result dict."""
+        agent_result = _make_agent_result(verdict)
+        # Replace reasoning with non-hedging text for TRUE to pass consistency check
+        if verdict == "TRUE":
+            out = json.loads(agent_result["output"])
+            out["reasoning"] = "Confirmed by direct evidence at https://example.com"
+            agent_result["output"] = json.dumps(out)
+        return {"claim": _make_claim(1), "agent_result": agent_result, "success": True}
+
+    def test_summary_counts_are_correct(self):
+        """_recalculate_summary should match actual verdict distribution."""
+        claims = [_make_claim(i) for i in range(1, 4)]
+        vr_true = self._make_vr_success("TRUE")
+        vr_false = self._make_vr_success("FALSE")
+        vr_unver = self._make_vr_success("UNVERIFIABLE")
+
+        # Give each result the right claim
+        vr_true["claim"] = claims[0]
+        vr_false["claim"] = claims[1]
+        vr_unver["claim"] = claims[2]
+
+        result = self.checker._aggregate_parallel_results(
+            self.data, claims, [vr_true, vr_false, vr_unver], time.time() - 1.0, None
+        )
+
+        # Check the EvalDetail score
+        assert result.score == pytest.approx(1 / 3, abs=0.01)
+        # Has false claim → status True (issue detected)
+        assert result.status is True
+
+    def test_exception_in_verification_result_becomes_unverifiable(self):
+        """Exception objects from asyncio.gather should be handled gracefully."""
+        claims = [_make_claim(1)]
+        exc_result = RuntimeError("API rate limit exceeded")
+
+        result = self.checker._aggregate_parallel_results(
+            self.data, claims, [exc_result], time.time() - 1.0, None
+        )
+
+        # UNVERIFIABLE claim → has_issues is True
+        assert result.status is True
+        assert result.score == pytest.approx(0.0)
+
+    def test_failed_success_flag_becomes_unverifiable(self):
+        """success=False in vr should produce UNVERIFIABLE record."""
+        claims = [_make_claim(1)]
+        failed_vr = {"claim": claims[0], "agent_result": {"error": "timeout"}, "success": False}
+
+        result = self.checker._aggregate_parallel_results(
+            self.data, claims, [failed_vr], time.time() - 1.0, None
+        )
+
+        assert result.status is True  # UNVERIFIABLE → has_issues
+
+
+# ─── Tests for asyncio.run() bridge ──────────────────────────────────────────
+
+
+class TestAsyncioRunBridge:
+    """Verify asyncio.run() works correctly inside a non-async (thread) context."""
+
+    def test_asyncio_run_in_thread_context(self):
+        """asyncio.run() should work in a fresh thread with no existing event loop."""
+        result_holder = []
+
+        async def dummy_coroutine():
+            return 42
+
+        def run_in_thread():
+            value = asyncio.run(dummy_coroutine())
+            result_holder.append(value)
+
+        t = threading.Thread(target=run_in_thread)
+        t.start()
+        t.join(timeout=5.0)
+
+        assert not t.is_alive(), "Thread should have completed"
+        assert result_holder == [42]
+
+    def test_asyncio_gather_with_semaphore(self):
+        """asyncio.gather with Semaphore should respect max_concurrent limit."""
+        max_concurrent = 3
+        concurrent_tracker = {"current": 0, "max_seen": 0}
+
+        async def task_with_semaphore(sem):
+            async with sem:
+                concurrent_tracker["current"] += 1
+                concurrent_tracker["max_seen"] = max(
+                    concurrent_tracker["max_seen"], concurrent_tracker["current"]
+                )
+                await asyncio.sleep(0.01)
+                concurrent_tracker["current"] -= 1
+
+        async def run():
+            sem = asyncio.Semaphore(max_concurrent)
+            await asyncio.gather(*[task_with_semaphore(sem) for _ in range(10)])
+
+        asyncio.run(run())
+
+        assert concurrent_tracker["max_seen"] <= max_concurrent
+
+    def test_fallback_when_event_loop_running(self):
+        """ThreadPoolExecutor fallback should produce the same result as asyncio.run()."""
+        import concurrent.futures
+
+        async def dummy():
+            return "from_thread"
+
+        result_holder = []
+
+        def run_with_fallback():
+            with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
+                future = pool.submit(lambda: asyncio.run(dummy()))
+                result_holder.append(future.result())
+
+        t = threading.Thread(target=run_with_fallback)
+        t.start()
+        t.join(timeout=5.0)
+
+        assert result_holder == ["from_thread"]
+
+
+# ─── Tests for _async_eval with mocked agents ────────────────────────────────
+
+
+class TestAsyncEvalWithMocks:
+    """Integration-level tests using mocked LLM/tools (run via asyncio.run)."""
+
+    def setup_method(self):
+        from dingo.io.input.data import Data
+        from dingo.model.llm.agent.agent_article_fact_checker import ArticleFactChecker
+
+        self.checker = ArticleFactChecker
+        self.data = Data(dingo_id="art_001", content="Short test article with one fact.")
+
+    def test_async_eval_with_mocked_components(self):
+        """Full async_eval flow should complete and return EvalDetail when mocked."""
+        from dingo.io.output.eval_detail import EvalDetail
+
+        mock_claims = [_make_claim(1), _make_claim(2)]
+        mock_agent_result = _make_agent_result("TRUE")
+        # Use non-hedging reasoning to avoid consistency downgrade
+        out = json.loads(mock_agent_result["output"])
+        out["reasoning"] = "Confirmed by https://example.com directly."
+        mock_agent_result["output"] = json.dumps(out)
+
+        async def run():
+            with (
+                patch.object(
+                    self.checker, '_async_extract_claims',
+                    new=AsyncMock(return_value=mock_claims)
+                ),
+                patch.object(self.checker, '_save_claims'),
+                patch.object(self.checker, 'get_langchain_llm', return_value=MagicMock()),
+                patch.object(self.checker, 'get_langchain_tools', return_value=[]),
+                patch.object(
+                    self.checker,
+                    '_async_verify_single_claim',
+                    new=AsyncMock(
+                        return_value={
+                            "claim": mock_claims[0],
+                            "agent_result": mock_agent_result,
+                            "success": True,
+                        }
+                    ),
+                ),
+            ):
+                return await self.checker._async_eval(self.data, time.time(), None)
+
+        result = asyncio.run(run())
+        assert isinstance(result, EvalDetail)
+        assert result.metric == "ArticleFactChecker"
+
+    def test_async_eval_returns_error_when_no_claims(self):
+        """Empty claim extraction should return an error EvalDetail."""
+        async def run():
+            with patch.object(
+                self.checker, '_async_extract_claims', new=AsyncMock(return_value=[])
+            ):
+                return await self.checker._async_eval(self.data, time.time(), None)
+
+        result = asyncio.run(run())
+
+        assert result.status is True
+        assert any("No claims" in str(r) for r in result.reason)
+
+
+# ─── Tests for _get_max_concurrent_claims ────────────────────────────────────
+
+
+class TestGetMaxConcurrentClaims:
+    def setup_method(self):
+        from dingo.model.llm.agent.agent_article_fact_checker import ArticleFactChecker
+
+        self.checker = ArticleFactChecker
+
+    def test_returns_class_default_when_no_config(self):
+        """Should return max_concurrent_claims class default when not configured."""
+        with patch.object(self.checker, 'dynamic_config') as mock_cfg:
+            mock_cfg.parameters = {}
+            result = self.checker._get_max_concurrent_claims()
+        assert result == self.checker.max_concurrent_claims
+
+    def test_returns_config_value_when_set(self):
+        """Should return value from agent_config.max_concurrent_claims."""
+        with patch.object(self.checker, 'dynamic_config') as mock_cfg:
+            mock_cfg.parameters = {"agent_config": {"max_concurrent_claims": 10}}
+            result = self.checker._get_max_concurrent_claims()
+        assert result == 10
diff --git a/test/scripts/model/llm/agent/test_tool_registry.py b/test/scripts/model/llm/agent/test_tool_registry.py
index 32ec06e9..01ccd5af 100644
--- a/test/scripts/model/llm/agent/test_tool_registry.py
+++ b/test/scripts/model/llm/agent/test_tool_registry.py
@@ -35,9 +35,14 @@ class TestToolRegistry:
     """Test ToolRegistry functionality"""
 
     def setup_method(self):
-        """Reset registry before each test"""
+        """Save registry state and reset before each test."""
+        self._saved_tools = ToolRegistry._tools.copy()
         ToolRegistry._tools = {}
 
+    def teardown_method(self):
+        """Restore registry state after each test."""
+        ToolRegistry._tools = self._saved_tools
+
     def test_register_tool(self):
         """Test registering a tool"""
         class TestTool(BaseTool):
diff --git a/test/scripts/model/llm/agent/tools/test_arxiv_search.py b/test/scripts/model/llm/agent/tools/test_arxiv_search.py
new file mode 100644
index 00000000..f5395428
--- /dev/null
+++ b/test/scripts/model/llm/agent/tools/test_arxiv_search.py
@@ -0,0 +1,509 @@
+"""
+Tests for arXiv search tool
+
+This module tests the ArxivSearch tool including:
+- Configuration validation
+- Tool registration
+- Pattern detection (arXiv IDs, DOIs)
+- Search execution with mocking
+- Result formatting
+- Error handling
+- Thread-safe rate limiting
+- Optional integration tests with real API
+"""
+
+import concurrent.futures
+import threading
+import time
+from datetime import datetime
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from dingo.model.llm.agent.tools.arxiv_search import ArxivConfig, ArxivSearch
+from dingo.model.llm.agent.tools.tool_registry import ToolRegistry
+
+
+class TestArxivConfig:
+    """Test ArxivConfig validation"""
+
+    def test_default_values(self):
+        """Test default configuration values"""
+        config = ArxivConfig()
+        assert config.max_results == 5
+        assert config.sort_by == "relevance"
+        assert config.sort_order == "descending"
+        assert config.rate_limit_delay == 3.0
+        assert config.timeout == 30
+        assert config.api_key is None  # arXiv doesn't need API key
+
+    def test_max_results_validation(self):
+        """Test max_results constraint validation"""
+        # Valid range: 1-50
+        config = ArxivConfig(max_results=1)
+        assert config.max_results == 1
+
+        config = ArxivConfig(max_results=50)
+        assert config.max_results == 50
+
+        # Invalid: below minimum
+        with pytest.raises(ValueError):
+            ArxivConfig(max_results=0)
+
+        # Invalid: above maximum
+        with pytest.raises(ValueError):
+            ArxivConfig(max_results=51)
+
+    def test_sort_by_validation(self):
+        """Test sort_by valid values"""
+        # Valid values
+        for sort_by in ["relevance", "lastUpdatedDate", "submittedDate"]:
+            config = ArxivConfig(sort_by=sort_by)
+            assert config.sort_by == sort_by
+
+        # Invalid value
+        with pytest.raises(ValueError):
+            ArxivConfig(sort_by="invalid_sort")
+
+    def test_sort_order_validation(self):
+        """Test sort_order valid values"""
+        # Valid values
+        for sort_order in ["ascending", "descending"]:
+            config = ArxivConfig(sort_order=sort_order)
+            assert config.sort_order == sort_order
+
+        # Invalid value
+        with pytest.raises(ValueError):
+            ArxivConfig(sort_order="invalid_order")
+
+    def test_rate_limit_delay_validation(self):
+        """Test rate_limit_delay constraint"""
+        # Valid: 0 or positive
+        config = ArxivConfig(rate_limit_delay=0.0)
+        assert config.rate_limit_delay == 0.0
+
+        config = ArxivConfig(rate_limit_delay=5.5)
+        assert config.rate_limit_delay == 5.5
+
+        # Invalid: negative
+        with pytest.raises(ValueError):
+            ArxivConfig(rate_limit_delay=-1.0)
+
+
+class TestArxivSearchRegistration:
+    """Test tool registration and attributes"""
+
+    def test_tool_registered(self):
+        """Test that ArxivSearch is registered in ToolRegistry"""
+        tool_class = ToolRegistry.get("arxiv_search")
+        assert tool_class is not None
+        assert tool_class == ArxivSearch
+
+    def test_tool_attributes(self):
+        """Test tool name and description are set correctly"""
+        assert ArxivSearch.name == "arxiv_search"
+        assert "arXiv" in ArxivSearch.description
+        assert "academic" in ArxivSearch.description.lower()
+        assert len(ArxivSearch.description) > 50  # Has meaningful description
+
+    def test_config_structure(self):
+        """Test config class is properly configured"""
+        assert hasattr(ArxivSearch, 'config')
+        assert isinstance(ArxivSearch.config, ArxivConfig)
+
+
+class TestPatternDetection:
+    """Test arXiv ID and DOI pattern detection"""
+
+    def test_detect_new_arxiv_id(self):
+        """Test detection of new arXiv ID format (YYMM.NNNNN)"""
+        # Valid new format IDs
+        assert ArxivSearch._is_arxiv_id("2301.12345")
+        assert ArxivSearch._is_arxiv_id("1706.03762")
+        assert ArxivSearch._is_arxiv_id("2012.12345")
+
+    def test_detect_versioned_arxiv_id(self):
+        """Test detection of versioned arXiv IDs"""
+        # With version number
+        assert ArxivSearch._is_arxiv_id("2301.12345v1")
+        assert ArxivSearch._is_arxiv_id("1706.03762v5")
+        assert ArxivSearch._is_arxiv_id("2012.12345v12")
+
+    def test_detect_old_arxiv_id(self):
+        """Test detection of old arXiv ID format (archive/NNNNNNN)"""
+        # Valid old format IDs
+        assert ArxivSearch._is_arxiv_id("hep-ph/0123456")
+        assert ArxivSearch._is_arxiv_id("cs/0123456")
+        assert ArxivSearch._is_arxiv_id("math/0123456v1")
+
+    def test_detect_doi(self):
+        """Test DOI pattern detection"""
+        # Valid DOIs
+        assert ArxivSearch._is_doi("10.1234/example")
+        assert ArxivSearch._is_doi("10.48550/arXiv.1706.03762")
+        assert ArxivSearch._is_doi("10.1109/5.771073")
+        assert ArxivSearch._is_doi("10.1007/978-3-540-74958-5_44")
+
+    def test_detect_invalid_formats(self):
+        """Test that invalid formats are rejected"""
+        # Not arXiv IDs
+        assert not ArxivSearch._is_arxiv_id("123.456")  # Too short
+        assert not ArxivSearch._is_arxiv_id("abcd.12345")  # Letters in year
+        assert not ArxivSearch._is_arxiv_id("random text")
+
+        # Not DOIs
+        assert not ArxivSearch._is_doi("1234/example")  # Missing "10."
+        assert not ArxivSearch._is_doi("10.example")  # Missing slash
+        assert not ArxivSearch._is_doi("random text")
+
+    def test_detect_paper_references_in_text(self):
+        """Test detecting multiple paper references in text"""
+        text = """
+        See the Transformer paper (arXiv:1706.03762) and also
+        check DOI 10.48550/arXiv.1706.03762. Another paper is 2301.12345.
+        Old format: hep-ph/0123456.
+        """
+
+        refs = ArxivSearch.detect_paper_references(text)
+
+        # Should find arXiv IDs
+        assert "arxiv_ids" in refs
+        assert "1706.03762" in refs["arxiv_ids"]
+        assert "2301.12345" in refs["arxiv_ids"]
+        assert any("hep-ph/0123456" in id for id in refs["arxiv_ids"])
+
+        # Should find DOIs
+        assert "dois" in refs
+        assert any("10.48550/arXiv.1706.03762" in doi for doi in refs["dois"])
+
+    def test_arxiv_id_with_prefix(self):
+        """Test handling of 'arXiv:' prefix in IDs"""
+        # _is_arxiv_id should work with or without prefix
+        assert ArxivSearch._is_arxiv_id("arXiv:1706.03762")
+        assert ArxivSearch._is_arxiv_id("1706.03762")
+
+
+class TestArxivSearchExecution:
+    """Test search execution with mocked API"""
+
+    def _create_mock_arxiv(self):
+        """Helper to create a mock arxiv module"""
+        mock_arxiv = MagicMock()
+        mock_arxiv.SortCriterion = MagicMock(
+            Relevance=1,
+            LastUpdatedDate=2,
+            SubmittedDate=3
+        )
+        mock_arxiv.SortOrder = MagicMock(
+            Ascending=1,
+            Descending=2
+        )
+        return mock_arxiv
+
+    def _create_mock_paper(self, arxiv_id: str = "1706.03762") -> MagicMock:
+        """Helper to create a mock arxiv.Result object"""
+        paper = MagicMock()
+        paper.entry_id = f"http://arxiv.org/abs/{arxiv_id}"
+        paper.title = "Attention is All You Need"
+        paper.authors = [MagicMock(name="Vaswani, Ashish")]
+        paper.summary = "We propose a new simple network architecture..."
+        paper.published = datetime(2017, 6, 12)
+        paper.updated = datetime(2017, 12, 6)
+        paper.pdf_url = f"http://arxiv.org/pdf/{arxiv_id}v5"
+        paper.doi = "10.48550/arXiv.1706.03762"
+        paper.categories = ["cs.CL", "cs.LG"]
+        paper.primary_category = "cs.CL"
+        paper.journal_ref = "NIPS 2017"
+        paper.comment = "15 pages, 5 figures"
+        return paper
+
+    def test_search_by_arxiv_id(self):
+        """Test direct arXiv ID search"""
+        mock_arxiv = self._create_mock_arxiv()
+        mock_arxiv.Client.return_value.results.return_value = [self._create_mock_paper()]
+
+        with patch.dict('sys.modules', {'arxiv': mock_arxiv}):
+            result = ArxivSearch.execute(query="1706.03762")
+
+            assert result['success'] is True
+            assert result['query'] == "1706.03762"
+            assert result['search_type'] == "arxiv_id"
+            assert result['count'] == 1
+            assert len(result['results']) == 1
+            assert result['results'][0]['arxiv_id'] == "1706.03762"
+            assert result['results'][0]['title'] == "Attention is All You Need"
+
+    def test_search_by_doi(self):
+        """Test DOI search"""
+        mock_arxiv = self._create_mock_arxiv()
+        mock_arxiv.Client.return_value.results.return_value = [self._create_mock_paper()]
+
+        with patch.dict('sys.modules', {'arxiv': mock_arxiv}):
+            result = ArxivSearch.execute(query="10.48550/arXiv.1706.03762")
+
+            assert result['success'] is True
+            assert result['search_type'] == "doi"
+            assert len(result['results']) == 1
+
+    def test_search_by_title(self):
+        """Test title/keyword search"""
+        mock_arxiv = self._create_mock_arxiv()
+        mock_arxiv.Client.return_value.results.return_value = [self._create_mock_paper()]
+
+        with patch.dict('sys.modules', {'arxiv': mock_arxiv}):
+            result = ArxivSearch.execute(query="Attention is All You Need")
+
+            assert result['success'] is True
+            assert result['search_type'] == "title"
+            assert len(result['results']) == 1
+
+    def test_auto_detection_arxiv_id(self):
+        """Test auto-detection mode with arXiv ID"""
+        mock_arxiv = self._create_mock_arxiv()
+        mock_arxiv.Client.return_value.results.return_value = [self._create_mock_paper()]
+
+        with patch.dict('sys.modules', {'arxiv': mock_arxiv}):
+            result = ArxivSearch.execute(query="2301.12345", search_type="auto")
+
+            assert result['success'] is True
+            assert result['search_type'] == "arxiv_id"
+
+    def test_auto_detection_doi(self):
+        """Test auto-detection mode with DOI"""
+        mock_arxiv = self._create_mock_arxiv()
+        mock_arxiv.Client.return_value.results.return_value = [self._create_mock_paper()]
+
+        with patch.dict('sys.modules', {'arxiv': mock_arxiv}):
+            result = ArxivSearch.execute(query="10.1234/example", search_type="auto")
+
+            assert result['success'] is True
+            assert result['search_type'] == "doi"
+
+    def test_auto_detection_title(self):
+        """Test auto-detection mode defaults to title"""
+        mock_arxiv = self._create_mock_arxiv()
+        mock_arxiv.Client.return_value.results.return_value = [self._create_mock_paper()]
+
+        with patch.dict('sys.modules', {'arxiv': mock_arxiv}):
+            result = ArxivSearch.execute(query="machine learning", search_type="auto")
+
+            assert result['success'] is True
+            assert result['search_type'] == "title"
+
+    def test_empty_query(self):
+        """Test error handling for empty query"""
+        result = ArxivSearch.execute(query="")
+
+        assert result['success'] is False
+        assert 'error' in result
+        assert 'empty' in result['error'].lower()
+
+    def test_invalid_search_type(self):
+        """Test error handling for invalid search_type"""
+        result = ArxivSearch.execute(query="test", search_type="invalid")
+
+        assert result['success'] is False
+        assert 'error' in result
+        assert 'invalid' in result['error'].lower()
+
+    def test_library_not_installed(self):
+        """Test error handling when arxiv library is not installed"""
+        # Simulate ImportError by setting module to None
+        with patch.dict('sys.modules', {'arxiv': None}):
+            result = ArxivSearch.execute(query="test")
+
+            assert result['success'] is False
+            assert 'error' in result
+            assert 'error_type' in result
+            assert result['error_type'] == 'DependencyError'
+
+    def test_rate_limiting(self):
+        """Test rate limiting is applied"""
+        mock_arxiv = self._create_mock_arxiv()
+        mock_arxiv.Client.return_value.results.return_value = []
+
+        # Reset last request time
+        ArxivSearch._last_request_time = 0.0
+
+        with patch.dict('sys.modules', {'arxiv': mock_arxiv}):
+            with patch('time.sleep') as mock_sleep:
+                # First request - should not sleep
+                ArxivSearch.execute(query="test")
+                assert mock_sleep.call_count == 0
+
+                # Second request immediately - should sleep
+                ArxivSearch.execute(query="test2")
+                assert mock_sleep.call_count >= 1
+
+    def test_thread_safety_rate_limiting(self):
+        """Test that rate limiting is thread-safe"""
+        mock_arxiv = self._create_mock_arxiv()
+        mock_arxiv.Client.return_value.results.return_value = []
+
+        # Reset last request time
+        ArxivSearch._last_request_time = 0.0
+
+        call_times = []
+        lock = threading.Lock()
+
+        def search_task(query: str):
+            """Task to execute search and record time"""
+            with patch.dict('sys.modules', {'arxiv': mock_arxiv}):
+                ArxivSearch.execute(query=query)
+                with lock:
+                    call_times.append(time.time())
+
+        with patch.dict('sys.modules', {'arxiv': mock_arxiv}):
+            # Execute multiple searches concurrently
+            with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
+                futures = [
+                    executor.submit(search_task, f"query_{i}")
+                    for i in range(3)
+                ]
+                concurrent.futures.wait(futures)
+
+        # Verify we have 3 call times
+        assert len(call_times) == 3
+
+        # Check that rate limiting enforced some minimum delay
+        # (At least 2 calls should be separated by rate_limit_delay)
+        call_times.sort()
+        total_time = call_times[-1] - call_times[0]
+        # With 3 calls and rate_limit_delay=3.0, minimum total time is ~6 seconds
+        # But with threading, we just verify no race conditions occurred
+        assert total_time >= 0, "Race condition may have occurred"
+
+    def test_has_rate_limit_lock(self):
+        """Test that ArxivSearch has a thread lock for rate limiting"""
+        assert hasattr(ArxivSearch, '_rate_limit_lock')
+        assert isinstance(ArxivSearch._rate_limit_lock, type(threading.Lock()))
+
+    def test_result_formatting(self):
+        """Test that result formatting is correct"""
+        mock_arxiv = self._create_mock_arxiv()
+        mock_paper = self._create_mock_paper()
+        mock_arxiv.Client.return_value.results.return_value = [mock_paper]
+
+        with patch.dict('sys.modules', {'arxiv': mock_arxiv}):
+            result = ArxivSearch.execute(query="1706.03762")
+
+            # Check result structure
+            paper = result['results'][0]
+            assert 'arxiv_id' in paper
+            assert 'title' in paper
+            assert 'authors' in paper
+            assert 'summary' in paper
+            assert 'published' in paper
+            assert 'updated' in paper
+            assert 'pdf_url' in paper
+            assert 'doi' in paper
+            assert 'categories' in paper
+            assert 'primary_category' in paper
+            assert 'journal_ref' in paper
+            assert 'comment' in paper
+
+            # Check types
+            assert isinstance(paper['authors'], list)
+            assert isinstance(paper['categories'], list)
+            assert paper['published'] == "2017-06-12"
+            assert paper['updated'] == "2017-12-06"
+
+    def test_multiple_results(self):
+        """Test handling multiple search results"""
+        mock_arxiv = self._create_mock_arxiv()
+        mock_arxiv.Client.return_value.results.return_value = [
+            self._create_mock_paper("1706.03762"),
+            self._create_mock_paper("2301.12345")
+        ]
+
+        with patch.dict('sys.modules', {'arxiv': mock_arxiv}):
+            result = ArxivSearch.execute(query="transformer", max_results=10)
+
+            assert result['success'] is True
+            assert result['count'] == 2
+            assert len(result['results']) == 2
+
+    def test_api_error_handling(self):
+        """Test handling of API errors"""
+        mock_arxiv = self._create_mock_arxiv()
+        mock_arxiv.Client.return_value.results.side_effect = Exception("API Error")
+
+        with patch.dict('sys.modules', {'arxiv': mock_arxiv}):
+            result = ArxivSearch.execute(query="test")
+
+            assert result['success'] is False
+            assert 'error' in result
+            assert 'error_type' in result
+
+
+@pytest.mark.integration
+class TestArxivSearchIntegration:
+    """
+    Integration tests with real arXiv API.
+
+    These tests are marked with @pytest.mark.integration and can be run separately:
+        pytest test/scripts/model/llm/agent/tools/test_arxiv_search.py -m integration
+
+    Or excluded from normal test runs:
+        pytest test/scripts/model/llm/agent/tools/test_arxiv_search.py -m "not integration"
+    """
+
+    def test_search_by_title_keyword(self):
+        """Test real search by title keywords"""
+        # Skip if arxiv not installed
+        try:
+            import arxiv  # noqa: F401
+        except ImportError:
+            pytest.skip("arxiv library not installed")
+
+        # Search for papers containing "transformer" in title
+        # This is a more reliable search than exact title matching
+        result = ArxivSearch.execute(query="transformer neural network")
+
+        # Verify successful search - arXiv search results may vary
+        assert result['success'] is True
+        # Should return some results for such a common topic
+        assert result['count'] >= 0  # May be 0 if API has issues
+        assert isinstance(result['results'], list)
+
+    def test_search_by_real_arxiv_id(self):
+        """Test real search by arXiv ID"""
+        # Skip if arxiv not installed
+        try:
+            import arxiv  # noqa: F401
+        except ImportError:
+            pytest.skip("arxiv library not installed")
+
+        # Famous Transformer paper
+        result = ArxivSearch.execute(query="1706.03762")
+
+        # Verify successful search
+        assert result['success'] is True
+        assert result['search_type'] == "arxiv_id"
+        assert result['count'] == 1
+
+        # Check paper details
+        paper = result['results'][0]
+        assert "1706.03762" in paper['arxiv_id']
+        assert "Attention" in paper['title']
+        assert len(paper['authors']) > 0
+        assert paper['pdf_url'] is not None
+
+    def test_rate_limiting_in_practice(self):
+        """Test that rate limiting works with real API"""
+        # Skip if arxiv not installed
+        try:
+            import arxiv  # noqa: F401
+        except ImportError:
+            pytest.skip("arxiv library not installed")
+
+        # Record start time
+        start_time = time.time()
+
+        # Make two searches
+        ArxivSearch.execute(query="1706.03762")
+        ArxivSearch.execute(query="2301.12345")
+
+        # Should have taken at least 3 seconds (default rate limit)
+        elapsed = time.time() - start_time
+        assert elapsed >= 3.0, f"Rate limiting not working: took only {elapsed}s"
diff --git a/test/scripts/model/llm/agent/tools/test_claims_extractor.py b/test/scripts/model/llm/agent/tools/test_claims_extractor.py
new file mode 100644
index 00000000..46258733
--- /dev/null
+++ b/test/scripts/model/llm/agent/tools/test_claims_extractor.py
@@ -0,0 +1,23 @@
+"""
+Unit tests for ClaimsExtractor tool.
+
+Only non-API tests are included here. Tests that require a live
+DeepSeek/OpenAI API have been removed to keep the suite fast and
+deterministic.
+"""
+
+from dingo.model.llm.agent.tools.claims_extractor import ClaimsExtractor
+
+
+class TestClaimsExtractor:
+    """Test suite for ClaimsExtractor tool."""
+
+    def test_missing_api_key(self):
+        """Test error when API key is missing."""
+        # Reset config to a fresh instance (no key set)
+        ClaimsExtractor.config = ClaimsExtractor.config.__class__()
+
+        result = ClaimsExtractor.execute(text="Some text")
+
+        assert not result['success']
+        assert 'API key' in result.get('error', '')