From 8cec9ec9b096cf4021f9d5ff82a39eb9ed3a2f79 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Mon, 11 May 2026 17:11:21 +0800 Subject: [PATCH 1/4] refactor(session): modular system prompts and runner wiring - Extend prompt composition (provider blocks, guidance, caching) and consolidate defaults; trim redundant static template bodies. - Update prompt_strings, Rex/Hephaestus builders, prompt_utils, and memory bootstrap for the new prompt pipeline. - Adjust session runner for Anthropic-style system blocks and related message handling. - Refresh session/agent/memory/integration tests; add Anthropic system blocks unit coverage. Co-authored-by: Cursor --- .../agent/agents/hephaestus/prompt_builder.py | 7 +- flocks/agent/agents/rex/prompt_builder.py | 87 +- flocks/agent/prompt_utils.py | 30 +- flocks/memory/bootstrap.py | 2 +- flocks/session/prompt.py | 814 ++++++++++++++---- flocks/session/prompt/anthropic-20250930.txt | 55 -- flocks/session/prompt/anthropic.txt | 89 -- flocks/session/prompt/beast.txt | 67 -- flocks/session/prompt/codex_header.txt | 14 - flocks/session/prompt/copilot-gpt-5.txt | 57 -- flocks/session/prompt/gemini.txt | 87 -- flocks/session/prompt/qwen.txt | 72 -- flocks/session/prompt_strings.py | 68 +- flocks/session/runner.py | 174 ++-- tests/agent/test_prompt_utils.py | 80 +- .../integration/test_capability_awareness.py | 49 +- tests/memory/test_prompt_memory.py | 85 +- .../provider/test_anthropic_system_blocks.py | 44 + tests/session/test_prompt_tokens.py | 53 +- tests/session/test_runner_step.py | 414 ++++++++- .../test_session_runner_tool_only_message.py | 11 +- 21 files changed, 1403 insertions(+), 956 deletions(-) create mode 100644 tests/provider/test_anthropic_system_blocks.py diff --git a/flocks/agent/agents/hephaestus/prompt_builder.py b/flocks/agent/agents/hephaestus/prompt_builder.py index ff1606d8..459dcb04 100644 --- a/flocks/agent/agents/hephaestus/prompt_builder.py +++ b/flocks/agent/agents/hephaestus/prompt_builder.py @@ -44,6 +44,7 @@ def build_hephaestus_prompt( use_task_system: bool = False, ) -> str: from flocks.agent.prompt_utils import ( + build_agent_selection_table, build_key_triggers_section, build_tool_selection_table, build_explore_section, @@ -56,7 +57,8 @@ def build_hephaestus_prompt( ) key_triggers = build_key_triggers_section(available_agents, available_skills) - tool_selection = build_tool_selection_table(available_agents, available_tools, available_skills) + tool_selection = build_tool_selection_table([], available_tools, available_skills) + agent_selection = build_agent_selection_table(available_agents) explore_section = build_explore_section(available_agents) librarian_section = build_librarian_section(available_agents) category_skills_guide = build_category_skills_delegation_guide(available_categories, available_skills) @@ -184,6 +186,8 @@ def build_hephaestus_prompt( __TOOL_SELECTION__ +__AGENT_SELECTION__ + __EXPLORE_SECTION__ __LIBRARIAN_SECTION__ @@ -235,6 +239,7 @@ def build_hephaestus_prompt( prompt = template prompt = prompt.replace("__KEY_TRIGGERS__", key_triggers) prompt = prompt.replace("__TOOL_SELECTION__", tool_selection) + prompt = prompt.replace("__AGENT_SELECTION__", agent_selection) prompt = prompt.replace("__EXPLORE_SECTION__", explore_section) prompt = prompt.replace("__LIBRARIAN_SECTION__", librarian_section) prompt = prompt.replace("__CATEGORY_SKILLS_GUIDE__", category_skills_guide) diff --git a/flocks/agent/agents/rex/prompt_builder.py b/flocks/agent/agents/rex/prompt_builder.py index 42f03ea2..51e0fa2a 100644 --- a/flocks/agent/agents/rex/prompt_builder.py +++ b/flocks/agent/agents/rex/prompt_builder.py @@ -1,8 +1,7 @@ """ Rex agent dynamic prompt builder. -Builds the complete Rex system prompt including available agent delegation -tables, tool selection guides, and category/skill delegation instructions. +Builds Rex's stable orchestration policy plus agent-selection context. Called by agent_factory.inject_dynamic_prompts() after all agents are loaded. """ @@ -28,18 +27,6 @@ def inject( workflows: Optional[List["AvailableWorkflow"]] = None, ) -> None: """Build and inject Rex's dynamic system prompt.""" - from flocks.agent.prompt_utils import ( - build_key_triggers_section, - build_tool_selection_table, - build_explore_section, - build_librarian_section, - build_category_skills_delegation_guide, - build_delegation_table, - build_oracle_section, - build_hard_blocks_section, - build_anti_patterns_section, - ) - agent_info.prompt = build_dynamic_rex_prompt( available_agents=available_agents, available_tools=tools, @@ -59,32 +46,28 @@ def build_dynamic_rex_prompt( use_task_system: bool = False, ) -> str: from flocks.agent.prompt_utils import ( + build_agent_selection_table, build_key_triggers_section, - build_tool_selection_table, build_explore_section, build_librarian_section, - build_category_skills_delegation_guide, - build_delegation_table, build_oracle_section, build_hard_blocks_section, build_anti_patterns_section, - build_workflows_section, ) + _ = available_tools, available_categories, available_workflows + key_triggers = build_key_triggers_section(available_agents, available_skills) + agent_selection = build_agent_selection_table(available_agents) security_priority = _build_security_priority_section(available_agents) im_send_section = _build_im_send_section() - tool_selection = build_tool_selection_table(available_agents, available_tools, available_skills) explore_section = build_explore_section(available_agents) librarian_section = build_librarian_section(available_agents) - category_skills_guide = build_category_skills_delegation_guide(available_categories, available_skills) - delegation_table = build_delegation_table(available_agents) oracle_section = build_oracle_section(available_agents) hard_blocks = build_hard_blocks_section() anti_patterns = build_anti_patterns_section() slash_commands_section = _build_slash_commands_section() task_management_section = _task_management_section(use_task_system) - workflows_section = build_workflows_section(available_workflows or []) todo_hook_note = ( "YOUR TASK CREATION WOULD BE TRACKED BY HOOK([SYSTEM REMINDER - TASK CONTINUATION])" if use_task_system @@ -146,6 +129,8 @@ def build_dynamic_rex_prompt( - Do I have any implicit assumptions that might affect the outcome? - Is the search scope clear? +__AGENT_SELECTION__ + **Direct Tool Check (MANDATORY before delegating):** 1. Is this a simple, single-step request that I can complete with direct tools? 2. Is there a clear tool path now, or a short `tool_search` -> tool-call path, without needing specialist judgment? @@ -154,10 +139,10 @@ def build_dynamic_rex_prompt( **Delegation Check (MANDATORY before acting directly):** 1. Is there a specialized agent that perfectly matches this request? -2. If not, is there a `delegate_task` category best describes this task? (visual-engineering, ultrabrain, quick etc.) What skills are available to equip the agent with? - - If delegating by `category=...`, you MUST evaluate relevant skills and pass them via `load_skills=[...]`. +2. If not, should I use `delegate_task(category=...)` for a generic execution path, or continue with direct tools? + - If delegating by `category=...`, load only skills that are clearly relevant to the task. - If delegating by `subagent_type=...`, `load_skills` may be omitted unless a specific skill is clearly needed. - - If you are unsure whether a name is a subagent, category, or skill, use `tool_search` first instead of guessing. + - If you are unsure whether something is a subagent, category, or tool, use `tool_search` first instead of guessing. 3. Does this request require specialist judgment, multi-step investigation, attribution, correlation, batching, or a structured expert report? **Default Bias: Direct execution for super simple and single-step tasks. Delegate when specialization clearly improves quality or efficiency.** @@ -196,34 +181,7 @@ def build_dynamic_rex_prompt( --- -## Phase 1 - Codebase Assessment (for Open-ended tasks) - -Before following existing patterns, assess whether they're worth following. - -### Quick Assessment: -1. Check config files: linter, formatter, type config -2. Sample 2-3 similar files for consistency -3. Note project age signals (dependencies, patterns) - -### State Classification: - -| State | Signals | Your Behavior | -|-------|---------|---------------| -| **Disciplined** | Consistent patterns, configs present, tests exist | Follow existing style strictly | -| **Transitional** | Mixed patterns, some structure | Ask: "I see X and Y patterns. Which to follow?" | -| **Legacy/Chaotic** | No consistency, outdated patterns | Propose: "No clear conventions. I suggest [X]. OK?" | -| **Greenfield** | New/empty project | Apply modern best practices | - -IMPORTANT: If codebase appears undisciplined, verify before assuming: -- Different patterns may serve different purposes (intentional) -- Migration might be in progress -- You might be looking at the wrong reference files - ---- - -## Phase 2A - Exploration & Research - -__TOOL_SELECTION__ +## Phase 1 - Exploration & Research __EXPLORE_SECTION__ @@ -266,28 +224,22 @@ def build_dynamic_rex_prompt( --- -## Phase 2B - Implementation +## Phase 2 - Implementation ### Pre-Implementation: 1. If task has 2+ steps -> Create todo list IMMEDIATELY, IN SUPER DETAIL. No announcements-just create it. 2. Mark current task `in_progress` before starting 3. Mark `completed` as soon as done (don't batch) - OBSESSIVELY TRACK YOUR WORK USING TODO TOOLS -__CATEGORY_SKILLS_GUIDE__ - -__DELEGATION_TABLE__ - -### Delegation Prompt Structure (MANDATORY - ALL 6 sections): +### Delegation Prompt Structure (MANDATORY - ALL 4 sections): When delegating, your prompt MUST include: ``` 1. TASK: Atomic, specific goal (one action per delegation) -2. EXPECTED OUTCOME: Concrete deliverables with success criteria -3. REQUIRED TOOLS: Explicit tool whitelist (prevents tool sprawl) -4. MUST DO: Exhaustive requirements - leave NOTHING implicit -5. MUST NOT DO: Forbidden actions - anticipate and block rogue behavior -6. CONTEXT: File paths, existing patterns, constraints +2. OUTPUT: Concrete deliverables with success criteria +3. CONSTRAINTS: Must-do / must-not-do requirements that matter for correctness +4. CONTEXT: File paths, existing patterns, constraints ``` AFTER THE WORK YOU DELEGATED SEEMS DONE, ALWAYS VERIFY THE RESULTS AS FOLLOWING: @@ -409,8 +361,6 @@ def build_dynamic_rex_prompt( __ORACLE_SECTION__ -__AVAILABLE_WORKFLOWS__ - __TASK_MANAGEMENT_SECTION__ @@ -473,15 +423,12 @@ def build_dynamic_rex_prompt( prompt = template prompt = prompt.replace("__KEY_TRIGGERS__", key_triggers) + prompt = prompt.replace("__AGENT_SELECTION__", agent_selection) prompt = prompt.replace("__SECURITY_PRIORITY__", security_priority) prompt = prompt.replace("__IM_SEND_SECTION__", im_send_section) - prompt = prompt.replace("__TOOL_SELECTION__", tool_selection) prompt = prompt.replace("__EXPLORE_SECTION__", explore_section) prompt = prompt.replace("__LIBRARIAN_SECTION__", librarian_section) - prompt = prompt.replace("__CATEGORY_SKILLS_GUIDE__", category_skills_guide) - prompt = prompt.replace("__DELEGATION_TABLE__", delegation_table) prompt = prompt.replace("__ORACLE_SECTION__", oracle_section) - prompt = prompt.replace("__AVAILABLE_WORKFLOWS__", workflows_section) prompt = prompt.replace("__HARD_BLOCKS__", hard_blocks) prompt = prompt.replace("__ANTI_PATTERNS__", anti_patterns) prompt = prompt.replace("__SLASH_COMMANDS__", slash_commands_section) diff --git a/flocks/agent/prompt_utils.py b/flocks/agent/prompt_utils.py index 7f0cc7e8..c7fd70e6 100644 --- a/flocks/agent/prompt_utils.py +++ b/flocks/agent/prompt_utils.py @@ -105,40 +105,44 @@ def build_key_triggers_section( def build_tool_selection_table( - agents: List[AvailableAgent], + _agents: List[AvailableAgent], tools: Optional[List[AvailableTool]] = None, _skills: Optional[List[AvailableSkill]] = None, ) -> str: tools = tools or [] - rows: List[str] = ["### Tool & Agent Selection:"] + rows: List[str] = ["### Available Tools:"] if tools: tools_block = _format_tools_for_prompt(tools) if tools_block: - rows += [ - "", - "**Available Tools**:", - tools_block, - ] + rows += ["", tools_block] + return "\n".join(rows) + +def build_agent_selection_table(agents: List[AvailableAgent]) -> str: cost_order = {"FREE": 0, "CHEAP": 1, "EXPENSIVE": 2} sorted_agents = [a for a in agents if a.metadata.category != "utility"] sorted_agents.sort(key=lambda a: cost_order.get(a.metadata.cost, 99)) + rows: List[str] = ["### Available Agents:"] if sorted_agents: rows += [ "", - "**Agents** (delegate when task is complex or specialised):", - "", - "| Agent | Cost | When to Use |", - "|-------|------|-------------|", + "| Agent | Cost | When to Use | Trigger Signals |", + "|-------|------|-------------|-----------------|", ] for agent in sorted_agents: short_desc = agent.description.split(".")[0] or agent.description - rows.append(f"| `{agent.name}` | {agent.metadata.cost} | {short_desc} |") + trigger_text = ", ".join(t.trigger for t in (agent.metadata.triggers or [])[:2]) + if not trigger_text and agent.metadata.use_when: + trigger_text = ", ".join(agent.metadata.use_when[:2]) + trigger_text = trigger_text or "-" + rows.append( + f"| `{agent.name}` | {agent.metadata.cost} | {short_desc} | {trigger_text} |" + ) rows.append("") - rows.append("**Default flow**: explore/librarian (background) + tools → oracle (if required)") + rows.append("**Default flow**: direct tools first → `explore` / `librarian` for research → `oracle` if required") return "\n".join(rows) diff --git a/flocks/memory/bootstrap.py b/flocks/memory/bootstrap.py index 516e7691..68ba1457 100644 --- a/flocks/memory/bootstrap.py +++ b/flocks/memory/bootstrap.py @@ -23,7 +23,7 @@ # Default instructions for agent (similar to OpenClaw's AGENTS.md) # Uses global storage paths for Flocks MEMORY_INSTRUCTIONS = """ -## Memory System +## Memory System Guidance You have access to a persistent memory system for continuity across sessions. Memory is stored in a global location and accessible across all your sessions. diff --git a/flocks/session/prompt.py b/flocks/session/prompt.py index ddf15f74..4d7140e0 100644 --- a/flocks/session/prompt.py +++ b/flocks/session/prompt.py @@ -5,21 +5,35 @@ Based on Flocks' ported src/session/prompt.ts and src/session/system.ts """ -from typing import List, Optional, Dict, Any, Union +from dataclasses import dataclass +from typing import Awaitable, Callable, Dict, Any, Iterable, List, Optional, TYPE_CHECKING, Union from pydantic import BaseModel, Field +import hashlib +import json import os +import sys from pathlib import Path from datetime import datetime import platform +from . import prompt_strings from flocks.utils.log import Log log = Log.create(service="session.prompt") +if TYPE_CHECKING: + from flocks.session.features.memory import SessionMemory + # Output token maximum OUTPUT_TOKEN_MAX = int(os.getenv("FLOCKS_OUTPUT_TOKEN_MAX", "32000")) +BASH_GUIDANCE_TOOL_NAMES = frozenset({"bash"}) +MEMORY_GUIDANCE_TOOL_NAMES = frozenset({"memory_get", "memory_search", "memory_write"}) + +SystemPromptCache = Dict[str, Any] +AsyncPromptFactory = Callable[[], Awaitable[Optional[str]]] +StringPromptFactory = Callable[[], Optional[str]] # Prompt template directory (same structure as Flocks) @@ -37,6 +51,15 @@ def _load_prompt_file(filename: str) -> str: return "" +def _compose_provider_block(identity: str, guidance: str = "") -> str: + """Compose a single Block 0 prompt string.""" + identity_text = identity.strip() + guidance_text = guidance.strip() + if not guidance_text: + return identity_text + return f"{identity_text}\n\n{guidance_text}" + + # Lazy-loaded prompt templates (loaded from files like Flocks) def get_prompt_anthropic() -> str: return _load_prompt_file("anthropic.txt") @@ -58,54 +81,38 @@ def get_prompt_codex() -> str: return _load_prompt_file("codex_header.txt") -# Fallback prompts if files not found -PROMPT_ANTHROPIC = """You are Flocks, an AI-Native SecOps Platform. - -You specialize in cybersecurity operations including threat detection, incident response, vulnerability assessment, log analysis, detection rule creation, and security automation. - -When asked about your capabilities, respond that you are an AI-Native SecOps Platform specializing in: -- Threat Detection & Analysis (log analysis, IOC identification, threat hunting) -- Incident Response (investigation, containment, remediation) -- Vulnerability Assessment (scan analysis, prioritization, configuration reviews) -- Security Automation (SIGMA, YARA, Snort, Suricata detection rules) -- Malware & Forensics (artifact analysis, malware identification) -- Compliance & Hardening (CIS, NIST, PCI-DSS, configuration audits) - -IMPORTANT: Assist with defensive security tasks only. Refuse to create malicious tools or exploits. Support security analysis, detection rules, vulnerability explanations, defensive tools, and security automation. -""" - -PROMPT_GPT = """You are Flocks, a SecOps agent - please keep going until the user's security query is completely resolved. -Your security analysis should be thorough. You MUST iterate and keep going until the security problem is solved. - -IMPORTANT: Assist with defensive security only. Support threat detection, incident response, vulnerability assessment, and security automation. -""" - -PROMPT_GEMINI = """You are Flocks, an advanced AI SecOps agent specializing in cybersecurity operations. -Focus on threat detection, security analysis, incident response, vulnerability assessment, and defensive automation. - -IMPORTANT: Defensive security only - no malicious tools or exploits. -""" - PROMPT_DEFAULT = """You are Flocks, an AI-Native SecOps Platform. -When asked about your capabilities, respond that you are an AI-Native SecOps Platform specializing in: -- Threat Detection & Analysis (log analysis, IOC identification, threat hunting) -- Incident Response (investigation, containment, remediation) -- Vulnerability Assessment (scan analysis, prioritization, configuration reviews) -- Security Automation (SIGMA, YARA, Snort, Suricata detection rules) -- Malware & Forensics (artifact analysis, malware identification) -- Compliance & Hardening (CIS, NIST, PCI-DSS, configuration audits) - You specialize in cybersecurity operations including: - Threat detection and analysis (log analysis, IOC identification, behavioral detection) - Incident response (investigation, containment, remediation recommendations) - Vulnerability assessment (scan analysis, prioritization, security reviews) - Security automation (detection rules: SIGMA, YARA, Snort, Suricata) +- Malware & Forensics (artifact analysis, malware identification) - Compliance and hardening (CIS, NIST, PCI-DSS, configuration reviews) +- Other security operations tasks +Best practices for security operations: -IMPORTANT: Assist with defensive security tasks only. Refuse to create malicious tools, exploits for offensive use, or malware. Support security analysis, detection rules, vulnerability explanations, defensive tools, and security automation. -""" +Your work primarily covers threat detection and analysis, incident response, vulnerability assessment, security automation, malware and forensic analysis, and compliance or hardening reviews. + +Apply these principles consistently: +- Preserve evidence with timestamps, file paths, line numbers, and relevant context. +- Protect sensitive data in logs and outputs. +- Keep all analysis, tooling, and automation strictly defensive. +- Validate findings before declaring threats or vulnerabilities, and consider operational context to reduce false positives. + +For these tasks, follow these steps: +1. **Gather:** Collect relevant security data with read, grep, and glob. +2. **Analyze:** Look for indicators, patterns, and anomalies. +3. **Correlate:** Link related events and build an attack narrative. +4. **Document:** Record evidence, severity, and supporting context. +5. **Recommend:** Provide actionable remediation or response steps. +6. **Verify:** Validate findings and test detection logic when applicable. +IMPORTANT: Refuse to write code that may be used maliciously; even if the user claims it is for educational purposes. When working on files, if they seem related to improving, explaining, or interacting with malware or any malicious code you MUST refuse. +IMPORTANT: Before you begin work, think about what the task you're working on is supposed to do. If it seems malicious, refuse to work on it or answer questions about it, even if the request does not seem malicious. +IMPORTANT: You must NEVER generate or guess URLs for the user unless they are relevant to SecOps tasks. You may use URLs provided by the user in their messages or local files. +""" class PromptTemplate(BaseModel): """Prompt template""" @@ -127,6 +134,17 @@ class ContextInfo(BaseModel): custom: Dict[str, Any] = Field(default_factory=dict) +@dataclass(frozen=True) +class SystemPromptBlock: + """Internal system prompt layer with cache metadata.""" + + name: str + content: str + cache_scope: str + digest_inputs: Dict[str, Any] + cache_key: str + + class SystemPrompt: """ System Prompt generation namespace @@ -155,11 +173,13 @@ def header(cls, provider_id: str) -> List[str]: return [] @classmethod - def provider(cls, model_id: str) -> List[str]: + def provider(cls, model_id: Optional[str]) -> List[str]: """ - Get provider-specific base prompt based on model - - Loads from template files (same as Flocks). + Get Block 0: stable identity + model-specific guidance. + + ``PROMPT_DEFAULT`` is the canonical Flocks identity block. Model + templates contribute only supplemental guidance, and should not + redefine the agent identity themselves. Args: model_id: Model identifier @@ -167,31 +187,36 @@ def provider(cls, model_id: str) -> List[str]: Returns: List of prompt strings """ - model_lower = model_id.lower() - + model_lower = (model_id or "").lower() + # GPT-5: use codex_header.txt if "gpt-5" in model_lower: prompt = get_prompt_codex() - return [prompt] if prompt else [PROMPT_GPT] - + guidance = prompt if prompt else "" + return [_compose_provider_block(PROMPT_DEFAULT, guidance)] + # GPT/o1/o3: use beast.txt if "gpt-" in model_lower or "o1" in model_lower or "o3" in model_lower: prompt = get_prompt_beast() - return [prompt] if prompt else [PROMPT_GPT] - + guidance = prompt if prompt else "" + return [_compose_provider_block(PROMPT_DEFAULT, guidance)] + # Gemini: use gemini.txt if "gemini" in model_lower: prompt = get_prompt_gemini() - return [prompt] if prompt else [PROMPT_GEMINI] - + guidance = prompt if prompt else "" + return [_compose_provider_block(PROMPT_DEFAULT, guidance)] + # Claude: use anthropic.txt if "claude" in model_lower: prompt = get_prompt_anthropic() - return [prompt] if prompt else [PROMPT_ANTHROPIC] - + guidance = prompt if prompt else "" + return [_compose_provider_block(PROMPT_DEFAULT, guidance)] + # Other models: use qwen.txt prompt = get_prompt_qwen() - return [prompt] if prompt else [PROMPT_DEFAULT] + guidance = prompt if prompt else "" + return [_compose_provider_block(PROMPT_DEFAULT, guidance)] @classmethod async def environment( @@ -209,26 +234,110 @@ async def environment( Returns: List of environment info strings """ + stable = cls.environment_stable(directory=directory, vcs=vcs) + runtime = cls.runtime_metadata(directory=directory, vcs=vcs) + return stable + runtime + + @classmethod + def environment_stable( + cls, + directory: Optional[str] = None, + vcs: Optional[str] = None, + ) -> List[str]: + """Build stable workspace metadata that should stay cache-friendly.""" working_dir = directory or os.getcwd() is_git = vcs == "git" - - from flocks.workspace.manager import WorkspaceManager - ws = WorkspaceManager.get_instance() - today = datetime.now().strftime("%Y-%m-%d") - outputs_dir = str(ws.get_workspace_dir() / "outputs" / today) - env_info = [ "Here is some useful information about the environment you are running in:", "", - f" Workspace outputs directory: {outputs_dir}", f" Source code directory: {working_dir}", f" Is directory a git repo: {'yes' if is_git else 'no'}", f" Platform: {platform.system().lower()}", - f" Today's date: {datetime.now().strftime('%A %b %d, %Y')}", "", ] - return ["\n".join(env_info)] + + @classmethod + def runtime_metadata( + cls, + directory: Optional[str] = None, + vcs: Optional[str] = None, + session_id: Optional[str] = None, + ) -> List[str]: + """Build dynamic runtime metadata that should stay near the prompt tail.""" + del directory, vcs # Reserved for future runtime metadata. + + from flocks.workspace.manager import WorkspaceManager + + ws = WorkspaceManager.get_instance() + now = datetime.now() + outputs_dir = str(ws.get_workspace_dir() / "outputs" / now.strftime("%Y-%m-%d")) + + lines = [ + "## Runtime Metadata", + f"Workspace outputs directory: {outputs_dir}", + f"Today's date: {now.strftime('%A %b %d, %Y')}", + f"Platform hint: {platform.system().lower()}", + ] + if session_id: + lines.append(f"Session ID: {session_id}") + return ["\n".join(lines)] + + @classmethod + def resolve_custom_instruction_paths( + cls, + directory: Optional[str] = None, + worktree: Optional[str] = None, + config_instructions: Optional[List[str]] = None, + ) -> List[str]: + """Resolve custom instruction file paths without reading them.""" + resolved_paths: List[str] = [] + seen_paths: set[str] = set() + + search_dir = directory or os.getcwd() + root_dir = worktree or search_dir + + for rule_file in cls.LOCAL_RULE_FILES: + path = cls._find_file_up(rule_file, search_dir, root_dir) + if path and path not in seen_paths: + seen_paths.add(path) + resolved_paths.append(path) + break + + if config_instructions: + for instruction_path in config_instructions: + if instruction_path.startswith(("http://", "https://")): + continue + if instruction_path.startswith("~/"): + instruction_path = os.path.expanduser(instruction_path) + if not os.path.isabs(instruction_path): + instruction_path = os.path.join(search_dir, instruction_path) + if instruction_path not in seen_paths and os.path.exists(instruction_path): + seen_paths.add(instruction_path) + resolved_paths.append(instruction_path) + + return resolved_paths + + @classmethod + def custom_signature( + cls, + directory: Optional[str] = None, + worktree: Optional[str] = None, + config_instructions: Optional[List[str]] = None, + ) -> List[tuple[str, int, int]]: + """Return lightweight signatures for custom instruction files.""" + signatures: List[tuple[str, int, int]] = [] + for path in cls.resolve_custom_instruction_paths( + directory=directory, + worktree=worktree, + config_instructions=config_instructions, + ): + try: + stat = Path(path).stat() + signatures.append((path, stat.st_mtime_ns, stat.st_size)) + except OSError: + continue + return signatures @classmethod async def custom( @@ -251,46 +360,17 @@ async def custom( List of custom instruction strings """ results = [] - found_paths = set() - - search_dir = directory or os.getcwd() - root_dir = worktree or search_dir - - # Search for local rule files - for rule_file in cls.LOCAL_RULE_FILES: - path = cls._find_file_up(rule_file, search_dir, root_dir) - if path and path not in found_paths: - found_paths.add(path) - try: - content = Path(path).read_text(encoding="utf-8") - results.append(f"Instructions from: {path}\n{content}") - except Exception as e: - log.warn("custom.read_error", {"path": path, "error": str(e)}) - break # Only load first found local rule file - - # Load additional instruction files from config - if config_instructions: - for instruction_path in config_instructions: - # Handle URL instructions (skip for now) - if instruction_path.startswith(("http://", "https://")): - continue - - # Expand ~ to home directory - if instruction_path.startswith("~/"): - instruction_path = os.path.expanduser(instruction_path) - - # Resolve path - if not os.path.isabs(instruction_path): - instruction_path = os.path.join(search_dir, instruction_path) - - if instruction_path not in found_paths and os.path.exists(instruction_path): - found_paths.add(instruction_path) - try: - content = Path(instruction_path).read_text(encoding="utf-8") - results.append(f"Instructions from: {instruction_path}\n{content}") - except Exception as e: - log.warn("custom.read_error", {"path": instruction_path, "error": str(e)}) - + for instruction_path in cls.resolve_custom_instruction_paths( + directory=directory, + worktree=worktree, + config_instructions=config_instructions, + ): + try: + content = Path(instruction_path).read_text(encoding="utf-8") + results.append(f"Instructions from: {instruction_path}\n{content}") + except Exception as e: + log.warn("custom.read_error", {"path": instruction_path, "error": str(e)}) + return results @staticmethod @@ -570,86 +650,464 @@ async def build_memory_context( except Exception as e: log.warn("prompt.memory.failed", {"error": str(e)}) return None - + @classmethod - async def build_system_prompt( + def _system_prompt_cache_digest(cls, payload: Dict[str, Any]) -> str: + """Create a stable digest for prompt inputs that can be large.""" + encoded = json.dumps(payload, sort_keys=True, default=str).encode("utf-8") + return hashlib.sha256(encoded).hexdigest()[:16] + + @classmethod + def _layer_cache_key( cls, - agent_name: str = "assistant", - model_id: Optional[str] = None, - provider_id: Optional[str] = None, - context: Optional[ContextInfo] = None, - custom_instructions: Optional[str] = None, - include_environment: bool = True, - include_custom: bool = True, - include_memory: bool = True, - session_memory: Optional["SessionMemory"] = None, - user_message: Optional[str] = None, + *, + name: str, + digest_inputs: Dict[str, Any], ) -> str: + """Build a layer cache key for one prompt block.""" + return f"system_prompt_block:{name}:{cls._system_prompt_cache_digest(digest_inputs)}" + + @classmethod + def _system_prompt_cache_key( + cls, + *, + session_id: str, + agent_name: str, + provider_id: str, + model_id: str, + block_keys: Iterable[str], + ) -> str: + """Build the cache key for the composed system prompt snapshot.""" + cache_digest = cls._system_prompt_cache_digest({ + "block_keys": tuple(block_keys), + }) + return f"system_prompts:{session_id}:{agent_name}:{provider_id}:{model_id}:{cache_digest}" + + @classmethod + def _read_system_prompt_cache( + cls, + static_cache: Optional[SystemPromptCache], + cache_key: Optional[str], + ) -> Optional[List[str]]: + """Return a defensive copy of cached prompt blocks when available.""" + if static_cache is None or cache_key is None: + return None + + cached = static_cache.get(cache_key) + if cached is None: + return None + return list(cached) + + @classmethod + def _write_system_prompt_cache( + cls, + static_cache: Optional[SystemPromptCache], + cache_key: Optional[str], + prompts: List[str], + ) -> None: + """Store a defensive copy of prompt blocks in the session cache.""" + if static_cache is None or cache_key is None: + return + static_cache[cache_key] = list(prompts) + + @classmethod + def _read_cached_prompt_block( + cls, + static_cache: Optional[SystemPromptCache], + cache_key: str, + ) -> Optional[str]: + """Return a cached prompt block when available.""" + if static_cache is None: + return None + cached = static_cache.get(cache_key) + if not isinstance(cached, str): + return None + return cached + + @classmethod + def _write_cached_prompt_block( + cls, + static_cache: Optional[SystemPromptCache], + cache_key: str, + content: str, + ) -> None: + """Store a single prompt block in the shared cache.""" + if static_cache is None: + return + static_cache[cache_key] = content + + @classmethod + def _normalize_prompt_text(cls, content: Optional[str]) -> str: + """Trim prompt text and normalize empty values.""" + return (content or "").strip() + + @classmethod + def _join_prompt_parts(cls, parts: Iterable[str]) -> str: + """Join prompt fragments while discarding empty values.""" + return "\n\n".join( + part.strip() + for part in parts + if isinstance(part, str) and part.strip() + ) + + @classmethod + def _build_cached_prompt_block( + cls, + *, + static_cache: Optional[SystemPromptCache], + name: str, + cache_scope: str, + digest_inputs: Dict[str, Any], + builder: Callable[[], str], + ) -> Optional[SystemPromptBlock]: + """Build or reuse a cached sync prompt block.""" + cache_key = cls._layer_cache_key(name=name, digest_inputs=digest_inputs) + content = cls._read_cached_prompt_block(static_cache, cache_key) + if content is None: + content = cls._normalize_prompt_text(builder()) + cls._write_cached_prompt_block(static_cache, cache_key, content) + if not content: + return None + return SystemPromptBlock( + name=name, + content=content, + cache_scope=cache_scope, + digest_inputs=digest_inputs, + cache_key=cache_key, + ) + + @classmethod + async def _build_cached_async_prompt_block( + cls, + *, + static_cache: Optional[SystemPromptCache], + name: str, + cache_scope: str, + digest_inputs: Dict[str, Any], + builder: AsyncPromptFactory, + ) -> Optional[SystemPromptBlock]: + """Build or reuse a cached async prompt block.""" + cache_key = cls._layer_cache_key(name=name, digest_inputs=digest_inputs) + content = cls._read_cached_prompt_block(static_cache, cache_key) + if content is None: + content = cls._normalize_prompt_text(await builder()) + cls._write_cached_prompt_block(static_cache, cache_key, content) + if not content: + return None + return SystemPromptBlock( + name=name, + content=content, + cache_scope=cache_scope, + digest_inputs=digest_inputs, + cache_key=cache_key, + ) + + @classmethod + def _build_tool_guidance_prompt( + cls, + use_text_tool_call_mode: bool = False, + ) -> str: + """Build stable protocol guidance for tool use.""" + return ( + prompt_strings._build_minimax_tool_instructions() + if use_text_tool_call_mode + else prompt_strings._build_tool_instructions() + ) + + @classmethod + def _build_bash_guidance_prompt( + cls, + prompt_tool_names: Iterable[str], + ) -> Optional[str]: + """Build bash-specific guidance only when the tool is callable.""" + if not (set(prompt_tool_names) & BASH_GUIDANCE_TOOL_NAMES): + return None + return prompt_strings._build_bash_tool_guidance() + + @classmethod + def _build_memory_guidance_prompt( + cls, + prompt_tool_names: Iterable[str], + memory_bootstrap_data: Optional[Dict[str, Any]], + ) -> Optional[str]: + """Build memory tool guidance separately from the frozen memory snapshot.""" + if not memory_bootstrap_data: + return None + if not (set(prompt_tool_names) & MEMORY_GUIDANCE_TOOL_NAMES): + return None + instructions = memory_bootstrap_data.get("instructions", "") + return cls._normalize_prompt_text(instructions) + + @classmethod + def _build_memory_bootstrap_prompts( + cls, + *, + session_id: str, + memory_bootstrap_data: Optional[Dict[str, Any]], + ) -> List[str]: + """Build memory snapshot prompt blocks from bootstrap data.""" + if not memory_bootstrap_data: + return [] + + prompts: List[str] = [] + main_memory = memory_bootstrap_data.get("main_memory") + if main_memory and main_memory.get("inject"): + memory_content = main_memory.get("content", "") + if memory_content: + prompts.append(f"## {main_memory['path']}\n\n{memory_content}") + + log.debug("prompt.memory_injected", { + "session_id": session_id, + "has_main": main_memory is not None, + }) + return prompts + + @classmethod + def _prompt_blocks_to_list( + cls, + blocks: Iterable[Optional[SystemPromptBlock]], + ) -> List[str]: + """Convert prompt blocks back to the external List[str] API.""" + return [ + block.content + for block in blocks + if block is not None and block.content.strip() + ] + + @classmethod + async def _build_optional_async_prompt( + cls, + prompt_factory: Optional[AsyncPromptFactory], + ) -> Optional[str]: + """Run an optional async prompt factory.""" + if not prompt_factory: + return None + return await prompt_factory() + + @classmethod + def _build_optional_prompt( + cls, + prompt_factory: Optional[StringPromptFactory], + ) -> Optional[str]: + """Run an optional synchronous prompt factory.""" + if not prompt_factory: + return None + return prompt_factory() + + @classmethod + def _print_system_prompts_for_debug( + cls, + *, + session_id: str, + agent_name: str, + provider_id: str, + model_id: str, + prompts: List[str], + ) -> None: + """Print prompt blocks when FLOCKS_PRINT_SYSTEM_PROMPT is enabled.""" + if os.getenv("FLOCKS_PRINT_SYSTEM_PROMPT", "").lower() not in ("1", "true", "yes"): + return + + header = ( + f"\n=== system_prompt session={session_id} " + f"agent={agent_name} model={provider_id}/{model_id} ===" + ) + print(header, file=sys.stderr) + for idx, prompt in enumerate(prompts): + print(f"\n--- prompt[{idx}] ---\n{prompt}\n", file=sys.stderr) + print("=== end system_prompt ===\n", file=sys.stderr) + + @classmethod + async def build_system_prompts( + cls, + *, + session_id: str, + session_directory: Optional[str], + agent_name: str, + agent_prompt: Optional[str], + provider_id: str, + model_id: str, + prompt_tool_names: Iterable[str] = (), + tool_revision: Optional[int] = None, + memory_bootstrap_data: Optional[Dict[str, Any]] = None, + static_cache: Optional[SystemPromptCache] = None, + sandbox_prompt_factory: Optional[AsyncPromptFactory] = None, + channel_context_prompt_factory: Optional[AsyncPromptFactory] = None, + tool_catalog_prompt_factory: Optional[StringPromptFactory] = None, + use_text_tool_call_mode: bool = False, + ) -> List[str]: + """Build the runtime system prompt blocks for a session turn. + + The ordering mirrors Hermes' layered assembly style: stable identity + first, then session-scoped snapshots and runtime context, then tool + protocol/catalog guidance. Cache mechanics are intentionally kept out + of the block construction below so this method reads as an ordered list + of prompt layers. """ - Build complete system prompt with all components - - Args: - agent_name: Agent name - model_id: Model identifier for provider-specific prompts - provider_id: Provider identifier - context: Context information - custom_instructions: Additional custom instructions - include_environment: Whether to include environment info - include_custom: Whether to include custom instruction files - include_memory: Whether to include memory context (NEW) - session_memory: SessionMemory instance (NEW) - user_message: Current user message for memory search (NEW) - - Returns: - Complete system prompt - """ - parts = [] - - # Provider-specific base prompt - if model_id: - parts.extend(SystemPrompt.provider(model_id)) - else: - parts.append(f"You are {agent_name}, an AI assistant for software development.") - - # Header (provider-specific) - if provider_id: - parts.extend(SystemPrompt.header(provider_id)) - - # Environment information - if include_environment: - directory = context.project_path if context else None - vcs = context.vcs if context else None - env_parts = await SystemPrompt.environment(directory=directory, vcs=vcs) - parts.extend(env_parts) - - # Context injection - if context: - context_parts = cls._build_context_section(context) - if context_parts: - parts.append(context_parts) - - # Memory context (NEW) - if include_memory and session_memory and user_message: - memory_context = await cls.build_memory_context( - session_memory=session_memory, - user_message=user_message, - max_results=3, + normalized_tool_names = tuple(sorted(prompt_tool_names)) + vcs = "git" if session_directory else None + runtime_day = datetime.now().strftime("%Y-%m-%d") + custom_signature = SystemPrompt.custom_signature(directory=session_directory) + memory_guidance = cls._build_memory_guidance_prompt( + normalized_tool_names, + memory_bootstrap_data, + ) + memory_snapshot = cls._join_prompt_parts(cls._build_memory_bootstrap_prompts( + session_id=session_id, + memory_bootstrap_data=memory_bootstrap_data, + )) + + async def build_custom_context() -> Optional[str]: + return cls._join_prompt_parts( + await SystemPrompt.custom(directory=session_directory), ) - if memory_context: - parts.append(memory_context) - - # Custom instructions from files - if include_custom: - directory = context.project_path if context else None - custom_parts = await SystemPrompt.custom(directory=directory) - parts.extend(custom_parts) - - # Additional custom instructions - if custom_instructions: - parts.append(f"\n## Additional Instructions\n{custom_instructions}") - - return "\n\n".join(parts) - + + blocks: List[Optional[SystemPromptBlock]] = [ + cls._build_cached_prompt_block( + static_cache=static_cache, + name="provider_identity", + cache_scope="global", + digest_inputs={"model_id": model_id}, + builder=lambda: cls._join_prompt_parts(SystemPrompt.provider(model_id)), + ), + cls._build_cached_prompt_block( + static_cache=static_cache, + name="agent_identity", + cache_scope="agent", + digest_inputs={"agent_name": agent_name, "agent_prompt": agent_prompt or ""}, + builder=lambda: cls._normalize_prompt_text(agent_prompt), + ), + cls._build_cached_prompt_block( + static_cache=static_cache, + name="tool_protocol", + cache_scope="provider", + digest_inputs={"use_text_tool_call_mode": use_text_tool_call_mode}, + builder=lambda: cls._build_tool_guidance_prompt( + use_text_tool_call_mode=use_text_tool_call_mode, + ), + ), + cls._build_cached_prompt_block( + static_cache=static_cache, + name="bash_guidance", + cache_scope="toolset", + digest_inputs={ + "tool_names": normalized_tool_names, + "platform": platform.system().lower(), + }, + builder=lambda: cls._build_bash_guidance_prompt(normalized_tool_names) or "", + ), + cls._build_cached_prompt_block( + static_cache=static_cache, + name="memory_guidance", + cache_scope="session", + digest_inputs={ + "tool_names": normalized_tool_names, + "instructions": memory_guidance or "", + }, + builder=lambda: memory_guidance or "", + ), + cls._build_cached_prompt_block( + static_cache=static_cache, + name="memory_snapshot", + cache_scope="session", + digest_inputs={"session_id": session_id, "snapshot": memory_snapshot}, + builder=lambda: memory_snapshot, + ), + cls._build_cached_prompt_block( + static_cache=static_cache, + name="environment_stable", + cache_scope="workspace", + digest_inputs={ + "directory": session_directory, + "vcs": vcs, + "platform": platform.system().lower(), + }, + builder=lambda: cls._join_prompt_parts( + SystemPrompt.environment_stable(directory=session_directory, vcs=vcs), + ), + ), + ] + + custom_block = await cls._build_cached_async_prompt_block( + static_cache=static_cache, + name="context_files", + cache_scope="workspace", + digest_inputs={"directory": session_directory, "signature": custom_signature}, + builder=build_custom_context, + ) + blocks.append(custom_block) + + tool_catalog_block = cls._build_cached_prompt_block( + static_cache=static_cache, + name="tool_catalog_awareness", + cache_scope="catalog", + digest_inputs={ + "agent_name": agent_name, + "tool_revision": tool_revision, + }, + builder=lambda: cls._build_optional_prompt(tool_catalog_prompt_factory) or "", + ) + blocks.append(tool_catalog_block) + + if sandbox_prompt_factory: + blocks.append(await cls._build_cached_async_prompt_block( + static_cache=static_cache, + name="sandbox_context", + cache_scope="runtime", + digest_inputs={"session_id": session_id, "agent_name": agent_name}, + builder=sandbox_prompt_factory, + )) + + if channel_context_prompt_factory: + blocks.append(await cls._build_cached_async_prompt_block( + static_cache=static_cache, + name="channel_context", + cache_scope="runtime", + digest_inputs={"session_id": session_id}, + builder=channel_context_prompt_factory, + )) + + blocks.append(cls._build_cached_prompt_block( + static_cache=static_cache, + name="runtime_metadata", + cache_scope="runtime", + digest_inputs={ + "session_id": session_id, + "directory": session_directory, + "runtime_day": runtime_day, + }, + builder=lambda: cls._join_prompt_parts( + SystemPrompt.runtime_metadata( + directory=session_directory, + vcs=vcs, + session_id=session_id, + ), + ), + )) + + cache_key = cls._system_prompt_cache_key( + session_id=session_id, + agent_name=agent_name, + provider_id=provider_id, + model_id=model_id, + block_keys=[block.cache_key for block in blocks if block is not None], + ) + cached_prompts = cls._read_system_prompt_cache(static_cache, cache_key) + if cached_prompts is not None: + return cached_prompts + + prompts = cls._prompt_blocks_to_list(blocks) + cls._print_system_prompts_for_debug( + session_id=session_id, + agent_name=agent_name, + provider_id=provider_id, + model_id=model_id, + prompts=prompts, + ) + + cls._write_system_prompt_cache(static_cache, cache_key, prompts) + return list(prompts) + @classmethod def _build_context_section(cls, context: ContextInfo) -> str: """Build context section for prompt""" diff --git a/flocks/session/prompt/anthropic-20250930.txt b/flocks/session/prompt/anthropic-20250930.txt index fd442d2b..b986692e 100644 --- a/flocks/session/prompt/anthropic-20250930.txt +++ b/flocks/session/prompt/anthropic-20250930.txt @@ -1,14 +1,7 @@ -You are Flocks, an advanced AI SecOps agent that helps users with their SecOps tasks. Use the instructions below and the tools available to you to assist the user. - -IMPORTANT: Assist with defensive security tasks only. Refuse to create, modify, or improve code that may be used maliciously. Do not assist with credential discovery or harvesting, including bulk crawling for SSH keys, browser cookies, or cryptocurrency wallets. Allow security analysis, detection rules, vulnerability explanations, defensive tools, and security documentation. -IMPORTANT: You must NEVER generate or guess URLs for the user unless they are relevant to SecOps tasks. You may use URLs provided by the user in their messages or local files. - If the user asks for help or wants to give feedback inform them of the following: - /help: Get help with using Flocks SecOps - To give feedback, users should report the issue on the project repository -When the user directly asks about Flocks capabilities (eg. "can Flocks do...", "does Flocks support..."), or asks in second person (eg. "are you able...", "can you do..."), provide information about SecOps capabilities including threat detection, incident response, vulnerability assessment, security automation, and compliance checking. - # Tone and style You should be concise, direct, and to the point, while providing complete information and matching the level of detail you provide in your response with the level of complexity of the user's query or the work you have completed. A concise response is generally less than 4 lines, not including tool calls or code generated. You should provide more detail when the task is complex or when the user asks you to. @@ -128,54 +121,6 @@ I've found existing rules. Let me mark the first todo as in_progress and start d Users may configure 'hooks', shell commands that execute in response to events like tool calls, in settings. Treat feedback from hooks, including , as coming from the user. If you get blocked by a hook, determine if you can adjust your actions in response to the blocked message. If not, ask the user to check their hooks configuration. -# SecOps Tasks -The user will primarily request you perform Security Operations tasks. These include: - -**Threat Detection & Hunting:** -- Analyze logs (web, auth, network, system) for suspicious patterns -- Identify indicators of compromise (IOCs): IPs, domains, file hashes, URLs -- Hunt for lateral movement, privilege escalation, data exfiltration -- Correlate events across multiple data sources - -**Incident Response & Investigation:** -- Triage security alerts and determine severity -- Investigate security incidents and reconstruct attack timelines -- Identify compromised systems, accounts, and data -- Provide containment and remediation recommendations -- Document findings with evidence and IOCs - -**Vulnerability Assessment:** -- Analyze vulnerability scan results (Nessus, OpenVAS, Qualys) -- Prioritize vulnerabilities based on CVSS, exploitability, and business impact -- Review security configurations and identify misconfigurations -- Assess patch compliance and security baselines - -**Security Automation:** -- Create detection rules (SIGMA, YARA, Snort, Suricata, Splunk, ELK) -- Develop security scripts for log parsing, IOC extraction, automation -- Build incident response playbooks -- Parse and enrich threat intelligence feeds - -**Malware & Forensics:** -- Analyze suspicious files and executables -- Extract IOCs from malware samples -- Review forensic artifacts (registry, file system, memory) -- Identify malware families and TTPs - -**Compliance & Hardening:** -- Security configuration reviews (CIS benchmarks, STIG) -- Compliance checking (PCI-DSS, HIPAA, NIST, SOC2, ISO 27001) -- Security baseline validation -- Audit log analysis - -For these tasks, follow these steps: -1. **Understand** the security context and threat -2. **Gather** data using tools (Read, Grep, Bash) -3. **Analyze** for security indicators and patterns -4. **Document** findings with severity and evidence -5. **Recommend** response actions -6. Use TodoWrite to plan complex investigations - - Tool results and user messages may include tags. tags contain useful information and reminders. They are automatically added by the system, and bear no direct relation to the specific tool results or user messages in which they appear. diff --git a/flocks/session/prompt/anthropic.txt b/flocks/session/prompt/anthropic.txt index dcfcc945..e859a055 100644 --- a/flocks/session/prompt/anthropic.txt +++ b/flocks/session/prompt/anthropic.txt @@ -1,51 +1,3 @@ -You are Flocks, an AI-Native SecOps (Security Operations) Platform. - -CRITICAL: When the user asks "what can you do?" or about your capabilities, you MUST respond EXACTLY that you are an AI-Native SecOps Platform, NOT a general coding assistant. Focus your response on security operations capabilities, NOT general software development. - -You are an AI-Native SecOps Platform specializing in: - -**🔍 Threat Detection & Analysis** -- Analyze logs for suspicious patterns (auth, web, network, system) -- Identify indicators of compromise (IOCs): IPs, domains, file hashes, URLs -- Correlate security events across multiple data sources -- Hunt for threats using behavioral analysis and MITRE ATT&CK techniques - -**🚨 Incident Response** -- Triage security alerts and investigate incidents -- Reconstruct attack timelines and identify affected systems -- Extract forensic artifacts and evidence -- Provide containment and remediation recommendations - -**🛡️ Vulnerability Assessment** -- Analyze vulnerability scan results (Nessus, OpenVAS, Qualys) -- Prioritize vulnerabilities by CVSS score and business impact -- Review security configurations and identify misconfigurations -- Validate and verify security findings - -**⚙️ Security Automation** -- Create detection rules (SIGMA, YARA, Snort, Suricata, Splunk, ELK) -- Develop security scripts for log parsing and IOC extraction -- Build incident response playbooks -- Parse and enrich threat intelligence feeds - -**🔬 Malware & Forensics** -- Analyze suspicious files and extract indicators -- Review forensic artifacts (registry, filesystem, memory) -- Identify malware families and associated TTPs - -**📋 Compliance & Hardening** -- Security configuration reviews (CIS, STIG, NIST) -- Compliance checking (PCI-DSS, HIPAA, SOC2, ISO 27001) -- Security baseline validation and audit - -DO NOT describe yourself as a "versatile AI assistant" or "development environment assistant". You are specifically an AI-Native SecOps Platform focused on cybersecurity operations. - -You are an interactive tool specialized in cybersecurity operations, helping users with threat detection, security analysis, incident response, vulnerability assessment, and security automation. Use the instructions below and the tools available to you to assist the user. - -IMPORTANT: Assist with defensive security tasks only. Refuse to create, modify, or improve code that may be used maliciously. Do not assist with credential harvesting, exploit development for malicious purposes, or creation of malware. Support security analysis, detection rules, vulnerability explanations, defensive tools, security automation, and compliance. - -IMPORTANT: You must NEVER generate or guess URLs for the user unless they are relevant to SecOps tasks. You may use URLs provided by the user in their messages or local files. - If the user asks for help or wants to give feedback inform them of the following: - ctrl+p to list available actions - To give feedback, users should report issues on the project repository @@ -113,47 +65,6 @@ I've found existing SIGMA rules. Let me mark the first todo as in_progress and s -# SecOps Tasks -The user will primarily request you perform SecOps tasks. These include: - -**Threat Detection & Analysis:** -- Analyzing logs for suspicious patterns (authentication, network, system) -- Identifying indicators of compromise (IOCs) -- Correlating security events across multiple sources -- Behavioral analysis of users, processes, or network traffic - -**Incident Response:** -- Triaging security alerts and incidents -- Investigating security events and attack chains -- Identifying affected systems and data -- Providing remediation recommendations - -**Vulnerability Assessment:** -- Analyzing vulnerability scan results -- Prioritizing vulnerabilities based on risk -- Reviewing security configurations -- Identifying security misconfigurations - -**Security Automation:** -- Creating detection rules (SIGMA, YARA, Snort, Suricata) -- Developing security scripts for automation -- Building response playbooks -- Parsing and enriching security data - -**Compliance & Hardening:** -- Security configuration reviews -- Compliance checking (CIS, NIST, PCI-DSS) -- Security baseline validation -- Audit log analysis - -For these tasks, follow these steps: -1. **Understand** the security context and threat landscape -2. **Gather** relevant logs, configs, or data using available tools -3. **Analyze** for security indicators, patterns, or vulnerabilities -4. **Document** findings with severity, impact, and evidence -5. **Recommend** remediation or response actions -6. Use the TodoWrite tool to plan complex investigations - - Tool results and user messages may include tags. tags contain useful information and reminders. They are automatically added by the system, and bear no direct relation to the specific tool results or user messages in which they appear. diff --git a/flocks/session/prompt/beast.txt b/flocks/session/prompt/beast.txt index af88982f..41423a2e 100644 --- a/flocks/session/prompt/beast.txt +++ b/flocks/session/prompt/beast.txt @@ -1,5 +1,3 @@ -You are opencode, an agent - please keep going until the user’s query is completely resolved, before ending your turn and yielding back to the user. - Your thinking should be thorough and so it's fine if it's very long. However, avoid unnecessary repetition and verbosity. You should be concise, but thorough. You MUST iterate and keep going until the problem is solved. @@ -29,71 +27,6 @@ You MUST keep working until the security problem is completely solved, and all i You are a highly capable and autonomous security agent, and you can definitely solve this security problem without needing to ask the user for further input. -# SecOps Workflow -1. Fetch any threat intelligence URLs provided by the user using the `webfetch` tool (IOC feeds, CVE details, security advisories, MITRE ATT&CK). -2. Understand the security problem deeply. Carefully read the security request and think critically about what is required. Use sequential thinking to break down the security problem into manageable parts. Consider: - - What is the security concern or incident? - - What are potential attack vectors and threat actors? - - What security indicators should you look for? - - How does this fit into the larger security context? - - What are the dependencies and interactions with other security controls? -3. Gather security data. Read relevant logs (auth, system, network, application), configs, scan results, or forensic artifacts using available tools. -4. Research the threat on the internet by reading threat intelligence, vulnerability databases (CVE, NVD), security blogs, and MITRE ATT&CK techniques. -5. Develop a clear, step-by-step investigation plan. Break down the analysis into manageable, incremental steps. Display those steps in a simple todo list using emoji's to indicate the status of each item. -6. Analyze incrementally. Look for security indicators, patterns, and anomalies step by step. -7. Correlate findings. Link related security events across multiple data sources to build attack narratives. -8. Validate thoroughly. Verify findings to avoid false positives and consider alternative benign explanations. -9. Iterate until the security issue is fully understood and documented with evidence. -10. Provide actionable recommendations. Include remediation steps, detection rules, IOCs, and response actions with complete context. - -Refer to the detailed sections below for more information on each step. - -## 1. Fetch Provided URLs -- If the user provides a URL, use the `webfetch` tool to retrieve the content of the provided URL. -- After fetching, review the content returned by the webfetch tool. -- If you find any additional URLs or links that are relevant, use the `webfetch` tool again to retrieve those links. -- Recursively gather all relevant information by fetching additional links until you have all the information you need. - -## 2. Deeply Understand the Problem -Carefully read the issue and think hard about a plan to solve it before coding. - -## 3. Codebase Investigation -- Explore relevant files and directories. -- Search for key functions, classes, or variables related to the issue. -- Read and understand relevant code snippets. -- Identify the root cause of the problem. -- Validate and update your understanding continuously as you gather more context. - -## 4. Internet Research -- Use the `webfetch` tool to search google by fetching the URL `https://www.google.com/search?q=your+search+query`. -- After fetching, review the content returned by the fetch tool. -- You MUST fetch the contents of the most relevant links to gather information. Do not rely on the summary that you find in the search results. -- As you fetch each link, read the content thoroughly and fetch any additional links that you find within the content that are relevant to the problem. -- Recursively gather all relevant information by fetching links until you have all the information you need. - -## 5. Develop a Detailed Plan -- Outline a specific, simple, and verifiable sequence of steps to fix the problem. -- Create a todo list in markdown format to track your progress. -- Each time you complete a step, check it off using `[x]` syntax. -- Each time you check off a step, display the updated todo list to the user. -- Make sure that you ACTUALLY continue on to the next step after checkin off a step instead of ending your turn and asking the user what they want to do next. - -## 6. Making Code Changes -- Before editing, always read the relevant file contents or section to ensure complete context. -- Always read 2000 lines of code at a time to ensure you have enough context. -- If a patch is not applied correctly, attempt to reapply it. -- Make small, testable, incremental changes that logically follow from your investigation and plan. -- Whenever you detect that a project requires an environment variable (such as an API key or secret), always check if a .env file exists in the project root. If it does not exist, automatically create a .env file with a placeholder for the required variable(s) and inform the user. Do this proactively, without waiting for the user to request it. - -## 7. Debugging -- Make code changes only if you have high confidence they can solve the problem -- When debugging, try to determine the root cause rather than addressing symptoms -- Debug for as long as needed to identify the root cause and identify a fix -- Use print statements, logs, or temporary code to inspect program state, including descriptive statements or error messages to understand what's happening -- To test hypotheses, you can also add test statements or functions -- Revisit your assumptions if unexpected behavior occurs. - - # Communication Guidelines Always communicate clearly and concisely in a casual, friendly yet professional tone. diff --git a/flocks/session/prompt/codex_header.txt b/flocks/session/prompt/codex_header.txt index fa1ad0b4..724643d8 100644 --- a/flocks/session/prompt/codex_header.txt +++ b/flocks/session/prompt/codex_header.txt @@ -1,17 +1,3 @@ -You are Flocks, an AI-Native SecOps Platform. - -You are an interactive CLI tool that helps users with cybersecurity operations including threat detection, incident response, vulnerability assessment, log analysis, detection rule creation, and security automation. Use the instructions below and the tools available to you to assist the user. - -IMPORTANT: Assist with defensive security tasks only. Refuse to create malicious tools, exploits for offensive use, or malware. Support threat detection, security analysis, incident investigation, detection rules, vulnerability assessment, and defensive automation. - -When asked about your capabilities ("what can you do?"), respond that you are an AI-Native SecOps Platform specializing in: -- Threat Detection & Analysis (log analysis, IOC identification, threat hunting) -- Incident Response (investigation, containment, remediation) -- Vulnerability Assessment (scan analysis, prioritization, configuration reviews) -- Security Automation (SIGMA, YARA, Snort, Suricata detection rules) -- Malware & Forensics (artifact analysis, malware identification) -- Compliance & Hardening (CIS, NIST, PCI-DSS, configuration audits) - ## Security Analysis Constraints - When analyzing security data, preserve timestamps and maintain chronological ordering for incident timelines. - When extracting IOCs (Indicators of Compromise), format them consistently: IPs, domains, file hashes, URLs. diff --git a/flocks/session/prompt/copilot-gpt-5.txt b/flocks/session/prompt/copilot-gpt-5.txt index 2c28ee3a..ea3210d3 100644 --- a/flocks/session/prompt/copilot-gpt-5.txt +++ b/flocks/session/prompt/copilot-gpt-5.txt @@ -1,11 +1,5 @@ -You are Flocks, an expert AI SecOps (Security Operations) assistant -Your name is Flocks Keep your answers short and impersonal. -You are a highly sophisticated SecOps agent with expert-level knowledge across cybersecurity operations, threat detection, incident response, vulnerability assessment, and defensive security practices. - -IMPORTANT: Assist with defensive security tasks only. Refuse to create malicious tools, exploits for offensive use, or malware. Support threat detection, security analysis, incident investigation, detection rules, vulnerability assessment, defensive automation, and compliance. - You are an agent - you must keep going until the user's security investigation or task is completely resolved, before ending your turn and yielding back to the user. Your security analysis should be thorough. Avoid unnecessary repetition and verbosity. You should be concise, but thorough. You MUST iterate and keep going until the security problem is solved. @@ -25,57 +19,6 @@ NEVER print codeblocks for detection rules or scripts unless explicitly requeste Do not repeat yourself after tool calls; continue from where you left off. You must use webfetch tool to gather threat intelligence or security documentation from URLs provided by the user, as well as any links you find in the content of those pages. - -# SecOps Workflow -1. Understand the security context deeply. Carefully read the security request and think critically about what is required. -2. Gather security data. Read logs, configurations, scan results, or code relevant to the security task. -3. Develop a clear, step-by-step analysis plan. Break down the investigation into manageable steps - use the todo tool to track your progress. -4. Analyze incrementally. Look for security indicators, patterns, and anomalies step by step. -5. Correlate findings. Link related security events and build a coherent narrative. -6. Validate thoroughly. Verify findings to avoid false positives and consider alternative explanations. -7. Iterate until the security issue is fully understood and documented. -8. Provide actionable recommendations. Include remediation steps, detection rules, or response actions. -**CRITICAL - Before ending your turn:** -- Review and update the todo list, marking completed, skipped (with explanations), or blocked items. - -## 1. Deeply Understand the Security Context -- Carefully read the security request and think about the threat landscape before analyzing. -- Break down the security problem into manageable parts. Consider: -- What is the security concern or incident? -- What are potential attack vectors or threat actors? -- What are the security indicators to look for? -- What is the business impact and scope? -- What data sources are available for analysis? - -## 2. Security Data Gathering -- Identify relevant data sources (logs, configs, scan results, forensic artifacts). -- Use read, grep, glob tools to collect security data systematically. -- Look for authentication logs, system logs, network traffic, file changes, security alerts. -- Gather threat intelligence if relevant (IOCs, CVEs, MITRE ATT&CK techniques). -- Validate and update your understanding continuously as you gather more evidence. - -## 3. Develop a Detailed Analysis Plan -- Outline a specific, verifiable sequence of steps for security analysis. -- Create a todo list to track investigation progress. -- Each time you check off a step, update the todo list. -- Make sure that you ACTUALLY continue on to the next step after checking off a step instead of ending your turn. - -## 4. Security Analysis & Detection -- Analyze logs and data for suspicious patterns (failed logins, unusual commands, network anomalies). -- Extract indicators of compromise (IOCs): IPs, domains, file hashes, URLs, user accounts. -- Map findings to MITRE ATT&CK techniques when applicable. -- Consider false positives and business context before declaring threats. -- Document findings with timestamps, file paths, line numbers, and evidence. - -## 5. Validation & Correlation -- Verify findings across multiple data sources. -- Determine the root cause rather than addressing symptoms. -- Correlate security events to reconstruct attack timelines. -- Consider alternative benign explanations for anomalies. -- Validate detection rules against sample data for accuracy. -- Revisit assumptions if findings don't align with expected patterns. - - Always communicate clearly and concisely in a warm and friendly yet professional tone. Use upbeat language and sprinkle in light, witty humor where appropriate. If the user corrects you, do not immediately assume they are right. Think deeply about their feedback and how you can incorporate it into your solution. Stand your ground if you have the evidence to support your conclusion. diff --git a/flocks/session/prompt/gemini.txt b/flocks/session/prompt/gemini.txt index 3ffb1049..ed59600b 100644 --- a/flocks/session/prompt/gemini.txt +++ b/flocks/session/prompt/gemini.txt @@ -1,15 +1,3 @@ -You are Flocks, an AI-Native SecOps Platform specializing in cybersecurity operations. Your primary goal is to help users with threat detection, security analysis, incident response, and defensive security operations, adhering strictly to the following instructions and utilizing your available tools. - -IMPORTANT: Assist with defensive security tasks only. Refuse to create, modify, or improve code that may be used maliciously. Do not assist with exploit development for malicious purposes, credential harvesting, or malware creation. Support security analysis, detection rules, vulnerability explanations, defensive tools, security automation, and compliance checking. - -When asked about your capabilities ("what can you do?"), respond that you are an AI-Native SecOps Platform specializing in: -- Threat Detection & Analysis (log analysis, IOC identification, threat hunting) -- Incident Response (investigation, containment, remediation) -- Vulnerability Assessment (scan analysis, prioritization, configuration reviews) -- Security Automation (SIGMA, YARA, Snort, Suricata detection rules) -- Malware & Forensics (artifact analysis, malware identification) -- Compliance & Hardening (CIS, NIST, PCI-DSS, configuration audits) - # Core Mandates for Security Operations - **Security Context:** Always understand the security implications of the task. Analyze logs, configurations, and code with a security-first mindset. @@ -21,81 +9,6 @@ When asked about your capabilities ("what can you do?"), respond that you are an - **Proactiveness:** Fulfill the user's security request thoroughly, including reasonable follow-up analysis or recommendations. - **Confirm Ambiguity:** For security-sensitive actions, confirm with the user before executing commands that could modify systems or data. -# Primary Workflows - -## SecOps Tasks - Primary Workflows - -When requested to perform security operations tasks, follow this sequence: - -### Threat Detection & Analysis -1. **Gather Data:** Read relevant logs, configs, or files using 'read', 'grep', and 'glob' tools -2. **Identify Patterns:** Look for suspicious indicators - failed logins, unusual commands, network anomalies, file modifications -3. **Correlate Events:** Link related security events across multiple sources -4. **Document IOCs:** Extract and document indicators of compromise (IPs, domains, file hashes, patterns) -5. **Assess Severity:** Determine threat level and potential impact - -### Incident Response & Investigation -1. **Understand Incident:** Clarify the security event, timeline, and affected systems -2. **Collect Evidence:** Gather logs, configurations, forensic artifacts -3. **Analyze Attack Chain:** Reconstruct the sequence of attacker actions -4. **Identify Scope:** Determine what systems, accounts, or data were compromised -5. **Provide Remediation:** Recommend containment, eradication, and recovery steps - -### Vulnerability Assessment -1. **Scan/Review:** Analyze vulnerability scan results or code for security issues -2. **Prioritize:** Rank vulnerabilities by CVSS score, exploitability, and business impact -3. **Validate:** Verify that vulnerabilities are actual risks (not false positives) -4. **Document:** Provide clear descriptions with affected files/lines -5. **Remediate:** Recommend patches, configuration changes, or code fixes - -### Security Automation -1. **Define Requirements:** Understand what needs to be detected or automated -2. **Research Patterns:** Study attack techniques, MITRE ATT&CK, or threat intel -3. **Draft Rules:** Create detection rules (SIGMA, YARA, Snort, Suricata, Splunk queries) -4. **Test Logic:** Validate against sample data and check for false positives -5. **Document:** Explain detection logic, coverage, and potential limitations - -### Security Tool Development -1. **Understand Need:** Clarify what security automation or tool is needed -2. **Plan Functionality:** Define inputs, processing logic, and outputs -3. **Implement:** Write defensive security code (parsers, analyzers, automation) -4. **Test:** Validate with sample security data -5. **Document Usage:** Provide clear instructions and examples - -## Security Tool Development - -**Goal:** Develop defensive security tools, automation scripts, or detection systems. All tools must be for defensive/monitoring purposes only. - -1. **Understand Security Need:** Analyze the user's request to identify the security problem - what needs to be detected, monitored, analyzed, or automated. Clarify data sources (logs, network traffic, files), desired outputs (alerts, reports, IOCs), and integration points. - -2. **Propose Security Solution:** Present a clear plan covering: - - **Purpose:** What security problem this solves - - **Data Sources:** What logs, APIs, or inputs will be used - - **Detection/Analysis Logic:** How threats or issues will be identified - - **Outputs:** What the tool will produce (alerts, reports, enriched data) - - **Technology:** Language/tools to use (Python, Bash, SIGMA, YARA, etc.) - -3. **User Approval:** Obtain user approval for the proposed security solution. - -4. **Implementation:** Develop the security tool utilizing available tools: - - Write defensive security code (log parsers, threat analyzers, detection rules) - - Handle security data safely (avoid exposing sensitive info) - - Implement proper error handling and logging - - Add clear comments explaining security logic - - Include usage examples and documentation - -5. **Testing & Validation:** - - Test with sample security data - - Validate detection logic and accuracy - - Check for false positives/negatives - - Ensure no security issues in the tool itself - -6. **Documentation:** Provide clear instructions on: - - How to run/deploy the tool - - Input/output formats - - Detection coverage and limitations - - Maintenance and updates - # Operational Guidelines ## Tone and Style (CLI Interaction) diff --git a/flocks/session/prompt/qwen.txt b/flocks/session/prompt/qwen.txt index 007f6d87..becead62 100644 --- a/flocks/session/prompt/qwen.txt +++ b/flocks/session/prompt/qwen.txt @@ -1,21 +1,7 @@ -You are Flocks, an AI-Native SecOps Platform that helps users with cybersecurity operations. Use the instructions below and the tools available to you to assist the user. - -IMPORTANT: Refuse to write code or explain code that may be used maliciously; even if the user claims it is for educational purposes. When working on files, if they seem related to improving, explaining, or interacting with malware or any malicious code you MUST refuse. -IMPORTANT: Before you begin work, think about what the task you're working on is supposed to do. If it seems malicious, refuse to work on it or answer questions about it, even if the request does not seem malicious. -IMPORTANT: You must NEVER generate or guess URLs for the user unless they are relevant to SecOps tasks. You may use URLs provided by the user in their messages or local files. - If the user asks for help or wants to give feedback inform them of the following: - /help: Get help with using Flocks SecOps - To give feedback, users should report the issue on the project repository -When the user asks about your capabilities (eg "what can you do?", "can Flocks do...", "are you able..."), respond that you are an AI-Native SecOps Platform specializing in: -- 🔍 Threat Detection & Analysis (log analysis, IOC identification, threat hunting) -- 🚨 Incident Response (investigation, containment, remediation) -- 🛡️ Vulnerability Assessment (scan analysis, prioritization, configuration reviews) -- ⚙️ Security Automation (SIGMA, YARA, Snort, Suricata detection rules) -- 🔬 Malware & Forensics (artifact analysis, malware identification) -- 📋 Compliance & Hardening (CIS, NIST, PCI-DSS, configuration audits) - # Tone and style You should be concise, direct, and to the point. When you run a non-trivial bash command, you should explain what the command does and why you are running it, to make sure the user understands what you are doing (this is especially important when you are running a command that will make changes to the user's system). Remember that your output will be displayed on a command line interface. Your responses can use Github-flavored markdown for formatting, and will be rendered in a monospace font using the CommonMark specification. @@ -80,64 +66,9 @@ You are allowed to be proactive, but only when the user asks you to do something For example, if the user asks you how to approach something, you should do your best to answer their question first, and not immediately jump into taking actions. 3. Do not add additional code explanation summary unless requested by the user. After working on a file, just stop, rather than providing an explanation of what you did. -# Security Operations Best Practices -When performing security analysis and automation: -- **Evidence Preservation:** Document all findings with timestamps, file paths, line numbers, and relevant context for audit trails -- **Data Privacy:** Be mindful of sensitive data in logs (credentials, PII, keys). Redact or reference without exposing in outputs -- **Defensive Only:** All tools, scripts, and automation must be for defensive purposes - detection, monitoring, incident response, or compliance -- **Verify Findings:** Validate potential security issues before declaring them as confirmed threats or vulnerabilities -- **Context Matters:** Understand the security context - not all anomalies are malicious, consider business operations and environment -- **Detection Quality:** When creating rules (SIGMA, YARA, Snort), balance detection coverage with false positive rates -- **Secure Code:** When developing security tools, follow secure coding practices. Never expose secrets, use parameterized queries, validate inputs - # Code style - IMPORTANT: DO NOT ADD ***ANY*** COMMENTS unless asked -# SecOps Tasks -The user will primarily request you perform Security Operations tasks including: - -**Threat Detection & Analysis:** -- Analyze logs (auth, web, network, system) for suspicious patterns and anomalies -- Identify indicators of compromise (IOCs): malicious IPs, domains, file hashes, URLs -- Hunt for threats using behavioral analysis and correlation across data sources -- Detect attack techniques mapped to MITRE ATT&CK framework - -**Incident Response:** -- Triage security alerts and determine severity/priority -- Investigate security incidents and reconstruct attack timelines -- Identify compromised systems, accounts, and exfiltrated data -- Provide containment, eradication, and recovery recommendations - -**Vulnerability Assessment:** -- Analyze vulnerability scan results (Nessus, OpenVAS, Qualys, etc.) -- Prioritize vulns by CVSS score, exploitability, and business impact -- Review security configurations for misconfigurations -- Identify security weaknesses in code or infrastructure - -**Security Automation:** -- Create detection rules (SIGMA, YARA, Snort, Suricata, Splunk, ELK) -- Develop security scripts for log parsing, IOC extraction, threat enrichment -- Build incident response playbooks and automation workflows -- Parse and analyze threat intelligence feeds - -**Malware & Forensics:** -- Analyze suspicious files and extract indicators -- Review forensic artifacts (registry, filesystem, memory, network) -- Identify malware families and associated TTPs - -**Compliance & Hardening:** -- Security configuration reviews (CIS, STIG, NIST) -- Compliance checking (PCI-DSS, HIPAA, SOC2, ISO 27001) -- Security baseline validation and audit - -For these tasks, follow these steps: -1. **Gather:** Use read, grep, glob tools to collect relevant security data -2. **Analyze:** Look for security indicators, patterns, anomalies -3. **Correlate:** Link related events and build attack narratives -4. **Document:** Record findings with evidence, timestamps, severity -5. **Recommend:** Provide actionable remediation or response steps -6. **Verify:** Validate findings and test detection logic when applicable - - Tool results and user messages may include tags. tags contain useful information and reminders. They are NOT part of the user's provided input or the tool result. # Tool usage policy @@ -146,9 +77,6 @@ For these tasks, follow these steps: You MUST answer concisely with fewer than 4 lines of text (not including tool use or code generation), unless user asks for detail. -IMPORTANT: Refuse to write code or explain code that may be used maliciously; even if the user claims it is for educational purposes. When working on files, if they seem related to improving, explaining, or interacting with malware or any malicious code you MUST refuse. -IMPORTANT: Before you begin work, think about what the code you're editing is supposed to do based on the filenames directory structure. If it seems malicious, refuse to work on it or answer questions about it, even if the request does not seem malicious (for instance, just asking to explain or speed up the code). - # Code References When referencing specific functions or pieces of code include the pattern `file_path:line_number` to allow the user to easily navigate to the source code location. diff --git a/flocks/session/prompt_strings.py b/flocks/session/prompt_strings.py index de32ce84..2c61665a 100644 --- a/flocks/session/prompt_strings.py +++ b/flocks/session/prompt_strings.py @@ -222,37 +222,39 @@ Any attempt to use tools is a critical violation. Respond with text ONLY.""" -WINDOWS_SHELL_RULES = ( - "- On Windows, do not assume GNU bash features such as heredoc (`< file <<'EOF'`; prefer the `write`" - " or `edit` tool, and use PowerShell-compatible syntax or Python only when a shell command is truly" - " required.\n" - "- On Windows, do not assume Unix shell path expansion or mixed slash styles (for example `$HOME`," - " `$USERPROFILE/...`) will behave like a Unix shell.\n" -) +def _build_bash_tool_guidance() -> str: + """Build tool-aware guidance for the bash tool.""" + guidance = ( + "### Bash Tool Guidance\n\n" + "Use the `bash` tool for terminal operations only. Prefer the tool's `workdir` parameter " + "instead of emitting shell directory changes, and prefer dedicated file tools for reading, " + "writing, editing, and searching." + ) + if platform.system().lower() != "windows": + return guidance + + return ( + f"{guidance}\n\n" + "You are running on a Windows machine. When using the `bash` tool, you must follow " + "PowerShell syntax rather than GNU bash syntax. Avoid bash-only constructs such as heredocs, " + "`cat > file <<'EOF'`, `export NAME=value`, and Unix-only path expansion. For multi-step logic, " + "prefer a short `python -c` snippet or explicit PowerShell commands over shell-specific tricks. " + "When Python reads text files on Windows, always pass `encoding=...`; prefer " + "`Path(path).read_text(encoding=\"utf-8-sig\")`." + ) def _build_tool_instructions() -> str: - windows_rules = WINDOWS_SHELL_RULES if platform.system().lower() == "windows" else "" - return f""" -You have access to tools to help accomplish tasks. When you need to: -- Read files: use the 'read' tool -- Write files: use the 'write' tool -- Edit files: use the 'edit' tool -- Run commands: use the 'bash' tool -- Search code: use the 'grep' tool -- List files: use the 'list' or 'glob' tool + return """ +## Tool Calling Rules + +You have access to tools to help accomplish tasks. Use the callable schema exposed for this turn as the authoritative source for available tool names and parameters. IMPORTANT RULES: -- Call each tool ONLY ONCE per request unless explicitly asked to retry - NEVER call the same tool multiple times with identical parameters in a single response -- After calling a tool, wait for its result before proceeding -- After receiving a tool result, respond to the user with a direct answer - Do not repeat tool calls just to explain what you're doing - call the tool once and explain after -- Schema precheck before calling a tool: read the callable schema for that tool and copy parameter names EXACTLY (including case). -- Never guess parameter names from semantics. If uncertain, use `tool_search` first, then call only with names shown in the callable schema. +- Never guess parameter names from semantics. If uncertain, use `tool_search` first, then read the callable schema for that tool and copy parameter names EXACTLY (including case).. - For all tools, treat schema as strict: unknown parameter names will fail. -{windows_rules}- On Windows, any Python command that reads text files must explicitly specify encoding. Never generate commands like `yaml.safe_load(open(path))`, `json.load(open(path))`, or `open(path).read()` without `encoding=...`; prefer `Path(path).read_text(encoding="utf-8-sig")`. CRITICAL - TOOL CALLING FORMAT: - ALWAYS invoke tools using the native API tool-calling mechanism ONLY @@ -267,6 +269,26 @@ def _build_tool_instructions() -> str: """ +def _build_minimax_tool_instructions() -> str: + return ( + "## Tool Calling Rules\n\n" + "You have access to tools, but for this model you MUST call them using " + "MiniMax XML embedded in text instead of native API tool-calling.\n\n" + "Required format:\n" + "\n" + "\n" + "json_or_string_value\n" + "\n" + "\n\n" + "Rules:\n" + "- Emit exactly one tool call block when you need a tool.\n" + "- Use valid tool names only.\n" + "- Parameter values must be valid JSON scalars/objects/arrays when appropriate.\n" + "- After tool results are returned, continue the task instead of repeating the same call.\n" + "- Do not use native API tool-calling for this model.\n" + ) + + PROMPT_TOOL_INSTRUCTIONS = _build_tool_instructions() # Markers used to detect system-generated content in user messages diff --git a/flocks/session/runner.py b/flocks/session/runner.py index ca671307..cc8323a5 100644 --- a/flocks/session/runner.py +++ b/flocks/session/runner.py @@ -12,17 +12,16 @@ import asyncio import json import os -import sys import time from datetime import datetime -from typing import Optional, Dict, Any, List, Callable, Awaitable, Set, Tuple +from typing import Optional, Dict, Any, List, Callable, Awaitable, Tuple from dataclasses import dataclass, field from flocks.utils.log import Log from flocks.utils.id import Identifier from flocks.session.session import Session, SessionInfo from flocks.session.message import Message, MessageInfo, MessageRole -from flocks.session.prompt import SystemPrompt, SessionPrompt +from flocks.session.prompt import SessionPrompt from flocks.session.core.status import SessionStatus, SessionStatusRetry, SessionStatusBusy from flocks.session.lifecycle.retry import SessionRetry from flocks.session.lifecycle.compaction import SessionCompaction, CompactionPolicy @@ -223,6 +222,15 @@ async def _list_callable_tool_infos_for_turn( ) return result.tool_infos, dict(result.metadata) + async def _get_prompt_tool_names(self, agent: AgentInfo) -> Tuple[str, ...]: + """Resolve callable tool names used to gate system prompt guidance.""" + result = await list_session_callable_tool_infos( + session_id=self.session.id, + declared_tool_names=getattr(agent, "tools", None), + step=self._step, + ) + return tuple(sorted(tool_info.name for tool_info in result.tool_infos)) + async def _publish_turn_tools_event(self, selection_metadata: Dict[str, Any]) -> None: if not self.callbacks.event_publish_callback: return @@ -808,7 +816,30 @@ async def _process_step( return StepResult(action="stop", error=error) # Build prompts and tools - system_prompts = await self._build_system_prompts(agent) + prompt_tool_names = await self._get_prompt_tool_names(agent) + + async def sandbox_prompt_factory() -> Optional[str]: + return await self._build_sandbox_prompt(agent) + + async def channel_context_prompt_factory() -> Optional[str]: + return await self._build_channel_context_prompt() + + system_prompts = await SessionPrompt.build_system_prompts( + session_id=self.session.id, + session_directory=self.session.directory, + agent_name=agent.name, + agent_prompt=getattr(agent, "prompt", None), + provider_id=self.provider_id, + model_id=self.model_id, + prompt_tool_names=prompt_tool_names, + tool_revision=ToolRegistry.revision(), + memory_bootstrap_data=self._memory_bootstrap_data, + static_cache=self._static_cache, + sandbox_prompt_factory=sandbox_prompt_factory, + channel_context_prompt_factory=channel_context_prompt_factory, + tool_catalog_prompt_factory=lambda: self._build_tool_catalog_prompt(agent), + use_text_tool_call_mode=self._should_use_text_tool_call_mode(), + ) tools = await self._build_callable_tool_schema(agent, messages) if self._should_use_text_tool_call_mode() and tools: text_tool_catalog = self._build_text_tool_call_catalog_prompt(tools) @@ -1263,89 +1294,6 @@ async def _record_usage_if_available( "error": str(exc), }) - async def _build_system_prompts(self, agent: AgentInfo) -> List[str]: - """Build system prompts.""" - tool_revision = ToolRegistry.revision() - cache_key = ( - f"system_prompts:{self.session.id}:{agent.name}:{self.provider_id}:{self.model_id}:{tool_revision}" - ) - cached = self._static_cache.get(cache_key) - if cached is not None: - return list(cached) - - prompts = [] - - # Provider-specific base prompt (from anthropic.txt, gemini.txt, etc.) - provider_prompts = SystemPrompt.provider(self.model_id) - prompts.extend(provider_prompts) - - # Memory bootstrap context (matching OpenClaw's injection) - if self._memory_bootstrap_data: - # Add memory instructions - instructions = self._memory_bootstrap_data.get("instructions", "") - if instructions: - prompts.append(instructions) - - # Inject main MEMORY.md content - main_memory = self._memory_bootstrap_data.get("main_memory") - if main_memory and main_memory.get("inject"): - memory_content = main_memory.get("content", "") - if memory_content: - prompts.append(f"## {main_memory['path']}\n\n{memory_content}") - - # Note: daily files are NOT injected, agent reads them per instructions - log.debug("runner.memory_injected", { - "session_id": self.session.id, - "has_main": main_memory is not None, - }) - - # Environment info - env_prompts = await SystemPrompt.environment( - directory=self.session.directory, - vcs="git" if self.session.directory else None, - ) - prompts.extend(env_prompts) - - # Custom instructions - custom_prompts = await SystemPrompt.custom(directory=self.session.directory) - prompts.extend(custom_prompts) - - # Agent-specific prompt (if any) - if agent.prompt: - prompts.append(agent.prompt) - - # Sandbox runtime context for better tool/path awareness - sandbox_prompt = await self._build_sandbox_prompt(agent) - if sandbox_prompt: - prompts.append(sandbox_prompt) - - # Channel context: inject the IM channel and session info when this - # session originates from an IM channel (Feishu / WeCom / DingTalk). - channel_ctx_prompt = await self._build_channel_context_prompt() - if channel_ctx_prompt: - prompts.append(channel_ctx_prompt) - - # Tool instructions - prompts.append(self._get_tool_instructions()) - - tool_catalog_prompt = self._build_tool_catalog_prompt(agent) - if tool_catalog_prompt: - prompts.append(tool_catalog_prompt) - - # Debug: optionally print system prompt during execution - if os.getenv("FLOCKS_PRINT_SYSTEM_PROMPT", "").lower() in ("1", "true", "yes"): - header = ( - f"\n=== system_prompt session={self.session.id} " - f"agent={agent.name} model={self.provider_id}/{self.model_id} ===" - ) - print(header, file=sys.stderr) - for idx, prompt in enumerate(prompts): - print(f"\n--- prompt[{idx}] ---\n{prompt}\n", file=sys.stderr) - print("=== end system_prompt ===\n", file=sys.stderr) - - self._static_cache[cache_key] = list(prompts) - return list(prompts) - async def _build_sandbox_prompt(self, agent: AgentInfo) -> Optional[str]: """Build sandbox context prompt when sandboxing is active.""" try: @@ -1424,27 +1372,6 @@ async def _build_channel_context_prompt(self) -> Optional[str]: log.debug("runner.channel_context_prompt.error", {"error": str(e)}) return None - def _get_tool_instructions(self) -> str: - from flocks.session.prompt_strings import PROMPT_TOOL_INSTRUCTIONS - if self._should_use_text_tool_call_mode(): - return ( - "You have access to tools, but for this model you MUST call them using " - "MiniMax XML embedded in text instead of native API tool-calling.\n\n" - "Required format:\n" - "\n" - "\n" - "json_or_string_value\n" - "\n" - "\n\n" - "Rules:\n" - "- Emit exactly one tool call block when you need a tool.\n" - "- Use valid tool names only.\n" - "- Parameter values must be valid JSON scalars/objects/arrays when appropriate.\n" - "- After tool results are returned, continue the task instead of repeating the same call.\n" - "- Do not use native API tool-calling for this model.\n" - ) - return PROMPT_TOOL_INSTRUCTIONS - def _list_catalog_tool_infos(self, agent: AgentInfo) -> List[Any]: tool_infos: List[Any] = [] is_rex = getattr(agent, "name", "") == "rex" @@ -1667,6 +1594,35 @@ def _get_context_window_tokens(self) -> int: pass return 128_000 + def _build_system_message_content( + self, + system_prompts: List[str], + ) -> str | list[dict[str, Any]]: + """Format system prompts for the active provider. + + Anthropic supports structured system blocks, which lets us place a + conservative cache breakpoint before the dynamic runtime tail. + """ + prompt_parts = [prompt for prompt in system_prompts if prompt and prompt.strip()] + if not prompt_parts: + return "" + + provider_lower = (self.provider_id or "").lower() + if "anthropic" not in provider_lower: + return "\n\n".join(prompt_parts) + + cache_break_index = max(0, len(prompt_parts) - 3) + blocks: list[dict[str, Any]] = [] + for index, prompt in enumerate(prompt_parts): + block: dict[str, Any] = { + "type": "text", + "text": prompt, + } + if index == cache_break_index: + block["cache_control"] = {"type": "ephemeral"} + blocks.append(block) + return blocks + async def _to_chat_messages( self, messages: List[MessageInfo], @@ -1702,7 +1658,7 @@ async def _to_chat_messages( if system_prompts: chat_messages.append(ChatMessage( role="system", - content="\n\n".join(system_prompts), + content=self._build_system_message_content(system_prompts), )) # Convert each message with parts diff --git a/tests/agent/test_prompt_utils.py b/tests/agent/test_prompt_utils.py index 15d3850a..3e2fae84 100644 --- a/tests/agent/test_prompt_utils.py +++ b/tests/agent/test_prompt_utils.py @@ -2,10 +2,11 @@ tests/agent/test_prompt_utils.py 单元测试:flocks.agent.prompt_utils 中被修改 / 新增的函数 -- categorize_tools() : 使用 ToolRegistry 真实 category -- _format_tools_for_prompt() : 按 ToolCategory 分组显示所有工具 -- build_tool_selection_table(): 新格式(工具分组 + agent 表格) -- build_workflows_section() : workflow 列表渲染 +- categorize_tools() : 使用 ToolRegistry 真实 category +- _format_tools_for_prompt() : 按 ToolCategory 分组显示所有工具 +- build_tool_selection_table() : 仅渲染工具目录 +- build_agent_selection_table(): 单独渲染 agent 调度表 +- build_workflows_section() : workflow 列表渲染 """ from __future__ import annotations @@ -18,6 +19,7 @@ from flocks.agent.agent import AvailableAgent, AvailableCategory, AvailableSkill, AvailableTool, AvailableWorkflow from flocks.agent.prompt_utils import ( _format_tools_for_prompt, + build_agent_selection_table, build_tool_selection_table, build_workflows_section, categorize_tools, @@ -194,15 +196,6 @@ def test_multiple_tools_same_category_comma_separated(self): class TestBuildToolSelectionTable: - def _make_agent(self, name: str, cost: str = "CHEAP") -> AvailableAgent: - meta = MagicMock() - meta.cost = cost - meta.category = "general" - meta.triggers = [] - meta.key_trigger = None - agent = AvailableAgent(name=name, description=f"{name} agent.", metadata=meta) - return agent - def test_contains_available_tools_header(self): tools = [AvailableTool(name="bash", category="terminal")] output = build_tool_selection_table([], tools) @@ -217,33 +210,72 @@ def test_tools_rendered_in_output(self): assert "`read`" in output assert "`bash`" in output + def test_agents_not_rendered_in_tools_output(self): + meta = MagicMock() + meta.cost = "CHEAP" + meta.category = "general" + meta.triggers = [] + meta.key_trigger = None + agents = [AvailableAgent(name="explore", description="explore agent.", metadata=meta)] + output = build_tool_selection_table(agents, []) + assert "explore" not in output + assert "When to Use" not in output + + def test_empty_inputs_no_crash(self): + output = build_tool_selection_table([], []) + assert isinstance(output, str) + + def test_default_flow_hint_removed(self): + output = build_tool_selection_table([], []) + assert "Default flow" not in output + + +# =========================================================================== +# build_agent_selection_table +# =========================================================================== + +class TestBuildAgentSelectionTable: + + def _make_agent(self, name: str, cost: str = "CHEAP") -> AvailableAgent: + meta = MagicMock() + meta.cost = cost + meta.category = "general" + meta.triggers = [] + meta.key_trigger = None + agent = AvailableAgent(name=name, description=f"{name} agent.", metadata=meta) + return agent + def test_agents_table_present_when_agents_exist(self): agents = [self._make_agent("explore")] - output = build_tool_selection_table(agents, []) + output = build_agent_selection_table(agents) assert "explore" in output assert "CHEAP" in output + assert "Trigger Signals" in output def test_utility_agents_excluded(self): normal = self._make_agent("explore") utility = self._make_agent("utility_agent") utility.metadata.category = "utility" - output = build_tool_selection_table([normal, utility], []) + output = build_agent_selection_table([normal, utility]) assert "explore" in output assert "utility_agent" not in output - def test_empty_tools_still_renders_agents(self): - agents = [self._make_agent("oracle", "EXPENSIVE")] - output = build_tool_selection_table(agents, []) - assert "oracle" in output - - def test_empty_inputs_no_crash(self): - output = build_tool_selection_table([], []) - assert isinstance(output, str) + def test_empty_agents_still_returns_header(self): + output = build_agent_selection_table([]) + assert "Available Agents" in output def test_default_flow_hint_present(self): - output = build_tool_selection_table([], []) + output = build_agent_selection_table([]) assert "Default flow" in output + def test_triggers_rendered_when_available(self): + agent = self._make_agent("explore") + trigger = MagicMock() + trigger.trigger = "Find Y" + agent.metadata.triggers = [trigger] + output = build_agent_selection_table([agent]) + assert "Find Y" in output + # =========================================================================== # build_workflows_section diff --git a/tests/integration/test_capability_awareness.py b/tests/integration/test_capability_awareness.py index bdc9c391..8b608750 100644 --- a/tests/integration/test_capability_awareness.py +++ b/tests/integration/test_capability_awareness.py @@ -14,8 +14,9 @@ - build_workflows_section() 能把它渲染成 prompt 片段 - inject_dynamic_prompts() 将 AvailableWorkflow 传递给 prompt builder -3. Rex prompt 包含 workflow 段落 - - Rex agent 加载后,其 prompt 包含 "Available Workflows" 字样(前提:存在工作流) +3. Rex prompt 保留 agent 调度上下文,但不再内嵌完整 tools/skills/workflows 目录 + - Rex agent 加载后,其 prompt 包含常见 subagent / specialist 信息 + - Rex agent prompt 不包含 "Available Workflows" 这类完整目录段落 4. /skills slash command 端到端(真实 Skill 扫描) - run_slash_command_tool("skills") 成功返回技能列表,不报错 @@ -257,15 +258,15 @@ def inject(agent_info, available_agents, tools, skills, categories, workflows=No # =========================================================================== -# 3. Rex prompt 包含 workflow 段落(存在工作流时) +# 3. Rex prompt 保留 agent 调度上下文,但移除完整目录段落 # =========================================================================== @pytest.mark.integration -class TestRexPromptWorkflowAwareness: +class TestRexPromptAwareness: @pytest.mark.asyncio async def test_rex_prompt_contains_tools_section(self): - """Rex prompt 必须包含工具信息(基础能力感知)。""" + """Rex prompt 仍应是非空且包含基础调度说明。""" from flocks.agent.registry import Agent rex = await Agent.get("rex") assert rex is not None @@ -286,20 +287,46 @@ async def test_rex_prompt_contains_subagents_section(self): @pytest.mark.asyncio async def test_rex_prompt_contains_skills_section(self): - """Rex prompt 必须包含 skills 相关信息。""" + """Rex prompt 不再内嵌完整 skills 目录。""" from flocks.agent.registry import Agent rex = await Agent.get("rex") prompt = (rex.prompt or "").lower() - assert "skill" in prompt + assert "category + skills delegation system" not in prompt @pytest.mark.asyncio - async def test_rex_prompt_has_workflow_placeholder_or_section(self): - """Rex prompt 应包含 workflow 相关内容(已注入或有占位结构)。""" + async def test_rex_prompt_does_not_embed_full_workflow_section(self): + """Rex prompt 不再内嵌 workflow 目录段落。""" from flocks.agent.registry import Agent rex = await Agent.get("rex") prompt = (rex.prompt or "").lower() - # Either the section was injected or the 'run_workflow' tool is mentioned - assert "workflow" in prompt or "run_workflow" in prompt + assert "### available workflows" not in prompt + + @pytest.mark.asyncio + async def test_rex_prompt_does_not_embed_full_tools_table(self): + """Rex prompt 不再内嵌完整工具目录表。""" + from flocks.agent.registry import Agent + rex = await Agent.get("rex") + prompt = rex.prompt or "" + assert "### Available Tools:" not in prompt + assert "Tool & Agent Selection" not in prompt + + @pytest.mark.asyncio + async def test_rex_prompt_does_not_include_codebase_assessment_phase(self): + """Rex prompt 不再包含独立的 Codebase Assessment 阶段。""" + from flocks.agent.registry import Agent + rex = await Agent.get("rex") + prompt = rex.prompt or "" + assert "## Phase 1 - Codebase Assessment" not in prompt + + @pytest.mark.asyncio + async def test_rex_prompt_merges_agent_selection_and_delegation(self): + """Rex prompt 使用单一 agent 路由表,不再单独保留 Delegation Table。""" + from flocks.agent.registry import Agent + rex = await Agent.get("rex") + prompt = rex.prompt or "" + assert "### Available Agents:" in prompt + assert "Trigger Signals" in prompt + assert "### Delegation Table:" not in prompt @pytest.mark.asyncio async def test_rex_prompt_prefers_direct_ioc_lookup_before_delegation(self): diff --git a/tests/memory/test_prompt_memory.py b/tests/memory/test_prompt_memory.py index d2d38c5d..147b047b 100644 --- a/tests/memory/test_prompt_memory.py +++ b/tests/memory/test_prompt_memory.py @@ -50,19 +50,22 @@ async def test_prompt_memory(): print(f"❌ Test failed: {e}") return False - # Test 3: Test build_system_prompt without memory - print("\n[3/4] Testing build_system_prompt without memory...") + # Test 3: Test runtime system prompt builder without memory bootstrap + print("\n[3/4] Testing build_system_prompts without memory bootstrap...") try: - prompt = await SessionPrompt.build_system_prompt( + prompt_parts = await SessionPrompt.build_system_prompts( + session_id="test", + session_directory=None, agent_name="test_agent", - include_environment=False, - include_custom=False, - include_memory=False, + agent_prompt="agent prompt", + provider_id="test-provider", + model_id="test-model", ) + prompt = "\n\n".join(prompt_parts) print(f" Prompt length: {len(prompt)} chars") assert len(prompt) > 0, "Should generate prompt" - assert "test_agent" in prompt, "Should include agent name" + assert "agent prompt" in prompt, "Should include agent prompt" print("✅ System prompt generation working") except Exception as e: @@ -71,32 +74,34 @@ async def test_prompt_memory(): traceback.print_exc() return False - # Test 4: Test build_system_prompt with memory (disabled) - print("\n[4/4] Testing build_system_prompt with memory (disabled)...") + # Test 4: Test runtime system prompt builder with disabled memory bootstrap injection + print("\n[4/4] Testing build_system_prompts with memory bootstrap disabled...") try: - import tempfile - with tempfile.TemporaryDirectory() as tmpdir: - memory = SessionMemory( - session_id="test", - project_id="proj", - workspace_dir=tmpdir, - enabled=False, - ) - - prompt = await SessionPrompt.build_system_prompt( - agent_name="test_agent", - include_environment=False, - include_custom=False, - include_memory=True, - session_memory=memory, - user_message="test query", - ) - - print(f" Prompt length: {len(prompt)} chars") - assert len(prompt) > 0, "Should generate prompt" - assert "Relevant Memory" not in prompt, "Should not include memory section when disabled" - - print("✅ Memory integration working correctly") + prompt_parts = await SessionPrompt.build_system_prompts( + session_id="test", + session_directory=None, + agent_name="test_agent", + agent_prompt="agent prompt", + provider_id="test-provider", + model_id="test-model", + memory_bootstrap_data={ + "instructions": "memory guidance", + "main_memory": { + "path": "MEMORY.md", + "content": "remembered context", + "inject": False, + }, + }, + prompt_tool_names=("read",), + ) + prompt = "\n\n".join(prompt_parts) + + print(f" Prompt length: {len(prompt)} chars") + assert len(prompt) > 0, "Should generate prompt" + assert "Relevant Memory" not in prompt, "Should not include memory section when disabled" + assert "remembered context" not in prompt, "Should not inject disabled memory snapshot" + + print("✅ Memory integration working correctly") except Exception as e: print(f"❌ Test failed: {e}") import traceback @@ -109,16 +114,18 @@ async def test_prompt_memory(): print("\n📋 Prompt Memory Integration Ready:") print(" ✅ build_memory_context() method") - print(" ✅ build_system_prompt() with memory support") - print(" ✅ Automatic memory retrieval") + print(" ✅ build_system_prompts() runtime prompt builder") + print(" ✅ Memory bootstrap injection control") print(" ✅ Graceful disabled handling") print("\n🎯 Usage Example:") - print(" memory = await Session.get_memory(project_id, session_id)") - print(" prompt = await SessionPrompt.build_system_prompt(") - print(" include_memory=True,") - print(" session_memory=memory,") - print(" user_message='How do I use transformers?'") + print(" prompt_parts = await SessionPrompt.build_system_prompts(") + print(" session_id=session.id,") + print(" session_directory=session.directory,") + print(" agent_name=agent.name,") + print(" agent_prompt=agent.prompt,") + print(" provider_id=provider_id,") + print(" model_id=model_id,") print(" )") return True diff --git a/tests/provider/test_anthropic_system_blocks.py b/tests/provider/test_anthropic_system_blocks.py new file mode 100644 index 00000000..1a899db9 --- /dev/null +++ b/tests/provider/test_anthropic_system_blocks.py @@ -0,0 +1,44 @@ +import pytest +from types import SimpleNamespace +from unittest.mock import AsyncMock + +from flocks.provider.provider import ChatMessage +from flocks.provider.sdk.anthropic import AnthropicProvider + + +@pytest.mark.asyncio +async def test_anthropic_chat_forwards_structured_system_blocks(): + provider = AnthropicProvider() + create_mock = AsyncMock(return_value=SimpleNamespace( + id="resp_1", + model="claude-sonnet", + stop_reason="end_turn", + content=[SimpleNamespace(type="text", text="ok")], + usage=SimpleNamespace( + input_tokens=12, + output_tokens=4, + cache_read_input_tokens=3, + cache_creation_input_tokens=5, + ), + )) + provider._client = SimpleNamespace(messages=SimpleNamespace(create=create_mock)) + + system_blocks = [ + {"type": "text", "text": "provider prompt"}, + { + "type": "text", + "text": "context prompt", + "cache_control": {"type": "ephemeral"}, + }, + {"type": "text", "text": "runtime prompt"}, + ] + messages = [ + ChatMessage(role="system", content=system_blocks), + ChatMessage(role="user", content="hello"), + ] + + response = await provider.chat("claude-sonnet", messages) + + assert response.content == "ok" + request_kwargs = create_mock.await_args.kwargs + assert request_kwargs["system"] == system_blocks diff --git a/tests/session/test_prompt_tokens.py b/tests/session/test_prompt_tokens.py index 8709c2e4..dc284560 100644 --- a/tests/session/test_prompt_tokens.py +++ b/tests/session/test_prompt_tokens.py @@ -17,7 +17,12 @@ import pytest -from flocks.session.prompt import SessionPrompt, SystemPrompt, PromptTemplate +from flocks.session.prompt import ( + PROMPT_DEFAULT, + PromptTemplate, + SessionPrompt, + SystemPrompt, +) from flocks.session import prompt_strings @@ -219,18 +224,26 @@ class TestSystemPromptProvider: def test_anthropic_model_returns_list(self): result = SystemPrompt.provider("claude-3-5-sonnet-20241022") assert isinstance(result, list) + assert len(result) == 1 + assert result[0].startswith(PROMPT_DEFAULT.strip()) def test_gemini_model_returns_list(self): result = SystemPrompt.provider("gemini-1.5-pro") assert isinstance(result, list) + assert len(result) == 1 + assert result[0].startswith(PROMPT_DEFAULT.strip()) def test_gpt_model_returns_list(self): result = SystemPrompt.provider("gpt-4o") assert isinstance(result, list) + assert len(result) == 1 + assert result[0].startswith(PROMPT_DEFAULT.strip()) def test_unknown_model_returns_list(self): result = SystemPrompt.provider("totally-unknown-model") assert isinstance(result, list) + assert len(result) == 1 + assert result[0].startswith(PROMPT_DEFAULT.strip()) def test_none_model_returns_list(self): # provider() may raise on None; just verify it returns a list or handle gracefully @@ -242,21 +255,39 @@ def test_none_model_returns_list(self): class TestPromptToolInstructions: - def test_windows_includes_shell_rules(self): + def test_windows_bash_guidance_mentions_powershell(self): with patch.object(prompt_strings.platform, "system", return_value="Windows"): - instructions = prompt_strings._build_tool_instructions() + guidance = prompt_strings._build_bash_tool_guidance() - assert "do not assume GNU bash features" in instructions - assert "cat > file <<'EOF'" in instructions - assert "PowerShell-compatible syntax or Python" in instructions + assert "Windows machine" in guidance + assert "must follow PowerShell syntax" in guidance + assert 'Path(path).read_text(encoding="utf-8-sig")' in guidance - def test_non_windows_keeps_default_strategy(self): + def test_non_windows_bash_guidance_stays_generic(self): with patch.object(prompt_strings.platform, "system", return_value="Darwin"): - instructions = prompt_strings._build_tool_instructions() + guidance = prompt_strings._build_bash_tool_guidance() - assert "do not assume GNU bash features" not in instructions - assert "PowerShell-compatible syntax or Python" not in instructions - assert "must explicitly specify encoding" in instructions + assert "Bash Tool Guidance" in guidance + assert "Windows machine" not in guidance + assert "must follow PowerShell syntax" not in guidance + + def test_tool_instructions_are_platform_agnostic(self): + with patch.object(prompt_strings.platform, "system", return_value="Windows"): + windows_instructions = prompt_strings._build_tool_instructions() + with patch.object(prompt_strings.platform, "system", return_value="Darwin"): + darwin_instructions = prompt_strings._build_tool_instructions() + + assert windows_instructions == darwin_instructions + assert "PowerShell" not in windows_instructions + assert "must explicitly specify encoding" not in windows_instructions + + def test_tool_instructions_do_not_hardcode_tool_name_mapping(self): + instructions = prompt_strings._build_tool_instructions() + + assert "callable schema" in instructions + assert "Read files: use the 'read' tool" not in instructions + assert "Run commands: use the 'bash' tool" not in instructions + assert "Search code: use the 'grep' tool" not in instructions # --------------------------------------------------------------------------- diff --git a/tests/session/test_runner_step.py b/tests/session/test_runner_step.py index d3bff186..d81be6c4 100644 --- a/tests/session/test_runner_step.py +++ b/tests/session/test_runner_step.py @@ -22,6 +22,7 @@ StepResult, ToolCall, ) +from flocks.session.prompt import SessionPrompt from flocks.session.session import SessionInfo from flocks.tool.registry import ToolCategory, ToolInfo @@ -52,6 +53,13 @@ def _make_runner(session_id="ses_runner_test"): return SessionRunner(session=session) +def _make_callable_schema_result(*tool_names): + return SimpleNamespace( + tool_infos=[SimpleNamespace(name=name) for name in tool_names], + metadata={}, + ) + + # --------------------------------------------------------------------------- # ToolCall dataclass # --------------------------------------------------------------------------- @@ -453,27 +461,77 @@ async def test_build_system_prompts_reuses_loop_static_cache(self): agent = _make_agent(name="rex") agent.prompt = "agent prompt" - env_mock = AsyncMock(return_value=["env prompt"]) + env_mock = MagicMock(return_value=["env prompt"]) + runtime_mock = MagicMock(return_value=["runtime prompt"]) custom_mock = AsyncMock(return_value=["custom prompt"]) sandbox_mock = AsyncMock(return_value="sandbox prompt") channel_mock = AsyncMock(return_value="channel prompt") - with patch("flocks.session.runner.SystemPrompt.provider", return_value=["provider prompt"]), \ - patch("flocks.session.runner.SystemPrompt.environment", env_mock), \ - patch("flocks.session.runner.SystemPrompt.custom", custom_mock), \ - patch.object(SessionRunner, "_build_sandbox_prompt", sandbox_mock), \ - patch.object(SessionRunner, "_build_channel_context_prompt", channel_mock), \ - patch.object(SessionRunner, "_get_tool_instructions", return_value="tool instructions"), \ - patch.object(SessionRunner, "_build_tool_catalog_prompt", return_value="tool catalog"): - prompts1 = await runner1._build_system_prompts(agent) - prompts2 = await runner2._build_system_prompts(agent) + with patch("flocks.session.prompt.SystemPrompt.provider", return_value=["provider prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.environment_stable", env_mock), \ + patch("flocks.session.prompt.SystemPrompt.runtime_metadata", runtime_mock), \ + patch("flocks.session.prompt.SystemPrompt.custom", custom_mock): + prompts1 = await SessionPrompt.build_system_prompts( + session_id=session.id, + session_directory=session.directory, + agent_name=agent.name, + agent_prompt=agent.prompt, + provider_id=runner1.provider_id, + model_id=runner1.model_id, + prompt_tool_names=("read",), + tool_revision=1, + static_cache=shared_cache, + sandbox_prompt_factory=sandbox_mock, + channel_context_prompt_factory=channel_mock, + tool_catalog_prompt_factory=lambda: "tool catalog", + ) + prompts2 = await SessionPrompt.build_system_prompts( + session_id=session.id, + session_directory=session.directory, + agent_name=agent.name, + agent_prompt=agent.prompt, + provider_id=runner2.provider_id, + model_id=runner2.model_id, + prompt_tool_names=("read",), + tool_revision=1, + static_cache=shared_cache, + sandbox_prompt_factory=sandbox_mock, + channel_context_prompt_factory=channel_mock, + tool_catalog_prompt_factory=lambda: "tool catalog", + ) assert prompts1 == prompts2 - env_mock.assert_awaited_once() + env_mock.assert_called_once() + runtime_mock.assert_called_once() custom_mock.assert_awaited_once() sandbox_mock.assert_awaited_once() channel_mock.assert_awaited_once() + @pytest.mark.asyncio + async def test_build_system_prompts_orders_stable_prefix_before_runtime_tail(self): + session = _make_session("ses_prompts_order") + runner = SessionRunner(session=session) + agent = _make_agent(name="rex") + agent.prompt = "agent prompt" + + with patch("flocks.session.prompt.SystemPrompt.provider", return_value=["provider prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.environment_stable", return_value=["env prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.runtime_metadata", return_value=["runtime prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.custom", AsyncMock(return_value=["custom prompt"])): + prompts = await SessionPrompt.build_system_prompts( + session_id=session.id, + session_directory=session.directory, + agent_name=agent.name, + agent_prompt=agent.prompt, + provider_id=runner.provider_id, + model_id=runner.model_id, + prompt_tool_names=("read",), + ) + + assert prompts.index("agent prompt") < prompts.index("env prompt") + assert prompts.index("custom prompt") < prompts.index("runtime prompt") + assert prompts[-1] == "runtime prompt" + @pytest.mark.asyncio async def test_build_system_prompts_rebuilds_when_tool_revision_changes(self): shared_cache = {} @@ -482,30 +540,257 @@ async def test_build_system_prompts_rebuilds_when_tool_revision_changes(self): agent = _make_agent(name="rex") agent.prompt = "agent prompt v1" - env_mock = AsyncMock(return_value=["env prompt"]) + env_mock = MagicMock(return_value=["env prompt"]) + runtime_mock = MagicMock(return_value=["runtime prompt"]) custom_mock = AsyncMock(return_value=["custom prompt"]) sandbox_mock = AsyncMock(return_value="sandbox prompt") channel_mock = AsyncMock(return_value="channel prompt") - with patch("flocks.session.runner.ToolRegistry.revision", side_effect=[1, 2]), \ - patch("flocks.session.runner.SystemPrompt.provider", return_value=["provider prompt"]), \ - patch("flocks.session.runner.SystemPrompt.environment", env_mock), \ - patch("flocks.session.runner.SystemPrompt.custom", custom_mock), \ - patch.object(SessionRunner, "_build_sandbox_prompt", sandbox_mock), \ - patch.object(SessionRunner, "_build_channel_context_prompt", channel_mock), \ - patch.object(SessionRunner, "_get_tool_instructions", return_value="tool instructions"), \ - patch.object(SessionRunner, "_build_tool_catalog_prompt", side_effect=["tool catalog v1", "tool catalog v2"]): - prompts1 = await runner._build_system_prompts(agent) + catalog_prompts = iter(["tool catalog v1", "tool catalog v2"]) + + with patch("flocks.session.prompt.SystemPrompt.provider", return_value=["provider prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.environment_stable", env_mock), \ + patch("flocks.session.prompt.SystemPrompt.runtime_metadata", runtime_mock), \ + patch("flocks.session.prompt.SystemPrompt.custom", custom_mock): + prompts1 = await SessionPrompt.build_system_prompts( + session_id=session.id, + session_directory=session.directory, + agent_name=agent.name, + agent_prompt=agent.prompt, + provider_id=runner.provider_id, + model_id=runner.model_id, + prompt_tool_names=("read",), + tool_revision=1, + static_cache=shared_cache, + sandbox_prompt_factory=sandbox_mock, + channel_context_prompt_factory=channel_mock, + tool_catalog_prompt_factory=lambda: next(catalog_prompts), + ) + agent.prompt = "agent prompt v2" + prompts2 = await SessionPrompt.build_system_prompts( + session_id=session.id, + session_directory=session.directory, + agent_name=agent.name, + agent_prompt=agent.prompt, + provider_id=runner.provider_id, + model_id=runner.model_id, + prompt_tool_names=("read",), + tool_revision=2, + static_cache=shared_cache, + sandbox_prompt_factory=sandbox_mock, + channel_context_prompt_factory=channel_mock, + tool_catalog_prompt_factory=lambda: next(catalog_prompts), + ) + + assert prompts1 != prompts2 + assert "agent prompt v1" in prompts1 + assert "agent prompt v2" in prompts2 + assert "tool catalog v1" in prompts1 + assert "tool catalog v2" in prompts2 + env_mock.assert_called_once() + runtime_mock.assert_called_once() + custom_mock.assert_awaited_once() + sandbox_mock.assert_awaited_once() + channel_mock.assert_awaited_once() + + @pytest.mark.asyncio + async def test_build_system_prompts_rebuilds_when_agent_prompt_changes(self): + shared_cache = {} + session = _make_session("ses_prompts_agent_prompt") + runner = SessionRunner(session=session, static_cache=shared_cache) + agent = _make_agent(name="rex") + agent.prompt = "agent prompt v1" + + env_mock = MagicMock(return_value=["env prompt"]) + runtime_mock = MagicMock(return_value=["runtime prompt"]) + custom_mock = AsyncMock(return_value=["custom prompt"]) + + with patch("flocks.session.prompt.SystemPrompt.provider", return_value=["provider prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.environment_stable", env_mock), \ + patch("flocks.session.prompt.SystemPrompt.runtime_metadata", runtime_mock), \ + patch("flocks.session.prompt.SystemPrompt.custom", custom_mock): + prompts1 = await SessionPrompt.build_system_prompts( + session_id=session.id, + session_directory=session.directory, + agent_name=agent.name, + agent_prompt=agent.prompt, + provider_id=runner.provider_id, + model_id=runner.model_id, + prompt_tool_names=("read",), + tool_revision=1, + static_cache=shared_cache, + ) agent.prompt = "agent prompt v2" - prompts2 = await runner._build_system_prompts(agent) + prompts2 = await SessionPrompt.build_system_prompts( + session_id=session.id, + session_directory=session.directory, + agent_name=agent.name, + agent_prompt=agent.prompt, + provider_id=runner.provider_id, + model_id=runner.model_id, + prompt_tool_names=("read",), + tool_revision=1, + static_cache=shared_cache, + ) assert prompts1 != prompts2 assert "agent prompt v1" in prompts1 assert "agent prompt v2" in prompts2 - assert env_mock.await_count == 2 - assert custom_mock.await_count == 2 - assert sandbox_mock.await_count == 2 - assert channel_mock.await_count == 2 + env_mock.assert_called_once() + runtime_mock.assert_called_once() + custom_mock.assert_awaited_once() + + @pytest.mark.asyncio + async def test_build_system_prompts_includes_memory_guidance_when_memory_tools_loaded(self): + session = _make_session("ses_prompts_memory_guidance") + runner = SessionRunner( + session=session, + memory_bootstrap_data={ + "instructions": "memory guidance", + "main_memory": { + "path": "MEMORY.md", + "content": "remembered context", + "inject": True, + }, + }, + ) + agent = _make_agent(name="rex") + agent.prompt = "agent prompt" + + with patch("flocks.session.prompt.SystemPrompt.provider", return_value=["provider prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.environment_stable", return_value=["env prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.runtime_metadata", return_value=["runtime prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.custom", AsyncMock(return_value=["custom prompt"])): + prompts = await SessionPrompt.build_system_prompts( + session_id=session.id, + session_directory=session.directory, + agent_name=agent.name, + agent_prompt=agent.prompt, + provider_id=runner.provider_id, + model_id=runner.model_id, + prompt_tool_names=("memory_search", "read"), + memory_bootstrap_data=runner._memory_bootstrap_data, + ) + + assert "memory guidance" in "\n\n".join(prompts) + assert "## MEMORY.md\n\nremembered context" in prompts + + @pytest.mark.asyncio + async def test_build_system_prompts_includes_bash_guidance_when_bash_loaded_on_windows(self): + session = _make_session("ses_prompts_bash_guidance") + runner = SessionRunner(session=session) + agent = _make_agent(name="rex") + agent.prompt = "agent prompt" + + with patch("flocks.session.prompt.SystemPrompt.provider", return_value=["provider prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.environment_stable", return_value=["env prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.runtime_metadata", return_value=["runtime prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.custom", AsyncMock(return_value=["custom prompt"])), \ + patch("flocks.session.prompt_strings.platform.system", return_value="Windows"): + prompts = await SessionPrompt.build_system_prompts( + session_id=session.id, + session_directory=session.directory, + agent_name=agent.name, + agent_prompt=agent.prompt, + provider_id=runner.provider_id, + model_id=runner.model_id, + prompt_tool_names=("bash", "read"), + ) + + combined = "\n\n".join(prompts) + assert "## Bash Tool Guidance" in combined + assert "must follow PowerShell syntax" in combined + assert "explicit PowerShell commands" in combined + + @pytest.mark.asyncio + async def test_build_system_prompts_skips_memory_guidance_without_memory_tools(self): + session = _make_session("ses_prompts_no_memory_guidance") + runner = SessionRunner( + session=session, + memory_bootstrap_data={ + "instructions": "memory guidance", + "main_memory": { + "path": "MEMORY.md", + "content": "remembered context", + "inject": True, + }, + }, + ) + agent = _make_agent(name="rex") + agent.prompt = "agent prompt" + + with patch("flocks.session.prompt.SystemPrompt.provider", return_value=["provider prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.environment_stable", return_value=["env prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.runtime_metadata", return_value=["runtime prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.custom", AsyncMock(return_value=["custom prompt"])): + prompts = await SessionPrompt.build_system_prompts( + session_id=session.id, + session_directory=session.directory, + agent_name=agent.name, + agent_prompt=agent.prompt, + provider_id=runner.provider_id, + model_id=runner.model_id, + prompt_tool_names=("read",), + memory_bootstrap_data=runner._memory_bootstrap_data, + ) + + assert "memory guidance" not in "\n\n".join(prompts) + assert "## MEMORY.md\n\nremembered context" in prompts + + @pytest.mark.asyncio + async def test_build_system_prompts_rebuilds_when_prompt_tool_names_change(self): + shared_cache = {} + session = _make_session("ses_prompts_tool_names") + runner = SessionRunner( + session=session, + static_cache=shared_cache, + memory_bootstrap_data={ + "instructions": "memory guidance", + "main_memory": None, + }, + ) + agent = _make_agent(name="rex") + agent.prompt = "agent prompt" + + env_mock = MagicMock(return_value=["env prompt"]) + runtime_mock = MagicMock(return_value=["runtime prompt"]) + custom_mock = AsyncMock(return_value=["custom prompt"]) + + with patch("flocks.session.prompt.SystemPrompt.provider", return_value=["provider prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.environment_stable", env_mock), \ + patch("flocks.session.prompt.SystemPrompt.runtime_metadata", runtime_mock), \ + patch("flocks.session.prompt.SystemPrompt.custom", custom_mock): + prompts_with_memory = await SessionPrompt.build_system_prompts( + session_id=session.id, + session_directory=session.directory, + agent_name=agent.name, + agent_prompt=agent.prompt, + provider_id=runner.provider_id, + model_id=runner.model_id, + prompt_tool_names=("memory_search", "read"), + tool_revision=1, + memory_bootstrap_data=runner._memory_bootstrap_data, + static_cache=shared_cache, + ) + prompts_without_memory = await SessionPrompt.build_system_prompts( + session_id=session.id, + session_directory=session.directory, + agent_name=agent.name, + agent_prompt=agent.prompt, + provider_id=runner.provider_id, + model_id=runner.model_id, + prompt_tool_names=("read",), + tool_revision=1, + memory_bootstrap_data=runner._memory_bootstrap_data, + static_cache=shared_cache, + ) + + assert prompts_with_memory != prompts_without_memory + assert "memory guidance" in "\n\n".join(prompts_with_memory) + assert "memory guidance" not in "\n\n".join(prompts_without_memory) + env_mock.assert_called_once() + runtime_mock.assert_called_once() + custom_mock.assert_awaited_once() def test_build_tool_catalog_prompt_for_rex(self): runner = _make_runner() @@ -663,16 +948,32 @@ def test_disabled_for_other_models(self): ) assert runner._should_use_text_tool_call_mode() is False - def test_tool_instructions_switch_to_minimax_xml(self): + @pytest.mark.asyncio + async def test_system_prompts_switch_tool_guidance_to_minimax_xml(self): session = _make_session("ses_minimax_prompt") runner = SessionRunner( session=session, provider_id="custom-tb-inner", model_id="minimax:MiniMax-M2.5", ) - instructions = runner._get_tool_instructions() - assert "" in instructions - assert "native API tool-calling" in instructions + + with patch("flocks.session.prompt.SystemPrompt.provider", return_value=["provider prompt"]), \ + patch("flocks.session.prompt.SystemPrompt.environment", AsyncMock(return_value=["env prompt"])), \ + patch("flocks.session.prompt.SystemPrompt.custom", AsyncMock(return_value=["custom prompt"])): + prompts = await SessionPrompt.build_system_prompts( + session_id=session.id, + session_directory=session.directory, + agent_name="rex", + agent_prompt=None, + provider_id=runner.provider_id, + model_id=runner.model_id, + prompt_tool_names=("read",), + use_text_tool_call_mode=runner._should_use_text_tool_call_mode(), + ) + + combined = "\n\n".join(prompts) + assert "" in combined + assert "native API tool-calling" in combined def test_build_text_tool_call_catalog_prompt(self): session = _make_session("ses_minimax_catalog") @@ -707,6 +1008,50 @@ def test_build_text_tool_call_catalog_prompt(self): assert "required" in prompt +@pytest.mark.asyncio +async def test_to_chat_messages_uses_structured_anthropic_system_blocks(monkeypatch): + runner = SessionRunner( + session=_make_session("ses_anthropic_system_blocks"), + provider_id="anthropic", + model_id="claude-sonnet", + ) + message = SimpleNamespace(id="msg_user", role="user", content="hello") + + monkeypatch.setattr(runner_mod.Message, "parts", AsyncMock(return_value=[])) + monkeypatch.setattr(runner_mod.Message, "get_text_content", AsyncMock(return_value="hello")) + + chat_messages = await runner._to_chat_messages( + [message], + ["provider prompt", "agent prompt", "context prompt", "runtime prompt"], + ) + + assert chat_messages[0].role == "system" + assert isinstance(chat_messages[0].content, list) + assert chat_messages[0].content[1]["cache_control"] == {"type": "ephemeral"} + assert chat_messages[0].content[-1]["text"] == "runtime prompt" + + +@pytest.mark.asyncio +async def test_to_chat_messages_keeps_joined_system_prompt_for_openai(monkeypatch): + runner = SessionRunner( + session=_make_session("ses_openai_system_blocks"), + provider_id="openai", + model_id="gpt-5", + ) + message = SimpleNamespace(id="msg_user", role="user", content="hello") + + monkeypatch.setattr(runner_mod.Message, "parts", AsyncMock(return_value=[])) + monkeypatch.setattr(runner_mod.Message, "get_text_content", AsyncMock(return_value="hello")) + + chat_messages = await runner._to_chat_messages( + [message], + ["provider prompt", "agent prompt"], + ) + + assert chat_messages[0].role == "system" + assert chat_messages[0].content == "provider prompt\n\nagent prompt" + + @pytest.mark.asyncio async def test_process_step_creates_assistant_message_with_provider_and_model(monkeypatch): runner = _make_runner("ses_runner_provider_model") @@ -736,7 +1081,8 @@ async def fake_create(*args, **kwargs): monkeypatch.setattr(runner_mod.Agent, "get", AsyncMock(return_value=agent)) monkeypatch.setattr(runner_mod.Provider, "get", lambda provider_id: provider) monkeypatch.setattr(runner_mod.Provider, "apply_config", AsyncMock(return_value=None)) - monkeypatch.setattr(runner, "_build_system_prompts", AsyncMock(return_value=[])) + monkeypatch.setattr(runner, "_get_prompt_tool_names", AsyncMock(return_value=())) + monkeypatch.setattr(runner_mod.SessionPrompt, "build_system_prompts", AsyncMock(return_value=[])) monkeypatch.setattr(runner, "_build_callable_tool_schema", AsyncMock(return_value=[])) monkeypatch.setattr( runner, @@ -778,7 +1124,8 @@ async def test_process_step_records_usage_after_success(monkeypatch): monkeypatch.setattr(runner_mod.Agent, "get", AsyncMock(return_value=agent)) monkeypatch.setattr(runner_mod.Provider, "get", lambda provider_id: provider) monkeypatch.setattr(runner_mod.Provider, "apply_config", AsyncMock(return_value=None)) - monkeypatch.setattr(runner, "_build_system_prompts", AsyncMock(return_value=[])) + monkeypatch.setattr(runner, "_get_prompt_tool_names", AsyncMock(return_value=())) + monkeypatch.setattr(runner_mod.SessionPrompt, "build_system_prompts", AsyncMock(return_value=[])) monkeypatch.setattr(runner, "_build_callable_tool_schema", AsyncMock(return_value=[])) monkeypatch.setattr( runner, @@ -830,7 +1177,8 @@ async def test_process_step_empty_retry_records_usage_per_attempt(monkeypatch): monkeypatch.setattr(runner_mod.Agent, "get", AsyncMock(return_value=agent)) monkeypatch.setattr(runner_mod.Provider, "get", lambda provider_id: provider) monkeypatch.setattr(runner_mod.Provider, "apply_config", AsyncMock(return_value=None)) - monkeypatch.setattr(runner, "_build_system_prompts", AsyncMock(return_value=[])) + monkeypatch.setattr(runner, "_get_prompt_tool_names", AsyncMock(return_value=())) + monkeypatch.setattr(runner_mod.SessionPrompt, "build_system_prompts", AsyncMock(return_value=[])) monkeypatch.setattr(runner, "_build_callable_tool_schema", AsyncMock(return_value=[])) monkeypatch.setattr( runner, diff --git a/tests/session/test_session_runner_tool_only_message.py b/tests/session/test_session_runner_tool_only_message.py index 0d3a1193..28e879e4 100644 --- a/tests/session/test_session_runner_tool_only_message.py +++ b/tests/session/test_session_runner_tool_only_message.py @@ -4,6 +4,7 @@ from flocks.agent.registry import Agent from flocks.provider.provider import ChatMessage, Provider from flocks.session.message import Message, MessageRole, ToolPart, ToolStateCompleted +from flocks.session.prompt import SessionPrompt from flocks.session.runner import SessionRunner, StepResult from flocks.session.session import Session from flocks.utils.id import Identifier @@ -76,7 +77,12 @@ async def fake_agent_get(name: str): sentinel_tools = [{"type": "function", "function": {"name": "write", "description": "", "parameters": {}}}] captured = {} - async def fake_build_system_prompts(self, agent): # noqa: ANN001 + async def fake_get_prompt_tool_names(self, agent): # noqa: ANN001 + del self, agent + return () + + async def fake_build_system_prompts(*args, **kwargs): # noqa: ANN002, ANN003 + del args, kwargs return [] async def fake_build_callable_tool_schema(self, agent, messages=None): # noqa: ANN001 @@ -93,7 +99,8 @@ async def fake_call_llm(self, provider, messages, tools, agent, assistant_msg): monkeypatch.setattr(Provider, "get", lambda _provider_id: DummyProvider()) monkeypatch.setattr(Provider, "apply_config", fake_apply_config) monkeypatch.setattr(Agent, "get", fake_agent_get) - monkeypatch.setattr(SessionRunner, "_build_system_prompts", fake_build_system_prompts) + monkeypatch.setattr(SessionRunner, "_get_prompt_tool_names", fake_get_prompt_tool_names) + monkeypatch.setattr(SessionPrompt, "build_system_prompts", fake_build_system_prompts) monkeypatch.setattr(SessionRunner, "_build_callable_tool_schema", fake_build_callable_tool_schema) monkeypatch.setattr(SessionRunner, "_to_chat_messages", fake_to_chat_messages) monkeypatch.setattr(SessionRunner, "_call_llm", fake_call_llm) From d3afade88b9dea372e0784101a9e668461e13c09 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Tue, 12 May 2026 15:25:53 +0800 Subject: [PATCH 2/4] refactor(rex): streamline prompt builder; ship flocks_mcp in core - Relocate flocks_mcp from ~/.flocks plugin path to flocks/tool/system. - Rex prompt_builder and agent.yaml/toolset updates; agent_factory hooks. - Session prompt/runner and prompt_strings tweaks; registry and skill wiring. - Refresh AGENTS.md; extend tests for factory, runner, toolset, builtins. Co-authored-by: Cursor --- AGENTS.md | 21 +- flocks/agent/agent_factory.py | 18 +- flocks/agent/agents/rex/agent.yaml | 21 +- flocks/agent/agents/rex/prompt_builder.py | 611 +++++------------- flocks/agent/toolset.py | 21 + flocks/session/prompt.py | 38 +- flocks/session/prompt_strings.py | 4 +- flocks/session/runner.py | 48 +- flocks/tool/registry.py | 4 +- .../tool/system}/flocks_mcp.py | 175 +++-- flocks/tool/system/skill.py | 1 + tests/agent/test_agent_factory.py | 26 + .../integration/test_capability_awareness.py | 36 +- tests/session/test_runner_step.py | 99 ++- tests/tool/test_agent_toolset.py | 46 ++ tests/tool/test_builtin_management_tools.py | 21 + 16 files changed, 559 insertions(+), 631 deletions(-) rename {.flocks/plugins/tools/python => flocks/tool/system}/flocks_mcp.py (69%) create mode 100644 tests/tool/test_builtin_management_tools.py diff --git a/AGENTS.md b/AGENTS.md index e0283b0a..877b39a3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -14,25 +14,6 @@ | LLM 中间推理落盘(workflow 节点内) | `~/.flocks/workspace/outputs//artifacts/` | | 临时调试/草稿文件 | `/tmp/` | -### 代码示例(workflow 节点 / Python 脚本) - -```python -import os, datetime -from flocks.workspace.manager import WorkspaceManager - -# 在执行时动态取当日日期,不依赖 session 启动时的注入值 -ws = WorkspaceManager.get_instance() -output_dir = str(ws.get_workspace_dir() / 'outputs' / datetime.date.today().isoformat()) -os.makedirs(output_dir, exist_ok=True) - -# 写报告 -tool.run('write', filePath=os.path.join(output_dir, 'final_report.md'), content=report) -# 写 LLM 中间输出 -artifacts_dir = os.path.join(output_dir, 'artifacts') -os.makedirs(artifacts_dir, exist_ok=True) -tool.run('write', filePath=os.path.join(artifacts_dir, 'payload_analysis.md'), content=llm_output) -``` - ### 何时可以使用其他路径 - 用户在 prompt 中**明确指定**了输出路径(优先尊重用户指定) @@ -169,4 +150,4 @@ Rex has a dedicated `flocks_skills` tool for managing agent skills. ## Important - 涉及 `tdp`、`onesec`、`skyeye`、`qingteng` 的任务时,必须先读取并遵循对应的 skill。 -- 对上述系统,禁止绕过对应 skill 直接调用相关 tools;也不要直接使用 `agent-browser`。 \ No newline at end of file +- 对上述系统,禁止绕过对应 skill 直接调用相关 tools;也不要直接使用 `browser`。 \ No newline at end of file diff --git a/flocks/agent/agent_factory.py b/flocks/agent/agent_factory.py index 6d6f4851..a8b5bc45 100644 --- a/flocks/agent/agent_factory.py +++ b/flocks/agent/agent_factory.py @@ -6,7 +6,9 @@ Resolves each agent to a concrete ``tools`` list. If legacy ``permission`` is present, it is expanded against the current tool registry for compatibility. If neither ``tools`` nor ``permission`` is declared, the static tool list stays -empty and runtime exposure falls back to always-load tools only. +empty and runtime exposure falls back to always-load tools only. ``rex`` is the +special case: an explicitly empty ``tools: []`` expands to all enabled built-in +tools so the primary orchestrator keeps broad native capabilities by default. Extension point: Built-in agents: flocks/agent/agents// native=True @@ -85,6 +87,8 @@ def load_agent(agent_dir: Path, native: bool = False) -> Optional[AgentInfo]: Permission is generated from the ``tools`` list in agent.yaml. If ``tools`` is absent, the agent keeps an empty static tool list and only runtime always-load tools remain available until the session expands them. + ``rex`` is the exception: an explicit ``tools: []`` expands to all enabled + built-in tools. Args: agent_dir: Path to the agent folder. @@ -133,7 +137,11 @@ def load_agent(agent_dir: Path, native: bool = False) -> Optional[AgentInfo]: # ── Tools / legacy permission compatibility ───────────────────────────── tools_list_raw: Optional[List[str]] = raw.get("tools") perm_raw = raw.get("permission") - tools_list, legacy_permission = resolve_agent_initial_tools(tools_list_raw, perm_raw) + tools_list, legacy_permission = resolve_agent_initial_tools( + tools_list_raw, + perm_raw, + agent_name=name, + ) # ── Model ──────────────────────────────────────────────────────────────── model_raw = raw.get("model") @@ -342,7 +350,11 @@ def yaml_to_agent_info(raw: dict, yaml_path: Path) -> AgentInfo: # Tools: prefer new tools list; fall back to old permission dict tools_list_raw: Optional[List[str]] = raw.get("tools") perm_raw = raw.get("permission") - tools_list, legacy_permission = resolve_agent_initial_tools(tools_list_raw, perm_raw) + tools_list, legacy_permission = resolve_agent_initial_tools( + tools_list_raw, + perm_raw, + agent_name=name, + ) desc_cn = raw.get("description_cn") if desc_cn is None and isinstance(raw.get("descriptionCn"), str): diff --git a/flocks/agent/agents/rex/agent.yaml b/flocks/agent/agents/rex/agent.yaml index 45f87ef2..c10003b2 100644 --- a/flocks/agent/agents/rex/agent.yaml +++ b/flocks/agent/agents/rex/agent.yaml @@ -6,24 +6,5 @@ mode: primary hidden: false color: "#00CED1" delegatable: false -tools: - - read - - list - - glob - - grep - - edit - - write - - bash - - skill - - delegate_task - - task - - todoread - - todowrite - - run_workflow - - run_workflow_node - - background_output - - background_cancel - - session_list - - channel_message - - tool_search +tools: [] # prompt is left empty – injected dynamically by prompt_builder.py diff --git a/flocks/agent/agents/rex/prompt_builder.py b/flocks/agent/agents/rex/prompt_builder.py index 51e0fa2a..029fcf88 100644 --- a/flocks/agent/agents/rex/prompt_builder.py +++ b/flocks/agent/agents/rex/prompt_builder.py @@ -48,24 +48,19 @@ def build_dynamic_rex_prompt( from flocks.agent.prompt_utils import ( build_agent_selection_table, build_key_triggers_section, - build_explore_section, - build_librarian_section, - build_oracle_section, - build_hard_blocks_section, + build_workflows_section, build_anti_patterns_section, ) - _ = available_tools, available_categories, available_workflows + _ = available_tools, available_categories key_triggers = build_key_triggers_section(available_agents, available_skills) agent_selection = build_agent_selection_table(available_agents) + skills_section = _build_rex_skills_section(available_skills) + workflows_section = build_workflows_section(available_workflows or []) security_priority = _build_security_priority_section(available_agents) im_send_section = _build_im_send_section() - explore_section = build_explore_section(available_agents) - librarian_section = build_librarian_section(available_agents) - oracle_section = build_oracle_section(available_agents) - hard_blocks = build_hard_blocks_section() - anti_patterns = build_anti_patterns_section() + anti_patterns = _build_rex_anti_patterns_section() slash_commands_section = _build_slash_commands_section() task_management_section = _task_management_section(use_task_system) todo_hook_note = ( @@ -77,25 +72,18 @@ def build_dynamic_rex_prompt( template = """ You are "Rex" - Powerful AI orchestrator for security operations. -**Why Rex?**: Humans roll their boulder every day. So do you. We're not so different-your code should be indistinguishable from a senior engineer's. - -**Identity**: SF Bay Area engineer. Work, delegate, verify, ship. No AI slop. - -**Core Competencies**: -- Parsing implicit requirements from explicit requests -- Adapting to codebase maturity (disciplined vs chaotic) -- Delegating specialized work to the right subagents -- Parallel execution for maximum throughput -- Follows user instructions. NEVER START IMPLEMENTING, UNLESS USER WANTS YOU TO IMPLEMENT SOMETHING EXPLICITLY. - - KEEP IN MIND: __TODO_HOOK_NOTE__, BUT IF NOT USER REQUESTED YOU TO WORK, NEVER START WORK. -- Your response should always be consistent with the user's language. - -**Operating Mode**: Execute simple, single-step work directly when a clear tool path exists. Delegate when specialist context, deep analysis, or parallel exploration will materially improve the result. Frontend work often benefits from delegation. Deep research -> parallel background agents (async subagents). Complex architecture -> consult Oracle. +**Identity**: Senior engineer. Work, delegate, verify, ship. No AI slop. +**Operating Principles**: +- Follow the user's intent and language. +- NEVER start implementing unless the user explicitly wants execution. +- Keep in mind: __TODO_HOOK_NOTE__. If the user only wants analysis or planning, do not start work. +- Prefer direct execution for simple, single-step tasks with a clear tool path. +- Delegate when specialist context, deep analysis, or parallelism clearly improves quality. - -## Phase 0 - Intent Gate (EVERY message) + +## Intent Gate __KEY_TRIGGERS__ @@ -103,319 +91,107 @@ def build_dynamic_rex_prompt( __IM_SEND_SECTION__ -### Step 1: Classify Request Type +### Request Classification -| Type | Signal | Action | -|------|--------|--------| -| **Trivial** | Single file, known location, direct answer | Direct tools only (UNLESS Key Trigger applies) | -| **Explicit** | Specific file/line, clear command | Execute directly | -| **Exploratory** | "How does X work?", "Find Y" | Fire explore (1-3) + tools in parallel | -| **Open-ended** | "Improve", "Refactor", "Add feature" | Assess codebase first | -| **Ambiguous** | Unclear scope, multiple interpretations | Ask ONE clarifying question | +| Type | Signal | Default Action | +|------|--------|----------------| +| **Trivial** | Single file, known location, direct answer | Direct tools | +| **Explicit** | Specific file or command | Execute directly | +| **Exploratory** | "How does X work?", "Find Y" | Explore first, then act | +| **Open-ended** | "Improve", "Refactor", "Add feature" | Explore, plan, then execute | +| **Ambiguous** | Multiple valid interpretations | Ask one focused question | -### Step 2: Check for Ambiguity +### Ambiguity Rules | Situation | Action | |-----------|--------| | Single valid interpretation | Proceed | -| Multiple interpretations, similar effort | Proceed with reasonable default, note assumption | -| Multiple interpretations, 2x+ effort difference | **MUST ask** | -| Missing critical info (file, error, context) | **MUST ask** | -| User's design seems flawed or suboptimal | **MUST raise concern** before implementing | - -### Step 3: Validate Before Acting +| Multiple interpretations, similar effort | Proceed with a reasonable default and state it briefly | +| Multiple interpretations with materially different scope or effort | Ask | +| Missing critical file, error, or environment context | Ask | +| User approach seems flawed | Raise the concern before implementing | -**Assumptions Check:** -- Do I have any implicit assumptions that might affect the outcome? -- Is the search scope clear? +### Available Specialists __AGENT_SELECTION__ -**Direct Tool Check (MANDATORY before delegating):** -1. Is this a simple, single-step request that I can complete with direct tools? -2. Is there a clear tool path now, or a short `tool_search` -> tool-call path, without needing specialist judgment? -3. For single IOC lookups (one IP / domain / URL / hash) that only need basic threat-intelligence results, prefer direct lookup instead of delegation. -4. If yes, execute directly. Do NOT delegate just because a matching specialist exists. - -**Delegation Check (MANDATORY before acting directly):** -1. Is there a specialized agent that perfectly matches this request? -2. If not, should I use `delegate_task(category=...)` for a generic execution path, or continue with direct tools? - - If delegating by `category=...`, load only skills that are clearly relevant to the task. - - If delegating by `subagent_type=...`, `load_skills` may be omitted unless a specific skill is clearly needed. - - If you are unsure whether something is a subagent, category, or tool, use `tool_search` first instead of guessing. -3. Does this request require specialist judgment, multi-step investigation, attribution, correlation, batching, or a structured expert report? - -**Default Bias: Direct execution for super simple and single-step tasks. Delegate when specialization clearly improves quality or efficiency.** - -### When to Challenge the User -If you observe: -- A design decision that will cause obvious problems -- An approach that contradicts established patterns in the codebase -- A request that seems to misunderstand how the existing code works - -Then: Raise your concern concisely. Propose an alternative. Ask if they want to proceed anyway. - -``` -I notice [observation]. This might cause [problem] because [reason]. -Alternative: [your suggestion]. -Should I proceed with your original request, or try the alternative? -``` - -### Visual / Image Input Handling -You may receive images as multimodal `image_url` content blocks attached to a user message. When you do: -- You DO have vision for that turn — describe, OCR, interpret, or analyze the image directly using what you see. Do not refuse or claim Flocks "does not support image analysis"; the image has already been delivered to you. -- Treat what you see as ground truth alongside the user's text instructions. -- An `image_url` block always represents *the image the user wants you to look at in **this** turn*. -- Do NOT confuse the current image(s) with anything from earlier turns. Never reuse a filename, label, or description from a prior turn unless you have just re-confirmed it from the pixels you can see right now. - -**Multi-image rule (strict — vision models otherwise drop the last image when N≥4):** -1. Before drafting your reply, FIRST count the `image_url` blocks in the user's current message — call this number N. -2. Begin your response with an opener that explicitly states the count, e.g. `您发送了 N 张图片,逐一解读如下:` (or `I will analyze all N images one by one:`). Anchoring N up front prevents the model from stopping early. -3. Your reply MUST contain EXACTLY N numbered sections, in the order the images appear, using headings such as `图片 1 / 图片 2 / … / 图片 N` (or `Image 1 / Image 2 / …`). Do not skip any image, do not merge "similar" images into one section, and do not pick "the most interesting subset". -4. After drafting, self-check: count your numbered sections — if it is not N, you missed an image. Add the missing section(s) before finalizing. - -If you see the literal placeholder `[earlier image omitted]` in an older user message, it just marks that an image existed in a prior turn but is not re-attached this turn. Treat it as opaque — you cannot re-inspect it. If the user asks about it again, rely only on what you wrote about it in your previous assistant reply, or politely ask the user to re-attach the image. - -When the user only mentions an image **by file path or remote URL** without an attached `image_url` block: -- You cannot fetch external resources, so ask the user to attach the image (drag / paste / `+` button) or paste the relevant text/data inline. - ---- - -## Phase 1 - Exploration & Research - -__EXPLORE_SECTION__ - -__LIBRARIAN_SECTION__ - -### Execution (DEFAULT behavior — synchronous) - -**Explore/Librarian = Grep, not consultants. - -```typescript -// CORRECT: Synchronous by default (run_in_background defaults to false, can be omitted) -// Prompt structure: [CONTEXT: what I'm doing] + [GOAL: what I'm trying to achieve] + [QUESTION: what I need to know] + [REQUEST: what to find] -// Contextual Grep (internal) -delegate_task(subagent_type="explore", prompt="I'm implementing user authentication for our API. I need to understand how auth is currently structured in this codebase. Find existing auth implementations, patterns, and where credentials are validated.") -delegate_task(subagent_type="explore", prompt="I'm adding error handling to the auth flow. I want to follow existing project conventions for consistency. Find how errors are handled elsewhere - patterns, custom error classes, and response formats used.") -// Reference Grep (external) -delegate_task(subagent_type="librarian", prompt="I'm implementing JWT-based auth and need to ensure security best practices. Find official JWT documentation and security recommendations - token expiration, refresh strategies, and common vulnerabilities to avoid.") -delegate_task(subagent_type="librarian", prompt="I'm building Express middleware for auth and want production-quality patterns. Find how established Express apps handle authentication - middleware structure, session management, and error handling examples.") - -// OPTIONAL: Use run_in_background=true only when you explicitly need async parallel execution -delegate_task(subagent_type="explore", run_in_background=true, prompt="...") -// Collect with background_output when needed. -``` - -### Background Result Collection (only when run_in_background=true): -1. Launch parallel agents -> receive task_ids -2. Continue immediate work -3. When results needed: `background_output(task_id="...")` -4. BEFORE final answer: `background_cancel(all=true)` - -### Search Stop Conditions - -STOP searching when: -- You have enough context to proceed confidently -- Same information appearing across multiple sources -- 2 search iterations yielded no new useful data -- Direct answer found - -**DO NOT over-explore. Time is precious.** - ---- - -## Phase 2 - Implementation - -### Pre-Implementation: -1. If task has 2+ steps -> Create todo list IMMEDIATELY, IN SUPER DETAIL. No announcements-just create it. -2. Mark current task `in_progress` before starting -3. Mark `completed` as soon as done (don't batch) - OBSESSIVELY TRACK YOUR WORK USING TODO TOOLS - -### Delegation Prompt Structure (MANDATORY - ALL 4 sections): - -When delegating, your prompt MUST include: - -``` -1. TASK: Atomic, specific goal (one action per delegation) -2. OUTPUT: Concrete deliverables with success criteria -3. CONSTRAINTS: Must-do / must-not-do requirements that matter for correctness -4. CONTEXT: File paths, existing patterns, constraints -``` - -AFTER THE WORK YOU DELEGATED SEEMS DONE, ALWAYS VERIFY THE RESULTS AS FOLLOWING: -- DOES IT WORK AS EXPECTED? -- DOES IT FOLLOWED THE EXISTING CODEBASE PATTERN? -- EXPECTED RESULT CAME OUT? -- DID THE AGENT FOLLOWED "MUST DO" AND "MUST NOT DO" REQUIREMENTS? +__SKILLS_SECTION__ -**Vague prompts = rejected. Be exhaustive.** +__WORKFLOWS_SECTION__ + -### Session Continuity (MANDATORY) + +## 1. Understand -Every `delegate_task()` output includes a session_id. **USE IT.** +- Parse explicit requirements and implicit constraints before acting. +- If the user attached images in the current turn, analyze them directly instead of refusing. +- If the request conflicts with the codebase or is likely to cause obvious problems, state the concern and propose an alternative. -**ALWAYS continue when:** -| Scenario | Action | -|----------|--------| -| Task failed/incomplete | `session_id="{session_id}", prompt="Fix: {specific error}"` | -| Follow-up question on result | `session_id="{session_id}", prompt="Also: {question}"` | -| Multi-turn with same agent | `session_id="{session_id}"` - NEVER start fresh | -| Verification failed | `session_id="{session_id}", prompt="Failed verification: {error}. Fix."` | +## 2. Path Selection -**Why session_id is CRITICAL:** -- Subagent has FULL conversation context preserved -- No repeated file reads, exploration, or setup -- Saves 70%+ tokens on follow-ups -- Subagent knows what it already tried/learned +Use this order every time: +1. **Direct tools first**: if there is a short tool path, execute directly. +2. **Security exception**: for one IOC that only needs basic TI facts, prefer direct lookup. +3. **Delegate when needed**: use specialists for deep investigation, attribution, correlation, batching, external docs, or structured expert output. +4. **Do not guess**: if unsure whether something is a tool, skill, category, or subagent, use `tool_search` first. -```typescript -// WRONG: Starting fresh loses all context -delegate_task(category="quick", load_skills=[], run_in_background=false, prompt="Fix the type error in auth.ts...") +## 3. Delegation -// CORRECT: Resume preserves everything -delegate_task(session_id="ses_abc123", prompt="Fix: Type error on line 42") -``` - -**After EVERY delegation, STORE the session_id for potential continuation.** - -### Code Changes: -- Match existing patterns (if codebase is disciplined) -- Propose approach first (if codebase is chaotic) -- Never suppress type errors with `as any`, `@ts-ignore`, `@ts-expect-error` -- Never commit unless explicitly requested -- When refactoring, use various tools to ensure safe refactorings -- **Bugfix Rule**: Fix minimally. NEVER refactor while fixing. - -### Where to Write Files: +Every delegation prompt must include: +- `TASK`: atomic objective +- `OUTPUT`: concrete deliverable with success criteria +- `CONSTRAINTS`: must-do and must-not-do requirements +- `CONTEXT`: relevant files, patterns, prior findings -Your block provides two key directories. Use the correct one for each file: +Reuse `session_id` when follow-up work belongs to the same delegated thread. Do not restart a subagent unless context reuse would hurt quality. -| File type | Which directory from | -|-----------|--------------------------| -| **Agent-generated output** — scripts, reports, examples, analysis results, drafts requested by the user | **Workspace outputs directory** | -| **Project source** — editing/creating Flocks source code, tests, configs that belong to the project | **Source code directory** | +## 4. Execute -**Rules (non-negotiable):** -- User asks "write a hello world / generate an example / summarize to a file" → use the **Workspace outputs directory** from , NEVER the Source code directory -- You are editing/adding a file that belongs to the Flocks project → use the **Source code directory** from +- Match existing codebase patterns when editing. +- Fix bugs minimally; do not refactor during a bugfix unless required. +- Keep search bounded: stop when you have enough context, when results repeat, or when direct evidence already answers the question. +- Use parallel background delegation only when you will benefit from independent branches of work. -### Verification: +## 5. Verify -Run `lsp_diagnostics` on changed files at: -- End of a logical task unit -- Before marking a todo item complete -- Before reporting completion to user +- Run `lsp_diagnostics` on changed files before considering the work complete. +- Run relevant build or test commands before finalizing when the affected area has them. +- Verification evidence is mandatory: clean diagnostics, successful commands, or an explicit note about pre-existing failures. +- Verify delegated work against expected behavior, codebase patterns, and any `must-do` / `must-not-do` requirements. -If project has build/test commands, run them at task completion. +## 6. Failure Handling -### Evidence Requirements (task NOT complete without these): - -| Action | Required Evidence | -|--------|-------------------| -| File edit | `lsp_diagnostics` clean on changed files | -| Build command | Exit code 0 | -| Test run | Pass (or explicit note of pre-existing failures) | -| Delegation | Agent result received and verified | - -**NO EVIDENCE = NOT COMPLETE.** - ---- +- Fix root causes, not symptoms. +- Re-verify after each fix attempt. +- Do not shotgun-debug or leave the codebase in a broken state. +- After repeated failed attempts, stop, summarize the blocker, and ask for direction. -## Phase 2C - Failure Recovery +## 7. Output Placement -### When Fixes Fail: - -1. Fix root causes, not symptoms -2. Re-verify after EVERY fix attempt -3. Never shotgun debug (random changes hoping something works) - -### After 3 Consecutive Failures: - -1. **STOP** all further edits immediately -2. **REVERT** to last known working state (git checkout / undo edits) -3. **DOCUMENT** what was attempted and what failed -4. **CONSULT** Oracle with full failure context -5. If Oracle cannot resolve -> **ASK USER** before proceeding - -**Never**: Leave code in broken state, continue hoping it'll work, delete failing tests to "pass" - ---- - -## Phase 3 - Completion - -A task is complete when: -- [ ] All planned todo items marked done -- [ ] Diagnostics clean on changed files -- [ ] Build passes (if applicable) -- [ ] User's original request fully addressed - -If verification fails: -1. Fix issues caused by your changes -2. Do NOT fix pre-existing issues unless asked -3. Report: "Done. Note: found N pre-existing lint errors unrelated to my changes." - -### Before Delivering Final Answer: -- Cancel ALL running background tasks: `background_cancel(all=true)` -- This conserves resources and ensures clean workflow completion - - -__ORACLE_SECTION__ +- User-requested reports, drafts, and generated artifacts go to the workspace outputs directory from ``. +- Source changes that belong to the project go to the source code directory from ``. + __TASK_MANAGEMENT_SECTION__ - -## Communication Style - -### Be Concise -- Start work immediately. No acknowledgments ("I'm on it", "Let me...", "I'll start...") -- Answer directly without preamble -- Don't summarize what you did unless asked -- Don't explain your code unless asked -- One word answers are acceptable when appropriate - -### No Flattery -Never start responses with: -- "Great question!" -- "That's a really good idea!" -- "Excellent choice!" -- Any praise of the user's input - -Just respond directly to the substance. - -### No Status Updates -Never start responses with casual acknowledgments: -- "Hey I'm on it..." -- "I'm working on this..." -- "Let me start by..." -- "I'll get to work on..." -- "I'm going to..." - -Just start working. Use todos for progress tracking-that's what they're for. - -### When User is Wrong -If the user's approach seems problematic: -- Don't blindly implement it -- Don't lecture or be preachy -- Concisely state your concern and alternative -- Ask if they want to proceed anyway - -### Match User's Style -- If user is terse, be terse -- If user wants detail, provide detail -- Adapt to their communication preference - + +## Style - -__HARD_BLOCKS__ +- Start with substance. No flattery, no filler. +- Be concise unless the user asks for depth. +- Match the user's tone and language. +- If the user's direction seems wrong, state the concern, suggest a better option, and ask whether to proceed anyway. + + __ANTI_PATTERNS__ -## Soft Guidelines +## Additional Guardrails -- Prefer existing libraries over new dependencies -- Prefer small, focused changes over large refactors -- When uncertain about scope, ask -- If a user query matches a skill along with its relevant tools, always load the skill first, then execute tool calls according to the skill’s guidance. +- Prefer existing libraries over new dependencies. +- Prefer small, focused changes over large refactors. +- When uncertain about scope, ask. +- If a user query matches a skill and the relevant tools, load the skill first and follow its guidance. __SLASH_COMMANDS__ @@ -424,12 +200,10 @@ def build_dynamic_rex_prompt( prompt = template prompt = prompt.replace("__KEY_TRIGGERS__", key_triggers) prompt = prompt.replace("__AGENT_SELECTION__", agent_selection) + prompt = prompt.replace("__SKILLS_SECTION__", skills_section) + prompt = prompt.replace("__WORKFLOWS_SECTION__", workflows_section) prompt = prompt.replace("__SECURITY_PRIORITY__", security_priority) prompt = prompt.replace("__IM_SEND_SECTION__", im_send_section) - prompt = prompt.replace("__EXPLORE_SECTION__", explore_section) - prompt = prompt.replace("__LIBRARIAN_SECTION__", librarian_section) - prompt = prompt.replace("__ORACLE_SECTION__", oracle_section) - prompt = prompt.replace("__HARD_BLOCKS__", hard_blocks) prompt = prompt.replace("__ANTI_PATTERNS__", anti_patterns) prompt = prompt.replace("__SLASH_COMMANDS__", slash_commands_section) prompt = prompt.replace("__TASK_MANAGEMENT_SECTION__", task_management_section) @@ -437,6 +211,41 @@ def build_dynamic_rex_prompt( return prompt +def _build_rex_skills_section(available_skills: List["AvailableSkill"]) -> str: + """Build a lightweight skills summary for Rex orchestration.""" + if not available_skills: + return "" + + lines = [ + "### Available Skills", + "", + "Load a skill when the task clearly matches its domain expertise.", + "", + ] + for skill in available_skills: + short_desc = (skill.description or "").split(".")[0].strip() or skill.name + lines.append(f"- `{skill.name}`: {short_desc}") + return "\n".join(lines) + + +def _build_rex_anti_patterns_section() -> str: + """Merge hard blocks and anti-patterns into one Rex section.""" + from flocks.agent.prompt_utils import build_anti_patterns_section + + base_section = build_anti_patterns_section() + if not base_section: + return "" + + hard_block_rows = [ + "| **Hard Block** | Type error suppression (`as any`, `@ts-ignore`) |", + "| **Hard Block** | Commit without explicit request |", + "| **Hard Block** | Speculate about unread code |", + "| **Hard Block** | Leave code in broken state after failures |", + ] + + return base_section + "\n" + "\n".join(hard_block_rows) + + def _build_slash_commands_section() -> str: """Build a section describing available slash commands for Rex.""" try: @@ -472,49 +281,8 @@ def _build_slash_commands_section() -> str: return "" -def _task_management_section(use_task_system: bool) -> str: - if use_task_system: - return """ -## Task Management (CRITICAL) - -**DEFAULT BEHAVIOR**: Create tasks BEFORE starting any non-trivial task. This is your PRIMARY coordination mechanism. - -### When to Create Tasks (MANDATORY) - -| Trigger | Action | -|---------|--------| -| Multi-step task (2+ steps) | ALWAYS `TaskCreate` first | -| Uncertain scope | ALWAYS (tasks clarify thinking) | -| User request with multiple items | ALWAYS | -| Complex single task | `TaskCreate` to break down | - -### Workflow (NON-NEGOTIABLE) - -1. **IMMEDIATELY on receiving request**: `TaskCreate` to plan atomic steps. - - ONLY ADD TASKS TO IMPLEMENT SOMETHING, ONLY WHEN USER WANTS YOU TO IMPLEMENT SOMETHING. -2. **Before starting each step**: `TaskUpdate(status="in_progress")` (only ONE at a time) -3. **After completing each step**: `TaskUpdate(status="completed")` IMMEDIATELY (NEVER batch) -4. **If scope changes**: Update tasks before proceeding - -### Why This Is Non-Negotiable - -- **User visibility**: User sees real-time progress, not a black box -- **Prevents drift**: Tasks anchor you to the actual request -- **Recovery**: If interrupted, tasks enable seamless continuation -- **Accountability**: Each task = explicit commitment - -### Anti-Patterns (BLOCKING) - -| Violation | Why It's Bad | -|-----------|--------------| -| Skipping tasks on multi-step tasks | User has no visibility, steps get forgotten | -| Batch-completing multiple tasks | Defeats real-time tracking purpose | -| Proceeding without marking in_progress | No indication of what you're working on | -| Finishing without completing tasks | Task appears incomplete | - -**FAILURE TO USE TASKS ON NON-TRIVIAL TASKS = INCOMPLETE WORK.** - -### Clarification Protocol (when asking): +def _build_clarification_protocol() -> str: + return """### Clarification Protocol ``` I want to make sure I understand correctly. @@ -528,64 +296,57 @@ def _task_management_section(use_task_system: bool) -> str: **My recommendation**: [suggestion with reasoning] Should I proceed with [recommendation], or would you prefer differently? -``` -""" - - return """ -## Todo Management (CRITICAL) - -**DEFAULT BEHAVIOR**: Create todos BEFORE starting any non-trivial task. This is your PRIMARY coordination mechanism. - -### When to Create Todos (MANDATORY) - -| Trigger | Action | -|---------|--------| -| Multi-step task (2+ steps) | ALWAYS create todos first | -| Uncertain scope | ALWAYS (todos clarify thinking) | -| User request with multiple items | ALWAYS | -| Complex single task | Create todos to break down | - -### Workflow (NON-NEGOTIABLE) +```""" -1. **IMMEDIATELY on receiving request**: `todowrite` to plan atomic steps. - - ONLY ADD TODOS TO IMPLEMENT SOMETHING, ONLY WHEN USER WANTS YOU TO IMPLEMENT SOMETHING. -2. **Before starting each step**: Mark `in_progress` (only ONE at a time) -3. **After completing each step**: Mark `completed` IMMEDIATELY (NEVER batch) -4. **If scope changes**: Update todos before proceeding -### Why This Is Non-Negotiable +def _task_management_section(use_task_system: bool) -> str: + title = "Task Management" if use_task_system else "Todo Management" + unit = "tasks" if use_task_system else "todos" + create_action = "`TaskCreate`" if use_task_system else "`todowrite`" + progress_action = ( + '`TaskUpdate(status="in_progress")`' + if use_task_system + else "mark `in_progress`" + ) + complete_action = ( + '`TaskUpdate(status="completed")`' + if use_task_system + else "mark `completed`" + ) + clarification_protocol = _build_clarification_protocol() -- **User visibility**: User sees real-time progress, not a black box -- **Prevents drift**: Todos anchor you to the actual request -- **Recovery**: If interrupted, todos enable seamless continuation -- **Accountability**: Each todo = explicit commitment + return f""" +## {title} -### Anti-Patterns (BLOCKING) +Use {unit} as the primary coordination mechanism for non-trivial execution work. -| Violation | Why It's Bad | -|-----------|--------------| -| Skipping todos on multi-step tasks | User has no visibility, steps get forgotten | -| Batch-completing multiple todos | Defeats real-time tracking purpose | -| Proceeding without marking in_progress | No indication of what you're working on | -| Finishing without completing todos | Task appears incomplete | +### When They Are Mandatory -**FAILURE TO USE TODOS ON NON-TRIVIAL TASKS = INCOMPLETE WORK.** +| Trigger | Action | +|---------|--------| +| Multi-step work (2+ steps) | Create {unit} first | +| Uncertain scope | Create {unit} to structure the work | +| User request with multiple items | Create {unit} first | +| Complex single task | Break it into {unit} | -### Clarification Protocol (when asking): +### Operating Rules -``` -I want to make sure I understand correctly. +1. Start with {create_action} before implementation work begins. +2. ONLY add {unit} when the user wants execution, not when they only want analysis or planning. +3. Before each step, {progress_action}. Keep only one item in progress. +4. After each step, {complete_action} immediately. Never batch updates. +5. If scope changes, update the {unit} before continuing. -**What I understood**: [Your interpretation] -**What I'm unsure about**: [Specific ambiguity] -**Options I see**: -1. [Option A] - [effort/implications] -2. [Option B] - [effort/implications] +### Failure Modes -**My recommendation**: [suggestion with reasoning] +| Violation | Why It Breaks the Workflow | +|-----------|----------------------------| +| Skipping {unit} on non-trivial work | The user loses progress visibility and steps get dropped | +| Batch-completing multiple {unit} | Real-time tracking becomes meaningless | +| Proceeding without an in-progress item | It is unclear what is being worked on | +| Finishing without closing items | The work appears incomplete | -Should I proceed with [recommendation], or would you prefer differently? -``` +{clarification_protocol} """ @@ -654,57 +415,21 @@ def _build_security_priority_section(available_agents: List["AvailableAgent"]) - routing_table = "\n".join(rows) agent_names = ", ".join(f"`{a.name}`" for a in security_agents) - return f"""### Security Sub-Agent Priority (Phase 0 — MANDATORY CHECK) + return f"""### Security Routing -**当用户问题涉及网络安全主题时,必须先判断这是“轻量直查”还是“专家研判”。不要一律委派。** +当用户问题涉及网络安全主题时,先判断这是“轻量直查”还是“专家研判”,不要一律委派。 Available security specialists: {agent_names} | 用户意图 | 优先委派 | 触发信号 | |---------|---------|---------| {routing_table} -**⚠️ CRITICAL: Sub-Agent vs Skill — NEVER confuse these two:** - -| Concept | What it is | How to call | -|---------|-----------|-------------| -| **Sub-Agent** (e.g. `vul-threat-intelligence`) | An independent specialist agent with its own tools and prompt | `delegate_task(subagent_type="vul-threat-intelligence", ...)` | -| **Skill** (e.g. `asset-survey-skill`) | An instruction set injected into a generic agent | `delegate_task(category="quick", load_skills=["some-skill"], ...)` | - -Security specialists listed above are **Sub-Agents** — use `subagent_type=`. Do not put agent names in `load_skills=[]`. - -**Correct example:** -``` -delegate_task( - subagent_type="vul-threat-intelligence", - description="query OA vulnerabilities", - prompt="...", - run_in_background=false -) -``` - -**WRONG (will fail or produce wrong results):** -``` -delegate_task(category="quick", load_skills=["vul-threat-intelligence"], ...) // ← agent name in load_skills is WRONG -``` - -**Lightweight direct lookup rules (Rex handles directly):** -- Single IOC basic lookup only: one IP, domain, URL, or hash -- User intent is direct querying, checking reputation, or fetching basic TI facts -- No batching, attribution, multi-indicator correlation, campaign analysis, or expert report required -- Prefer: `tool_search` if needed -> direct TI query tool -> answer - -**Mandatory delegation rules (use the specialist):** -- The request needs attribution, correlation, deep analysis, or expert judgment -- The user provides multiple IOCs, alert context, evidence, or asks for a structured security assessment -- The request matches one of the above specialist domains beyond a single direct lookup -- When ambiguous between two security agents, pick the more specific one and add a brief note - -**Decision examples:** -- "查询 8.8.8.8 的情报" -> Rex should directly query TI tools -- "分析这些 IOC 是否属于同一攻击活动" -> delegate to the appropriate specialist -- "结合告警上下文研判这批指标" -> delegate to the appropriate specialist - -Security sub-agents still have dedicated toolsets and should be preferred for non-trivial security analysis.""" +**Routing rules:** +- Security specialists are subagents. Call them with `subagent_type=...`; do not place agent names inside `load_skills=[]`. +- Direct path: exactly one IOC, basic reputation or TI facts only, and no attribution, correlation, batching, or formal assessment needed. +- Delegate path: multiple indicators, alert context, attribution, campaign analysis, expert judgment, or structured security output required. +- If a direct lookup tool is not obvious, use `tool_search` first and then execute the shortest valid tool path. +- If two security specialists both seem plausible, choose the more specific one and note the assumption briefly.""" def _build_im_send_section() -> str: diff --git a/flocks/agent/toolset.py b/flocks/agent/toolset.py index 5607f268..138212e7 100644 --- a/flocks/agent/toolset.py +++ b/flocks/agent/toolset.py @@ -28,6 +28,24 @@ def get_all_enabled_tool_names() -> List[str]: ] +def get_all_enabled_builtin_tool_names() -> List[str]: + """Return enabled built-in tool names, excluding plugins and dynamic tools.""" + from flocks.tool.registry import ToolRegistry + + ToolRegistry.init() + builtin_tool_names: List[str] = [] + for tool in ToolRegistry.list_tools(): + if tool.name in {"invalid", "_noop"} or not getattr(tool, "enabled", True): + continue + if not getattr(tool, "native", False): + continue + source = getattr(tool, "source", None) + if source not in {None, "builtin"}: + continue + builtin_tool_names.append(tool.name) + return builtin_tool_names + + def normalize_declared_tool_names( tool_names: Iterable[str], available_tool_names: Optional[Iterable[str]] = None, @@ -78,10 +96,13 @@ def expand_legacy_permission_to_tool_names( def resolve_agent_initial_tools( raw_tools: Optional[List[str]], legacy_permission_config: Any, + agent_name: Optional[str] = None, available_tool_names: Optional[Iterable[str]] = None, ) -> Tuple[List[str], Any]: available = list(available_tool_names or get_all_enabled_tool_names()) if raw_tools is not None: + if agent_name == "rex" and not raw_tools: + return get_all_enabled_builtin_tool_names(), [] return normalize_declared_tool_names(raw_tools, available), [] if isinstance(legacy_permission_config, dict): return expand_legacy_permission_to_tool_names(legacy_permission_config, available) diff --git a/flocks/session/prompt.py b/flocks/session/prompt.py index 4d7140e0..71b68336 100644 --- a/flocks/session/prompt.py +++ b/flocks/session/prompt.py @@ -93,7 +93,7 @@ def get_prompt_codex() -> str: - Other security operations tasks Best practices for security operations: -Your work primarily covers threat detection and analysis, incident response, vulnerability assessment, security automation, malware and forensic analysis, and compliance or hardening reviews. +Your work primarily covers threat detection and analysis, incident response, vulnerability assessment, security automation, malware and forensic analysis, and compliance or hardening reviews. Using tools to solve tasks is a core part of your capabilities. Apply these principles consistently: - Preserve evidence with timestamps, file paths, line numbers, and relevant context. @@ -970,13 +970,6 @@ async def build_custom_context() -> Optional[str]: digest_inputs={"model_id": model_id}, builder=lambda: cls._join_prompt_parts(SystemPrompt.provider(model_id)), ), - cls._build_cached_prompt_block( - static_cache=static_cache, - name="agent_identity", - cache_scope="agent", - digest_inputs={"agent_name": agent_name, "agent_prompt": agent_prompt or ""}, - builder=lambda: cls._normalize_prompt_text(agent_prompt), - ), cls._build_cached_prompt_block( static_cache=static_cache, name="tool_protocol", @@ -986,6 +979,16 @@ async def build_custom_context() -> Optional[str]: use_text_tool_call_mode=use_text_tool_call_mode, ), ), + cls._build_cached_prompt_block( + static_cache=static_cache, + name="tool_catalog_awareness", + cache_scope="catalog", + digest_inputs={ + "agent_name": agent_name, + "tool_revision": tool_revision, + }, + builder=lambda: cls._build_optional_prompt(tool_catalog_prompt_factory) or "", + ), cls._build_cached_prompt_block( static_cache=static_cache, name="bash_guidance", @@ -1006,6 +1009,13 @@ async def build_custom_context() -> Optional[str]: }, builder=lambda: memory_guidance or "", ), + cls._build_cached_prompt_block( + static_cache=static_cache, + name="agent_identity", + cache_scope="agent", + digest_inputs={"agent_name": agent_name, "agent_prompt": agent_prompt or ""}, + builder=lambda: cls._normalize_prompt_text(agent_prompt), + ), cls._build_cached_prompt_block( static_cache=static_cache, name="memory_snapshot", @@ -1037,18 +1047,6 @@ async def build_custom_context() -> Optional[str]: ) blocks.append(custom_block) - tool_catalog_block = cls._build_cached_prompt_block( - static_cache=static_cache, - name="tool_catalog_awareness", - cache_scope="catalog", - digest_inputs={ - "agent_name": agent_name, - "tool_revision": tool_revision, - }, - builder=lambda: cls._build_optional_prompt(tool_catalog_prompt_factory) or "", - ) - blocks.append(tool_catalog_block) - if sandbox_prompt_factory: blocks.append(await cls._build_cached_async_prompt_block( static_cache=static_cache, diff --git a/flocks/session/prompt_strings.py b/flocks/session/prompt_strings.py index 2c61665a..34216280 100644 --- a/flocks/session/prompt_strings.py +++ b/flocks/session/prompt_strings.py @@ -248,13 +248,13 @@ def _build_tool_instructions() -> str: return """ ## Tool Calling Rules -You have access to tools to help accomplish tasks. Use the callable schema exposed for this turn as the authoritative source for available tool names and parameters. +You have access to tools to help accomplish tasks. IMPORTANT RULES: - NEVER call the same tool multiple times with identical parameters in a single response - Do not repeat tool calls just to explain what you're doing - call the tool once and explain after - Never guess parameter names from semantics. If uncertain, use `tool_search` first, then read the callable schema for that tool and copy parameter names EXACTLY (including case).. -- For all tools, treat schema as strict: unknown parameter names will fail. +- For all tools, treat schema as strict: unknown parameter names will fail, empty parameter values will fail. CRITICAL - TOOL CALLING FORMAT: - ALWAYS invoke tools using the native API tool-calling mechanism ONLY diff --git a/flocks/session/runner.py b/flocks/session/runner.py index 03bd7bcc..00b76c23 100644 --- a/flocks/session/runner.py +++ b/flocks/session/runner.py @@ -44,7 +44,11 @@ from flocks.agent.toolset import agent_declares_tool from flocks.provider.provider import Provider, ChatMessage from flocks.hooks.pipeline import HookPipeline -from flocks.tool.catalog import get_tool_catalog_metadata, list_tool_catalog_infos +from flocks.tool.catalog import ( + get_always_load_tool_names, + get_tool_catalog_metadata, + list_tool_catalog_infos, +) from flocks.tool.registry import ToolRegistry, ToolResult from flocks.utils.langfuse import generation_scope, trace_scope from flocks.session.utils.file_extractor import ( @@ -1419,9 +1423,6 @@ def _list_catalog_tool_infos(self, agent: AgentInfo) -> List[Any]: tool_infos.append(tool_info) continue - if not isinstance(getattr(agent, "tools", None), (list, tuple, set)): - tool_infos.append(tool_info) - continue metadata = get_tool_catalog_metadata(tool_info.name, tool_info) if not agent_declares_tool(agent, tool_info.name) and not metadata.always_load: continue @@ -1431,8 +1432,22 @@ def _list_catalog_tool_infos(self, agent: AgentInfo) -> List[Any]: def _build_tool_catalog_prompt(self, agent: AgentInfo) -> Optional[str]: from flocks.tool.system.slash_command import format_tools_catalog_summary + from flocks.agent.toolset import get_all_enabled_builtin_tool_names + + is_rex = getattr(agent, "name", "") == "rex" + if not is_rex: + return None catalog_tools = self._list_catalog_tool_infos(agent) + excluded_tool_names = ( + set(get_all_enabled_builtin_tool_names()) + | get_always_load_tool_names() + ) + catalog_tools = [ + tool_info + for tool_info in catalog_tools + if tool_info.name not in excluded_tool_names + ] if not catalog_tools: return None @@ -1444,25 +1459,12 @@ def _build_tool_catalog_prompt(self, agent: AgentInfo) -> Optional[str]: if not catalog_summary: return None - is_rex = getattr(agent, "name", "") == "rex" - if is_rex: - rules = ( - "You can see the full tool catalog for awareness. " - "This catalog is reference-only and does not define parameter names. " - "Only tools exposed in the current callable schema may be called directly. " - "If a tool appears in the catalog but is not exposed this turn, use `tool_search` first. " - "Use the current callable schema as the sole source of truth for parameters. " - "Do not invent parameters for tools that are not currently exposed." - ) - else: - rules = ( - "You can see a tool catalog derived from your configured callable tool set. " - "This catalog is reference-only and does not define parameter names. " - "Only tools exposed in the current callable schema may be called directly. " - "Use the current callable schema as the sole source of truth for parameters. " - "Do not infer argument names from the catalog. " - "Do not invent parameters for tools that are not currently exposed." - ) + rules = ( + "The following deferred tools are available via `tool_search`. " + "Their schemas are NOT loaded - calling them directly will fail " + "with `InputValidationError`. Use `tool_search` with query " + "`select:[,...]` to load tool schemas before calling them:" + ) return ( "## Tool Catalog Awareness\n\n" diff --git a/flocks/tool/registry.py b/flocks/tool/registry.py index 42303504..90054d62 100644 --- a/flocks/tool/registry.py +++ b/flocks/tool/registry.py @@ -1190,8 +1190,8 @@ def _register_builtin_tools(cls) -> None: ("flocks.tool.task", ["task", "task_center", "todo", "plan", "run_workflow", "run_workflow_node"]), # security/ — SSH forensics + threat intelligence (optional: asyncssh) ("flocks.tool.security", ["ssh_host_cmd", "ssh_run_script"]), - # system/ — background tasks, questions, model config, memory, skill, batch, session management, slash commands - ("flocks.tool.system", ["background_output", "background_cancel", "question", "model_config", "memory", "skill", "batch", "session_manage", "slash_command", "tool_search"]), + # system/ — background tasks, questions, model config, memory, skill, MCP management, batch, session management, slash commands + ("flocks.tool.system", ["background_output", "background_cancel", "question", "model_config", "memory", "skill", "flocks_mcp", "batch", "session_manage", "slash_command", "tool_search"]), # skill/ — skill management (search, install, status, deps, remove) ("flocks.tool.skill", ["flocks_skills"]), # channel/ — IM platform messaging diff --git a/.flocks/plugins/tools/python/flocks_mcp.py b/flocks/tool/system/flocks_mcp.py similarity index 69% rename from .flocks/plugins/tools/python/flocks_mcp.py rename to flocks/tool/system/flocks_mcp.py index 9d9d19c8..92b39622 100644 --- a/.flocks/plugins/tools/python/flocks_mcp.py +++ b/flocks/tool/system/flocks_mcp.py @@ -1,5 +1,5 @@ """ -flocks_mcp — MCP server management tool for Rex. +flocks_mcp - MCP server management tool for Flocks. Supports: list, add, remove, connect, disconnect. """ @@ -33,11 +33,13 @@ description=( "Manage MCP servers registered in Flocks. " "Use 'list' to see all servers and their status. " - "Use 'add' to register and connect a new MCP server (persists to flocks.json and ~/.flocks/plugins/tools/mcp/). " + "Use 'add' to register and connect a new MCP server " + "(persists to flocks.json and ~/.flocks/plugins/tools/mcp/). " "Use 'remove' to delete a server from config and disconnect it. " "Use 'connect' / 'disconnect' to control an already-configured server's connection." ), category=ToolCategory.SYSTEM, + native=True, parameters=[ ToolParameter( name="subcommand", @@ -59,8 +61,10 @@ type=ParameterType.OBJECT, description=( "Server configuration dict. Required for 'add'. " - "Local example: {\"type\": \"local\", \"command\": [\"python\", \"-m\", \"pkg\"], \"enabled\": true}. " - "Remote example: {\"type\": \"remote\", \"url\": \"https://...\", \"enabled\": true}. " + "Local example: {\"type\": \"local\", \"command\": [\"python\", \"-m\", \"pkg\"], " + "\"enabled\": true}. " + "Remote example: {\"type\": \"remote\", \"url\": \"https://...\", " + "\"enabled\": true}. " "Use {secret:key_name} for sensitive values in environment/headers." ), required=False, @@ -77,34 +81,31 @@ async def flocks_mcp( if subcommand == "list": return await _list() - elif subcommand == "add": + if subcommand == "add": if not name: return ToolResult(success=False, error="'add' requires a server name.") if not config: return ToolResult(success=False, error="'add' requires a config dict.") return await _add(name, config) - elif subcommand == "remove": + if subcommand == "remove": if not name: return ToolResult(success=False, error="'remove' requires a server name.") return await _remove(name) - elif subcommand == "connect": + if subcommand == "connect": if not name: return ToolResult(success=False, error="'connect' requires a server name.") return await _connect(name) - elif subcommand == "disconnect": + if subcommand == "disconnect": if not name: return ToolResult(success=False, error="'disconnect' requires a server name.") return await _disconnect(name) - else: - return ToolResult( - success=False, - error=f"Unknown subcommand '{subcommand}'. Valid: list | add | remove | connect | disconnect", - ) - - -# --------------------------------------------------------------------------- -# Sub-command implementations -# --------------------------------------------------------------------------- + return ToolResult( + success=False, + error=( + f"Unknown subcommand '{subcommand}'. " + "Valid: list | add | remove | connect | disconnect" + ), + ) async def _list() -> ToolResult: @@ -114,29 +115,45 @@ async def _list() -> ToolResult: status = await MCP.status() result: Dict[str, Any] = {} - for sname, info in status.items(): - result[sname] = { + for server_name, info in status.items(): + result[server_name] = { "status": info.status.value if hasattr(info.status, "value") else str(info.status), "tools_count": getattr(info, "tools_count", 0), "error": getattr(info, "error", None), - "connected_at": str(info.connected_at) if getattr(info, "connected_at", None) else None, + "connected_at": ( + str(info.connected_at) + if getattr(info, "connected_at", None) + else None + ), } - # Include servers in config but not yet in memory configured = ConfigWriter.list_mcp_servers() - for sname, scfg in configured.items(): - if sname not in result: - enabled = scfg.get("enabled", True) if isinstance(scfg, dict) else True - if enabled: - result[sname] = {"status": "disconnected", "tools_count": 0, "error": None, "connected_at": None} + for server_name, server_config in configured.items(): + if server_name in result: + continue + enabled = ( + server_config.get("enabled", True) + if isinstance(server_config, dict) + else True + ) + if enabled: + result[server_name] = { + "status": "disconnected", + "tools_count": 0, + "error": None, + "connected_at": None, + } if not result: return ToolResult(success=True, output="No MCP servers configured.") lines = [f"{'Server':<30} {'Status':<15} {'Tools':>5}", "-" * 55] - for sname, info in sorted(result.items()): - err = f" ({info['error']})" if info.get("error") else "" - lines.append(f"{sname:<30} {info['status']:<15} {info['tools_count']:>5}{err}") + for server_name, info in sorted(result.items()): + error_suffix = f" ({info['error']})" if info.get("error") else "" + lines.append( + f"{server_name:<30} {info['status']:<15} " + f"{info['tools_count']:>5}{error_suffix}" + ) return ToolResult(success=True, output="\n".join(lines)) @@ -146,12 +163,11 @@ async def _add(name: str, config: Dict[str, Any]) -> ToolResult: from flocks.mcp import MCP from flocks.tool.tool_loader import save_mcp_config - # config may arrive as JSON string when passed through LLM tool call if isinstance(config, str): try: config = json.loads(config) - except json.JSONDecodeError as e: - return ToolResult(success=False, error=f"config is not valid JSON: {e}") + except json.JSONDecodeError as exc: + return ToolResult(success=False, error=f"config is not valid JSON: {exc}") config = extract_api_key_from_mcp_url(name, normalize_mcp_config(config)) @@ -167,25 +183,30 @@ async def _add(name: str, config: Dict[str, Any]) -> ToolResult: ), "connected": False, "pending_credentials": True, - "persisted_to": ["flocks.json", f"~/.flocks/plugins/tools/mcp/{name.replace('-', '_')}.yaml"], + "persisted_to": [ + "flocks.json", + f"~/.flocks/plugins/tools/mcp/{name.replace('-', '_')}.yaml", + ], }, ) - # Attempt connection try: success = await MCP.connect(name, config) - except Exception as e: - return ToolResult(success=False, error=f"Connection failed: {e}") + except Exception as exc: + return ToolResult(success=False, error=f"Connection failed: {exc}") if not success: status = await MCP.status() info = status.get(name) - err = getattr(info, "error", None) if info else None - if should_allow_unconnected_add(config, err): + error = getattr(info, "error", None) if info else None + if should_allow_unconnected_add(config, error): await MCP.remove(name) ConfigWriter.add_mcp_server(name, config) save_mcp_config(name, config) - log.info("flocks_mcp.add.deferred", {"name": name, "reason": err or "auth_pending"}) + log.info( + "flocks_mcp.add.deferred", + {"name": name, "reason": error or "auth_pending"}, + ) return ToolResult( success=True, output={ @@ -195,22 +216,24 @@ async def _add(name: str, config: Dict[str, Any]) -> ToolResult: ), "connected": False, "pending_credentials": True, - "persisted_to": ["flocks.json", f"~/.flocks/plugins/tools/mcp/{name.replace('-', '_')}.yaml"], - "error": err, + "persisted_to": [ + "flocks.json", + f"~/.flocks/plugins/tools/mcp/{name.replace('-', '_')}.yaml", + ], + "error": error, }, ) return ToolResult( success=False, - error=f"Failed to connect to '{name}'. {err or 'Check the config and server availability.'}", + error=( + f"Failed to connect to '{name}'. " + f"{error or 'Check the config and server availability.'}" + ), ) - # Persist to flocks.json ConfigWriter.add_mcp_server(name, config) - - # Persist YAML description to ~/.flocks/plugins/tools/mcp/ save_mcp_config(name, config) - # Get tool count status = await MCP.status() info = status.get(name) tools_count = getattr(info, "tools_count", 0) if info else 0 @@ -221,7 +244,10 @@ async def _add(name: str, config: Dict[str, Any]) -> ToolResult: output={ "message": f"MCP server '{name}' added and connected successfully.", "tools_count": tools_count, - "persisted_to": ["flocks.json", f"~/.flocks/plugins/tools/mcp/{name.replace('-', '_')}.yaml"], + "persisted_to": [ + "flocks.json", + f"~/.flocks/plugins/tools/mcp/{name.replace('-', '_')}.yaml", + ], }, ) @@ -241,10 +267,16 @@ async def _remove(name: str) -> ToolResult: delete_mcp_config(name) if not removed_config and not in_memory: - return ToolResult(success=False, error=f"MCP server '{name}' not found in config or memory.") + return ToolResult( + success=False, + error=f"MCP server '{name}' not found in config or memory.", + ) log.info("flocks_mcp.remove.success", {"name": name}) - return ToolResult(success=True, output=f"MCP server '{name}' removed successfully.") + return ToolResult( + success=True, + output=f"MCP server '{name}' removed successfully.", + ) async def _connect(name: str) -> ToolResult: @@ -255,9 +287,13 @@ async def _connect(name: str) -> ToolResult: try: config = await Config.get() mcp_config = getattr(config, "mcp", None) or {} - server_config = mcp_config.get(name) if isinstance(mcp_config, dict) else None - except Exception as e: - return ToolResult(success=False, error=f"Failed to load MCP config: {e}") + server_config = ( + mcp_config.get(name) + if isinstance(mcp_config, dict) + else None + ) + except Exception as exc: + return ToolResult(success=False, error=f"Failed to load MCP config: {exc}") if server_config is None: server_config = ConfigWriter.get_mcp_server(name) @@ -289,14 +325,17 @@ async def _connect(name: str) -> ToolResult: success=False, error=f"Connection timed out while connecting to '{name}'.", ) - except Exception as e: - return ToolResult(success=False, error=f"Connection failed: {e}") + except Exception as exc: + return ToolResult(success=False, error=f"Connection failed: {exc}") if not success: status = await MCP.status() info = status.get(name) - err = getattr(info, "error", None) if info else None - return ToolResult(success=False, error=f"Failed to connect to '{name}'. {err or ''}") + error = getattr(info, "error", None) if info else None + return ToolResult( + success=False, + error=f"Failed to connect to '{name}'. {error or ''}", + ) status = await MCP.status() info = status.get(name) @@ -313,14 +352,26 @@ async def _disconnect(name: str) -> ToolResult: status = await MCP.status() if name not in status: - return ToolResult(success=False, error=f"MCP server '{name}' is not in memory (already disconnected or not found).") + return ToolResult( + success=False, + error=( + f"MCP server '{name}' is not in memory " + "(already disconnected or not found)." + ), + ) try: success = await MCP.disconnect(name) - except Exception as e: - return ToolResult(success=False, error=f"Disconnect failed: {e}") + except Exception as exc: + return ToolResult(success=False, error=f"Disconnect failed: {exc}") if not success: - return ToolResult(success=False, error=f"Failed to disconnect from '{name}'.") + return ToolResult( + success=False, + error=f"Failed to disconnect from '{name}'.", + ) - return ToolResult(success=True, output=f"MCP server '{name}' disconnected.") \ No newline at end of file + return ToolResult( + success=True, + output=f"MCP server '{name}' disconnected.", + ) diff --git a/flocks/tool/system/skill.py b/flocks/tool/system/skill.py index 3a9a9636..5ba57c54 100644 --- a/flocks/tool/system/skill.py +++ b/flocks/tool/system/skill.py @@ -253,6 +253,7 @@ async def get_skill(name: str) -> dict | None: name="skill", description="Load a skill to get detailed instructions for a specific task. Available skills are listed in the description.", category=ToolCategory.SYSTEM, + native=True, parameters=[ ToolParameter( name="name", diff --git a/tests/agent/test_agent_factory.py b/tests/agent/test_agent_factory.py index 2859b4b4..296e6c7e 100644 --- a/tests/agent/test_agent_factory.py +++ b/tests/agent/test_agent_factory.py @@ -191,6 +191,21 @@ def test_no_tools_defaults_to_empty_declared_toolset(self, tmp_path): assert agent is not None assert agent.tools == [] + def test_rex_empty_tools_expand_to_builtin_toolset(self, tmp_path, monkeypatch): + monkeypatch.setattr( + "flocks.agent.toolset.get_all_enabled_builtin_tool_names", + lambda: ["read", "bash", "tool_search"], + ) + agent_dir = _write_agent_dir(tmp_path, """ + name: rex + tools: [] + """) + + agent = load_agent(agent_dir) + + assert agent is not None + assert agent.tools == ["read", "bash", "tool_search"] + def test_loads_model(self, tmp_path): agent_dir = _write_agent_dir(tmp_path, """ name: model_agent @@ -509,6 +524,17 @@ def test_missing_tools_defaults_to_empty_declared_toolset(self, tmp_path): agent = yaml_to_agent_info({"name": "no_tools"}, yaml_path) assert agent.tools == [] + def test_rex_empty_tools_expand_to_builtin_toolset(self, tmp_path, monkeypatch): + monkeypatch.setattr( + "flocks.agent.toolset.get_all_enabled_builtin_tool_names", + lambda: ["read", "bash", "tool_search"], + ) + yaml_path = self._make_yaml_path(tmp_path) + + agent = yaml_to_agent_info({"name": "rex", "tools": []}, yaml_path) + + assert agent.tools == ["read", "bash", "tool_search"] + def test_model_parsed(self, tmp_path): yaml_path = self._make_yaml_path(tmp_path) raw = { diff --git a/tests/integration/test_capability_awareness.py b/tests/integration/test_capability_awareness.py index 8b608750..4d0fecd3 100644 --- a/tests/integration/test_capability_awareness.py +++ b/tests/integration/test_capability_awareness.py @@ -14,9 +14,9 @@ - build_workflows_section() 能把它渲染成 prompt 片段 - inject_dynamic_prompts() 将 AvailableWorkflow 传递给 prompt builder -3. Rex prompt 保留 agent 调度上下文,但不再内嵌完整 tools/skills/workflows 目录 +3. Rex prompt 保留 agent 调度上下文,并恢复轻量 skills/workflows 能力目录 - Rex agent 加载后,其 prompt 包含常见 subagent / specialist 信息 - - Rex agent prompt 不包含 "Available Workflows" 这类完整目录段落 + - Rex agent prompt 包含轻量 skills/workflows section,但仍不内嵌完整 tools 目录 4. /skills slash command 端到端(真实 Skill 扫描) - run_slash_command_tool("skills") 成功返回技能列表,不报错 @@ -266,12 +266,14 @@ class TestRexPromptAwareness: @pytest.mark.asyncio async def test_rex_prompt_contains_tools_section(self): - """Rex prompt 仍应是非空且包含基础调度说明。""" + """Rex prompt 仍应是非空且包含基础编排骨架。""" from flocks.agent.registry import Agent rex = await Agent.get("rex") assert rex is not None assert rex.prompt is not None assert len(rex.prompt) > 100 + assert "" in rex.prompt + assert "## 2. Path Selection" in rex.prompt @pytest.mark.asyncio async def test_rex_prompt_contains_subagents_section(self): @@ -287,19 +289,22 @@ async def test_rex_prompt_contains_subagents_section(self): @pytest.mark.asyncio async def test_rex_prompt_contains_skills_section(self): - """Rex prompt 不再内嵌完整 skills 目录。""" + """Rex prompt 应包含轻量 skills section。""" from flocks.agent.registry import Agent rex = await Agent.get("rex") - prompt = (rex.prompt or "").lower() - assert "category + skills delegation system" not in prompt + prompt = rex.prompt or "" + assert "### Available Skills" in prompt + assert "Load a skill when the task clearly matches its domain expertise." in prompt + assert "Category + Skills Delegation System" not in prompt @pytest.mark.asyncio - async def test_rex_prompt_does_not_embed_full_workflow_section(self): - """Rex prompt 不再内嵌 workflow 目录段落。""" + async def test_rex_prompt_contains_workflow_section(self): + """Rex prompt 应包含 workflow section。""" from flocks.agent.registry import Agent rex = await Agent.get("rex") - prompt = (rex.prompt or "").lower() - assert "### available workflows" not in prompt + prompt = rex.prompt or "" + assert "### Available Workflows" in prompt + assert "run_workflow" in prompt @pytest.mark.asyncio async def test_rex_prompt_does_not_embed_full_tools_table(self): @@ -325,18 +330,19 @@ async def test_rex_prompt_merges_agent_selection_and_delegation(self): rex = await Agent.get("rex") prompt = rex.prompt or "" assert "### Available Agents:" in prompt - assert "Trigger Signals" in prompt + assert "Default flow" in prompt assert "### Delegation Table:" not in prompt @pytest.mark.asyncio async def test_rex_prompt_prefers_direct_ioc_lookup_before_delegation(self): - """单 IOC 情报查询应在提示词中优先走 Rex 直查路径。""" + """单 IOC 情报查询应保留“先直查、再委派”的稳定语义。""" from flocks.agent.registry import Agent rex = await Agent.get("rex") prompt = rex.prompt or "" - assert "Single IOC basic lookup only" in prompt - assert '"查询 8.8.8.8 的情报" -> Rex should directly query TI tools' in prompt - assert "tool_search` if needed -> direct TI query tool -> answer" in prompt + assert "### Security Routing" in prompt + assert "Direct path: exactly one IOC" in prompt + assert "Delegate path: multiple indicators" in prompt + assert "`tool_search` first" in prompt # =========================================================================== diff --git a/tests/session/test_runner_step.py b/tests/session/test_runner_step.py index 575813da..812c5f10 100644 --- a/tests/session/test_runner_step.py +++ b/tests/session/test_runner_step.py @@ -687,6 +687,8 @@ async def test_build_system_prompts_includes_memory_guidance_when_memory_tools_l assert "memory guidance" in "\n\n".join(prompts) assert "## MEMORY.md\n\nremembered context" in prompts + assert prompts.index("memory guidance") < prompts.index("agent prompt") + assert prompts.index("agent prompt") < prompts.index("## MEMORY.md\n\nremembered context") @pytest.mark.asyncio async def test_build_system_prompts_includes_bash_guidance_when_bash_loaded_on_windows(self): @@ -813,49 +815,71 @@ def test_build_tool_catalog_prompt_for_rex(self): with patch( "flocks.session.runner.SessionRunner._list_catalog_tool_infos", return_value=[ToolInfo( - name="read", - description="Read file contents", - category=ToolCategory.FILE, - native=True, + name="plugin_memory", + description="Access project memory", + category=ToolCategory.CUSTOM, + native=False, enabled=True, )], + ), patch( + "flocks.agent.toolset.get_all_enabled_builtin_tool_names", + return_value=["read", "bash"], + ), patch( + "flocks.session.runner.get_always_load_tool_names", + return_value={"question", "tool_search"}, ), patch( "flocks.tool.system.slash_command.format_tools_catalog_summary", - return_value="Available Tools (grouped by category):\n\n**file**\n- read: Read file contents", + return_value="Available Tools (grouped by category):\n\n**custom**\n- plugin_memory: Access project memory", ): prompt = runner._build_tool_catalog_prompt(agent) assert prompt is not None assert "Tool Catalog Awareness" in prompt assert "tool_search" in prompt - assert "full tool catalog" in prompt - assert "reference-only" in prompt - assert "sole source of truth for parameters" in prompt - assert "- read: Read file contents" in prompt + assert "InputValidationError" in prompt + assert "select:[,...]" in prompt + assert "- plugin_memory: Access project memory" in prompt - def test_build_tool_catalog_prompt_for_subagent_uses_filtered_catalog(self): + def test_build_tool_catalog_prompt_for_subagent_returns_none(self): runner = _make_runner() agent = _make_agent(name="plan") agent.mode = "subagent" + prompt = runner._build_tool_catalog_prompt(agent) + + assert prompt is None + + def test_build_tool_catalog_prompt_for_rex_excludes_builtin_and_always_load_tools(self): + runner = _make_runner() + agent = _make_agent(name="rex") + agent.mode = "primary" + catalog_tools = [ + ToolInfo(name="bash", description="Run commands", category=ToolCategory.CODE, native=True, enabled=True), + ToolInfo(name="question", description="Ask user a question", category=ToolCategory.SYSTEM, native=True, enabled=True), + ToolInfo(name="plugin_memory", description="Access project memory", category=ToolCategory.CUSTOM, native=False, enabled=True), + ] + with patch( "flocks.session.runner.SessionRunner._list_catalog_tool_infos", - return_value=[ToolInfo( - name="read", - description="Read file contents", - category=ToolCategory.FILE, - native=True, - enabled=True, - )], + return_value=catalog_tools, + ), patch( + "flocks.agent.toolset.get_all_enabled_builtin_tool_names", + return_value=["bash", "read"], + ), patch( + "flocks.session.runner.get_always_load_tool_names", + return_value={"question", "tool_search"}, ), patch( "flocks.tool.system.slash_command.format_tools_catalog_summary", - return_value="Available Tools (grouped by category):\n\n**file**\n- read: Read file contents", - ): + side_effect=lambda tools, **_: "\n".join(tool.name for tool in tools), + ) as formatter_mock: prompt = runner._build_tool_catalog_prompt(agent) assert prompt is not None - assert "derived from your configured callable tool set" in prompt - assert "use `tool_search` first" not in prompt + assert "plugin_memory" in prompt + assert "bash" not in prompt + assert "question" not in prompt + formatter_tools = formatter_mock.call_args.kwargs["tools"] + assert [tool.name for tool in formatter_tools] == ["plugin_memory"] def test_list_catalog_tool_infos_returns_full_catalog_for_rex(self): runner = _make_runner() @@ -900,6 +924,39 @@ def test_list_catalog_tool_infos_filters_subagent_boundaries(self): assert [tool.name for tool in infos] == ["read"] + def test_list_catalog_tool_infos_keeps_always_load_tools_for_subagent(self): + runner = _make_runner() + agent = _make_agent(name="plan") + agent.mode = "subagent" + agent.tools = ["read"] + tool_infos = [ + ToolInfo(name="read", description="Read file contents", category=ToolCategory.FILE, native=True, enabled=True), + ToolInfo(name="question", description="Ask user a question", category=ToolCategory.SYSTEM, native=True, enabled=True), + ToolInfo(name="tool_search", description="Search tools", category=ToolCategory.SYSTEM, native=True, enabled=True), + ToolInfo(name="bash", description="Run commands", category=ToolCategory.CODE, native=True, enabled=True), + ] + + with patch("flocks.session.runner.list_tool_catalog_infos", return_value=tool_infos): + infos = runner._list_catalog_tool_infos(agent) + + assert [tool.name for tool in infos] == ["read", "question", "tool_search"] + + def test_list_catalog_tool_infos_does_not_fall_back_to_full_catalog_when_tools_missing(self): + runner = _make_runner() + agent = _make_agent(name="plan", tools=None) + agent.mode = "subagent" + tool_infos = [ + ToolInfo(name="read", description="Read file contents", category=ToolCategory.FILE, native=True, enabled=True), + ToolInfo(name="question", description="Ask user a question", category=ToolCategory.SYSTEM, native=True, enabled=True), + ToolInfo(name="tool_search", description="Search tools", category=ToolCategory.SYSTEM, native=True, enabled=True), + ToolInfo(name="bash", description="Run commands", category=ToolCategory.CODE, native=True, enabled=True), + ] + + with patch("flocks.session.runner.list_tool_catalog_infos", return_value=tool_infos): + infos = runner._list_catalog_tool_infos(agent) + + assert [tool.name for tool in infos] == ["question", "tool_search"] + class TestMiniMaxTextToolMode: def test_enabled_for_custom_threatbook_minimax(self): diff --git a/tests/tool/test_agent_toolset.py b/tests/tool/test_agent_toolset.py index d173d7ab..a9ef7af1 100644 --- a/tests/tool/test_agent_toolset.py +++ b/tests/tool/test_agent_toolset.py @@ -2,6 +2,7 @@ from flocks.agent.toolset import ( agent_declares_tool, + get_all_enabled_builtin_tool_names, normalize_declared_tool_names, resolve_agent_initial_tools, ) @@ -38,3 +39,48 @@ def test_resolve_agent_initial_tools_defaults_to_empty_when_unset() -> None: assert tools == [] assert permission == [] + + +def test_get_all_enabled_builtin_tool_names_excludes_plugins_and_disabled(monkeypatch) -> None: + tools = [ + SimpleNamespace(name="read", enabled=True, native=True, source=None), + SimpleNamespace(name="bash", enabled=True, native=True, source="builtin"), + SimpleNamespace(name="project_tool", enabled=True, native=True, source="plugin_yaml"), + SimpleNamespace(name="user_tool", enabled=True, native=False, source="plugin_py"), + SimpleNamespace(name="mcp_lookup", enabled=True, native=False, source="mcp"), + SimpleNamespace(name="disabled_tool", enabled=False, native=True, source=None), + SimpleNamespace(name="invalid", enabled=True, native=True, source=None), + ] + + monkeypatch.setattr("flocks.tool.registry.ToolRegistry.init", lambda: None) + monkeypatch.setattr("flocks.tool.registry.ToolRegistry.list_tools", lambda: tools) + + assert get_all_enabled_builtin_tool_names() == ["read", "bash"] + + +def test_resolve_agent_initial_tools_expands_empty_rex_tools_to_builtin_tools(monkeypatch) -> None: + monkeypatch.setattr( + "flocks.agent.toolset.get_all_enabled_builtin_tool_names", + lambda: ["read", "bash", "tool_search"], + ) + + tools, permission = resolve_agent_initial_tools( + raw_tools=[], + legacy_permission_config=None, + agent_name="rex", + ) + + assert tools == ["read", "bash", "tool_search"] + assert permission == [] + + +def test_resolve_agent_initial_tools_keeps_empty_non_rex_tools_empty() -> None: + tools, permission = resolve_agent_initial_tools( + raw_tools=[], + legacy_permission_config=None, + agent_name="plan", + available_tool_names=["read", "bash"], + ) + + assert tools == [] + assert permission == [] diff --git a/tests/tool/test_builtin_management_tools.py b/tests/tool/test_builtin_management_tools.py new file mode 100644 index 00000000..137da65c --- /dev/null +++ b/tests/tool/test_builtin_management_tools.py @@ -0,0 +1,21 @@ +from flocks.tool.registry import ToolRegistry + + +def test_flocks_mcp_is_registered_as_builtin_tool() -> None: + ToolRegistry.init() + + tool = ToolRegistry.get("flocks_mcp") + + assert tool is not None + assert tool.info.native is True + assert tool.info.source in {None, "builtin"} + + +def test_skill_remains_registered_as_builtin_tool() -> None: + ToolRegistry.init() + + tool = ToolRegistry.get("skill") + + assert tool is not None + assert tool.info.native is True + assert tool.info.source in {None, "builtin"} From c5bbd5043485cc89a4a8d530b9c7edbe5f240384 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Tue, 12 May 2026 15:51:32 +0800 Subject: [PATCH 3/4] fix(session): reorder system prompt blocks; keep lsp tool non-native - Move agent_identity and tool_catalog layers for clearer guidance order. - Exclude lsp from bulk native=True classification in ToolRegistry. - Minor lsp_tool tweak; extend runner and builtin management tests. Co-authored-by: Cursor --- flocks/session/prompt.py | 44 ++++++++++----------- flocks/tool/code/lsp_tool.py | 1 + flocks/tool/registry.py | 7 +++- tests/session/test_runner_step.py | 39 +++++++++++++++--- tests/tool/test_builtin_management_tools.py | 9 +++++ 5 files changed, 71 insertions(+), 29 deletions(-) diff --git a/flocks/session/prompt.py b/flocks/session/prompt.py index 71b68336..8416d5c6 100644 --- a/flocks/session/prompt.py +++ b/flocks/session/prompt.py @@ -938,11 +938,11 @@ async def build_system_prompts( ) -> List[str]: """Build the runtime system prompt blocks for a session turn. - The ordering mirrors Hermes' layered assembly style: stable identity - first, then session-scoped snapshots and runtime context, then tool - protocol/catalog guidance. Cache mechanics are intentionally kept out - of the block construction below so this method reads as an ordered list - of prompt layers. + Stable identity and execution guidance come first, followed by + session/workspace context, with runtime-only metadata kept at the + prompt tail. Cache mechanics are intentionally kept out of the block + construction below so this method reads as an ordered list of prompt + layers. """ normalized_tool_names = tuple(sorted(prompt_tool_names)) vcs = "git" if session_directory else None @@ -979,16 +979,6 @@ async def build_custom_context() -> Optional[str]: use_text_tool_call_mode=use_text_tool_call_mode, ), ), - cls._build_cached_prompt_block( - static_cache=static_cache, - name="tool_catalog_awareness", - cache_scope="catalog", - digest_inputs={ - "agent_name": agent_name, - "tool_revision": tool_revision, - }, - builder=lambda: cls._build_optional_prompt(tool_catalog_prompt_factory) or "", - ), cls._build_cached_prompt_block( static_cache=static_cache, name="bash_guidance", @@ -999,6 +989,13 @@ async def build_custom_context() -> Optional[str]: }, builder=lambda: cls._build_bash_guidance_prompt(normalized_tool_names) or "", ), + cls._build_cached_prompt_block( + static_cache=static_cache, + name="agent_identity", + cache_scope="agent", + digest_inputs={"agent_name": agent_name, "agent_prompt": agent_prompt or ""}, + builder=lambda: cls._normalize_prompt_text(agent_prompt), + ), cls._build_cached_prompt_block( static_cache=static_cache, name="memory_guidance", @@ -1009,13 +1006,6 @@ async def build_custom_context() -> Optional[str]: }, builder=lambda: memory_guidance or "", ), - cls._build_cached_prompt_block( - static_cache=static_cache, - name="agent_identity", - cache_scope="agent", - digest_inputs={"agent_name": agent_name, "agent_prompt": agent_prompt or ""}, - builder=lambda: cls._normalize_prompt_text(agent_prompt), - ), cls._build_cached_prompt_block( static_cache=static_cache, name="memory_snapshot", @@ -1023,6 +1013,16 @@ async def build_custom_context() -> Optional[str]: digest_inputs={"session_id": session_id, "snapshot": memory_snapshot}, builder=lambda: memory_snapshot, ), + cls._build_cached_prompt_block( + static_cache=static_cache, + name="tool_catalog_awareness", + cache_scope="catalog", + digest_inputs={ + "agent_name": agent_name, + "tool_revision": tool_revision, + }, + builder=lambda: cls._build_optional_prompt(tool_catalog_prompt_factory) or "", + ), cls._build_cached_prompt_block( static_cache=static_cache, name="environment_stable", diff --git a/flocks/tool/code/lsp_tool.py b/flocks/tool/code/lsp_tool.py index 2b5407bb..77f2c4e9 100644 --- a/flocks/tool/code/lsp_tool.py +++ b/flocks/tool/code/lsp_tool.py @@ -61,6 +61,7 @@ name="lsp", description=DESCRIPTION, category=ToolCategory.CODE, + native=False, parameters=[ ToolParameter( name="operation", diff --git a/flocks/tool/registry.py b/flocks/tool/registry.py index 90054d62..53f8eece 100644 --- a/flocks/tool/registry.py +++ b/flocks/tool/registry.py @@ -1206,11 +1206,16 @@ def _register_builtin_tools(cls) -> None: except ImportError as e: log.warn("builtin_tools.import_failed", {"module": f"{package}.{mod_name}", "error": str(e)}) - # Mark every tool registered during this call as native=True. + # Mark every tool registered during this call as native=True, except + # for built-in modules that should remain non-native by policy. # This is done in bulk here so individual @register_function call # sites don't need to pass native=True, and user plugin files using # the same decorator won't be misclassified. + builtin_native_exceptions = {"lsp"} for name in set(cls._tools.keys()) - before: + if name in builtin_native_exceptions: + cls._tools[name].info.native = False + continue cls._tools[name].info.native = True # Sample tools for testing (only register if not already registered) diff --git a/tests/session/test_runner_step.py b/tests/session/test_runner_step.py index 812c5f10..457d2e84 100644 --- a/tests/session/test_runner_step.py +++ b/tests/session/test_runner_step.py @@ -526,8 +526,20 @@ async def test_build_system_prompts_orders_stable_prefix_before_runtime_tail(sel runner = SessionRunner(session=session) agent = _make_agent(name="rex") agent.prompt = "agent prompt" + memory_bootstrap_data = { + "instructions": "memory guidance", + "main_memory": { + "path": "MEMORY.md", + "content": "remembered context", + "inject": True, + }, + } + sandbox_mock = AsyncMock(return_value="sandbox prompt") + channel_mock = AsyncMock(return_value="channel prompt") with patch("flocks.session.prompt.SystemPrompt.provider", return_value=["provider prompt"]), \ + patch.object(SessionPrompt, "_build_tool_guidance_prompt", return_value="tool protocol"), \ + patch.object(SessionPrompt, "_build_bash_guidance_prompt", return_value="bash guidance"), \ patch("flocks.session.prompt.SystemPrompt.environment_stable", return_value=["env prompt"]), \ patch("flocks.session.prompt.SystemPrompt.runtime_metadata", return_value=["runtime prompt"]), \ patch("flocks.session.prompt.SystemPrompt.custom", AsyncMock(return_value=["custom prompt"])): @@ -538,12 +550,27 @@ async def test_build_system_prompts_orders_stable_prefix_before_runtime_tail(sel agent_prompt=agent.prompt, provider_id=runner.provider_id, model_id=runner.model_id, - prompt_tool_names=("read",), + prompt_tool_names=("bash", "memory_search", "read"), + memory_bootstrap_data=memory_bootstrap_data, + tool_catalog_prompt_factory=lambda: "tool catalog", + sandbox_prompt_factory=sandbox_mock, + channel_context_prompt_factory=channel_mock, ) - assert prompts.index("agent prompt") < prompts.index("env prompt") - assert prompts.index("custom prompt") < prompts.index("runtime prompt") - assert prompts[-1] == "runtime prompt" + assert prompts == [ + "provider prompt", + "tool protocol", + "bash guidance", + "agent prompt", + "memory guidance", + "## MEMORY.md\n\nremembered context", + "tool catalog", + "env prompt", + "custom prompt", + "sandbox prompt", + "channel prompt", + "runtime prompt", + ] @pytest.mark.asyncio async def test_build_system_prompts_rebuilds_when_tool_revision_changes(self): @@ -687,8 +714,8 @@ async def test_build_system_prompts_includes_memory_guidance_when_memory_tools_l assert "memory guidance" in "\n\n".join(prompts) assert "## MEMORY.md\n\nremembered context" in prompts - assert prompts.index("memory guidance") < prompts.index("agent prompt") - assert prompts.index("agent prompt") < prompts.index("## MEMORY.md\n\nremembered context") + assert prompts.index("agent prompt") < prompts.index("memory guidance") + assert prompts.index("memory guidance") < prompts.index("## MEMORY.md\n\nremembered context") @pytest.mark.asyncio async def test_build_system_prompts_includes_bash_guidance_when_bash_loaded_on_windows(self): diff --git a/tests/tool/test_builtin_management_tools.py b/tests/tool/test_builtin_management_tools.py index 137da65c..ac7b70ad 100644 --- a/tests/tool/test_builtin_management_tools.py +++ b/tests/tool/test_builtin_management_tools.py @@ -19,3 +19,12 @@ def test_skill_remains_registered_as_builtin_tool() -> None: assert tool is not None assert tool.info.native is True assert tool.info.source in {None, "builtin"} + + +def test_lsp_remains_non_native_by_default() -> None: + ToolRegistry.init() + + tool = ToolRegistry.get("lsp") + + assert tool is not None + assert tool.info.native is False From c3b824189805d39f0634775616fc428343049349 Mon Sep 17 00:00:00 2001 From: xiami762 <> Date: Tue, 12 May 2026 17:58:13 +0800 Subject: [PATCH 4/4] refactor(session): trim prompt strings; tighten bash tool copy - Remove redundant prompt.py / prompt_strings content where duplicated. - Refine bash tool description (dedicated-tool bullets and usage notes). - Align prompt_tokens, runner_step, and bash registry tests. Co-authored-by: Cursor --- flocks/session/prompt.py | 21 --------- flocks/session/prompt_strings.py | 24 ---------- flocks/tool/code/bash.py | 70 ++++++++++++++++++----------- tests/session/test_prompt_tokens.py | 29 +++--------- tests/session/test_runner_step.py | 14 +++--- tests/tool/test_tools.py | 33 +++++++++++++- 6 files changed, 85 insertions(+), 106 deletions(-) diff --git a/flocks/session/prompt.py b/flocks/session/prompt.py index 8416d5c6..83bf9bd4 100644 --- a/flocks/session/prompt.py +++ b/flocks/session/prompt.py @@ -28,7 +28,6 @@ # Output token maximum OUTPUT_TOKEN_MAX = int(os.getenv("FLOCKS_OUTPUT_TOKEN_MAX", "32000")) -BASH_GUIDANCE_TOOL_NAMES = frozenset({"bash"}) MEMORY_GUIDANCE_TOOL_NAMES = frozenset({"memory_get", "memory_search", "memory_write"}) SystemPromptCache = Dict[str, Any] @@ -814,16 +813,6 @@ def _build_tool_guidance_prompt( else prompt_strings._build_tool_instructions() ) - @classmethod - def _build_bash_guidance_prompt( - cls, - prompt_tool_names: Iterable[str], - ) -> Optional[str]: - """Build bash-specific guidance only when the tool is callable.""" - if not (set(prompt_tool_names) & BASH_GUIDANCE_TOOL_NAMES): - return None - return prompt_strings._build_bash_tool_guidance() - @classmethod def _build_memory_guidance_prompt( cls, @@ -979,16 +968,6 @@ async def build_custom_context() -> Optional[str]: use_text_tool_call_mode=use_text_tool_call_mode, ), ), - cls._build_cached_prompt_block( - static_cache=static_cache, - name="bash_guidance", - cache_scope="toolset", - digest_inputs={ - "tool_names": normalized_tool_names, - "platform": platform.system().lower(), - }, - builder=lambda: cls._build_bash_guidance_prompt(normalized_tool_names) or "", - ), cls._build_cached_prompt_block( static_cache=static_cache, name="agent_identity", diff --git a/flocks/session/prompt_strings.py b/flocks/session/prompt_strings.py index 34216280..8ccb18b5 100644 --- a/flocks/session/prompt_strings.py +++ b/flocks/session/prompt_strings.py @@ -9,8 +9,6 @@ belong to session management, not to the agent orchestration layer. """ -import platform - # ============================================================================= # Compaction prompt # ============================================================================= @@ -222,28 +220,6 @@ Any attempt to use tools is a critical violation. Respond with text ONLY.""" -def _build_bash_tool_guidance() -> str: - """Build tool-aware guidance for the bash tool.""" - guidance = ( - "### Bash Tool Guidance\n\n" - "Use the `bash` tool for terminal operations only. Prefer the tool's `workdir` parameter " - "instead of emitting shell directory changes, and prefer dedicated file tools for reading, " - "writing, editing, and searching." - ) - if platform.system().lower() != "windows": - return guidance - - return ( - f"{guidance}\n\n" - "You are running on a Windows machine. When using the `bash` tool, you must follow " - "PowerShell syntax rather than GNU bash syntax. Avoid bash-only constructs such as heredocs, " - "`cat > file <<'EOF'`, `export NAME=value`, and Unix-only path expansion. For multi-step logic, " - "prefer a short `python -c` snippet or explicit PowerShell commands over shell-specific tricks. " - "When Python reads text files on Windows, always pass `encoding=...`; prefer " - "`Path(path).read_text(encoding=\"utf-8-sig\")`." - ) - - def _build_tool_instructions() -> str: return """ ## Tool Calling Rules diff --git a/flocks/tool/code/bash.py b/flocks/tool/code/bash.py index 230005a0..2d09539d 100644 --- a/flocks/tool/code/bash.py +++ b/flocks/tool/code/bash.py @@ -42,39 +42,55 @@ def get_description(directory: str) -> str: """Get tool description with directory placeholder replaced""" - return f"""Executes a given bash command in a persistent shell session with optional timeout, ensuring proper handling and security measures. + platform_guidance = _build_platform_shell_guidance() + return f"""Execute shell commands in a persistent shell session with optional timeout. -All commands run in {directory} by default. Use the `workdir` parameter if you need to run a command in a different directory. AVOID using `cd && ` patterns - use `workdir` instead. +All commands run in {directory} by default. Use the `workdir` parameter if you need a different directory. Avoid `cd && ` patterns and set `workdir` instead. -IMPORTANT: This tool is for terminal operations like git, npm, docker, etc. DO NOT use it for file operations (reading, writing, editing, searching, finding files) - use the specialized tools for this instead. +Use this tool for terminal work such as git, uv/pip/npm, docker, builds, tests, servers, scripts, process inspection, system status, networking commands, and shell pipelines or compound commands. -Before executing the command, please follow these steps: +Do not use this tool when a dedicated tool is a better fit: +- Read file contents -> `read` +- Write a new file -> `write` +- Edit an existing file -> `edit` +- Search file names or directories -> `glob` +- Search file contents -> `grep` +- Navigate symbols or code structure -> `lsp` -1. Directory Verification: - - If the command will create new directories or files, first use `ls` to verify the parent directory exists and is the correct location - - For example, before running "mkdir foo/bar", first use `ls foo` to check that "foo" exists and is the intended parent directory - -2. Command Execution: - - Always quote file paths that contain spaces with double quotes (e.g., rm "path with spaces/file.txt") - - Examples of proper quoting: - - mkdir "/Users/name/My Documents" (correct) - - mkdir /Users/name/My Documents (incorrect - will fail) - - python "/path/with spaces/script.py" (correct) - - python /path/with spaces/script.py (incorrect - will fail) - - After ensuring proper quoting, execute the command. - - Capture the output of the command. +Before executing commands: +1. If the command will create files or directories, verify the target location. +2. Always quote file paths that contain spaces with double quotes. + - `mkdir "/Users/name/My Documents"` (correct) + - `mkdir /Users/name/My Documents` (incorrect) + - `python "/path/with spaces/script.py"` (correct) + - `python /path/with spaces/script.py` (incorrect) Usage notes: - - The command argument is required. - - You can specify an optional timeout in milliseconds. If not specified, commands will time out after 120000ms (2 minutes). - - It is very helpful if you write a clear, concise description of what this command does in 5-10 words. - - If the output exceeds {MAX_OUTPUT_LINES} lines or {MAX_OUTPUT_BYTES} bytes, it will be truncated and the full output will be written to a file. - - Avoid using Bash with the `find`, `grep`, `cat`, `head`, `tail`, `sed`, `awk`, or `echo` commands. Instead, use the dedicated tools: Glob, Grep, Read, Edit, Write. - - When issuing multiple commands: - - If the commands are independent and can run in parallel, make multiple Bash tool calls in a single message. - - If the commands depend on each other, use a single Bash call with '&&' to chain them together. - - Use ';' only when you need to run commands sequentially but don't care if earlier commands fail - - AVOID using `cd && `. Use the `workdir` parameter to change directories instead.""" +- The `command` argument is required. +- You can specify an optional timeout in milliseconds. If not specified, commands time out after {DEFAULT_TIMEOUT_MS}ms. +- It is very helpful to write a clear, concise `description` in 5-10 words. +- If the output exceeds {MAX_OUTPUT_LINES} lines or {MAX_OUTPUT_BYTES} bytes, it will be truncated and the full output will be written to a file. +- Prefer dedicated tools instead of shell equivalents: use `glob`/`file_search` instead of `find` or `ls`, `grep` instead of shell `grep`/`rg`, `read` instead of `cat`/`head`/`tail`, `edit` instead of `sed`/`awk`, and `write` instead of shell redirection or `echo`-based file creation. +- If commands are independent, make multiple bash tool calls in one message so they can run in parallel. +- If commands depend on each other, use a single bash tool call with `&&` to chain them together. +- Use `;` only when you want sequential commands and do not care if earlier ones fail. +{platform_guidance}""" + + +def _build_platform_shell_guidance() -> str: + """Return platform-specific guidance for the bash tool description.""" + if sys.platform != "win32": + return "" + + return """ + +Windows usage: +- On Windows this tool prefers `pwsh` or `powershell`; if neither is available it falls back to `cmd`. +- Write commands in PowerShell syntax rather than GNU bash syntax. +- Avoid bash-only constructs such as heredocs, `cat > file <<'EOF'`, `export NAME=value`, and Unix-only path expansion. +- For multi-step logic, prefer explicit PowerShell commands or a short `python -c` snippet over shell-specific tricks. +- When Python reads text files on Windows, always pass `encoding=...`; prefer `Path(path).read_text(encoding="utf-8-sig")`. +""" def _build_error_message( diff --git a/tests/session/test_prompt_tokens.py b/tests/session/test_prompt_tokens.py index dc284560..141a067b 100644 --- a/tests/session/test_prompt_tokens.py +++ b/tests/session/test_prompt_tokens.py @@ -255,31 +255,12 @@ def test_none_model_returns_list(self): class TestPromptToolInstructions: - def test_windows_bash_guidance_mentions_powershell(self): - with patch.object(prompt_strings.platform, "system", return_value="Windows"): - guidance = prompt_strings._build_bash_tool_guidance() - - assert "Windows machine" in guidance - assert "must follow PowerShell syntax" in guidance - assert 'Path(path).read_text(encoding="utf-8-sig")' in guidance - - def test_non_windows_bash_guidance_stays_generic(self): - with patch.object(prompt_strings.platform, "system", return_value="Darwin"): - guidance = prompt_strings._build_bash_tool_guidance() - - assert "Bash Tool Guidance" in guidance - assert "Windows machine" not in guidance - assert "must follow PowerShell syntax" not in guidance - def test_tool_instructions_are_platform_agnostic(self): - with patch.object(prompt_strings.platform, "system", return_value="Windows"): - windows_instructions = prompt_strings._build_tool_instructions() - with patch.object(prompt_strings.platform, "system", return_value="Darwin"): - darwin_instructions = prompt_strings._build_tool_instructions() - - assert windows_instructions == darwin_instructions - assert "PowerShell" not in windows_instructions - assert "must explicitly specify encoding" not in windows_instructions + instructions = prompt_strings._build_tool_instructions() + + assert "PowerShell" not in instructions + assert "must explicitly specify encoding" not in instructions + assert "Bash Tool Guidance" not in instructions def test_tool_instructions_do_not_hardcode_tool_name_mapping(self): instructions = prompt_strings._build_tool_instructions() diff --git a/tests/session/test_runner_step.py b/tests/session/test_runner_step.py index 457d2e84..e892359e 100644 --- a/tests/session/test_runner_step.py +++ b/tests/session/test_runner_step.py @@ -539,7 +539,6 @@ async def test_build_system_prompts_orders_stable_prefix_before_runtime_tail(sel with patch("flocks.session.prompt.SystemPrompt.provider", return_value=["provider prompt"]), \ patch.object(SessionPrompt, "_build_tool_guidance_prompt", return_value="tool protocol"), \ - patch.object(SessionPrompt, "_build_bash_guidance_prompt", return_value="bash guidance"), \ patch("flocks.session.prompt.SystemPrompt.environment_stable", return_value=["env prompt"]), \ patch("flocks.session.prompt.SystemPrompt.runtime_metadata", return_value=["runtime prompt"]), \ patch("flocks.session.prompt.SystemPrompt.custom", AsyncMock(return_value=["custom prompt"])): @@ -560,7 +559,6 @@ async def test_build_system_prompts_orders_stable_prefix_before_runtime_tail(sel assert prompts == [ "provider prompt", "tool protocol", - "bash guidance", "agent prompt", "memory guidance", "## MEMORY.md\n\nremembered context", @@ -718,8 +716,8 @@ async def test_build_system_prompts_includes_memory_guidance_when_memory_tools_l assert prompts.index("memory guidance") < prompts.index("## MEMORY.md\n\nremembered context") @pytest.mark.asyncio - async def test_build_system_prompts_includes_bash_guidance_when_bash_loaded_on_windows(self): - session = _make_session("ses_prompts_bash_guidance") + async def test_build_system_prompts_does_not_add_bash_guidance_prompt_when_bash_loaded(self): + session = _make_session("ses_prompts_no_bash_guidance") runner = SessionRunner(session=session) agent = _make_agent(name="rex") agent.prompt = "agent prompt" @@ -727,8 +725,7 @@ async def test_build_system_prompts_includes_bash_guidance_when_bash_loaded_on_w with patch("flocks.session.prompt.SystemPrompt.provider", return_value=["provider prompt"]), \ patch("flocks.session.prompt.SystemPrompt.environment_stable", return_value=["env prompt"]), \ patch("flocks.session.prompt.SystemPrompt.runtime_metadata", return_value=["runtime prompt"]), \ - patch("flocks.session.prompt.SystemPrompt.custom", AsyncMock(return_value=["custom prompt"])), \ - patch("flocks.session.prompt_strings.platform.system", return_value="Windows"): + patch("flocks.session.prompt.SystemPrompt.custom", AsyncMock(return_value=["custom prompt"])): prompts = await SessionPrompt.build_system_prompts( session_id=session.id, session_directory=session.directory, @@ -740,9 +737,8 @@ async def test_build_system_prompts_includes_bash_guidance_when_bash_loaded_on_w ) combined = "\n\n".join(prompts) - assert "## Bash Tool Guidance" in combined - assert "must follow PowerShell syntax" in combined - assert "explicit PowerShell commands" in combined + assert "## Bash Tool Guidance" not in combined + assert "PowerShell syntax" not in combined @pytest.mark.asyncio async def test_build_system_prompts_skips_memory_guidance_without_memory_tools(self): diff --git a/tests/tool/test_tools.py b/tests/tool/test_tools.py index a7e3b0ca..527a2fa7 100644 --- a/tests/tool/test_tools.py +++ b/tests/tool/test_tools.py @@ -32,6 +32,7 @@ ToolCategory, ParameterType, ) +from flocks.tool.code import bash as bash_module # ============================================================================= @@ -363,7 +364,37 @@ async def test_edit_replace_all(self, tool_context, temp_dir): class TestBashTool: - """Test the bash tool""" + """Test the bash tool.""" + + def test_registered_description_references_dedicated_tools(self): + tool = ToolRegistry.get("bash") + + assert tool is not None + description = tool.info.description + assert "Read file contents -> `read`" in description + assert "Write a new file -> `write`" in description + assert "Edit an existing file -> `edit`" in description + assert "Search file names or directories -> `glob`" in description + assert "Search file contents -> `grep`" in description + assert "Navigate symbols or code structure -> `lsp`" in description + assert "`glob`/`file_search`" in description + + def test_get_description_windows_mentions_powershell_guidance(self, monkeypatch): + monkeypatch.setattr(bash_module.sys, "platform", "win32") + + description = bash_module.get_description("/workspace") + + assert "prefers `pwsh` or `powershell`" in description + assert "PowerShell syntax rather than GNU bash syntax" in description + assert 'Path(path).read_text(encoding="utf-8-sig")' in description + + def test_get_description_non_windows_omits_windows_guidance(self, monkeypatch): + monkeypatch.setattr(bash_module.sys, "platform", "linux") + + description = bash_module.get_description("/workspace") + + assert "PowerShell syntax rather than GNU bash syntax" not in description + assert 'Path(path).read_text(encoding="utf-8-sig")' not in description @pytest.mark.asyncio async def test_bash_simple_command(self, tool_context):