From c633b1ca552c290de8419b413bbced14996eeb8b Mon Sep 17 00:00:00 2001 From: vinci Date: Wed, 11 Mar 2026 17:32:50 +0800 Subject: [PATCH 1/4] feat: add ContextCompressor for context overflow handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new context compression mechanism inspired by opencode's compaction approach. Features include: - Token-based overflow detection - Tool output pruning to reduce context size - LLM-based conversation summarization 🤖 Generated with [Qoder][https://qoder.com] --- ms_agent/memory/condenser/__init__.py | 6 + .../memory/condenser/context_compressor.py | 210 ++++++++++++++++++ ms_agent/memory/utils.py | 2 + 3 files changed, 218 insertions(+) create mode 100644 ms_agent/memory/condenser/context_compressor.py diff --git a/ms_agent/memory/condenser/__init__.py b/ms_agent/memory/condenser/__init__.py index e69de29bb..0ece6c459 100644 --- a/ms_agent/memory/condenser/__init__.py +++ b/ms_agent/memory/condenser/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) ModelScope Contributors. All rights reserved. +from .code_condenser import CodeCondenser +from .context_compressor import ContextCompressor +from .refine_condenser import RefineCondenser + +__all__ = ['CodeCondenser', 'RefineCondenser', 'ContextCompressor'] diff --git a/ms_agent/memory/condenser/context_compressor.py b/ms_agent/memory/condenser/context_compressor.py new file mode 100644 index 000000000..855157015 --- /dev/null +++ b/ms_agent/memory/condenser/context_compressor.py @@ -0,0 +1,210 @@ +# Copyright (c) ModelScope Contributors. All rights reserved. +""" +Context Compressor - Inspired by opencode's context compaction mechanism. + +Core concepts: +1. Token overflow detection - Monitor token usage against context limits +2. Tool output pruning - Compress old tool call outputs to save context +3. Summary compaction - Use LLM to generate conversation summary + +Reference: desktop/opencode/packages/opencode/src/session/compaction.ts +""" + +from typing import List, Optional + +import json +from ms_agent.llm import LLM, Message +from ms_agent.memory import Memory +from ms_agent.utils.logger import logger + + +# Default summary prompt template (from opencode) +SUMMARY_PROMPT = """Summarize this conversation to help continue the work. + +Focus on: +- Goal: What is the user trying to accomplish? +- Instructions: Important user requirements or constraints +- Discoveries: Notable findings during the conversation +- Accomplished: What's done, in progress, and remaining +- Relevant files: Files read, edited, or created + +Keep it concise but comprehensive enough for another agent to continue.""" + + +class ContextCompressor(Memory): + """Context compression tool inspired by opencode's compaction mechanism. + + Features: + 1. Token-based overflow detection + 2. Tool output pruning for old tool calls + 3. LLM-based conversation summarization + """ + + def __init__(self, config): + super().__init__(config) + mem_config = getattr(config.memory, 'context_compressor', None) + if mem_config is None: + mem_config = config.memory + + # Token thresholds (inspired by opencode's PRUNE constants) + self.context_limit = getattr(mem_config, 'context_limit', 128000) + self.prune_protect = getattr(mem_config, 'prune_protect', 40000) + self.prune_minimum = getattr(mem_config, 'prune_minimum', 20000) + self.reserved_buffer = getattr(mem_config, 'reserved_buffer', 20000) + + # Summary prompt + self.summary_prompt = getattr(mem_config, 'summary_prompt', + SUMMARY_PROMPT) + + # LLM for summarization + self.llm: Optional[LLM] = None + if getattr(mem_config, 'enable_summary', True): + try: + self.llm = LLM.from_config(config) + except Exception as e: + logger.warning(f'Failed to init LLM for summary: {e}') + + def estimate_tokens(self, text: str) -> int: + """Estimate token count from text. + Simple heuristic: ~4 chars per token for mixed content. + """ + if not text: + return 0 + return len(text) // 4 + + def estimate_message_tokens(self, msg: Message) -> int: + """Estimate tokens for a single message.""" + total = 0 + if msg.content: + content = msg.content if isinstance(msg.content, + str) else json.dumps( + msg.content, + ensure_ascii=False) + total += self.estimate_tokens(content) + if msg.tool_calls: + total += self.estimate_tokens(json.dumps(msg.tool_calls)) + if msg.reasoning_content: + total += self.estimate_tokens(msg.reasoning_content) + return total + + def estimate_total_tokens(self, messages: List[Message]) -> int: + """Estimate total tokens for all messages.""" + return sum(self.estimate_message_tokens(m) for m in messages) + + def is_overflow(self, messages: List[Message]) -> bool: + """Check if messages exceed context limit.""" + total = self.estimate_total_tokens(messages) + usable = self.context_limit - self.reserved_buffer + return total >= usable + + def prune_tool_outputs(self, messages: List[Message]) -> List[Message]: + """Prune old tool outputs to reduce context size. + + Strategy (from opencode): + - Scan backwards through messages + - Protect the most recent tool outputs (prune_protect tokens) + - Truncate older tool outputs + """ + result = [] + total_tool_tokens = 0 + pruned_count = 0 + + # Process in reverse to protect recent outputs + for msg in reversed(messages): + if msg.role == 'tool' and msg.content: + content_str = msg.content if isinstance( + msg.content, str) else json.dumps(msg.content, + ensure_ascii=False) + tokens = self.estimate_tokens(content_str) + total_tool_tokens += tokens + + # Prune if beyond protection threshold + if total_tool_tokens > self.prune_protect: + msg = Message( + role=msg.role, + content='[Output truncated to save context]', + tool_call_id=msg.tool_call_id, + name=msg.name, + ) + pruned_count += 1 + result.append(msg) + + if pruned_count > 0: + logger.info(f'Pruned {pruned_count} tool outputs') + + return list(reversed(result)) + + def summarize(self, messages: List[Message]) -> Optional[str]: + """Generate conversation summary using LLM.""" + if not self.llm: + return None + + # Build conversation text for summarization + conv_parts = [] + for msg in messages: + role = msg.role.upper() + content = msg.content if isinstance(msg.content, str) else str( + msg.content) + if content: + conv_parts.append(f'{role}: {content[:2000]}') + + conversation = '\n'.join(conv_parts) + query = f'{self.summary_prompt}\n\n---\n{conversation}' + + try: + response = self.llm.generate( + [Message(role='user', content=query)], stream=False) + return response.content + except Exception as e: + logger.error(f'Summary generation failed: {e}') + return None + + def compress(self, messages: List[Message]) -> List[Message]: + """Compress messages when context overflows. + + Steps: + 1. Try pruning tool outputs first + 2. If still overflow, generate summary and replace history + """ + if not self.is_overflow(messages): + return messages + + logger.info('Context overflow detected, starting compression') + + # Step 1: Prune tool outputs + pruned = self.prune_tool_outputs(messages) + if not self.is_overflow(pruned): + return pruned + + # Step 2: Generate summary + summary = self.summarize(messages) + if not summary: + logger.warning('Summary failed, returning pruned messages') + return pruned + + # Keep system prompt and replace history with summary + result = [] + for msg in messages: + if msg.role == 'system': + result.append(msg) + break + + result.append( + Message( + role='user', + content=f'[Conversation Summary]\n{summary}\n\n' + 'Please continue based on this summary.')) + + # Keep the most recent user message if different + if messages and messages[-1].role == 'user': + last_user = messages[-1] + if last_user.content and last_user.content != result[-1].content: + result.append(last_user) + + logger.info( + f'Compressed {len(messages)} messages to {len(result)} messages') + return result + + async def run(self, messages: List[Message]) -> List[Message]: + """Main entry point for context compression.""" + return self.compress(messages) diff --git a/ms_agent/memory/utils.py b/ms_agent/memory/utils.py index ae0e1223f..b7e20ad30 100644 --- a/ms_agent/memory/utils.py +++ b/ms_agent/memory/utils.py @@ -2,6 +2,7 @@ from omegaconf import DictConfig, OmegaConf from .condenser.code_condenser import CodeCondenser +from .condenser.context_compressor import ContextCompressor from .condenser.refine_condenser import RefineCondenser from .default_memory import DefaultMemory from .diversity import Diversity @@ -11,6 +12,7 @@ 'diversity': Diversity, 'code_condenser': CodeCondenser, 'refine_condenser': RefineCondenser, + 'context_compressor': ContextCompressor, } From 02809649f2fc89b090a27b419b24a49ff4bc057b Mon Sep 17 00:00:00 2001 From: vinci Date: Fri, 20 Mar 2026 14:50:53 +0800 Subject: [PATCH 2/4] fix lint error --- ms_agent/memory/condenser/context_compressor.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/ms_agent/memory/condenser/context_compressor.py b/ms_agent/memory/condenser/context_compressor.py index 855157015..06a7ce75e 100644 --- a/ms_agent/memory/condenser/context_compressor.py +++ b/ms_agent/memory/condenser/context_compressor.py @@ -17,7 +17,6 @@ from ms_agent.memory import Memory from ms_agent.utils.logger import logger - # Default summary prompt template (from opencode) SUMMARY_PROMPT = """Summarize this conversation to help continue the work. @@ -76,10 +75,9 @@ def estimate_message_tokens(self, msg: Message) -> int: """Estimate tokens for a single message.""" total = 0 if msg.content: - content = msg.content if isinstance(msg.content, - str) else json.dumps( - msg.content, - ensure_ascii=False) + content = msg.content if isinstance( + msg.content, str) else json.dumps( + msg.content, ensure_ascii=False) total += self.estimate_tokens(content) if msg.tool_calls: total += self.estimate_tokens(json.dumps(msg.tool_calls)) @@ -113,8 +111,8 @@ def prune_tool_outputs(self, messages: List[Message]) -> List[Message]: for msg in reversed(messages): if msg.role == 'tool' and msg.content: content_str = msg.content if isinstance( - msg.content, str) else json.dumps(msg.content, - ensure_ascii=False) + msg.content, str) else json.dumps( + msg.content, ensure_ascii=False) tokens = self.estimate_tokens(content_str) total_tool_tokens += tokens @@ -152,8 +150,8 @@ def summarize(self, messages: List[Message]) -> Optional[str]: query = f'{self.summary_prompt}\n\n---\n{conversation}' try: - response = self.llm.generate( - [Message(role='user', content=query)], stream=False) + response = self.llm.generate([Message(role='user', content=query)], + stream=False) return response.content except Exception as e: logger.error(f'Summary generation failed: {e}') From 0702ce84546bb74f345f145240adde9e63ba97e2 Mon Sep 17 00:00:00 2001 From: "vinci.grape" Date: Mon, 23 Mar 2026 17:41:41 +0800 Subject: [PATCH 3/4] fix typos --- .../memory/condenser/context_compressor.py | 22 ++++++------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/ms_agent/memory/condenser/context_compressor.py b/ms_agent/memory/condenser/context_compressor.py index 06a7ce75e..88ab8f997 100644 --- a/ms_agent/memory/condenser/context_compressor.py +++ b/ms_agent/memory/condenser/context_compressor.py @@ -1,14 +1,4 @@ # Copyright (c) ModelScope Contributors. All rights reserved. -""" -Context Compressor - Inspired by opencode's context compaction mechanism. - -Core concepts: -1. Token overflow detection - Monitor token usage against context limits -2. Tool output pruning - Compress old tool call outputs to save context -3. Summary compaction - Use LLM to generate conversation summary - -Reference: desktop/opencode/packages/opencode/src/session/compaction.ts -""" from typing import List, Optional @@ -31,12 +21,14 @@ class ContextCompressor(Memory): - """Context compression tool inspired by opencode's compaction mechanism. + """Context Compressor - Inspired by opencode's context compaction mechanism. + + Core concepts: + 1. Token overflow detection - Monitor token usage against context limits + 2. Tool output pruning - Compress old tool call outputs to save context + 3. Summary compaction - Use LLM to generate conversation summary - Features: - 1. Token-based overflow detection - 2. Tool output pruning for old tool calls - 3. LLM-based conversation summarization + Reference: opencode/packages/opencode/src/session/compaction.ts """ def __init__(self, config): From 89d7e4b69a591cb4b81255b27b9c535d6ac7b048 Mon Sep 17 00:00:00 2001 From: "vinci.grape" Date: Mon, 23 Mar 2026 20:15:26 +0800 Subject: [PATCH 4/4] fix memory --- .../memory/condenser/context_compressor.py | 67 ++++++++++++------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/ms_agent/memory/condenser/context_compressor.py b/ms_agent/memory/condenser/context_compressor.py index 88ab8f997..9bec9bcf1 100644 --- a/ms_agent/memory/condenser/context_compressor.py +++ b/ms_agent/memory/condenser/context_compressor.py @@ -63,8 +63,8 @@ def estimate_tokens(self, text: str) -> int: return 0 return len(text) // 4 - def estimate_message_tokens(self, msg: Message) -> int: - """Estimate tokens for a single message.""" + def _estimate_message_tokens_from_content(self, msg: Message) -> int: + """Heuristic token count from message body (no API usage fields).""" total = 0 if msg.content: content = msg.content if isinstance( @@ -77,8 +77,34 @@ def estimate_message_tokens(self, msg: Message) -> int: total += self.estimate_tokens(msg.reasoning_content) return total + def estimate_message_tokens(self, msg: Message) -> int: + """Tokens for one message: prefer ``Message`` usage, else content heuristic.""" + pt = int(getattr(msg, 'prompt_tokens', 0) or 0) + ct = int(getattr(msg, 'completion_tokens', 0) or 0) + if pt or ct: + return pt + ct + return self._estimate_message_tokens_from_content(msg) + def estimate_total_tokens(self, messages: List[Message]) -> int: - """Estimate total tokens for all messages.""" + """Total tokens for the conversation.""" + last_usage_idx = -1 + for i in range(len(messages) - 1, -1, -1): + m = messages[i] + if m.role != 'assistant': + continue + pt = int(getattr(m, 'prompt_tokens', 0) or 0) + ct = int(getattr(m, 'completion_tokens', 0) or 0) + if pt or ct: + last_usage_idx = i + break + if last_usage_idx >= 0: + m = messages[last_usage_idx] + base = int(getattr(m, 'prompt_tokens', 0) or 0) + int( + getattr(m, 'completion_tokens', 0) or 0) + tail = sum( + self._estimate_message_tokens_from_content(x) + for x in messages[last_usage_idx + 1:]) + return base + tail return sum(self.estimate_message_tokens(m) for m in messages) def is_overflow(self, messages: List[Message]) -> bool: @@ -95,34 +121,27 @@ def prune_tool_outputs(self, messages: List[Message]) -> List[Message]: - Protect the most recent tool outputs (prune_protect tokens) - Truncate older tool outputs """ - result = [] total_tool_tokens = 0 pruned_count = 0 - # Process in reverse to protect recent outputs - for msg in reversed(messages): - if msg.role == 'tool' and msg.content: - content_str = msg.content if isinstance( - msg.content, str) else json.dumps( - msg.content, ensure_ascii=False) - tokens = self.estimate_tokens(content_str) - total_tool_tokens += tokens - - # Prune if beyond protection threshold - if total_tool_tokens > self.prune_protect: - msg = Message( - role=msg.role, - content='[Output truncated to save context]', - tool_call_id=msg.tool_call_id, - name=msg.name, - ) - pruned_count += 1 - result.append(msg) + for idx in range(len(messages) - 1, -1, -1): + msg = messages[idx] + if msg.role != 'tool' or not msg.content: + continue + content_str = msg.content if isinstance( + msg.content, str) else json.dumps( + msg.content, ensure_ascii=False) + tokens = self.estimate_tokens(content_str) + total_tool_tokens += tokens + + if total_tool_tokens > self.prune_protect: + msg.content = '[Output truncated to save context]' + pruned_count += 1 if pruned_count > 0: logger.info(f'Pruned {pruned_count} tool outputs') - return list(reversed(result)) + return messages def summarize(self, messages: List[Message]) -> Optional[str]: """Generate conversation summary using LLM."""