OpenHands · enyst · Oct 26, 2025 · Nov 10, 2025 · Dec 1, 2025 · Dec 6, 2025
diff --git a/openhands-sdk/openhands/sdk/context/condenser/base.py b/openhands-sdk/openhands/sdk/context/condenser/base.py
@@ -57,6 +57,68 @@ def handles_condensation_requests(self) -> bool:
         """
         return False
 
+    # Shared token-budget utilities (for condenser authors)
+    @staticmethod
+    def compute_token_budget(llm, token_margin_ratio: float) -> int | None:
+        """Compute usable input-token budget for a target LLM.
+
+        Returns an integer budget (>= 0) or None if limits are unknown.
+        """
+        try:
+            max_input = getattr(llm, "max_input_tokens", None)
+            if not max_input:
+                return None
+            max_output = getattr(llm, "max_output_tokens", 0) or 0
+            headroom = int(max_input * token_margin_ratio)
+            return max(0, int(max_input) - int(max_output) - headroom)
+        except Exception:
+            return None
+
+    @staticmethod
+    def estimate_token_count(llm, events) -> int:
+        """Estimate tokens for a sequence of LLMConvertibleEvent using the given LLM.
+
+        Falls back to 0 on failure.
+        """
+        try:
+            from openhands.sdk.event.base import LLMConvertibleEvent
+
+            messages = LLMConvertibleEvent.events_to_messages(list(events))
+            return int(llm.get_token_count(messages))
+        except Exception:
+            return 0
+
+    @staticmethod
+    def max_tail_within_budget(view: View, llm, keep_first: int, budget: int) -> int:
+        """Binary-search the longest tail we can keep under the token budget.
+
+        Counts tokens using the provided LLM. The head is fixed to keep_first.
+        Returns number of tail events to keep (>= 0).
+        """
+        from openhands.sdk.event.base import LLMConvertibleEvent
+
+        head = view[:keep_first]
+        total_len = len(view)
+        max_tail_possible = max(0, total_len - keep_first)
+        low, high = 0, max_tail_possible
+        best = 0
+        while low <= high:
+            mid = (low + high) // 2
+            kept_events = list(head) + (list(view[-mid:]) if mid > 0 else [])
+            msgs = LLMConvertibleEvent.events_to_messages(kept_events)
+            try:
+                t = int(llm.get_token_count(msgs))
+            except Exception:
+                # If counting fails, be conservative and stop expanding
+                high = mid - 1
+                continue
+            if t <= budget:
+                best = mid
+                low = mid + 1
+            else:
+                high = mid - 1
+        return best
+
 
 class PipelinableCondenserBase(CondenserBase):
     """Abstract condenser interface which may be pipelined. (Since a pipeline

diff --git a/openhands-sdk/openhands/sdk/context/condenser/llm_summarizing_condenser.py b/openhands-sdk/openhands/sdk/context/condenser/llm_summarizing_condenser.py
@@ -2,7 +2,7 @@
 
 from pydantic import Field, model_validator
 
-from openhands.sdk.context.condenser.base import RollingCondenser
+from openhands.sdk.context.condenser.base import CondenserBase, RollingCondenser
 from openhands.sdk.context.prompts import render_template
 from openhands.sdk.context.view import View
 from openhands.sdk.event.condenser import Condensation
@@ -15,6 +15,9 @@ class LLMSummarizingCondenser(RollingCondenser):
     llm: LLM
     max_size: int = Field(default=120, gt=0)
     keep_first: int = Field(default=4, ge=0)
+    token_margin_ratio: float = Field(
+        default=0.1, ge=0.0, le=0.5
+    )  # reserve headroom of context window
 
     @model_validator(mode="after")
     def validate_keep_first_vs_max_size(self):
@@ -32,19 +35,50 @@ def handles_condensation_requests(self) -> bool:
     def should_condense(self, view: View) -> bool:
         if view.unhandled_condensation_request:
             return True
+
+        # Prefer token-aware check when LLM has context window info and
+        # we can estimate message tokens. Fallback to event-count otherwise.
+        try:
+            budget = CondenserBase.compute_token_budget(
+                self.llm, self.token_margin_ratio
+            )
+            if budget is not None:
+                total_tokens = CondenserBase.estimate_token_count(self.llm, view.events)
+                return total_tokens > budget
+        except Exception:
+            # Any failure falls back to count-based behavior
+            pass
+
         return len(view) > self.max_size
 
     @observe(ignore_inputs=["view"])
     def get_condensation(self, view: View) -> Condensation:
         head = view[: self.keep_first]
-        target_size = self.max_size // 2
-        if view.unhandled_condensation_request:
-            # Condensation triggered by a condensation request
-            # should be calculated based on the view size.
-            target_size = len(view) // 2
-        # Number of events to keep from the tail -- target size, minus however many
-        # prefix events from the head, minus one for the summarization event
-        events_from_tail = target_size - len(head) - 1
+
+        # Prefer token-aware trimming if we have model limits; otherwise
+        # fall back to event-count based trimming as before.
+        events_from_tail: int | None = None
+        try:
+            budget = CondenserBase.compute_token_budget(
+                self.llm, self.token_margin_ratio
+            )
+            if budget is not None:
+                # Binary search the max tail we can keep within budget
+                events_from_tail = CondenserBase.max_tail_within_budget(
+                    view=view, llm=self.llm, keep_first=self.keep_first, budget=budget
+                )
+        except Exception:
+            events_from_tail = None
+
+        if events_from_tail is None:
+            target_size = self.max_size // 2
+            if view.unhandled_condensation_request:
+                # Condensation triggered by a condensation request
+                # should be calculated based on the view size.
+                target_size = len(view) // 2
+            # Number of events to keep from the tail -- target size, minus however many
+            # prefix events from the head, minus one for the summarization event
+            events_from_tail = max(0, target_size - len(head) - 1)
 
         summary_event_content: str = ""
 
@@ -55,7 +89,11 @@ def get_condensation(self, view: View) -> Condensation:
                 summary_event_content = message_content.text
 
         # Identify events to be forgotten (those not in head or tail)
-        forgotten_events = view[self.keep_first : -events_from_tail]
+        forgotten_events = (
+            view[self.keep_first : -events_from_tail]
+            if events_from_tail > 0
+            else view[self.keep_first :]
+        )
 
         # Convert events to strings for the template
         event_strings = [str(forgotten_event) for forgotten_event in forgotten_events]