perf(cache): upgrade LlamaRAMCache to O(1) eviction and set LlamaTrieCache as default

JamePeng · JamePeng · commit 5c8e056bc1c7 · 2026-03-10T21:26:18.000+08:00
Addressed severe performance bottlenecks in legacy RAM caching components:
- Refactored `LlamaRAMCache` to use an O(1) `_current_size` tracker instead of an O(N) dynamic sum. This eliminates massive CPU spikes and O(N^2) complexity during LRU eviction cycles.
- Added strict OOM safeguards to `LlamaRAMCache`: The current size is explicitly clamped to 0 during evictions, and hard-reset to 0 if the cache empties, preventing catastrophic capacity drift.
- Introduced early-exit O(1) short-circuits in `__getitem__` and `__contains__` to bypass expensive prefix searches when the cache is empty.
- Updated the `LlamaCache` backward-compatibility alias to point to the highly optimized `LlamaTrieCache` instead of the legacy `LlamaRAMCache`.

Signed-off-by: JamePeng &lt;jame_peng@sina.com&gt;
diff --git a/llama_cpp/llama_cache.py b/llama_cpp/llama_cache.py
@@ -107,18 +107,22 @@ def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"):
 
 
 class LlamaRAMCache(BaseLlamaCache):
-    """Cache for a llama.cpp model using RAM."""
+    """
+    RAM cache for a llama.cpp model.
+    Maintains an LRU eviction policy with O(1) size tracking.
+    """
 
     def __init__(self, capacity_bytes: int = (2 << 30)):
         super().__init__(capacity_bytes)
         self.capacity_bytes = capacity_bytes
         self.cache_state: OrderedDict[
             Tuple[int, ...], "llama_core.LlamaState"
         ] = OrderedDict()
+        self._current_size = 0
 
     @property
     def cache_size(self):
-        return sum([state.llama_state_size for state in self.cache_state.values()])
+        return self._current_size
 
     def _find_longest_prefix_key(
         self,
@@ -137,6 +141,9 @@ def _find_longest_prefix_key(
         return min_key
 
     def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState":
+        if not self.cache_state:
+            raise KeyError("Cache is empty")
+
         key = tuple(key)
         _key = self._find_longest_prefix_key(key)
         if _key is None:
@@ -146,15 +153,26 @@ def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState":
         return value
 
     def __contains__(self, key: Sequence[int]) -> bool:
+        if not self.cache_state:
+            return False
+
         return self._find_longest_prefix_key(tuple(key)) is not None
 
     def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"):
         key = tuple(key)
         if key in self.cache_state:
             del self.cache_state[key]
+
         self.cache_state[key] = value
-        while self.cache_size > self.capacity_bytes and len(self.cache_state) > 0:
-            self.cache_state.popitem(last=False)
+        self._current_size += value.llama_state_size
+
+        while self._current_size > self.capacity_bytes and len(self.cache_state) > 0:
+            _, popped_state = self.cache_state.popitem(last=False)
+            self._current_size -= popped_state.llama_state_size
+            self._current_size = max(0, self._current_size)
+
+        if len(self.cache_state) == 0:
+            self._current_size = 0
 
 
 class TrieNode:
@@ -316,7 +334,7 @@ def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"):
             self._prune(evicted_key)
 
 # Alias for backwards compatibility
-LlamaCache = LlamaRAMCache
+LlamaCache = LlamaTrieCache
 
 
 @dataclass