Skip to content

Commit 5c8e056

Browse files
committed
perf(cache): upgrade LlamaRAMCache to O(1) eviction and set LlamaTrieCache as default
Addressed severe performance bottlenecks in legacy RAM caching components: - Refactored `LlamaRAMCache` to use an O(1) `_current_size` tracker instead of an O(N) dynamic sum. This eliminates massive CPU spikes and O(N^2) complexity during LRU eviction cycles. - Added strict OOM safeguards to `LlamaRAMCache`: The current size is explicitly clamped to 0 during evictions, and hard-reset to 0 if the cache empties, preventing catastrophic capacity drift. - Introduced early-exit O(1) short-circuits in `__getitem__` and `__contains__` to bypass expensive prefix searches when the cache is empty. - Updated the `LlamaCache` backward-compatibility alias to point to the highly optimized `LlamaTrieCache` instead of the legacy `LlamaRAMCache`. Signed-off-by: JamePeng <jame_peng@sina.com>
1 parent 4bec165 commit 5c8e056

1 file changed

Lines changed: 23 additions & 5 deletions

File tree

llama_cpp/llama_cache.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -107,18 +107,22 @@ def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"):
107107

108108

109109
class LlamaRAMCache(BaseLlamaCache):
110-
"""Cache for a llama.cpp model using RAM."""
110+
"""
111+
RAM cache for a llama.cpp model.
112+
Maintains an LRU eviction policy with O(1) size tracking.
113+
"""
111114

112115
def __init__(self, capacity_bytes: int = (2 << 30)):
113116
super().__init__(capacity_bytes)
114117
self.capacity_bytes = capacity_bytes
115118
self.cache_state: OrderedDict[
116119
Tuple[int, ...], "llama_core.LlamaState"
117120
] = OrderedDict()
121+
self._current_size = 0
118122

119123
@property
120124
def cache_size(self):
121-
return sum([state.llama_state_size for state in self.cache_state.values()])
125+
return self._current_size
122126

123127
def _find_longest_prefix_key(
124128
self,
@@ -137,6 +141,9 @@ def _find_longest_prefix_key(
137141
return min_key
138142

139143
def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState":
144+
if not self.cache_state:
145+
raise KeyError("Cache is empty")
146+
140147
key = tuple(key)
141148
_key = self._find_longest_prefix_key(key)
142149
if _key is None:
@@ -146,15 +153,26 @@ def __getitem__(self, key: Sequence[int]) -> "llama_core.LlamaState":
146153
return value
147154

148155
def __contains__(self, key: Sequence[int]) -> bool:
156+
if not self.cache_state:
157+
return False
158+
149159
return self._find_longest_prefix_key(tuple(key)) is not None
150160

151161
def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"):
152162
key = tuple(key)
153163
if key in self.cache_state:
154164
del self.cache_state[key]
165+
155166
self.cache_state[key] = value
156-
while self.cache_size > self.capacity_bytes and len(self.cache_state) > 0:
157-
self.cache_state.popitem(last=False)
167+
self._current_size += value.llama_state_size
168+
169+
while self._current_size > self.capacity_bytes and len(self.cache_state) > 0:
170+
_, popped_state = self.cache_state.popitem(last=False)
171+
self._current_size -= popped_state.llama_state_size
172+
self._current_size = max(0, self._current_size)
173+
174+
if len(self.cache_state) == 0:
175+
self._current_size = 0
158176

159177

160178
class TrieNode:
@@ -316,7 +334,7 @@ def __setitem__(self, key: Sequence[int], value: "llama_core.LlamaState"):
316334
self._prune(evicted_key)
317335

318336
# Alias for backwards compatibility
319-
LlamaCache = LlamaRAMCache
337+
LlamaCache = LlamaTrieCache
320338

321339

322340
@dataclass

0 commit comments

Comments
 (0)