chore: enhance hybrid cache logging and document M-RoPE token usage

JamePeng · JamePeng · commit d2318a4c04ab · 2026-03-20T22:41:31.000+08:00
- Added explanatory comments detailing why n_tokens is used instead of chunk_n_pos for M-RoPE models (to prevent the system from skipping evaluation).

- Added verbose logging for hybrid cache clearance scenarios (when checkpoints are missing, restore fails, or max_checkpoints is 0).

Signed-off-by: JamePeng &lt;jame_peng@sina.com&gt;
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -3185,6 +3185,10 @@ def _create_bitmap_func(idx: int, item: str):
                         self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO
                     ]:
                     # Extract media properties
+                    # Note(JamePeng):
+                    # The M-RoPE model is based on `n_pos` instead of `n_tokens` (of course, there's no difference in non-M-RoPE models).
+                    # However, I still keep `n_tokens` because if `n_pos` is used, the underlying system will assume it is a full-match and will skip eval and sample.
+                    # chunk_n_pos = self._mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk) # equals to max(t,h,w) for M-RoPE; equals to `n_tokens` otherwise
                     chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)
 
                     if media_items_cur < media_items_count:
@@ -3318,10 +3322,14 @@ def __call__(
                             if self.verbose:
                                 print(f"{self.log_prefix}(__call__): Successfully rolled back to checkpoint at pos {llama.n_tokens}.", file=sys.stderr)
                         else:
+                            if self.verbose:
+                                print(f"{self.log_prefix}(__call__): No suitable checkpoint found or restore failed. Clearing hybrid cache entirely.", file=sys.stderr)
                             llama._hybrid_cache_mgr.clear()
                             llama._ctx.memory_clear(True)
                             llama.n_tokens = 0
                     else:
+                        if self.verbose:
+                            print(f"{self.log_prefix}(__call__): Hybrid cache enabled but max_checkpoints is 0. Clearing cache entirely.", file=sys.stderr)
                         llama._hybrid_cache_mgr.clear()
                         llama._ctx.memory_clear(True)
                         llama.n_tokens = 0