Skip to content

Commit 18aa31e

Browse files
authored
feat: Update llama.cpp to ggml-org/llama.cpp@49bfdde (abetlen#2151)
* Update llama.cpp and sync bindings * Clean up binding compatibility shims * Remove flash attention property shim * Remove mtmd verbosity shim * Add docstrings for new bindings * Format Ruff files and add changelog entry
1 parent a9b4a06 commit 18aa31e

File tree

10 files changed

+714
-329
lines changed

10 files changed

+714
-329
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
- feat: Update llama.cpp to ggerganov/llama.cpp@49bfddeca18e62fa3d39114a23e9fcbdf8a22388 and sync Python bindings by @abetlen in #2151
1011
- chore(dev): Add Ruff-based formatting and a safe lint baseline, and run it in CI for pull requests and pushes to `main`
1112
- fix(ci): Run macOS CI on supported Apple Silicon and Intel runners by @abetlen in #2150
1213
- fix(ci): Use the `hf` CLI instead of the deprecated `huggingface-cli` name in GitHub Actions and docs by @abetlen in #2149

CMakeLists.txt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,26 @@ if (LLAMA_BUILD)
153153
add_compile_definitions(GGML_USE_METAL)
154154
endif()
155155

156+
# Upstream mtmd expects LLAMA_INSTALL_VERSION to be set by llama.cpp's
157+
# top-level CMakeLists.txt. When we include tools/mtmd directly from the
158+
# Python package build, that directory scope is skipped.
159+
if (NOT DEFINED LLAMA_INSTALL_VERSION OR "${LLAMA_INSTALL_VERSION}" STREQUAL "")
160+
set(LLAMA_INSTALL_VERSION 0.0.0)
161+
find_package(Git QUIET)
162+
if (Git_FOUND)
163+
execute_process(
164+
COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
165+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp
166+
OUTPUT_VARIABLE LLAMA_MTMD_BUILD_NUMBER
167+
OUTPUT_STRIP_TRAILING_WHITESPACE
168+
RESULT_VARIABLE LLAMA_MTMD_BUILD_NUMBER_RESULT
169+
)
170+
if (LLAMA_MTMD_BUILD_NUMBER_RESULT EQUAL 0)
171+
set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_MTMD_BUILD_NUMBER})
172+
endif()
173+
endif()
174+
endif()
175+
156176
# Building llava
157177
add_subdirectory(vendor/llama.cpp/tools/mtmd)
158178

Makefile

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,6 @@ run-server:
8282
python3 -m llama_cpp.server --model ${MODEL}
8383

8484
clean:
85-
- cd vendor/llama.cpp && make clean
86-
- cd vendor/llama.cpp && rm libllama.so
8785
- rm -rf _skbuild
8886
- rm llama_cpp/lib/*.so
8987
- rm llama_cpp/lib/*.dylib

llama_cpp/_internals.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import os
44
import ctypes
5+
import warnings
56

67
from typing import (
78
Dict,
@@ -699,8 +700,11 @@ def add_dist(self, seed: int):
699700
llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
700701

701702
def add_softmax(self):
702-
sampler = llama_cpp.llama_sampler_init_softmax()
703-
llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
703+
warnings.warn(
704+
"add_softmax is deprecated; llama_sampler_init_dist now samples directly from logits",
705+
DeprecationWarning,
706+
stacklevel=2,
707+
)
704708

705709
def add_top_k(self, k: int):
706710
sampler = llama_cpp.llama_sampler_init_top_k(k)

llama_cpp/llama.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,11 @@ def __init__(
341341
self._logits_all = logits_all if draft_model is None else True
342342
self.context_params.embeddings = embedding # TODO: Rename to embeddings
343343
self.context_params.offload_kqv = offload_kqv
344-
self.context_params.flash_attn = flash_attn
344+
self.context_params.flash_attn_type = (
345+
llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
346+
if flash_attn
347+
else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
348+
)
345349

346350
if op_offload is not None:
347351
self.context_params.op_offload = op_offload
@@ -431,9 +435,9 @@ def free_lora_adapter():
431435

432436
self._stack.callback(free_lora_adapter)
433437

434-
if llama_cpp.llama_set_adapter_lora(
435-
self._ctx.ctx, self._lora_adapter, self.lora_scale
436-
):
438+
adapters = (llama_cpp.llama_adapter_lora_p_ctypes * 1)(self._lora_adapter)
439+
scales = (ctypes.c_float * 1)(self.lora_scale)
440+
if llama_cpp.llama_set_adapters_lora(self._ctx.ctx, adapters, 1, scales):
437441
raise RuntimeError(
438442
f"Failed to set LoRA adapter from lora path: {self.lora_path}"
439443
)
@@ -726,7 +730,6 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p):
726730
sampler.add_grammar(self._model, grammar)
727731

728732
if temp < 0.0:
729-
sampler.add_softmax()
730733
sampler.add_dist(self._seed)
731734
elif temp == 0.0:
732735
sampler.add_greedy()
@@ -1042,7 +1045,7 @@ def embed(
10421045
data: Union[List[List[float]], List[List[List[float]]]] = []
10431046

10441047
def decode_batch(seq_sizes: List[int]):
1045-
llama_cpp.llama_kv_self_clear(self._ctx.ctx)
1048+
self._ctx.kv_cache_clear()
10461049
self._ctx.decode(self._batch)
10471050
self._batch.reset()
10481051

@@ -1113,7 +1116,7 @@ def decode_batch(seq_sizes: List[int]):
11131116

11141117
output = data[0] if isinstance(input, str) else data
11151118

1116-
llama_cpp.llama_kv_self_clear(self._ctx.ctx)
1119+
self._ctx.kv_cache_clear()
11171120
self.reset()
11181121

11191122
if return_count:
@@ -2100,7 +2103,10 @@ def __getstate__(self):
21002103
logits_all=self._logits_all,
21012104
embedding=self.context_params.embeddings,
21022105
offload_kqv=self.context_params.offload_kqv,
2103-
flash_attn=self.context_params.flash_attn,
2106+
flash_attn=(
2107+
self.context_params.flash_attn_type
2108+
== llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
2109+
),
21042110
op_offload=self.context_params.op_offload,
21052111
swa_full=self.context_params.swa_full,
21062112
# Sampling Params

llama_cpp/llama_chat_format.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2755,7 +2755,14 @@ def _init_mtmd_context(self, llama_model: llama.Llama):
27552755
ctx_params.use_gpu = True # TODO: Make this configurable
27562756
ctx_params.print_timings = self.verbose
27572757
ctx_params.n_threads = llama_model.n_threads
2758-
ctx_params.verbosity = 2 if self.verbose else 0 # GGML_LOG_LEVEL_INFO = 2
2758+
ctx_params.flash_attn_type = (
2759+
llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
2760+
if (
2761+
llama_model.context_params.flash_attn_type
2762+
== llama_cpp.LLAMA_FLASH_ATTN_TYPE_ENABLED
2763+
)
2764+
else llama_cpp.LLAMA_FLASH_ATTN_TYPE_DISABLED
2765+
)
27592766

27602767
# Initialize mtmd context
27612768
self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(

0 commit comments

Comments
 (0)