From f750fa94447b334040b7d5adff0927007c559f84 Mon Sep 17 00:00:00 2001 From: abetlen Date: Tue, 23 Jun 2026 07:43:52 -0700 Subject: [PATCH] feat: update llama.cpp to 92e854ab8 --- CHANGELOG.md | 1 + llama_cpp/llama_cpp.py | 5 +++++ llama_cpp/llama_cpp_ext.py | 19 +++++++++++++++++++ llama_cpp/mtmd_cpp.py | 7 +++++++ vendor/llama.cpp | 2 +- 5 files changed, 33 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1d5fb880..925e941d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: update llama.cpp to ggml-org/llama.cpp@92e854ab8 - fix: preserve recurrent/hybrid model state when the full prompt is already cached by @allthatido and @abetlen in #2306 ## [0.3.31] diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 21f85c81c..176709d96 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -1744,6 +1744,11 @@ def llama_model_n_embd_out(model: llama_model_p, /) -> int: def llama_model_n_layer(model: llama_model_p, /) -> int: ... +# LLAMA_API int32_t llama_model_n_layer_nextn(const struct llama_model * model); +@ctypes_function("llama_model_n_layer_nextn", [llama_model_p_ctypes], ctypes.c_int32) +def llama_model_n_layer_nextn(model: llama_model_p, /) -> int: ... + + # LLAMA_API int32_t llama_model_n_head (const struct llama_model * model); @ctypes_function("llama_model_n_head", [llama_model_p_ctypes], ctypes.c_int32) def llama_model_n_head(model: llama_model_p, /) -> int: ... diff --git a/llama_cpp/llama_cpp_ext.py b/llama_cpp/llama_cpp_ext.py index 284811086..a4b424eb6 100644 --- a/llama_cpp/llama_cpp_ext.py +++ b/llama_cpp/llama_cpp_ext.py @@ -62,6 +62,25 @@ def llama_set_embeddings_nextn( ... +# LLAMA_API void llama_set_nextn_layer_offset(struct llama_context * ctx, int32_t offset); +@_ctypes_function_from_names( + ( + "llama_set_nextn_layer_offset", + "_Z28llama_set_nextn_layer_offsetP13llama_contexti", + "?llama_set_nextn_layer_offset@@YAXPEAUllama_context@@H@Z", + ), + [llama_cpp.llama_context_p_ctypes, ctypes.c_int32], + None, +) +def llama_set_nextn_layer_offset( + ctx: llama_cpp.llama_context_p, + offset: Union[ctypes.c_int32, int], + /, +): + """Select which appended NextN block the decoder MTP graph runs.""" + ... + + # LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx); @_ctypes_function_from_names( ( diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py index 78f068aa9..35357a327 100644 --- a/llama_cpp/mtmd_cpp.py +++ b/llama_cpp/mtmd_cpp.py @@ -20,6 +20,7 @@ ) import pathlib from typing import ( + Callable, Union, NewType, Optional, @@ -84,6 +85,8 @@ MTMD_INPUT_CHUNK_TYPE_IMAGE = 1 MTMD_INPUT_CHUNK_TYPE_AUDIO = 2 +mtmd_progress_callback = CFUNCTYPE(c_bool, c_float, c_void_p) + # Structures class mtmd_context_params(Structure): @@ -106,6 +109,8 @@ class mtmd_context_params(Structure): cb_eval: llama_cpp.ggml_backend_sched_eval_callback cb_eval_user_data: c_void_p batch_max_tokens: int + progress_callback: Callable[[float, c_void_p], bool] + progress_callback_user_data: c_void_p _fields_ = [ ("use_gpu", c_bool), @@ -120,6 +125,8 @@ class mtmd_context_params(Structure): ("cb_eval", llama_cpp.ggml_backend_sched_eval_callback), ("cb_eval_user_data", c_void_p), ("batch_max_tokens", c_int), + ("progress_callback", mtmd_progress_callback), + ("progress_callback_user_data", c_void_p), ] diff --git a/vendor/llama.cpp b/vendor/llama.cpp index f449e0553..92e854ab8 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit f449e0553708b895adbd94a301431cef691f632d +Subproject commit 92e854ab836254bb7f2eb49babd5613474bdb700