From f750fa94447b334040b7d5adff0927007c559f84 Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Tue, 23 Jun 2026 07:43:52 -0700
Subject: [PATCH] feat: update llama.cpp to 92e854ab8

---
 CHANGELOG.md               |  1 +
 llama_cpp/llama_cpp.py     |  5 +++++
 llama_cpp/llama_cpp_ext.py | 19 +++++++++++++++++++
 llama_cpp/mtmd_cpp.py      |  7 +++++++
 vendor/llama.cpp           |  2 +-
 5 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b1d5fb880..925e941d8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: update llama.cpp to ggml-org/llama.cpp@92e854ab8
 - fix: preserve recurrent/hybrid model state when the full prompt is already cached by @allthatido and @abetlen in #2306
 
 ## [0.3.31]
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 21f85c81c..176709d96 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -1744,6 +1744,11 @@ def llama_model_n_embd_out(model: llama_model_p, /) -> int:
 def llama_model_n_layer(model: llama_model_p, /) -> int: ...
 
 
+# LLAMA_API int32_t llama_model_n_layer_nextn(const struct llama_model * model);
+@ctypes_function("llama_model_n_layer_nextn", [llama_model_p_ctypes], ctypes.c_int32)
+def llama_model_n_layer_nextn(model: llama_model_p, /) -> int: ...
+
+
 # LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
 @ctypes_function("llama_model_n_head", [llama_model_p_ctypes], ctypes.c_int32)
 def llama_model_n_head(model: llama_model_p, /) -> int: ...
diff --git a/llama_cpp/llama_cpp_ext.py b/llama_cpp/llama_cpp_ext.py
index 284811086..a4b424eb6 100644
--- a/llama_cpp/llama_cpp_ext.py
+++ b/llama_cpp/llama_cpp_ext.py
@@ -62,6 +62,25 @@ def llama_set_embeddings_nextn(
     ...
 
 
+# LLAMA_API void llama_set_nextn_layer_offset(struct llama_context * ctx, int32_t offset);
+@_ctypes_function_from_names(
+    (
+        "llama_set_nextn_layer_offset",
+        "_Z28llama_set_nextn_layer_offsetP13llama_contexti",
+        "?llama_set_nextn_layer_offset@@YAXPEAUllama_context@@H@Z",
+    ),
+    [llama_cpp.llama_context_p_ctypes, ctypes.c_int32],
+    None,
+)
+def llama_set_nextn_layer_offset(
+    ctx: llama_cpp.llama_context_p,
+    offset: Union[ctypes.c_int32, int],
+    /,
+):
+    """Select which appended NextN block the decoder MTP graph runs."""
+    ...
+
+
 # LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);
 @_ctypes_function_from_names(
     (
diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
index 78f068aa9..35357a327 100644
--- a/llama_cpp/mtmd_cpp.py
+++ b/llama_cpp/mtmd_cpp.py
@@ -20,6 +20,7 @@
 )
 import pathlib
 from typing import (
+    Callable,
     Union,
     NewType,
     Optional,
@@ -84,6 +85,8 @@
 MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
 MTMD_INPUT_CHUNK_TYPE_AUDIO = 2
 
+mtmd_progress_callback = CFUNCTYPE(c_bool, c_float, c_void_p)
+
 
 # Structures
 class mtmd_context_params(Structure):
@@ -106,6 +109,8 @@ class mtmd_context_params(Structure):
         cb_eval: llama_cpp.ggml_backend_sched_eval_callback
         cb_eval_user_data: c_void_p
         batch_max_tokens: int
+        progress_callback: Callable[[float, c_void_p], bool]
+        progress_callback_user_data: c_void_p
 
     _fields_ = [
         ("use_gpu", c_bool),
@@ -120,6 +125,8 @@ class mtmd_context_params(Structure):
         ("cb_eval", llama_cpp.ggml_backend_sched_eval_callback),
         ("cb_eval_user_data", c_void_p),
         ("batch_max_tokens", c_int),
+        ("progress_callback", mtmd_progress_callback),
+        ("progress_callback_user_data", c_void_p),
     ]
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index f449e0553..92e854ab8 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit f449e0553708b895adbd94a301431cef691f632d
+Subproject commit 92e854ab836254bb7f2eb49babd5613474bdb700