Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

- feat: update llama.cpp to ggml-org/llama.cpp@92e854ab8
- fix: preserve recurrent/hybrid model state when the full prompt is already cached by @allthatido and @abetlen in #2306

## [0.3.31]
Expand Down
5 changes: 5 additions & 0 deletions llama_cpp/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -1744,6 +1744,11 @@ def llama_model_n_embd_out(model: llama_model_p, /) -> int:
def llama_model_n_layer(model: llama_model_p, /) -> int: ...


# LLAMA_API int32_t llama_model_n_layer_nextn(const struct llama_model * model);
@ctypes_function("llama_model_n_layer_nextn", [llama_model_p_ctypes], ctypes.c_int32)
def llama_model_n_layer_nextn(model: llama_model_p, /) -> int: ...


# LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
@ctypes_function("llama_model_n_head", [llama_model_p_ctypes], ctypes.c_int32)
def llama_model_n_head(model: llama_model_p, /) -> int: ...
Expand Down
19 changes: 19 additions & 0 deletions llama_cpp/llama_cpp_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,25 @@ def llama_set_embeddings_nextn(
...


# LLAMA_API void llama_set_nextn_layer_offset(struct llama_context * ctx, int32_t offset);
@_ctypes_function_from_names(
(
"llama_set_nextn_layer_offset",
"_Z28llama_set_nextn_layer_offsetP13llama_contexti",
"?llama_set_nextn_layer_offset@@YAXPEAUllama_context@@H@Z",
),
[llama_cpp.llama_context_p_ctypes, ctypes.c_int32],
None,
)
def llama_set_nextn_layer_offset(
ctx: llama_cpp.llama_context_p,
offset: Union[ctypes.c_int32, int],
/,
):
"""Select which appended NextN block the decoder MTP graph runs."""
...


# LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);
@_ctypes_function_from_names(
(
Expand Down
7 changes: 7 additions & 0 deletions llama_cpp/mtmd_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
)
import pathlib
from typing import (
Callable,
Union,
NewType,
Optional,
Expand Down Expand Up @@ -84,6 +85,8 @@
MTMD_INPUT_CHUNK_TYPE_IMAGE = 1
MTMD_INPUT_CHUNK_TYPE_AUDIO = 2

mtmd_progress_callback = CFUNCTYPE(c_bool, c_float, c_void_p)


# Structures
class mtmd_context_params(Structure):
Expand All @@ -106,6 +109,8 @@ class mtmd_context_params(Structure):
cb_eval: llama_cpp.ggml_backend_sched_eval_callback
cb_eval_user_data: c_void_p
batch_max_tokens: int
progress_callback: Callable[[float, c_void_p], bool]
progress_callback_user_data: c_void_p

_fields_ = [
("use_gpu", c_bool),
Expand All @@ -120,6 +125,8 @@ class mtmd_context_params(Structure):
("cb_eval", llama_cpp.ggml_backend_sched_eval_callback),
("cb_eval_user_data", c_void_p),
("batch_max_tokens", c_int),
("progress_callback", mtmd_progress_callback),
("progress_callback_user_data", c_void_p),
]


Expand Down
2 changes: 1 addition & 1 deletion vendor/llama.cpp
Submodule llama.cpp updated 128 files
Loading