Skip to content

Commit bc283d3

Browse files
committed
Update llama.cpp API 20250717
1 parent 67046e1 commit bc283d3

File tree

6 files changed

+47
-7
lines changed

6 files changed

+47
-7
lines changed

llama_cpp/_internals.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,9 @@ def token_nl(self) -> int:
146146
def token_pad(self) -> int:
147147
return llama_cpp.llama_vocab_pad(self.vocab)
148148

149+
def token_mask(self) -> int:
150+
return llama_cpp.llama_vocab_mask(self.vocab)
151+
149152
def token_cls(self) -> int:
150153
return llama_cpp.llama_vocab_cls(self.vocab)
151154

llama_cpp/llama.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ def __init__(
9595
flash_attn: bool = False,
9696
op_offload: Optional[bool] = None,
9797
swa_full: Optional[bool] = None,
98+
kv_unified: Optional[bool] = None,
9899
# Sampling Params
99100
no_perf: bool = False,
100101
last_n_tokens_size: int = 64,
@@ -178,6 +179,7 @@ def __init__(
178179
flash_attn: Use flash attention.
179180
op_offload: whether to offload host tensor operations to device
180181
swa_full: whether to use full-size SWA cache
182+
kv_unified: use single unified KV buffer for the KV cache of all sequences
181183
no_perf: Measure performance timings.
182184
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
183185
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
@@ -352,6 +354,9 @@ def __init__(
352354
if swa_full is not None:
353355
self.context_params.swa_full = swa_full
354356

357+
if kv_unified is not None:
358+
self.context_params.kv_unified = kv_unified
359+
355360
# KV cache quantization
356361
if type_k is not None:
357362
self.context_params.type_k = type_k
@@ -2213,6 +2218,7 @@ def __getstate__(self):
22132218
flash_attn=self.context_params.flash_attn,
22142219
op_offload=self.context_params.op_offload,
22152220
swa_full=self.context_params.swa_full,
2221+
kv_unified= self.context_params.kv_unified,
22162222
# Sampling Params
22172223
no_perf=self.context_params.no_perf,
22182224
last_n_tokens_size=self.last_n_tokens_size,
@@ -2327,6 +2333,10 @@ def token_pad(self) -> int:
23272333
"""Return the padding token."""
23282334
return self._model.token_pad()
23292335

2336+
def token_mask(self) -> int:
2337+
"""Return the mask token."""
2338+
return self._model.token_mask()
2339+
23302340
def pooling_type(self) -> str:
23312341
"""Return the pooling type."""
23322342
return self._ctx.pooling_type()

llama_cpp/llama_cpp.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -197,12 +197,13 @@
197197

198198

199199
# enum llama_vocab_type {
200-
# LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
201-
# LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
202-
# LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
203-
# LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
204-
# LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
205-
# LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
200+
# LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
201+
# LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
202+
# LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
203+
# LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
204+
# LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
205+
# LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
206+
# LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
206207
# };
207208
LLAMA_VOCAB_TYPE_NONE = 0
208209
"""For models without vocab"""
@@ -216,8 +217,11 @@
216217
"""T5 tokenizer based on Unigram"""
217218
LLAMA_VOCAB_TYPE_RWKV = 5
218219
"""RWKV tokenizer based on greedy tokenization"""
220+
LLAMA_VOCAB_TYPE_PLAMO2 = 6
221+
"""PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming"""
219222

220223

224+
# NOTE: Deprecated and will be removed in the future. (already gone in llama.cpp)
221225
# // pre-tokenization types
222226
# enum llama_vocab_pre_type {
223227
# LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
@@ -257,6 +261,7 @@
257261
# LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
258262
# LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
259263
# LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
264+
# LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
260265
# };
261266
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
262267
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -295,6 +300,7 @@
295300
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34
296301
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35
297302
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36
303+
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37
298304

299305

300306
# // note: these values should be synchronized with ggml_rope
@@ -814,6 +820,9 @@ class llama_model_params(ctypes.Structure):
814820
# bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
815821
# // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some casesAdd commentMore actions
816822
# // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
823+
# bool kv_unified; // use a unified buffer across the input sequences when computing the attention
824+
# // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
825+
# // ref: https://github.com/ggml-org/llama.cpp/pull/14363
817826
# };
818827
class llama_context_params(ctypes.Structure):
819828
"""Parameters for llama_context
@@ -848,6 +857,7 @@ class llama_context_params(ctypes.Structure):
848857
no_perf (bool): whether to measure performance timings
849858
op_offload(bool): whether to offload host tensor operations to device
850859
swa_full(bool): whether to use full-size SWA cache
860+
kv_unified(bool): use a unified buffer across the input sequences when computing the attention
851861
"""
852862

853863
if TYPE_CHECKING:
@@ -880,6 +890,7 @@ class llama_context_params(ctypes.Structure):
880890
no_perf: bool
881891
op_offload:bool
882892
swa_full:bool
893+
kv_unified:bool
883894

884895
_fields_ = [
885896
("n_ctx", ctypes.c_uint32),
@@ -911,6 +922,7 @@ class llama_context_params(ctypes.Structure):
911922
("no_perf", ctypes.c_bool),
912923
("op_offload", ctypes.c_bool),
913924
("swa_full", ctypes.c_bool),
925+
("kv_unified", ctypes.c_bool),
914926
]
915927

916928

@@ -2796,6 +2808,14 @@ def llama_vocab_pad(vocab: llama_vocab_p, /) -> llama_token:
27962808
"""padding"""
27972809
...
27982810

2811+
2812+
# LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
2813+
@ctypes_function("llama_vocab_mask", [llama_vocab_p_ctypes], llama_token)
2814+
def llama_vocab_mask(vocab: llama_vocab_p, /) -> llama_token:
2815+
"""mask"""
2816+
...
2817+
2818+
27992819
# LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
28002820
@ctypes_function(
28012821
"llama_vocab_get_add_bos",

llama_cpp/server/model.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
282282
flash_attn=settings.flash_attn,
283283
op_offload=settings.op_offload,
284284
swa_full=settings.swa_full,
285+
kv_unified=settings.kv_unified,
285286
# Sampling Params
286287
last_n_tokens_size=settings.last_n_tokens_size,
287288
# LoRA Params

llama_cpp/server/settings.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,9 @@ class ModelSettings(BaseSettings):
113113
swa_full: bool = Field(
114114
default=True, description="Whether to use full-size SWA cache"
115115
)
116+
kv_unified: bool = Field(
117+
default=True, description="use single unified KV buffer for the KV cache of all sequences"
118+
)
116119
# Sampling Params
117120
last_n_tokens_size: int = Field(
118121
default=64,

tests/test_llama.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def test_real_model(llama_cpp_model_path):
8888
cparams.n_threads_batch = multiprocessing.cpu_count()
8989
cparams.flash_attn = True
9090
cparams.swa_full = True
91+
cparams.kv_unified = True
9192

9293
context = internals.LlamaContext(model=model, params=cparams)
9394
tokens = model.tokenize(b"Hello, world!", add_bos=True, special=True)
@@ -129,7 +130,8 @@ def test_real_llama(llama_cpp_model_path):
129130
n_threads_batch=multiprocessing.cpu_count(),
130131
logits_all=False,
131132
flash_attn=True,
132-
swa_full = True
133+
swa_full=True,
134+
kv_unified=True,
133135
)
134136

135137
output = model.create_completion(
@@ -234,6 +236,7 @@ def test_real_llama_embeddings(llama_cpp_model_path):
234236
logits_all=False,
235237
flash_attn=True,
236238
swa_full=True,
239+
kv_unified=True,
237240
embedding=True
238241
)
239242
# Smoke test for now

0 commit comments

Comments
 (0)