Update llama.cpp API 20250717

JamePeng · JamePeng · commit bc283d365726 · 2025-07-17T00:06:19.000+08:00
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -146,6 +146,9 @@ def token_nl(self) -> int:
     def token_pad(self) -> int:
         return llama_cpp.llama_vocab_pad(self.vocab)
 
+    def token_mask(self) -> int:
+        return llama_cpp.llama_vocab_mask(self.vocab)
+
     def token_cls(self) -> int:
         return llama_cpp.llama_vocab_cls(self.vocab)
 
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -95,6 +95,7 @@ def __init__(
         flash_attn: bool = False,
         op_offload: Optional[bool] = None,
         swa_full: Optional[bool] = None,
+        kv_unified: Optional[bool] = None,
         # Sampling Params
         no_perf: bool = False,
         last_n_tokens_size: int = 64,
@@ -178,6 +179,7 @@ def __init__(
             flash_attn: Use flash attention.
             op_offload: whether to offload host tensor operations to device
             swa_full: whether to use full-size SWA cache
+            kv_unified: use single unified KV buffer for the KV cache of all sequences
             no_perf: Measure performance timings.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
@@ -352,6 +354,9 @@ def __init__(
         if swa_full is not None:
             self.context_params.swa_full = swa_full
 
+        if kv_unified is not None:
+            self.context_params.kv_unified = kv_unified
+
         #  KV cache quantization
         if type_k is not None:
             self.context_params.type_k = type_k
@@ -2213,6 +2218,7 @@ def __getstate__(self):
             flash_attn=self.context_params.flash_attn,
             op_offload=self.context_params.op_offload,
             swa_full=self.context_params.swa_full,
+            kv_unified= self.context_params.kv_unified,
             # Sampling Params
             no_perf=self.context_params.no_perf,
             last_n_tokens_size=self.last_n_tokens_size,
@@ -2327,6 +2333,10 @@ def token_pad(self) -> int:
         """Return the padding token."""
         return self._model.token_pad()
 
+    def token_mask(self) -> int:
+        """Return the mask token."""
+        return self._model.token_mask()
+
     def pooling_type(self) -> str:
         """Return the pooling type."""
         return self._ctx.pooling_type()
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -197,12 +197,13 @@
 
 
 # enum llama_vocab_type {
-#     LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
-#     LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
-#     LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
-#     LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
-#     LLAMA_VOCAB_TYPE_UGM  = 4, // T5 tokenizer based on Unigram
-#     LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
+#     LLAMA_VOCAB_TYPE_NONE   = 0, // For models without vocab
+#     LLAMA_VOCAB_TYPE_SPM    = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
+#     LLAMA_VOCAB_TYPE_BPE    = 2, // GPT-2 tokenizer based on byte-level BPE
+#     LLAMA_VOCAB_TYPE_WPM    = 3, // BERT tokenizer based on WordPiece
+#     LLAMA_VOCAB_TYPE_UGM    = 4, // T5 tokenizer based on Unigram
+#     LLAMA_VOCAB_TYPE_RWKV   = 5, // RWKV tokenizer based on greedy tokenization
+#     LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
 # };
 LLAMA_VOCAB_TYPE_NONE = 0
 """For models without vocab"""
@@ -216,8 +217,11 @@
 """T5 tokenizer based on Unigram"""
 LLAMA_VOCAB_TYPE_RWKV = 5
 """RWKV tokenizer based on greedy tokenization"""
+LLAMA_VOCAB_TYPE_PLAMO2 = 6
+"""PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming"""
 
 
+# NOTE: Deprecated and will be removed in the future. (already gone in llama.cpp)
 # // pre-tokenization types
 # enum llama_vocab_pre_type {
 #     LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
@@ -257,6 +261,7 @@
 #     LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
 #     LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
 #     LLAMA_VOCAB_PRE_TYPE_HUNYUAN        = 36,
+#     LLAMA_VOCAB_PRE_TYPE_KIMI_K2        = 37,
 # };
 LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -295,6 +300,7 @@
 LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34
 LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35
 LLAMA_VOCAB_PRE_TYPE_HUNYUAN  = 36
+LLAMA_VOCAB_PRE_TYPE_KIMI_K2  = 37
 
 
 # // note: these values should be synchronized with ggml_rope
@@ -814,6 +820,9 @@ class llama_model_params(ctypes.Structure):
 #     bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
 #                       // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some casesAdd commentMore actions
 #                       //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+#     bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
+#                       // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
+#                       // ref: https://github.com/ggml-org/llama.cpp/pull/14363
 # };
 class llama_context_params(ctypes.Structure):
     """Parameters for llama_context
@@ -848,6 +857,7 @@ class llama_context_params(ctypes.Structure):
         no_perf (bool): whether to measure performance timings
         op_offload(bool): whether to offload host tensor operations to device
         swa_full(bool): whether to use full-size SWA cache
+        kv_unified(bool): use a unified buffer across the input sequences when computing the attention
     """
 
     if TYPE_CHECKING:
@@ -880,6 +890,7 @@ class llama_context_params(ctypes.Structure):
         no_perf: bool
         op_offload:bool
         swa_full:bool
+        kv_unified:bool
 
     _fields_ = [
         ("n_ctx", ctypes.c_uint32),
@@ -911,6 +922,7 @@ class llama_context_params(ctypes.Structure):
         ("no_perf", ctypes.c_bool),
         ("op_offload", ctypes.c_bool),
         ("swa_full", ctypes.c_bool),
+        ("kv_unified", ctypes.c_bool),
     ]
 
 
@@ -2796,6 +2808,14 @@ def llama_vocab_pad(vocab: llama_vocab_p, /) -> llama_token:
     """padding"""
     ...
 
+
+# LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
+@ctypes_function("llama_vocab_mask", [llama_vocab_p_ctypes], llama_token)
+def llama_vocab_mask(vocab: llama_vocab_p, /) -> llama_token:
+    """mask"""
+    ...
+
+
 # LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
 @ctypes_function(
     "llama_vocab_get_add_bos",
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
@@ -282,6 +282,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             flash_attn=settings.flash_attn,
             op_offload=settings.op_offload,
             swa_full=settings.swa_full,
+            kv_unified=settings.kv_unified,
             # Sampling Params
             last_n_tokens_size=settings.last_n_tokens_size,
             # LoRA Params
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -113,6 +113,9 @@ class ModelSettings(BaseSettings):
     swa_full: bool = Field(
         default=True, description="Whether to use full-size SWA cache"
     )
+    kv_unified: bool = Field(
+        default=True, description="use single unified KV buffer for the KV cache of all sequences"
+    )
     # Sampling Params
     last_n_tokens_size: int = Field(
         default=64,
diff --git a/tests/test_llama.py b/tests/test_llama.py
@@ -88,6 +88,7 @@ def test_real_model(llama_cpp_model_path):
     cparams.n_threads_batch = multiprocessing.cpu_count()
     cparams.flash_attn = True
     cparams.swa_full = True
+    cparams.kv_unified = True
 
     context = internals.LlamaContext(model=model, params=cparams)
     tokens = model.tokenize(b"Hello, world!", add_bos=True, special=True)
@@ -129,7 +130,8 @@ def test_real_llama(llama_cpp_model_path):
         n_threads_batch=multiprocessing.cpu_count(),
         logits_all=False,
         flash_attn=True,
-        swa_full = True
+        swa_full=True,
+        kv_unified=True,
     )
 
     output = model.create_completion(
@@ -234,6 +236,7 @@ def test_real_llama_embeddings(llama_cpp_model_path):
         logits_all=False,
         flash_attn=True,
         swa_full=True,
+        kv_unified=True,
         embedding=True
     )
     # Smoke test for now