|
197 | 197 |
|
198 | 198 |
|
199 | 199 | # enum llama_vocab_type { |
200 | | -# LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab |
201 | | -# LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback |
202 | | -# LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE |
203 | | -# LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece |
204 | | -# LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram |
205 | | -# LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization |
| 200 | +# LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab |
| 201 | +# LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback |
| 202 | +# LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE |
| 203 | +# LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece |
| 204 | +# LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram |
| 205 | +# LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization |
| 206 | +# LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming |
206 | 207 | # }; |
207 | 208 | LLAMA_VOCAB_TYPE_NONE = 0 |
208 | 209 | """For models without vocab""" |
|
216 | 217 | """T5 tokenizer based on Unigram""" |
217 | 218 | LLAMA_VOCAB_TYPE_RWKV = 5 |
218 | 219 | """RWKV tokenizer based on greedy tokenization""" |
| 220 | +LLAMA_VOCAB_TYPE_PLAMO2 = 6 |
| 221 | +"""PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming""" |
219 | 222 |
|
220 | 223 |
|
| 224 | +# NOTE: Deprecated and will be removed in the future. (already gone in llama.cpp) |
221 | 225 | # // pre-tokenization types |
222 | 226 | # enum llama_vocab_pre_type { |
223 | 227 | # LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0, |
|
257 | 261 | # LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, |
258 | 262 | # LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, |
259 | 263 | # LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36, |
| 264 | +# LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37, |
260 | 265 | # }; |
261 | 266 | LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 |
262 | 267 | LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 |
|
295 | 300 | LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34 |
296 | 301 | LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35 |
297 | 302 | LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36 |
| 303 | +LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37 |
298 | 304 |
|
299 | 305 |
|
300 | 306 | # // note: these values should be synchronized with ggml_rope |
@@ -814,6 +820,9 @@ class llama_model_params(ctypes.Structure): |
814 | 820 | # bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) |
815 | 821 | # // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some casesAdd commentMore actions |
816 | 822 | # // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573 |
| 823 | +# bool kv_unified; // use a unified buffer across the input sequences when computing the attention |
| 824 | +# // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix |
| 825 | +# // ref: https://github.com/ggml-org/llama.cpp/pull/14363 |
817 | 826 | # }; |
818 | 827 | class llama_context_params(ctypes.Structure): |
819 | 828 | """Parameters for llama_context |
@@ -848,6 +857,7 @@ class llama_context_params(ctypes.Structure): |
848 | 857 | no_perf (bool): whether to measure performance timings |
849 | 858 | op_offload(bool): whether to offload host tensor operations to device |
850 | 859 | swa_full(bool): whether to use full-size SWA cache |
| 860 | + kv_unified(bool): use a unified buffer across the input sequences when computing the attention |
851 | 861 | """ |
852 | 862 |
|
853 | 863 | if TYPE_CHECKING: |
@@ -880,6 +890,7 @@ class llama_context_params(ctypes.Structure): |
880 | 890 | no_perf: bool |
881 | 891 | op_offload:bool |
882 | 892 | swa_full:bool |
| 893 | + kv_unified:bool |
883 | 894 |
|
884 | 895 | _fields_ = [ |
885 | 896 | ("n_ctx", ctypes.c_uint32), |
@@ -911,6 +922,7 @@ class llama_context_params(ctypes.Structure): |
911 | 922 | ("no_perf", ctypes.c_bool), |
912 | 923 | ("op_offload", ctypes.c_bool), |
913 | 924 | ("swa_full", ctypes.c_bool), |
| 925 | + ("kv_unified", ctypes.c_bool), |
914 | 926 | ] |
915 | 927 |
|
916 | 928 |
|
@@ -2796,6 +2808,14 @@ def llama_vocab_pad(vocab: llama_vocab_p, /) -> llama_token: |
2796 | 2808 | """padding""" |
2797 | 2809 | ... |
2798 | 2810 |
|
| 2811 | + |
| 2812 | +# LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask |
| 2813 | +@ctypes_function("llama_vocab_mask", [llama_vocab_p_ctypes], llama_token) |
| 2814 | +def llama_vocab_mask(vocab: llama_vocab_p, /) -> llama_token: |
| 2815 | + """mask""" |
| 2816 | + ... |
| 2817 | + |
| 2818 | + |
2799 | 2819 | # LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab); |
2800 | 2820 | @ctypes_function( |
2801 | 2821 | "llama_vocab_get_add_bos", |
|
0 commit comments