@@ -396,7 +396,7 @@ def generate(
396396 and tuple (self .eval_tokens ) == tuple (tokens [: len (self .eval_tokens )])
397397 ):
398398 if self .verbose :
399- print ("generate cache hit" , file = sys .stderr )
399+ print ("Llama. generate: cache hit" , file = sys .stderr )
400400 reset = False
401401 tokens = tokens [len (self .eval_tokens ) :]
402402
@@ -518,7 +518,7 @@ def _create_completion(
518518
519519 if self .cache and prompt_tokens in self .cache :
520520 if self .verbose :
521- print ("cache hit" , file = sys .stderr )
521+ print ("Llama._create_completion: cache hit" , file = sys .stderr )
522522 self .load_state (self .cache [prompt_tokens ])
523523
524524 finish_reason = "length"
@@ -538,7 +538,7 @@ def _create_completion(
538538 if self .cache and len (completion_tokens ) == 0 :
539539 if prompt_tokens not in self .cache :
540540 if self .verbose :
541- print ("cache miss" , file = sys .stderr )
541+ print ("Llama._create_completion: cache miss" , file = sys .stderr )
542542 self .cache [prompt_tokens ] = self .save_state ()
543543
544544 completion_tokens .append (token )
@@ -957,6 +957,8 @@ def save_state(self) -> LlamaState:
957957 raise RuntimeError ("Failed to copy llama state data" )
958958 llama_state_compact = (llama_cpp .c_uint8 * int (n_bytes ))()
959959 llama_cpp .ctypes .memmove (llama_state_compact , llama_state , int (n_bytes ))
960+ if self .verbose :
961+ print (f"Llama.save_state: saving { n_bytes } bytes of llama state" , file = sys .stderr )
960962 return LlamaState (
961963 eval_tokens = self .eval_tokens .copy (),
962964 eval_logits = self .eval_logits .copy (),
0 commit comments