@@ -317,7 +317,15 @@ def _create_completion(
317317 if self .verbose :
318318 llama_cpp .llama_reset_timings (self .ctx )
319319
320- if len (prompt_tokens ) + max_tokens > int (llama_cpp .llama_n_ctx (self .ctx )):
320+ if max_tokens <= 0 :
321+ # Unlimited, depending on n_ctx.
322+ if len (prompt_tokens ) >= int (llama_cpp .llama_n_ctx (self .ctx )):
323+ raise ValueError (
324+ f"Requested tokens exceed context window of { llama_cpp .llama_n_ctx (self .ctx )} "
325+ )
326+ else :
327+ max_tokens = int (llama_cpp .llama_n_ctx (self .ctx )) - len (prompt_tokens )
328+ elif len (prompt_tokens ) + max_tokens > int (llama_cpp .llama_n_ctx (self .ctx )):
321329 raise ValueError (
322330 f"Requested tokens exceed context window of { llama_cpp .llama_n_ctx (self .ctx )} "
323331 )
@@ -455,7 +463,7 @@ def create_completion(
455463 Args:
456464 prompt: The prompt to generate text from.
457465 suffix: A suffix to append to the generated text. If None, no suffix is appended.
458- max_tokens: The maximum number of tokens to generate.
466+ max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
459467 temperature: The temperature to use for sampling.
460468 top_p: The top-p value to use for sampling.
461469 logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -510,7 +518,7 @@ def __call__(
510518 Args:
511519 prompt: The prompt to generate text from.
512520 suffix: A suffix to append to the generated text. If None, no suffix is appended.
513- max_tokens: The maximum number of tokens to generate.
521+ max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
514522 temperature: The temperature to use for sampling.
515523 top_p: The top-p value to use for sampling.
516524 logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -619,7 +627,7 @@ def create_chat_completion(
619627 top_k: The top-k value to use for sampling.
620628 stream: Whether to stream the results.
621629 stop: A list of strings to stop generation when encountered.
622- max_tokens: The maximum number of tokens to generate.
630+ max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
623631 repeat_penalty: The penalty to apply to repeated tokens.
624632
625633 Returns:
0 commit comments