@@ -824,7 +824,15 @@ def _create_completion(
824824 if self .verbose :
825825 llama_cpp .llama_reset_timings (self .ctx )
826826
827- if len (prompt_tokens ) > self ._n_ctx :
827+ if max_tokens <= 0 :
828+ # Unlimited, depending on n_ctx.
829+ if len (prompt_tokens ) >= int (llama_cpp .llama_n_ctx (self .ctx )):
830+ raise ValueError (
831+ f"Requested tokens exceed context window of { llama_cpp .llama_n_ctx (self .ctx )} "
832+ )
833+ else :
834+ max_tokens = int (llama_cpp .llama_n_ctx (self .ctx )) - len (prompt_tokens )
835+ elif len (prompt_tokens ) + max_tokens > int (llama_cpp .llama_n_ctx (self .ctx )):
828836 raise ValueError (
829837 f"Requested tokens ({ len (prompt_tokens )} ) exceed context window of { self ._n_ctx } "
830838 )
@@ -1231,7 +1239,7 @@ def create_completion(
12311239 Args:
12321240 prompt: The prompt to generate text from.
12331241 suffix: A suffix to append to the generated text. If None, no suffix is appended.
1234- max_tokens: The maximum number of tokens to generate.
1242+ max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
12351243 temperature: The temperature to use for sampling.
12361244 top_p: The top-p value to use for sampling.
12371245 logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -1304,7 +1312,7 @@ def __call__(
13041312 Args:
13051313 prompt: The prompt to generate text from.
13061314 suffix: A suffix to append to the generated text. If None, no suffix is appended.
1307- max_tokens: The maximum number of tokens to generate.
1315+ max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
13081316 temperature: The temperature to use for sampling.
13091317 top_p: The top-p value to use for sampling.
13101318 logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -1432,7 +1440,7 @@ def create_chat_completion(
14321440 top_k: The top-k value to use for sampling.
14331441 stream: Whether to stream the results.
14341442 stop: A list of strings to stop generation when encountered.
1435- max_tokens: The maximum number of tokens to generate.
1443+ max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
14361444 repeat_penalty: The penalty to apply to repeated tokens.
14371445
14381446 Returns:
0 commit comments