From 3e67b1778a612f433f65abd4fc7adc215b603d14 Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Sun, 14 Sep 2025 21:04:58 -0500 Subject: [PATCH 01/16] Cleaned llm/ type errors --- nemoguardrails/llm/filters.py | 8 +-- nemoguardrails/llm/helpers.py | 23 +++---- nemoguardrails/llm/models/initializer.py | 7 +- nemoguardrails/llm/params.py | 15 +++-- .../llm/providers/huggingface/pipeline.py | 39 ++++++++--- .../llm/providers/huggingface/streamers.py | 26 ++++++-- nemoguardrails/llm/providers/trtllm/client.py | 66 ++++++++++++------- nemoguardrails/llm/providers/trtllm/llm.py | 10 ++- nemoguardrails/llm/taskmanager.py | 21 ++++-- 9 files changed, 149 insertions(+), 66 deletions(-) diff --git a/nemoguardrails/llm/filters.py b/nemoguardrails/llm/filters.py index c195d5b01..a78110919 100644 --- a/nemoguardrails/llm/filters.py +++ b/nemoguardrails/llm/filters.py @@ -140,7 +140,7 @@ def to_messages(colang_history: str) -> List[dict]: # a message from the user, and the rest gets translated to messages from the assistant. lines = colang_history.split("\n") - bot_lines = [] + bot_lines: list[str] = [] for i, line in enumerate(lines): if line.startswith('user "'): # If we have bot lines in the buffer, we first add a bot message. @@ -181,8 +181,8 @@ def to_messages_v2(colang_history: str) -> List[dict]: # a message from the user, and the rest gets translated to messages from the assistant. lines = colang_history.split("\n") - user_lines = [] - bot_lines = [] + user_lines: list[str] = [] + bot_lines: list[str] = [] for line in lines: if line.startswith("user action:"): if len(bot_lines) > 0: @@ -275,7 +275,7 @@ def verbose_v1(colang_history: str) -> str: return "\n".join(lines) -def to_chat_messages(events: List[dict]) -> str: +def to_chat_messages(events: List[dict]) -> List[dict]: """Filter that turns an array of events into a sequence of user/assistant messages. Properly handles multimodal content by preserving the structure when the content diff --git a/nemoguardrails/llm/helpers.py b/nemoguardrails/llm/helpers.py index 04835d669..23d001e90 100644 --- a/nemoguardrails/llm/helpers.py +++ b/nemoguardrails/llm/helpers.py @@ -13,18 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional, Type, Union +from typing import List, Optional, Type from langchain.callbacks.manager import ( AsyncCallbackManagerForLLMRun, CallbackManagerForLLMRun, ) -from langchain_core.language_models.llms import LLM, BaseLLM +from langchain_core.language_models.llms import LLM -def get_llm_instance_wrapper( - llm_instance: Union[LLM, BaseLLM], llm_type: str -) -> Type[LLM]: +def get_llm_instance_wrapper(llm_instance: LLM, llm_type: str) -> Type[LLM]: """Wraps an LLM instance in a class that can be registered with LLMRails. This is useful to create specific types of LLMs using a generic LLM provider @@ -47,7 +45,7 @@ def model_kwargs(self): These are needed to allow changes to the arguments of the LLM calls. """ if hasattr(llm_instance, "model_kwargs"): - return llm_instance.model_kwargs + return getattr(llm_instance, "model_kwargs") return {} @property @@ -66,26 +64,29 @@ def _modify_instance_kwargs(self): """ if hasattr(llm_instance, "model_kwargs"): - if isinstance(llm_instance.model_kwargs, dict): - llm_instance.model_kwargs["temperature"] = self.temperature - llm_instance.model_kwargs["streaming"] = self.streaming + model_kwargs = getattr(llm_instance, "model_kwargs") + if isinstance(model_kwargs, dict): + model_kwargs["temperature"] = self.temperature + model_kwargs["streaming"] = self.streaming def _call( self, prompt: str, stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs, ) -> str: self._modify_instance_kwargs() - return llm_instance._call(prompt, stop, run_manager) + return llm_instance._call(prompt, stop, run_manager, **kwargs) async def _acall( self, prompt: str, stop: Optional[List[str]] = None, run_manager: Optional[AsyncCallbackManagerForLLMRun] = None, + **kwargs, ) -> str: self._modify_instance_kwargs() - return await llm_instance._acall(prompt, stop, run_manager) + return await llm_instance._acall(prompt, stop, run_manager, **kwargs) return WrapperLLM diff --git a/nemoguardrails/llm/models/initializer.py b/nemoguardrails/llm/models/initializer.py index 09071920c..bd97d03fd 100644 --- a/nemoguardrails/llm/models/initializer.py +++ b/nemoguardrails/llm/models/initializer.py @@ -20,12 +20,15 @@ from langchain_core.language_models import BaseChatModel from langchain_core.language_models.llms import BaseLLM -from .langchain_initializer import ModelInitializationError, init_langchain_model +from nemoguardrails.llm.models.langchain_initializer import ( + ModelInitializationError, + init_langchain_model, +) # later we can easily conver it to a class def init_llm_model( - model_name: Optional[str], + model_name: str, provider_name: str, mode: Literal["chat", "text"], kwargs: Dict[str, Any], diff --git a/nemoguardrails/llm/params.py b/nemoguardrails/llm/params.py index 7a4cf13f6..3cdf948c0 100644 --- a/nemoguardrails/llm/params.py +++ b/nemoguardrails/llm/params.py @@ -36,7 +36,7 @@ import logging import warnings -from typing import Dict, Type +from typing import Any, Dict, Type from langchain.base_language import BaseLanguageModel @@ -61,18 +61,18 @@ def __init__(self, llm: BaseLanguageModel, **kwargs): warnings.warn(_DEPRECATION_MESSAGE, DeprecationWarning, stacklevel=2) self.llm = llm self.altered_params = kwargs - self.original_params = {} + self.original_params: dict[str, Any] = {} def __enter__(self): # Here we can access and modify the global language model parameters. - self.original_params = {} for param, value in self.altered_params.items(): if hasattr(self.llm, param): self.original_params[param] = getattr(self.llm, param) setattr(self.llm, param, value) elif hasattr(self.llm, "model_kwargs"): - if param not in self.llm.model_kwargs: + model_kwargs = getattr(self.llm, "model_kwargs", {}) + if param not in model_kwargs: log.warning( "Parameter %s does not exist for %s. Passing to model_kwargs", param, @@ -81,9 +81,10 @@ def __enter__(self): self.original_params[param] = None else: - self.original_params[param] = self.llm.model_kwargs[param] + self.original_params[param] = model_kwargs[param] - self.llm.model_kwargs[param] = value + model_kwargs[param] = value + setattr(self.llm, "model_kwargs", model_kwargs) else: log.warning( @@ -92,7 +93,7 @@ def __enter__(self): self.llm.__class__.__name__, ) - def __exit__(self, type, value, traceback): + def __exit__(self, exc_type, value, traceback): # Restore original parameters when exiting the context for param, value in self.original_params.items(): if hasattr(self.llm, param): diff --git a/nemoguardrails/llm/providers/huggingface/pipeline.py b/nemoguardrails/llm/providers/huggingface/pipeline.py index 8745a109d..918837693 100644 --- a/nemoguardrails/llm/providers/huggingface/pipeline.py +++ b/nemoguardrails/llm/providers/huggingface/pipeline.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import asyncio from typing import Any, List, Optional from langchain.callbacks.manager import ( @@ -20,7 +21,25 @@ CallbackManagerForLLMRun, ) from langchain.schema.output import GenerationChunk -from langchain_community.llms import HuggingFacePipeline + +# Import HuggingFacePipeline with fallbacks for different LangChain versions +HuggingFacePipeline = None # type: ignore[assignment] + +try: + from langchain_community.llms import ( + HuggingFacePipeline, # type: ignore[attr-defined,no-redef] + ) +except ImportError: + # Fallback for older versions of langchain + try: + from langchain.llms import ( + HuggingFacePipeline, # type: ignore[attr-defined,no-redef] + ) + except ImportError: + # Create a dummy class if HuggingFacePipeline is not available + class HuggingFacePipeline: # type: ignore[misc,no-redef] + def __init__(self, *args, **kwargs): + raise ImportError("HuggingFacePipeline is not available") class HuggingFacePipelineCompatible(HuggingFacePipeline): @@ -47,12 +66,13 @@ def _call( ) # Streaming for NeMo Guardrails is not supported in sync calls. - if self.model_kwargs and self.model_kwargs.get("streaming"): - raise Exception( + model_kwargs = getattr(self, "model_kwargs", {}) + if model_kwargs and model_kwargs.get("streaming"): + raise NotImplementedError( "Streaming mode not supported for HuggingFacePipeline in NeMo Guardrails!" ) - llm_result = self._generate( + llm_result = getattr(self, "_generate")( [prompt], stop=stop, run_manager=run_manager, @@ -78,11 +98,12 @@ async def _acall( ) # Handle streaming, if the flag is set - if self.model_kwargs and self.model_kwargs.get("streaming"): + model_kwargs = getattr(self, "model_kwargs", {}) + if model_kwargs and model_kwargs.get("streaming"): # Retrieve the streamer object, needs to be set in model_kwargs - streamer = self.model_kwargs.get("streamer") + streamer = model_kwargs.get("streamer") if not streamer: - raise Exception( + raise ValueError( "Cannot stream, please add HuggingFace streamer object to model_kwargs!" ) @@ -99,7 +120,7 @@ async def _acall( run_manager=run_manager, **kwargs, ) - loop.create_task(self._agenerate(**generation_kwargs)) + loop.create_task(getattr(self, "_agenerate")(**generation_kwargs)) # And start waiting for the chunks to come in. completion = "" @@ -111,7 +132,7 @@ async def _acall( return completion - llm_result = await self._agenerate( + llm_result = await getattr(self, "_agenerate")( [prompt], stop=stop, run_manager=run_manager, diff --git a/nemoguardrails/llm/providers/huggingface/streamers.py b/nemoguardrails/llm/providers/huggingface/streamers.py index c163b8217..7ed5a3beb 100644 --- a/nemoguardrails/llm/providers/huggingface/streamers.py +++ b/nemoguardrails/llm/providers/huggingface/streamers.py @@ -14,11 +14,27 @@ # limitations under the License. import asyncio +from typing import TYPE_CHECKING, Optional -from transformers.generation.streamers import TextStreamer +TRANSFORMERS_AVAILABLE = True +try: + from transformers.generation.streamers import ( # type: ignore[import-untyped] + TextStreamer, + ) +except ImportError: + # Fallback if transformers is not available + TRANSFORMERS_AVAILABLE = False + class TextStreamer: # type: ignore[no-redef] + def __init__(self, *args, **kwargs): + pass -class AsyncTextIteratorStreamer(TextStreamer): + +if TYPE_CHECKING: + from transformers import AutoTokenizer # type: ignore[import-untyped] + + +class AsyncTextIteratorStreamer(TextStreamer): # type: ignore[misc] """ Simple async implementation for HuggingFace Transformers streamers. @@ -30,12 +46,14 @@ def __init__( self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, **decode_kwargs ): super().__init__(tokenizer, skip_prompt, **decode_kwargs) - self.text_queue = asyncio.Queue() + self.text_queue: asyncio.Queue[str] = asyncio.Queue() self.stop_signal = None - self.loop = None + self.loop: Optional[asyncio.AbstractEventLoop] = None def on_finalized_text(self, text: str, stream_end: bool = False): """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue.""" + if self.loop is None: + return if len(text) > 0: asyncio.run_coroutine_threadsafe(self.text_queue.put(text), self.loop) diff --git a/nemoguardrails/llm/providers/trtllm/client.py b/nemoguardrails/llm/providers/trtllm/client.py index 46fd2ff3f..9e74d72c1 100644 --- a/nemoguardrails/llm/providers/trtllm/client.py +++ b/nemoguardrails/llm/providers/trtllm/client.py @@ -19,7 +19,25 @@ import queue import time from functools import partial -from typing import Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + +# Try to import tritonclient dependencies, with fallbacks for type checking +try: + import tritonclient.grpc as grpcclient + from tritonclient.grpc.service_pb2 import ( + ModelInferResponse, # type: ignore[attr-defined] + ) + + TRITONCLIENT_AVAILABLE = True +except ImportError: + # Create dummy types when tritonclient is not available + grpcclient = Any # type: ignore + ModelInferResponse = Any # type: ignore + TRITONCLIENT_AVAILABLE = False + +if TYPE_CHECKING and not TRITONCLIENT_AVAILABLE: + import tritonclient.grpc as grpcclient # type: ignore + from tritonclient.grpc.service_pb2 import ModelInferResponse # type: ignore STOP_WORDS = [""] BAD_WORDS = [""] @@ -31,11 +49,11 @@ class TritonClient: def __init__(self, server_url: str) -> None: """Initialize the client.""" - # pylint: disable-next=import-outside-toplevel - import tritonclient.grpc as grpcclient + if not TRITONCLIENT_AVAILABLE: + raise ImportError("tritonclient is required for TensorRT-LLM support") self.server_url = server_url - self.client = grpcclient.InferenceServerClient(server_url) + self.client = grpcclient.InferenceServerClient(server_url) # type: ignore def load_model(self, model_name: str, timeout: int = 1000) -> None: """Load a model into the server.""" @@ -54,29 +72,33 @@ def load_model(self, model_name: str, timeout: int = 1000) -> None: def get_model_list(self) -> List[str]: """Get a list of models loaded in the triton server.""" res = self.client.get_model_repository_index(as_json=True) + if res is None or "models" not in res: + return [] return [model["name"] for model in res["models"]] def get_model_concurrency(self, model_name: str, timeout: int = 1000) -> int: """Get the modle concurrency.""" self.load_model(model_name, timeout) - instances = self.client.get_model_config(model_name, as_json=True)["config"][ - "instance_group" - ] + config_result = self.client.get_model_config(model_name, as_json=True) + if config_result is None or "config" not in config_result: + return 0 + instances = config_result["config"].get("instance_group", []) return sum(instance["count"] * len(instance["gpus"]) for instance in instances) @staticmethod def process_result(result: Dict[str, str]) -> Dict[str, str]: """Post-process the result from the server.""" - import google.protobuf.json_format # pylint: disable=import-outside-toplevel - import tritonclient.grpc as grpcclient # pylint: disable=import-outside-toplevel + if not TRITONCLIENT_AVAILABLE: + raise ImportError("tritonclient is required for TensorRT-LLM support") - # pylint: disable-next=import-outside-toplevel - from tritonclient.grpc.service_pb2 import ModelInferResponse + import google.protobuf.json_format - message = ModelInferResponse() + message = ModelInferResponse() # type: ignore[misc] google.protobuf.json_format.Parse(json.dumps(result), message) - infer_result = grpcclient.InferResult(message) + infer_result = grpcclient.InferResult(message) # type: ignore np_res = infer_result.as_numpy("OUTPUT_0") + if np_res is None: + return {"OUTPUT_0": ""} if np_res.ndim == 2: generated_text = np_res[0, 0].decode() else: @@ -140,21 +162,21 @@ def close_streaming(self) -> None: self.client.stop_stream() @staticmethod - def generate_outputs() -> List["grpcclient.InferRequestedOutput"]: + def generate_outputs() -> List[Any]: """Generate the expected output structure.""" - import tritonclient.grpc as grpcclient # pylint: disable=import-outside-toplevel - - return [grpcclient.InferRequestedOutput("OUTPUT_0")] + if not TRITONCLIENT_AVAILABLE: + raise ImportError("tritonclient is required for TensorRT-LLM support") + return [grpcclient.InferRequestedOutput("OUTPUT_0")] # type: ignore @staticmethod - def prepare_tensor(name: str, input_data: Any) -> "grpcclient.InferInput": + def prepare_tensor(name: str, input_data: Any) -> Any: """Prepare an input data structure.""" - import tritonclient.grpc as grpcclient # pylint: disable=import-outside-toplevel + if not TRITONCLIENT_AVAILABLE: + raise ImportError("tritonclient is required for TensorRT-LLM support") - # pylint: disable-next=import-outside-toplevel from tritonclient.utils import np_to_triton_dtype - t = grpcclient.InferInput( + t = grpcclient.InferInput( # type: ignore name, input_data.shape, np_to_triton_dtype(input_data.dtype) ) t.set_data_from_numpy(input_data) @@ -170,7 +192,7 @@ def generate_inputs( # pylint: disable=too-many-arguments,too-many-locals beam_width: int = 1, repetition_penalty: float = 1, length_penalty: float = 1.0, - ) -> List["grpcclient.InferInput"]: + ) -> List[Any]: """Create the input for the triton inference server.""" import numpy as np # pylint: disable=import-outside-toplevel diff --git a/nemoguardrails/llm/providers/trtllm/llm.py b/nemoguardrails/llm/providers/trtllm/llm.py index cec6a5fe1..ea332d7cc 100644 --- a/nemoguardrails/llm/providers/trtllm/llm.py +++ b/nemoguardrails/llm/providers/trtllm/llm.py @@ -18,7 +18,13 @@ import queue from functools import partial -from typing import Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +if TYPE_CHECKING: + try: + from tritonclient.utils import InferenceServerException + except ImportError: + InferenceServerException = Exception from langchain.callbacks.manager import CallbackManagerForLLMRun from langchain_core.language_models.llms import BaseLLM @@ -107,7 +113,7 @@ def _llm_type(self) -> str: def _call( self, prompt: str, - stop: Optional[List[str]] = None, + stop: Optional[List[str]] = None, # pylint: disable=unused-argument run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs: Any, ) -> str: diff --git a/nemoguardrails/llm/taskmanager.py b/nemoguardrails/llm/taskmanager.py index 49e39cc24..1cf5850bb 100644 --- a/nemoguardrails/llm/taskmanager.py +++ b/nemoguardrails/llm/taskmanager.py @@ -95,6 +95,8 @@ def __init__(self, config: RailsConfig): def _get_general_instructions(self): """Helper to extract the general instructions.""" text = "" + if self.config.instructions is None: + return text for instruction in self.config.instructions: if instruction.type == "general": text = instruction.content @@ -266,7 +268,9 @@ def render_task_prompt( task_prompt = self._render_string( prompt.content, context=context, events=events ) - while len(task_prompt) > prompt.max_length: + while ( + prompt.max_length is not None and len(task_prompt) > prompt.max_length + ): if not events: raise Exception( f"Prompt exceeds max length of {prompt.max_length} characters even without history" @@ -288,20 +292,27 @@ def render_task_prompt( return task_prompt else: + if prompt.messages is None: + return [] task_messages = self._render_messages( prompt.messages, context=context, events=events ) task_prompt_length = self._get_messages_text_length(task_messages) - while task_prompt_length > prompt.max_length: + while ( + prompt.max_length is not None and task_prompt_length > prompt.max_length + ): if not events: raise Exception( f"Prompt exceeds max length of {prompt.max_length} characters even without history" ) # Remove events from the beginning of the history until the prompt fits. events = events[1:] - task_messages = self._render_messages( - prompt.messages, context=context, events=events - ) + if prompt.messages is not None: + task_messages = self._render_messages( + prompt.messages, context=context, events=events + ) + else: + task_messages = [] task_prompt_length = self._get_messages_text_length(task_messages) return task_messages From d91749c2e1e8098db2073ba60b8a5e8be55b310f Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Mon, 13 Oct 2025 14:57:46 -0500 Subject: [PATCH 02/16] Add nemoguardrails/llm to the pyright pre-commit check --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 2e79e544d..bf31b0532 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -157,6 +157,7 @@ pyright = "^1.1.405" include = [ "nemoguardrails/rails/**", "nemoguardrails/actions/**", + "nemoguardrails/llm/**", "nemoguardrails/embeddings/**", "nemoguardrails/cli/**", "nemoguardrails/kb/**", From 75e60009fcf91f8a5f7a238189fd67b5902cf2c3 Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Mon, 13 Oct 2025 15:18:34 -0500 Subject: [PATCH 03/16] Fix types in nemoguardrails/rails module --- nemoguardrails/rails/llm/llmrails.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/nemoguardrails/rails/llm/llmrails.py b/nemoguardrails/rails/llm/llmrails.py index 96647158e..e6c4a9d5d 100644 --- a/nemoguardrails/rails/llm/llmrails.py +++ b/nemoguardrails/rails/llm/llmrails.py @@ -458,7 +458,7 @@ def _init_llms(self): (model for model in self.config.models if model.type == "main"), None ) - if main_model: + if main_model and main_model.model: kwargs = self._prepare_model_kwargs(main_model) self.llm = init_llm_model( model_name=main_model.model, @@ -489,7 +489,16 @@ def _init_llms(self): continue try: - model_name = llm_config.model + model_name = ( + llm_config.model + if llm_config.model + else llm_config.parameters["model"] + ) + if not model_name: + raise ModelInitializationError( + f"No model name provided in {llm_config}" + ) + provider_name = llm_config.engine kwargs = self._prepare_model_kwargs(llm_config) mode = llm_config.mode From 092db00230a9759d9aea580c53c385b796fd94a1 Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Mon, 13 Oct 2025 16:02:11 -0500 Subject: [PATCH 04/16] Use poetry install --all-extras --with dev to install langchain_nvidia_ai_endpoints for Github CI tests --- .github/workflows/pr-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml index 35652aba8..037a508d3 100644 --- a/.github/workflows/pr-tests.yml +++ b/.github/workflows/pr-tests.yml @@ -21,6 +21,7 @@ jobs: os: ${{ matrix.os }} image: ${{ matrix.image }} python-version: ${{ matrix.python-version }} + upgrade-deps: true pr-tests-summary: name: PR Tests Summary needs: pr-tests-matrix From 5ea4eff486d854926c33e5e41969ec53d1332e23 Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Mon, 13 Oct 2025 16:17:37 -0500 Subject: [PATCH 05/16] Install extras in test-coverage-report so the langchain_nvidia_ai_endpoints work for pyright type-checking --- .github/workflows/test-coverage-report.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-coverage-report.yml b/.github/workflows/test-coverage-report.yml index 30b4e1dc8..3d072edad 100644 --- a/.github/workflows/test-coverage-report.yml +++ b/.github/workflows/test-coverage-report.yml @@ -28,7 +28,7 @@ jobs: run: poetry config virtualenvs.in-project true - name: Install dependencies - run: poetry install --with dev + run: poetry install --with dev --all-extras - name: Run pre-commit hooks run: poetry run make pre_commit From b32e10ced5358772a0f6fef42cb464c57953f277 Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Mon, 13 Oct 2025 16:21:10 -0500 Subject: [PATCH 06/16] Remove tritonclient from type-checking (should this be deprecated? --- pyproject.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index bf31b0532..5c4cb5094 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -167,6 +167,11 @@ include = [ "tests/test_callbacks.py", ] +# tritonclient is only supported for Python <= 3.8, imports fail pyright-checking +exclude = [ + "nemoguardrails/llm/providers/trtllm/**" +] + [tool.poetry.group.docs] optional = true From 4f2053298b2bc6e413c5aa6f2207aa8ce65c506e Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Mon, 13 Oct 2025 16:49:46 -0500 Subject: [PATCH 07/16] Add upgrade-deps to the full-tests.yml file in Github CI/CD --- .github/workflows/full-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/full-tests.yml b/.github/workflows/full-tests.yml index 56898a41c..02b0a278f 100644 --- a/.github/workflows/full-tests.yml +++ b/.github/workflows/full-tests.yml @@ -32,6 +32,7 @@ jobs: os: ${{ matrix.os }} image: ${{ matrix.image }} python-version: ${{ matrix.python-version }} + upgrade-deps: true full-tests-summary: name: Full Tests Summary needs: full-tests-matrix From f5b5d75aeabaa68c620d76bd67fb9904a72e7414 Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Fri, 24 Oct 2025 12:10:14 -0500 Subject: [PATCH 08/16] Exclude providers/trtllm/** and providers/_langchain_nvidia_ai_endpoints_patch.py from type-checking --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5c4cb5094..4558978d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -169,7 +169,8 @@ include = [ # tritonclient is only supported for Python <= 3.8, imports fail pyright-checking exclude = [ - "nemoguardrails/llm/providers/trtllm/**" + "nemoguardrails/llm/providers/trtllm/**", + "nemoguardrails/llm/providers/_langchain_nvidia_ai_endpoints_patch.py" ] [tool.poetry.group.docs] From 8cb9bf5c10c7be409424cd17d79a93af11581953 Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Fri, 24 Oct 2025 12:12:16 -0500 Subject: [PATCH 09/16] Roll back type cleaning under llm/providers/trtllm now they're excluded from type-checking --- nemoguardrails/llm/providers/trtllm/client.py | 66 +++++++------------ nemoguardrails/llm/providers/trtllm/llm.py | 10 +-- 2 files changed, 24 insertions(+), 52 deletions(-) diff --git a/nemoguardrails/llm/providers/trtllm/client.py b/nemoguardrails/llm/providers/trtllm/client.py index 9e74d72c1..46fd2ff3f 100644 --- a/nemoguardrails/llm/providers/trtllm/client.py +++ b/nemoguardrails/llm/providers/trtllm/client.py @@ -19,25 +19,7 @@ import queue import time from functools import partial -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union - -# Try to import tritonclient dependencies, with fallbacks for type checking -try: - import tritonclient.grpc as grpcclient - from tritonclient.grpc.service_pb2 import ( - ModelInferResponse, # type: ignore[attr-defined] - ) - - TRITONCLIENT_AVAILABLE = True -except ImportError: - # Create dummy types when tritonclient is not available - grpcclient = Any # type: ignore - ModelInferResponse = Any # type: ignore - TRITONCLIENT_AVAILABLE = False - -if TYPE_CHECKING and not TRITONCLIENT_AVAILABLE: - import tritonclient.grpc as grpcclient # type: ignore - from tritonclient.grpc.service_pb2 import ModelInferResponse # type: ignore +from typing import Any, Dict, List, Optional, Union STOP_WORDS = [""] BAD_WORDS = [""] @@ -49,11 +31,11 @@ class TritonClient: def __init__(self, server_url: str) -> None: """Initialize the client.""" - if not TRITONCLIENT_AVAILABLE: - raise ImportError("tritonclient is required for TensorRT-LLM support") + # pylint: disable-next=import-outside-toplevel + import tritonclient.grpc as grpcclient self.server_url = server_url - self.client = grpcclient.InferenceServerClient(server_url) # type: ignore + self.client = grpcclient.InferenceServerClient(server_url) def load_model(self, model_name: str, timeout: int = 1000) -> None: """Load a model into the server.""" @@ -72,33 +54,29 @@ def load_model(self, model_name: str, timeout: int = 1000) -> None: def get_model_list(self) -> List[str]: """Get a list of models loaded in the triton server.""" res = self.client.get_model_repository_index(as_json=True) - if res is None or "models" not in res: - return [] return [model["name"] for model in res["models"]] def get_model_concurrency(self, model_name: str, timeout: int = 1000) -> int: """Get the modle concurrency.""" self.load_model(model_name, timeout) - config_result = self.client.get_model_config(model_name, as_json=True) - if config_result is None or "config" not in config_result: - return 0 - instances = config_result["config"].get("instance_group", []) + instances = self.client.get_model_config(model_name, as_json=True)["config"][ + "instance_group" + ] return sum(instance["count"] * len(instance["gpus"]) for instance in instances) @staticmethod def process_result(result: Dict[str, str]) -> Dict[str, str]: """Post-process the result from the server.""" - if not TRITONCLIENT_AVAILABLE: - raise ImportError("tritonclient is required for TensorRT-LLM support") + import google.protobuf.json_format # pylint: disable=import-outside-toplevel + import tritonclient.grpc as grpcclient # pylint: disable=import-outside-toplevel - import google.protobuf.json_format + # pylint: disable-next=import-outside-toplevel + from tritonclient.grpc.service_pb2 import ModelInferResponse - message = ModelInferResponse() # type: ignore[misc] + message = ModelInferResponse() google.protobuf.json_format.Parse(json.dumps(result), message) - infer_result = grpcclient.InferResult(message) # type: ignore + infer_result = grpcclient.InferResult(message) np_res = infer_result.as_numpy("OUTPUT_0") - if np_res is None: - return {"OUTPUT_0": ""} if np_res.ndim == 2: generated_text = np_res[0, 0].decode() else: @@ -162,21 +140,21 @@ def close_streaming(self) -> None: self.client.stop_stream() @staticmethod - def generate_outputs() -> List[Any]: + def generate_outputs() -> List["grpcclient.InferRequestedOutput"]: """Generate the expected output structure.""" - if not TRITONCLIENT_AVAILABLE: - raise ImportError("tritonclient is required for TensorRT-LLM support") - return [grpcclient.InferRequestedOutput("OUTPUT_0")] # type: ignore + import tritonclient.grpc as grpcclient # pylint: disable=import-outside-toplevel + + return [grpcclient.InferRequestedOutput("OUTPUT_0")] @staticmethod - def prepare_tensor(name: str, input_data: Any) -> Any: + def prepare_tensor(name: str, input_data: Any) -> "grpcclient.InferInput": """Prepare an input data structure.""" - if not TRITONCLIENT_AVAILABLE: - raise ImportError("tritonclient is required for TensorRT-LLM support") + import tritonclient.grpc as grpcclient # pylint: disable=import-outside-toplevel + # pylint: disable-next=import-outside-toplevel from tritonclient.utils import np_to_triton_dtype - t = grpcclient.InferInput( # type: ignore + t = grpcclient.InferInput( name, input_data.shape, np_to_triton_dtype(input_data.dtype) ) t.set_data_from_numpy(input_data) @@ -192,7 +170,7 @@ def generate_inputs( # pylint: disable=too-many-arguments,too-many-locals beam_width: int = 1, repetition_penalty: float = 1, length_penalty: float = 1.0, - ) -> List[Any]: + ) -> List["grpcclient.InferInput"]: """Create the input for the triton inference server.""" import numpy as np # pylint: disable=import-outside-toplevel diff --git a/nemoguardrails/llm/providers/trtllm/llm.py b/nemoguardrails/llm/providers/trtllm/llm.py index ea332d7cc..cec6a5fe1 100644 --- a/nemoguardrails/llm/providers/trtllm/llm.py +++ b/nemoguardrails/llm/providers/trtllm/llm.py @@ -18,13 +18,7 @@ import queue from functools import partial -from typing import TYPE_CHECKING, Any, Dict, List, Optional - -if TYPE_CHECKING: - try: - from tritonclient.utils import InferenceServerException - except ImportError: - InferenceServerException = Exception +from typing import Any, Dict, List, Optional from langchain.callbacks.manager import CallbackManagerForLLMRun from langchain_core.language_models.llms import BaseLLM @@ -113,7 +107,7 @@ def _llm_type(self) -> str: def _call( self, prompt: str, - stop: Optional[List[str]] = None, # pylint: disable=unused-argument + stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs: Any, ) -> str: From 3655b1bd92e6f61909be2f3e6c08c4a0ecb0f469 Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Fri, 24 Oct 2025 12:27:14 -0500 Subject: [PATCH 10/16] Type-clean the LFU cache implementation --- nemoguardrails/llm/cache/lfu.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/nemoguardrails/llm/cache/lfu.py b/nemoguardrails/llm/cache/lfu.py index 2c48ecb75..968aacf0e 100644 --- a/nemoguardrails/llm/cache/lfu.py +++ b/nemoguardrails/llm/cache/lfu.py @@ -54,7 +54,8 @@ def append(self, node: LFUNode) -> None: """Add node to the end of the list (before tail).""" node.prev = self.tail.prev node.next = self.tail - self.tail.prev.next = node + if self.tail.prev: + self.tail.prev.next = node self.tail.prev = node self.size += 1 @@ -67,8 +68,10 @@ def pop(self, node: Optional[LFUNode] = None) -> Optional[LFUNode]: node = self.head.next # Remove node from the list - node.prev.next = node.next - node.next.prev = node.prev + if node and node.prev: + node.prev.next = node.next + if node and node.next: + node.next.prev = node.prev self.size -= 1 return node @@ -121,6 +124,7 @@ def __init__( "evictions": 0, "puts": 0, "updates": 0, + "hit_rate": 0.0, } def _update_node_freq(self, node: LFUNode) -> None: @@ -272,7 +276,7 @@ def get_stats(self) -> dict: # Calculate hit rate total_requests = stats["hits"] + stats["misses"] - stats["hit_rate"] = ( + stats["hit_rate"] = float( stats["hits"] / total_requests if total_requests > 0 else 0.0 ) @@ -288,6 +292,7 @@ def reset_stats(self) -> None: "evictions": 0, "puts": 0, "updates": 0, + "hit_rate": 0.0, } def _check_and_log_stats(self) -> None: From 59c1da0a2da265b5660ef4bb3ada26cbc6ca66c4 Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Fri, 24 Oct 2025 12:38:04 -0500 Subject: [PATCH 11/16] Address Pouyan's feedback. Removed Model.model Optional and default value --- nemoguardrails/llm/helpers.py | 2 +- nemoguardrails/llm/providers/huggingface/pipeline.py | 2 +- nemoguardrails/rails/llm/config.py | 3 +-- nemoguardrails/rails/llm/llmrails.py | 11 +---------- 4 files changed, 4 insertions(+), 14 deletions(-) diff --git a/nemoguardrails/llm/helpers.py b/nemoguardrails/llm/helpers.py index 23d001e90..88488dcf2 100644 --- a/nemoguardrails/llm/helpers.py +++ b/nemoguardrails/llm/helpers.py @@ -45,7 +45,7 @@ def model_kwargs(self): These are needed to allow changes to the arguments of the LLM calls. """ if hasattr(llm_instance, "model_kwargs"): - return getattr(llm_instance, "model_kwargs") + return llm_instance.model_kwargs # type: ignore[attr-defined] (We check in line above) return {} @property diff --git a/nemoguardrails/llm/providers/huggingface/pipeline.py b/nemoguardrails/llm/providers/huggingface/pipeline.py index 918837693..b81cafb90 100644 --- a/nemoguardrails/llm/providers/huggingface/pipeline.py +++ b/nemoguardrails/llm/providers/huggingface/pipeline.py @@ -72,7 +72,7 @@ def _call( "Streaming mode not supported for HuggingFacePipeline in NeMo Guardrails!" ) - llm_result = getattr(self, "_generate")( + llm_result = self._generate( # type: ignore[attr-defined] [prompt], stop=stop, run_manager=run_manager, diff --git a/nemoguardrails/rails/llm/config.py b/nemoguardrails/rails/llm/config.py index 90d24bdc7..4248b2be1 100644 --- a/nemoguardrails/rails/llm/config.py +++ b/nemoguardrails/rails/llm/config.py @@ -112,8 +112,7 @@ class Model(BaseModel): type: str engine: str - model: Optional[str] = Field( - default=None, + model: str = Field( description="The name of the model. If not specified, it should be specified through the parameters attribute.", ) api_key_env_var: Optional[str] = Field( diff --git a/nemoguardrails/rails/llm/llmrails.py b/nemoguardrails/rails/llm/llmrails.py index e6c4a9d5d..f8b293f09 100644 --- a/nemoguardrails/rails/llm/llmrails.py +++ b/nemoguardrails/rails/llm/llmrails.py @@ -489,16 +489,7 @@ def _init_llms(self): continue try: - model_name = ( - llm_config.model - if llm_config.model - else llm_config.parameters["model"] - ) - if not model_name: - raise ModelInitializationError( - f"No model name provided in {llm_config}" - ) - + model_name = llm_config.model provider_name = llm_config.engine kwargs = self._prepare_model_kwargs(llm_config) mode = llm_config.mode From a4aad26a3903fc66b00c7a5a85c84348038feac8 Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Fri, 24 Oct 2025 12:39:11 -0500 Subject: [PATCH 12/16] fix typo --- nemoguardrails/llm/models/initializer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemoguardrails/llm/models/initializer.py b/nemoguardrails/llm/models/initializer.py index bd97d03fd..fc0f4b5d6 100644 --- a/nemoguardrails/llm/models/initializer.py +++ b/nemoguardrails/llm/models/initializer.py @@ -26,7 +26,7 @@ ) -# later we can easily conver it to a class +# later we can easily convert it to a class def init_llm_model( model_name: str, provider_name: str, From 1ad6b73633584fc5a8c9a128228cd7f73ff57ab1 Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Fri, 24 Oct 2025 12:40:43 -0500 Subject: [PATCH 13/16] Revert github workflow changes (not needed now we exclude trtllm from type-checking) --- .github/workflows/full-tests.yml | 1 - .github/workflows/pr-tests.yml | 1 - .github/workflows/test-coverage-report.yml | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/full-tests.yml b/.github/workflows/full-tests.yml index 02b0a278f..56898a41c 100644 --- a/.github/workflows/full-tests.yml +++ b/.github/workflows/full-tests.yml @@ -32,7 +32,6 @@ jobs: os: ${{ matrix.os }} image: ${{ matrix.image }} python-version: ${{ matrix.python-version }} - upgrade-deps: true full-tests-summary: name: Full Tests Summary needs: full-tests-matrix diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml index 037a508d3..35652aba8 100644 --- a/.github/workflows/pr-tests.yml +++ b/.github/workflows/pr-tests.yml @@ -21,7 +21,6 @@ jobs: os: ${{ matrix.os }} image: ${{ matrix.image }} python-version: ${{ matrix.python-version }} - upgrade-deps: true pr-tests-summary: name: PR Tests Summary needs: pr-tests-matrix diff --git a/.github/workflows/test-coverage-report.yml b/.github/workflows/test-coverage-report.yml index 3d072edad..30b4e1dc8 100644 --- a/.github/workflows/test-coverage-report.yml +++ b/.github/workflows/test-coverage-report.yml @@ -28,7 +28,7 @@ jobs: run: poetry config virtualenvs.in-project true - name: Install dependencies - run: poetry install --with dev --all-extras + run: poetry install --with dev - name: Run pre-commit hooks run: poetry run make pre_commit From a2b64d65e97be781e8d752927dd8d9998d250585 Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Fri, 24 Oct 2025 12:43:24 -0500 Subject: [PATCH 14/16] Remove comment from pyproject.toml --- pyproject.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4558978d8..5c743052e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -166,8 +166,6 @@ include = [ "nemoguardrails/server/**", "tests/test_callbacks.py", ] - -# tritonclient is only supported for Python <= 3.8, imports fail pyright-checking exclude = [ "nemoguardrails/llm/providers/trtllm/**", "nemoguardrails/llm/providers/_langchain_nvidia_ai_endpoints_patch.py" From 93dd296a6a916b6e3d7feb55cca9d2e038040c05 Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Fri, 24 Oct 2025 12:52:08 -0500 Subject: [PATCH 15/16] Revert mandatory Model name field change, add None-guard back --- nemoguardrails/rails/llm/config.py | 3 ++- nemoguardrails/rails/llm/llmrails.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/nemoguardrails/rails/llm/config.py b/nemoguardrails/rails/llm/config.py index 4248b2be1..90d24bdc7 100644 --- a/nemoguardrails/rails/llm/config.py +++ b/nemoguardrails/rails/llm/config.py @@ -112,7 +112,8 @@ class Model(BaseModel): type: str engine: str - model: str = Field( + model: Optional[str] = Field( + default=None, description="The name of the model. If not specified, it should be specified through the parameters attribute.", ) api_key_env_var: Optional[str] = Field( diff --git a/nemoguardrails/rails/llm/llmrails.py b/nemoguardrails/rails/llm/llmrails.py index f8b293f09..187300aa2 100644 --- a/nemoguardrails/rails/llm/llmrails.py +++ b/nemoguardrails/rails/llm/llmrails.py @@ -490,6 +490,9 @@ def _init_llms(self): try: model_name = llm_config.model + if not model_name: + raise ValueError("LLM Config model field not set") + provider_name = llm_config.engine kwargs = self._prepare_model_kwargs(llm_config) mode = llm_config.mode From 721a71b3f90b3b7a4611e60ebb6ab731d5aa27fd Mon Sep 17 00:00:00 2001 From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com> Date: Mon, 27 Oct 2025 17:11:09 -0500 Subject: [PATCH 16/16] Address last feedback --- nemoguardrails/llm/cache/lfu.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nemoguardrails/llm/cache/lfu.py b/nemoguardrails/llm/cache/lfu.py index 968aacf0e..755f84b17 100644 --- a/nemoguardrails/llm/cache/lfu.py +++ b/nemoguardrails/llm/cache/lfu.py @@ -54,7 +54,7 @@ def append(self, node: LFUNode) -> None: """Add node to the end of the list (before tail).""" node.prev = self.tail.prev node.next = self.tail - if self.tail.prev: + if self.tail.prev is not None: self.tail.prev.next = node self.tail.prev = node self.size += 1 @@ -68,9 +68,9 @@ def pop(self, node: Optional[LFUNode] = None) -> Optional[LFUNode]: node = self.head.next # Remove node from the list - if node and node.prev: + if node is not None and node.prev is not None: node.prev.next = node.next - if node and node.next: + if node is not None and node.next is not None: node.next.prev = node.prev self.size -= 1 @@ -276,7 +276,7 @@ def get_stats(self) -> dict: # Calculate hit rate total_requests = stats["hits"] + stats["misses"] - stats["hit_rate"] = float( + stats["hit_rate"] = ( stats["hits"] / total_requests if total_requests > 0 else 0.0 )