feat(content_safety): add auto selected multilingual refusal bot message support

Pouyanpi · Pouyanpi · commit 3b5d09ec83d9 · 2025-12-02T16:30:17.000+01:00
Detect user input language and return refusal messages in the same
language when content safety rails block unsafe content. Supports 9
languages: English, Spanish, Chinese, German, French, Hindi, Japanese,
Arabic, and Thai.
diff --git a/examples/configs/content_safety_multilingual/config.yml b/examples/configs/content_safety_multilingual/config.yml
@@ -0,0 +1,30 @@
+models:
+  - type: main
+    engine: nim
+    model: meta/llama-3.3-70b-instruct
+
+  - type: content_safety
+    engine: nim
+    model: nvidia/llama-3.1-nemotron-safety-guard-8b-v3
+
+rails:
+  config:
+    content_safety:
+      multilingual:
+        enabled: true
+        # Optional: customize refusal messages per language
+        # If not specified, built-in defaults are used
+        refusal_messages:
+          # en: "Sorry, I cannot help with that request."
+          # es: "Lo siento, no puedo ayudar con esa solicitud."
+          # zh: "抱歉，我无法处理该请求。"
+
+  input:
+    flows:
+      - content safety check input $model=content_safety
+
+  output:
+    flows:
+      - content safety check output $model=content_safety
+
+  # 你很笨 (nǐ hěn bèn) — “You are stupid / you are foolish.”
diff --git a/examples/configs/content_safety_multilingual/prompts.yml b/examples/configs/content_safety_multilingual/prompts.yml
@@ -0,0 +1,90 @@
+prompts:
+  - task: content_safety_check_input $model=content_safety
+    content: |
+      Task: Check if there is unsafe content in the user or agent messages in the below conversation in accordance with our safety policy with the below categories. Provide a 'safe' or 'unsafe' rating for both the user message and the agent response. If there is unsafe content, please also provide a list of violated categories according to our safety policy below.
+
+      <BEGIN UNSAFE CONTENT CATEGORIES>
+      S1: Violence.
+      S2: Sexual.
+      S3: Criminal Planning/Confessions.
+      S4: Guns and Illegal Weapons.
+      S5: Controlled/Regulated Substances.
+      S6: Suicide and Self Harm.
+      S7: Sexual (minor).
+      S8: Hate/Identity Hate.
+      S9: PII/Privacy.
+      S10: Harassment.
+      S11: Threat.
+      S12: Profanity.
+      S13: Needs Caution.
+      S14: Other.
+      S15: Manipulation.
+      S16: Fraud/Deception.
+      S17: Malware.
+      S18: High Risk Gov Decision Making.
+      S19: Political/Misinformation/Conspiracy.
+      S20: Copyright/Trademark/Plagiarism.
+      S21: Unauthorized Advice.
+      S22: Illegal Activity.
+      S23: Immoral/Unethical.
+      <END UNSAFE CONTENT CATEGORIES>
+
+      <BEGIN CONVERSATION>
+
+      user: {{ user_input }}
+
+      <END CONVERSATION>
+
+      Provide your safety assessment for the above conversation in the below output JSON format:
+      {"User Safety": safe or unsafe rating for the user message, "Response Safety": safe or unsafe rating for the agent response. Omit if no agent response present. "Safety Categories": a comma-separated list of applicable safety categories from the provided taxonomy. Omit if all safe.}
+
+      Do not include anything other than the output JSON in your response.
+      Output JSON:
+    output_parser: nemoguard_parse_prompt_safety
+    max_tokens: 50
+
+  - task: content_safety_check_output $model=content_safety
+    content: |
+      Task: Check if there is unsafe content in the user or agent messages in the below conversation in accordance with our safety policy with the below categories. Provide a 'safe' or 'unsafe' rating for both the user message and the agent response. If there is unsafe content, please also provide a list of violated categories according to our safety policy below.
+
+      <BEGIN UNSAFE CONTENT CATEGORIES>
+      S1: Violence.
+      S2: Sexual.
+      S3: Criminal Planning/Confessions.
+      S4: Guns and Illegal Weapons.
+      S5: Controlled/Regulated Substances.
+      S6: Suicide and Self Harm.
+      S7: Sexual (minor).
+      S8: Hate/Identity Hate.
+      S9: PII/Privacy.
+      S10: Harassment.
+      S11: Threat.
+      S12: Profanity.
+      S13: Needs Caution.
+      S14: Other.
+      S15: Manipulation.
+      S16: Fraud/Deception.
+      S17: Malware.
+      S18: High Risk Gov Decision Making.
+      S19: Political/Misinformation/Conspiracy.
+      S20: Copyright/Trademark/Plagiarism.
+      S21: Unauthorized Advice.
+      S22: Illegal Activity.
+      S23: Immoral/Unethical.
+      <END UNSAFE CONTENT CATEGORIES>
+
+      <BEGIN CONVERSATION>
+
+      user: {{ user_input }}
+
+      response: agent: {{ bot_response }}
+
+      <END CONVERSATION>
+
+      Provide your safety assessment for the above conversation in the below output JSON format:
+      {"User Safety": safe or unsafe rating for the user message, "Response Safety": safe or unsafe rating for the agent response. Omit if no agent response present. "Safety Categories": a comma-separated list of applicable safety categories from the provided taxonomy. Omit if all safe.}
+
+      Do not include anything other than the output JSON in your response.
+      Output JSON:
+    output_parser: nemoguard_parse_response_safety
+    max_tokens: 50
diff --git a/nemoguardrails/library/content_safety/actions.py b/nemoguardrails/library/content_safety/actions.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 import logging
-from typing import Dict, Optional
+from typing import Dict, FrozenSet, Optional
 
 from langchain_core.language_models import BaseLLM
 
@@ -220,3 +220,79 @@ async def content_safety_check_output(
         log.debug(f"Content safety output result cached for model '{model_name}'")
 
     return final_result
+
+
+SUPPORTED_LANGUAGES: FrozenSet[str] = frozenset({"en", "es", "zh", "de", "fr", "hi", "ja", "ar", "th"})
+
+DEFAULT_REFUSAL_MESSAGES: Dict[str, str] = {
+    "en": "I'm sorry, I can't respond to that.",
+    "es": "Lo siento, no puedo responder a eso.",
+    "zh": "抱歉，我无法回应。",
+    "de": "Es tut mir leid, darauf kann ich nicht antworten.",
+    "fr": "Je suis désolé, je ne peux pas répondre à cela.",
+    "hi": "मुझे खेद है, मैं इसका जवाब नहीं दे सकता।",
+    "ja": "申し訳ありませんが、それには回答できません。",
+    "ar": "عذراً، لا أستطيع الرد على ذلك.",
+    "th": "ขออภัย ฉันไม่สามารถตอบได้",
+}
+
+
+def _detect_language(text: str) -> Optional[str]:
+    try:
+        from fast_langdetect import detect
+
+        result = detect(text, k=1)
+        if result and len(result) > 0:
+            return result[0].get("lang")
+        return None
+    except ImportError:
+        log.warning("fast-langdetect not installed, skipping")
+        return None
+    except Exception as e:
+        log.warning(f"fast-langdetect detection failed: {e}")
+        return None
+
+
+def _get_refusal_message(lang: str, custom_messages: Optional[Dict[str, str]]) -> str:
+    if custom_messages and lang in custom_messages:
+        return custom_messages[lang]
+    if lang in DEFAULT_REFUSAL_MESSAGES:
+        return DEFAULT_REFUSAL_MESSAGES[lang]
+    if custom_messages and "en" in custom_messages:
+        return custom_messages["en"]
+    return DEFAULT_REFUSAL_MESSAGES["en"]
+
+
+@action()
+async def detect_language(
+    context: Optional[dict] = None,
+    config: Optional[dict] = None,
+) -> dict:
+    user_message = ""
+    if context is not None:
+        user_message = context.get("user_message", "")
+
+    custom_messages = None
+    if config is not None:
+        multilingual_config = (
+            config.rails.config.content_safety.multilingual
+            if hasattr(config, "rails")
+            and hasattr(config.rails, "config")
+            and hasattr(config.rails.config, "content_safety")
+            and hasattr(config.rails.config.content_safety, "multilingual")
+            else None
+        )
+        if multilingual_config:
+            custom_messages = multilingual_config.refusal_messages
+
+    lang = _detect_language(user_message) or "en"
+
+    if lang not in SUPPORTED_LANGUAGES:
+        lang = "en"
+
+    refusal_message = _get_refusal_message(lang, custom_messages)
+
+    return {
+        "language": lang,
+        "refusal_message": refusal_message,
+    }
diff --git a/nemoguardrails/library/content_safety/flows.co b/nemoguardrails/library/content_safety/flows.co
@@ -3,15 +3,19 @@ flow content safety check input $model
 
   global $allowed
   $allowed = $response["allowed"]
-  # Policy violations are currently unused, but can be used to better phrase the bot output
   global $policy_violations
   $policy_violations = $response["policy_violations"]
 
   if not $allowed
     if $system.config.enable_rails_exceptions
       send ContentSafetyCheckInputException(message="Input not allowed. The input was blocked by the 'content safety check input $model='{$model}'' flow.")
     else
-      bot refuse to respond
+      if $system.config.rails.config.content_safety.multilingual.enabled
+        $lang_result = await DetectLanguageAction()
+        $refusal_message = $lang_result["refusal_message"]
+        bot $refusal_message
+      else
+        bot refuse to respond
     abort
 
 flow content safety check output $model
@@ -25,5 +29,10 @@ flow content safety check output $model
     if $system.config.enable_rails_exceptions
       send ContentSafetyCheckOuputException(message="Output not allowed. The output was blocked by the 'content safety check output $model='{$model}'' flow.")
     else
-      bot refuse to respond
+      if $system.config.rails.config.content_safety.multilingual.enabled
+        $lang_result = await DetectLanguageAction()
+        $refusal_message = $lang_result["refusal_message"]
+        bot $refusal_message
+      else
+        bot refuse to respond
     abort
diff --git a/nemoguardrails/library/content_safety/flows.v1.co b/nemoguardrails/library/content_safety/flows.v1.co
@@ -1,20 +1,22 @@
-# TODO: add support for enable_rails_exceptions
-
 define bot refuse to respond
   "I'm sorry, I can't respond to that."
 
 define flow content safety check input
   $response = execute content_safety_check_input
 
   $allowed = $response["allowed"]
-  # Policy violations are currently unused, but can be used to better phrase the bot output
   $policy_violations = $response["policy_violations"]
 
   if not $allowed
     if $config.enable_rails_exceptions
       create event ContentSafetyCheckInputException(message="Input not allowed. The input was blocked by the 'content safety check input $model='{$model}'' flow.")
     else
-      bot refuse to respond
+      if $config.rails.config.content_safety.multilingual.enabled
+        $lang_result = execute detect_language
+        $refusal_message = $lang_result["refusal_message"]
+        bot $refusal_message
+      else
+        bot refuse to respond
     stop
 
 define flow content safety check output
@@ -26,5 +28,10 @@ define flow content safety check output
     if $config.enable_rails_exceptions
       create event ContentSafetyCheckOuputException(message="Output not allowed. The output was blocked by the 'content safety check output $model='{$model}'' flow.")
     else
-      bot refuse to respond
+      if $config.rails.config.content_safety.multilingual.enabled
+        $lang_result = execute detect_language
+        $refusal_message = $lang_result["refusal_message"]
+        bot $refusal_message
+      else
+        bot refuse to respond
     stop
diff --git a/nemoguardrails/rails/llm/config.py b/nemoguardrails/rails/llm/config.py
@@ -887,6 +887,32 @@ class AIDefenseRailConfig(BaseModel):
     )
 
 
+class MultilingualConfig(BaseModel):
+    """Configuration for multilingual refusal messages."""
+
+    enabled: bool = Field(
+        default=False,
+        description="If True, detect the language of user input and return refusal messages in the same language. "
+        "Supported languages: en (English), es (Spanish), zh (Chinese), de (German), fr (French), "
+        "hi (Hindi), ja (Japanese), ar (Arabic), th (Thai).",
+    )
+    refusal_messages: Optional[Dict[str, str]] = Field(
+        default=None,
+        description="Custom refusal messages per language code. "
+        "If not specified, built-in defaults are used. "
+        "Example: {'en': 'Sorry, I cannot help.', 'es': 'Lo siento, no puedo ayudar.'}",
+    )
+
+
+class ContentSafetyConfig(BaseModel):
+    """Configuration data for content safety rails."""
+
+    multilingual: MultilingualConfig = Field(
+        default_factory=MultilingualConfig,
+        description="Configuration for multilingual refusal messages.",
+    )
+
+
 class RailsConfigData(BaseModel):
     """Configuration data for specific rails that are supported out-of-the-box."""
 
@@ -955,6 +981,11 @@ class RailsConfigData(BaseModel):
         description="Configuration for Cisco AI Defense.",
     )
 
+    content_safety: Optional[ContentSafetyConfig] = Field(
+        default_factory=ContentSafetyConfig,
+        description="Configuration for content safety rails.",
+    )
+
 
 class Rails(BaseModel):
     """Configuration of specific rails."""
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml