From 80cf80b545f30c3d059f55e33ef21e5ca4010bb8 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 16:21:37 +0000 Subject: [PATCH 01/29] [Refactor] Separate out `RendererConfig` from ModelConfig` Signed-off-by: DarkLight1337 --- docs/contributing/model/transcription.md | 10 +- .../entrypoints/openai/test_chat_template.py | 22 +- tests/entrypoints/openai/test_vision.py | 2 +- tests/entrypoints/test_chat_utils.py | 194 +++++++----------- .../test_model_load_with_params.py | 22 +- .../multimodal/processing/test_common.py | 22 +- .../multimodal/processing/test_glm4_1v.py | 4 +- .../multimodal/processing/test_h2ovl.py | 2 +- .../multimodal/processing/test_idefics3.py | 2 +- .../multimodal/processing/test_internvl.py | 2 +- .../multimodal/processing/test_llama4.py | 2 +- .../multimodal/processing/test_llava_next.py | 6 +- .../processing/test_llava_onevision.py | 6 +- .../processing/test_minimax_vl_01.py | 4 +- .../multimodal/processing/test_mllama4.py | 2 +- .../multimodal/processing/test_nemotron_vl.py | 2 +- .../multimodal/processing/test_phi3v.py | 2 +- .../multimodal/processing/test_phi4mm.py | 2 +- .../multimodal/processing/test_qwen2_vl.py | 2 +- .../multimodal/processing/test_smolvlm.py | 2 +- .../processing/test_tensor_schema.py | 18 +- .../processing/test_transformers.py | 5 +- tests/models/multimodal/test_mapping.py | 33 +-- tests/models/registry.py | 31 ++- tests/models/utils.py | 18 +- tests/multimodal/test_processing.py | 24 ++- tests/test_config.py | 131 +++++++----- tests/test_inputs.py | 7 +- .../test_reasoning_structured_output.py | 35 ++-- vllm/config/__init__.py | 3 + vllm/config/model.py | 131 +++--------- vllm/config/renderer.py | 89 ++++++++ vllm/config/speculative.py | 5 - vllm/config/vllm.py | 21 +- vllm/engine/arg_utils.py | 85 +++++--- vllm/entrypoints/chat_utils.py | 72 ++++--- vllm/entrypoints/llm.py | 11 +- vllm/entrypoints/openai/serving_completion.py | 2 +- vllm/entrypoints/openai/serving_engine.py | 11 +- vllm/entrypoints/openai/serving_models.py | 1 + vllm/entrypoints/openai/speech_to_text.py | 4 +- vllm/entrypoints/pooling/pooling/serving.py | 2 +- vllm/inputs/preprocess.py | 9 +- vllm/model_executor/models/adapters.py | 20 +- vllm/model_executor/models/deepseek_ocr.py | 4 +- vllm/model_executor/models/deepseek_vl2.py | 4 +- vllm/model_executor/models/granite_speech.py | 14 +- vllm/model_executor/models/gritlm.py | 14 +- vllm/model_executor/models/interfaces.py | 8 +- vllm/model_executor/models/interns1.py | 2 +- .../model_executor/models/nano_nemotron_vl.py | 13 +- vllm/model_executor/models/nemotron_vl.py | 2 +- vllm/model_executor/models/pixtral.py | 2 +- vllm/model_executor/models/voxtral.py | 22 +- vllm/model_executor/models/whisper.py | 14 +- vllm/multimodal/processing.py | 28 ++- vllm/multimodal/registry.py | 53 +++-- vllm/tokenizers/registry.py | 24 +-- vllm/transformers_utils/processor.py | 28 ++- vllm/v1/core/encoder_cache_manager.py | 8 +- vllm/v1/core/sched/scheduler.py | 2 +- vllm/v1/engine/async_llm.py | 7 +- vllm/v1/engine/input_processor.py | 5 +- vllm/v1/engine/llm_engine.py | 7 +- vllm/v1/structured_output/__init__.py | 18 +- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/tpu_model_runner.py | 2 +- vllm/v1/worker/utils.py | 9 +- 68 files changed, 697 insertions(+), 675 deletions(-) create mode 100644 vllm/config/renderer.py diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index fca941acd507..baaa790d611e 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -22,7 +22,7 @@ Declare supported languages and capabilities: import torch from torch import nn - from vllm.config import ModelConfig, SpeechToTextConfig + from vllm.config import RendererConfig, SpeechToTextConfig from vllm.inputs.data import PromptType from vllm.model_executor.models.interfaces import SupportsTranscription @@ -52,7 +52,7 @@ This is for controlling general behavior of the API when serving your model: @classmethod def get_speech_to_text_config( cls, - model_config: ModelConfig, + renderer_config: RendererConfig, task_type: Literal["transcribe", "translate"], ) -> SpeechToTextConfig: return SpeechToTextConfig( @@ -83,7 +83,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -120,7 +120,7 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics: cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: renderer_config, ) -> int | None: # Return None if unknown; otherwise return an estimate. return int(audio_duration_s * stt_config.sample_rate // 320) # example diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index 77087ac21ea8..ba7cb9328155 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -3,7 +3,6 @@ import pytest -from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.tokenizers import get_tokenizer @@ -107,24 +106,11 @@ def test_get_gen_prompt( model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - trust_remote_code=model_info.trust_remote_code, - revision=model_info.revision, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + renderer_config = model_info.build_renderer_config() - # Initialize the tokenizer tokenizer = get_tokenizer( - tokenizer_name=model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) template_content = load_chat_template(chat_template=template) @@ -143,7 +129,7 @@ def test_get_gen_prompt( tokenizer=tokenizer, conversation=mock_request.messages, chat_template=mock_request.chat_template or template_content, - model_config=model_config, + renderer_config=renderer_config, tools=None, add_generation_prompt=mock_request.add_generation_prompt, continue_final_message=mock_request.continue_final_message, diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index ae8860ee877b..869f6eeb4bfa 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -118,7 +118,7 @@ def get_hf_prompt_tokens(model_name, content, image_url): image = image.media images = [image] - prompt = processor.tokenizer.apply_chat_template( + prompt = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = processor(prompt, images, return_tensors="pt") diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 75be34820bcd..5f5580fa181b 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -11,7 +11,7 @@ from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.entrypoints.chat_utils import ( _try_extract_ast, apply_mistral_chat_template, @@ -232,7 +232,7 @@ def test_parse_chat_messages_single_image( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -264,7 +264,7 @@ def test_parse_chat_messages_single_image_with_uuid( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -294,7 +294,7 @@ def test_parse_chat_messages_single_empty_image_with_uuid( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -327,7 +327,7 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -368,7 +368,7 @@ def test_parse_chat_messages_multiple_images_with_uuids( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -408,7 +408,7 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -450,7 +450,7 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -484,7 +484,7 @@ async def test_parse_chat_messages_single_image_with_uuid_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -515,7 +515,7 @@ async def test_parse_chat_messages_empty_image_with_uuid_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -553,7 +553,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -594,7 +594,7 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -633,7 +633,7 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -659,7 +659,7 @@ def test_parse_chat_messages_empty_system( "content": [{"type": "text", "text": "Who are you?"}], }, ], - mistral_model_config, + RendererConfig(model_config=mistral_model_config), content_format="string", ) assert conversation == [ @@ -676,7 +676,7 @@ def test_parse_chat_messages_empty_system( "content": [{"type": "text", "text": "Who are you?"}], }, ], - mistral_model_config, + RendererConfig(model_config=mistral_model_config), content_format="openai", ) assert conversation == [ @@ -700,7 +700,7 @@ async def test_parse_chat_messages_single_image_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -729,7 +729,7 @@ def test_parse_chat_messages_multiple_images( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -757,7 +757,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -785,7 +785,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid( ], } ], - phi3v_model_config_image_embeds, + RendererConfig(model_config=phi3v_model_config_image_embeds), content_format="string", ) @@ -817,7 +817,7 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid( ], } ], - audio_embeds_model_config, + RendererConfig(model_config=audio_embeds_model_config), content_format="string", ) @@ -857,7 +857,7 @@ def test_parse_chat_messages_audio_embeds_with_string( ], } ], - audio_embeds_model_config, + RendererConfig(model_config=audio_embeds_model_config), content_format="string", ) @@ -899,7 +899,7 @@ async def test_parse_chat_messages_audio_embeds_async( ], } ], - audio_embeds_model_config, + RendererConfig(model_config=audio_embeds_model_config), content_format="string", ) @@ -930,7 +930,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( ], } ], - phi3v_model_config_image_embeds, + RendererConfig(model_config=phi3v_model_config_image_embeds), content_format="string", ) @@ -966,7 +966,7 @@ async def test_parse_chat_messages_multiple_images_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -998,7 +998,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) assert conversation == [ @@ -1030,7 +1030,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1067,7 +1067,7 @@ def test_parse_chat_messages_multiple_images_across_messages( ], }, ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1111,7 +1111,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages( ], }, ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1136,7 +1136,7 @@ def test_parse_chat_messages_context_text_format( {"role": "assistant", "content": "Some stuff."}, {"role": "user", "content": "What about this one?"}, ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="openai", ) @@ -1189,7 +1189,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1232,7 +1232,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( ], }, ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1252,7 +1252,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1286,7 +1286,7 @@ def test_parse_chat_messages_multiple_images_interleave( ], } ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1322,7 +1322,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async( ], } ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1367,7 +1367,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async( ], } ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1405,7 +1405,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave( ], }, ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1453,7 +1453,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl ], }, ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1497,7 +1497,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( ], }, ], - qwen25omni_model_config_mm_interleaved, + RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), content_format="string", ) @@ -1565,7 +1565,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl ], }, ], - qwen25omni_model_config_mm_interleaved, + RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), content_format="string", ) @@ -1635,7 +1635,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes ], }, ], - qwen25omni_model_config_mm_interleaved, + RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), content_format="string", ) @@ -1701,7 +1701,7 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message ], }, ], - qwen25omni_model_config_mm_interleaved, + RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), content_format="string", ) @@ -1749,7 +1749,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders( ], } ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1767,24 +1767,11 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) - - # Build the tokenizer + renderer_config = model_info.build_renderer_config() + tokenizer = get_tokenizer( - model, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) tools = ( @@ -1807,7 +1794,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): tokenizer, chat_template=None, tools=tools, - model_config=model_config, + renderer_config=renderer_config, ) assert isinstance(chat_template, str) @@ -1869,24 +1856,11 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa "enable_thinking": True, } - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) - - # Build the tokenizer + renderer_config = model_info.build_renderer_config() + tokenizer = get_tokenizer( - model, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template @@ -1894,7 +1868,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa tokenizer, chat_template=None, tools=tools, - model_config=model_config, + renderer_config=renderer_config, ) with pytest.raises( ValueError, match="Found unexpected chat template kwargs from request" @@ -1965,23 +1939,11 @@ def test_resolve_content_format_hf_defined(model, expected_format): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + renderer_config = model_info.build_renderer_config() tokenizer = get_tokenizer( - model, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template @@ -1989,7 +1951,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): tokenizer, chat_template=None, tools=None, - model_config=model_config, + renderer_config=renderer_config, ) assert isinstance(chat_template, str) @@ -2003,7 +1965,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): None, "auto", tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) assert resolved_format == expected_format @@ -2025,23 +1987,11 @@ def test_resolve_content_format_fallbacks(model, expected_format): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + renderer_config = model_info.build_renderer_config() tokenizer = get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template @@ -2049,7 +1999,7 @@ def test_resolve_content_format_fallbacks(model, expected_format): tokenizer, chat_template=None, tools=None, - model_config=model_config, + renderer_config=renderer_config, ) assert isinstance(chat_template, str) @@ -2063,7 +2013,7 @@ def test_resolve_content_format_fallbacks(model, expected_format): None, "auto", tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) assert resolved_format == expected_format @@ -2094,15 +2044,13 @@ def test_resolve_content_format_fallbacks(model, expected_format): ], ) def test_resolve_content_format_examples(template_path, expected_format): - model_config = ModelConfig( - PHI3V_MODEL_ID, # Dummy - tokenizer=PHI3V_MODEL_ID, # Dummy - trust_remote_code=True, - ) + model = PHI3V_MODEL_ID # Dummy + model_config = ModelConfig(model, trust_remote_code=True) + renderer_config = RendererConfig(model_config=model_config, tokenizer=model) dummy_tokenizer = get_tokenizer( - PHI3V_MODEL_ID, # Dummy - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) dummy_tokenizer.chat_template = None @@ -2119,7 +2067,7 @@ def test_resolve_content_format_examples(template_path, expected_format): None, "auto", dummy_tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) assert resolved_format == expected_format @@ -2154,7 +2102,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config): conversation_with_thinking, _, _ = parse_chat_messages( messages, - mistral_model_config, + RendererConfig(model_config=mistral_model_config), content_format="openai", ) @@ -2254,7 +2202,7 @@ def test_parse_chat_messages_single_empty_audio_with_uuid( ], } ], - qwen2_audio_model_config, + RendererConfig(model_config=qwen2_audio_model_config), content_format="string", ) @@ -2288,7 +2236,7 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async( ], } ], - qwen2_audio_model_config, + RendererConfig(model_config=qwen2_audio_model_config), content_format="string", ) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 489ac1e6475b..e368671078fd 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -42,8 +42,10 @@ def test_model_loading_with_params(vllm_runner, monkeypatch): "Write a short story about a robot that dreams for the first time.\n" ) - model_config = vllm_model.llm.llm_engine.model_config - model_tokenizer = vllm_model.llm.llm_engine.tokenizer + llm_engine = vllm_model.llm.llm_engine + model_config = llm_engine.model_config + renderer_config = llm_engine.renderer_config + tokenizer = llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -54,8 +56,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch): assert model_config.pooler_config.normalize # asserts on the tokenizer loaded - assert model_config.tokenizer == "BAAI/bge-base-en-v1.5" - assert model_tokenizer.model_max_length == 512 + assert renderer_config.tokenizer == "BAAI/bge-base-en-v1.5" + assert tokenizer.model_max_length == 512 def check_model(model): assert isinstance(model, BertEmbeddingModel) @@ -86,8 +88,10 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): "Write a short story about a robot that dreams for the first time.\n" ) - model_config = vllm_model.llm.llm_engine.model_config - model_tokenizer = vllm_model.llm.llm_engine.tokenizer + llm_engine = vllm_model.llm.llm_engine + model_config = llm_engine.model_config + renderer_config = llm_engine.renderer_config + tokenizer = llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -98,8 +102,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): assert model_config.pooler_config.normalize # asserts on the tokenizer loaded - assert model_config.tokenizer == "intfloat/multilingual-e5-base" - assert model_tokenizer.model_max_length == 512 + assert renderer_config.tokenizer == "intfloat/multilingual-e5-base" + assert tokenizer.model_max_length == 512 def check_model(model): assert isinstance(model, RobertaEmbeddingModel) @@ -128,7 +132,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch): "Write a short story about a robot that dreams for the first time.\n" ) - assert vllm_model.llm.llm_engine.model_config.tokenizer == model_name + assert vllm_model.llm.llm_engine.renderer_config.tokenizer == model_name def check_model(model): assert isinstance(model, RobertaEmbeddingModel) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 2e032ac4ca52..9b2b29b75876 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -25,7 +25,6 @@ from vllm.tokenizers import ( MistralTokenizer, TokenizerLike, - cached_tokenizer_from_config, ) from ....multimodal.utils import random_audio, random_image, random_video @@ -212,31 +211,20 @@ def _test_processing_correctness( else: model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch) model_id = model_id_or_arch + model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - model_config = ModelConfig( - model_id, - tokenizer=model_info.tokenizer or model_id, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, + renderer_config = model_info.build_renderer_config( + model=model_id, # Ensure that the cache can fit all of the data mm_processor_cache_gb=2048, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, ) + model_config = renderer_config.model_config model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) factories = model_cls._processor_factory - ctx = InputProcessingContext( - model_config, - tokenizer=cached_tokenizer_from_config(model_config), - ) + ctx = InputProcessingContext.from_config(renderer_config) cache = MultiModalProcessorOnlyCache(model_config) processing_info = factories.info(ctx) diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index 51071c93531d..fdc6352e2ec8 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -40,7 +40,7 @@ def test_processor_override( mm_processor_kwargs=None, limit_mm_per_prompt={"video": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) tokenizer = processor.info.get_tokenizer() hf_processor_mm_kwargs = {"fps": fps} @@ -79,7 +79,7 @@ def test_video_loader_consistency( mm_processor_kwargs=None, limit_mm_per_prompt={"video": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {"fps": fps} # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py index 1701d9dd8f01..1263d663e6af 100644 --- a/tests/models/multimodal/processing/test_h2ovl.py +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -162,7 +162,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": len(size_factors)}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs min_num = min_dynamic_patch if dynamic_image_size else 1 diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py index 351b9d018eec..bf12e79a718b 100644 --- a/tests/models/multimodal/processing/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -38,7 +38,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index b4994295d3a8..51f0d2e891b3 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -116,7 +116,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": len(size_factors)}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs min_num = min_dynamic_patch if dynamic_image_size else 1 diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py index b73246b68b36..04bc8d3f5381 100644 --- a/tests/models/multimodal/processing/test_llama4.py +++ b/tests/models/multimodal/processing/test_llama4.py @@ -30,7 +30,7 @@ def test_processor_override( limit_mm_per_prompt={"image": num_imgs}, mm_processor_cache_gb=mm_processor_cache_gb, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) config = processor.info.get_hf_config() tokenizer = processor.info.get_tokenizer() hf_processor = processor.info.get_hf_processor() diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py index ffe7ca17b5d6..cd01002a32af 100644 --- a/tests/models/multimodal/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id): mm_processor_kwargs=None, limit_mm_per_prompt={"image": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) info = processor.info seen_aspect_ratios = set[float]() @@ -140,7 +140,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) image_ratios = [ (171, 152), @@ -173,7 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) seen_aspect_ratios = set[float]() image_sizes = list[ImageSize]() diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py index f5c552fe6476..be505d95a500 100644 --- a/tests/models/multimodal/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id): mm_processor_kwargs=None, limit_mm_per_prompt={"image": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) info = processor.info seen_aspect_ratios = set[float]() @@ -138,7 +138,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) image_ratios = [ (171, 152), @@ -171,7 +171,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) seen_aspect_ratios = set[float]() image_sizes = list[ImageSize]() diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py index 11e000123511..17ac54fdd0a4 100644 --- a/tests/models/multimodal/processing/test_minimax_vl_01.py +++ b/tests/models/multimodal/processing/test_minimax_vl_01.py @@ -24,7 +24,7 @@ def test_processor_override( mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) prompt = "" * num_imgs image = Image.new("RGB", size=(364, 364)) mm_data = {"image": [image] * num_imgs} @@ -83,7 +83,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) image_ratios = [ (171, 152), diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py index e5ff2d1391b6..9a65e2ddc85c 100644 --- a/tests/models/multimodal/processing/test_mllama4.py +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -25,7 +25,7 @@ def test_profiling(model_id: str, max_model_len: int): limit_mm_per_prompt=mm_counts, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) profiler = MultiModalProfiler(processor) decoder_dummy_data = profiler.get_decoder_dummy_data( diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py index 5311ab1b78c6..f3609743b7c8 100644 --- a/tests/models/multimodal/processing/test_nemotron_vl.py +++ b/tests/models/multimodal/processing/test_nemotron_vl.py @@ -118,7 +118,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": len(size_factors)}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs min_num = min_dynamic_patch if dynamic_image_size else 1 diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py index 8faff2611e6f..f51bd9786178 100644 --- a/tests/models/multimodal/processing/test_phi3v.py +++ b/tests/models/multimodal/processing/test_phi3v.py @@ -39,7 +39,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py index 5391555c2667..271357b0d150 100644 --- a/tests/models/multimodal/processing/test_phi4mm.py +++ b/tests/models/multimodal/processing/test_phi4mm.py @@ -39,7 +39,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index 9f4cdb6789b2..d65a270a7da3 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -34,7 +34,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) tokenizer = processor.info.get_tokenizer() hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py index 6f77d5516d14..e0e6264de4e3 100644 --- a/tests/models/multimodal/processing/test_smolvlm.py +++ b/tests/models/multimodal/processing/test_smolvlm.py @@ -38,7 +38,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 5d489549c5b4..7eda0e6bdb85 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -31,7 +31,6 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.platforms import current_platform -from vllm.tokenizers import cached_tokenizer_from_config from vllm.utils.collection_utils import is_list_of from vllm.utils.torch_utils import set_default_torch_dtype @@ -182,19 +181,11 @@ def test_model_tensor_schema(model_id: str): else: dtype = model_info.dtype - model_config = ModelConfig( - model_id, - tokenizer=model_info.tokenizer or model_id, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, + renderer_config = model_info.build_renderer_config( hf_overrides=hf_overrides_fn, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, dtype=dtype, ) + model_config = renderer_config.model_config model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) assert supports_multimodal(model_cls) @@ -212,10 +203,7 @@ def test_model_tensor_schema(model_id: str): if not any(inputs_parse_methods): pytest.skip(f"{model_arch} does not support tensor schema validation.") - ctx = InputProcessingContext( - model_config, - tokenizer=cached_tokenizer_from_config(model_config), - ) + ctx = InputProcessingContext.from_config(renderer_config) processing_info = factories.info(ctx) supported_mm_limits = processing_info.get_supported_mm_limits() limit_mm_per_prompt = { diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py index e2a2186f470b..c9a90eb882da 100644 --- a/tests/models/multimodal/processing/test_transformers.py +++ b/tests/models/multimodal/processing/test_transformers.py @@ -3,7 +3,7 @@ import pytest from vllm.assets.image import ImageAsset -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.multimodal import MULTIMODAL_REGISTRY @@ -13,8 +13,9 @@ def test_multimodal_processor(model_id): model=model_id, model_impl="transformers", ) + renderer_config = RendererConfig(model_config=model_config) - mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config) + mm_processor = MULTIMODAL_REGISTRY.create_processor(renderer_config) image_pil = ImageAsset("cherry_blossom").pil_image mm_data = {"image": image_pil} diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py index 0d2eaca95504..73de6b5f7d2f 100644 --- a/tests/models/multimodal/test_mapping.py +++ b/tests/models/multimodal/test_mapping.py @@ -7,7 +7,6 @@ import transformers from transformers import AutoConfig, PreTrainedModel -from vllm.config import ModelConfig from vllm.model_executor.models.utils import WeightsMapper from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.transformers_utils.config import try_get_safetensors_metadata @@ -50,37 +49,11 @@ def test_hf_model_weights_mapper(model_arch: str): model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - is_mistral_model = model_arch in [ - "Mistral3ForConditionalGeneration", - "PixtralForConditionalGeneration", - "VoxtralForConditionalGeneration", - ] - - if not is_mistral_model or model_info.tokenizer_mode == "mistral": - tokenizer_mode = model_info.tokenizer_mode - else: - tokenizer_mode = "hf" - - model_id = model_info.default - - model_config = ModelConfig( - model_id, - tokenizer=model_info.tokenizer or model_id, - tokenizer_mode=tokenizer_mode, - config_format="hf", - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + model_config = model_info.build_model_config(config_format="hf") model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - original_weights = create_repo_dummy_weights(model_id) - hf_dummy_model = create_dummy_model(model_id, model_arch) + original_weights = create_repo_dummy_weights(model_config.model) + hf_dummy_model = create_dummy_model(model_config.model, model_arch) hf_converted_weights = hf_dummy_model.named_parameters() hf_converted_buffers = hf_dummy_model.named_buffers() mapper: WeightsMapper = model_cls.hf_to_vllm_mapper diff --git a/tests/models/registry.py b/tests/models/registry.py index 352abdd2da9a..7349d4a3f35d 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -9,7 +9,8 @@ from packaging.version import Version from transformers import __version__ as TRANSFORMERS_VERSION -from vllm.config.model import ModelDType, TokenizerMode +from vllm.config.model import ModelConfig, ModelDType +from vllm.config.renderer import RendererConfig, TokenizerMode @dataclass(frozen=True) @@ -170,6 +171,34 @@ def check_available_online( else: pytest.skip(msg) + def build_model_config(self, model: str | None = None, **kwargs) -> ModelConfig: + if model is None: + model = self.default + + return ModelConfig( + model=model, + revision=self.revision, + trust_remote_code=self.trust_remote_code, + hf_overrides=self.hf_overrides, + enable_prompt_embeds=self.require_embed_inputs, + enable_mm_embeds=self.require_embed_inputs, + enforce_eager=self.enforce_eager, + dtype=self.dtype, + **kwargs, + ) + + def build_renderer_config( + self, model: str | None = None, **kwargs + ) -> RendererConfig: + model_config = self.build_model_config(model, **kwargs) + + return RendererConfig( + model_config=model_config, + tokenizer=self.tokenizer or model_config.model, + tokenizer_mode=self.tokenizer_mode, + skip_tokenizer_init=self.require_embed_inputs, + ) + _TEXT_GENERATION_EXAMPLE_MODELS = { # [Decoder-only] diff --git a/tests/models/utils.py b/tests/models/utils.py index d84b4b820533..d92188f40eea 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -13,7 +13,6 @@ from vllm.config.model import ModelConfig, ModelDType, RunnerOption from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.multimodal.processing import InputProcessingContext -from vllm.tokenizers import cached_tokenizer_from_config from .. import ci_envs from .registry import HF_EXAMPLE_MODELS @@ -296,30 +295,17 @@ def build_model_context( model_config_kwargs = model_config_kwargs or {} limit_mm_per_prompt = limit_mm_per_prompt or {} - model_config = ModelConfig( - model_id, + renderer_config = model_info.build_renderer_config( runner=runner, - tokenizer=model_info.tokenizer or model_id, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, dtype=dtype, seed=0, mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt=limit_mm_per_prompt, mm_processor_cache_gb=mm_processor_cache_gb, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, **model_config_kwargs, ) - return InputProcessingContext( - model_config, - tokenizer=cached_tokenizer_from_config(model_config), - ) + return InputProcessingContext.from_config(renderer_config) def check_embeddings_close( diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 262ea42e4d0f..adff572524a9 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -7,7 +7,7 @@ import numpy as np import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.processing import ( InputProcessingContext, @@ -920,8 +920,9 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): model=model_id, limit_mm_per_prompt=limit_mm_per_prompt, ) + renderer_config = RendererConfig(model_config=model_config) - processor = MULTIMODAL_REGISTRY.create_processor(model_config) + processor = MULTIMODAL_REGISTRY.create_processor(renderer_config) processor._supported_mm_limits = {"image": num_supported} profiler = MultiModalProfiler(processor) @@ -955,8 +956,9 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): model=model_id, limit_mm_per_prompt=limit_mm_per_prompt, ) + renderer_config = RendererConfig(model_config=model_config) - processor = MULTIMODAL_REGISTRY.create_processor(model_config) + processor = MULTIMODAL_REGISTRY.create_processor(renderer_config) rng = np.random.RandomState(0) image = random_image(rng, min_wh=128, max_wh=256) @@ -1012,11 +1014,13 @@ def test_hf_processor_init_kwargs( inference_kwargs, expected_kwargs, ): - ctx = InputProcessingContext( - model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), - tokenizer=None, + model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs) + renderer_config = RendererConfig( + model_config=model_config, + tokenizer=model_id, ) + ctx = InputProcessingContext.from_config(renderer_config) processor = ctx.get_hf_processor( DummyProcessor, # type: ignore[arg-type] **inference_kwargs, @@ -1045,11 +1049,13 @@ def test_hf_processor_call_kwargs( inference_kwargs, expected_kwargs, ): - ctx = InputProcessingContext( - model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), - tokenizer=None, + model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs) + renderer_config = RendererConfig( + model_config=model_config, + tokenizer=model_id, ) + ctx = InputProcessingContext.from_config(renderer_config) processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type] result = ctx.call_hf_processor(processor, {}, inference_kwargs) diff --git a/tests/test_config.py b/tests/test_config.py index 203447cd531f..7464fcd1e9fe 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -13,6 +13,7 @@ CompilationConfig, ModelConfig, PoolerConfig, + RendererConfig, SchedulerConfig, VllmConfig, update_config, @@ -476,27 +477,41 @@ def test_load_config_pt_load_map_location(pt_load_map_location): ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True), ], ) -def test_get_and_verify_max_len( +def test_recalculate_max_model_len( model_id, max_model_len, expected_max_len, should_raise ): - """Test get_and_verify_max_len with different configurations.""" + """Test recalculate_max_model_len with different configurations.""" model_config = ModelConfig(model_id) if should_raise: with pytest.raises(ValueError): - model_config.get_and_verify_max_len(max_model_len) + model_config.recalculate_max_model_len( + max_model_len, + tokenizer=model_id, + tokenizer_revision=None, + ) else: - actual_max_len = model_config.get_and_verify_max_len(max_model_len) - assert actual_max_len == expected_max_len + model_config.recalculate_max_model_len( + max_model_len, + tokenizer=model_id, + tokenizer_revision=None, + ) + assert model_config.max_model_len == expected_max_len -class MockConfig: - """Simple mock object for testing maybe_pull_model_tokenizer_for_runai""" +class MockModelConfig: + """Simple mock object for testing maybe_pull_model_for_runai""" - def __init__(self, model: str, tokenizer: str): + def __init__(self, model: str): self.model = model - self.tokenizer = tokenizer - self.model_weights = None + + +class MockRendererConfig: + """Simple mock object for testing maybe_pull_tokenizer_for_runai""" + + def __init__(self, model_config: MockModelConfig): + self.model_config = model_config + self.tokenizer = model_config.model @pytest.mark.parametrize( @@ -514,59 +529,65 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url): mock_pull_files.return_value = None # Create first mock and run the method - config1 = MockConfig(model=s3_url, tokenizer=s3_url) - ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url) + model_config1 = MockModelConfig(model=s3_url) + renderer_config1 = MockRendererConfig(model_config=model_config1) + ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url) + RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url) # Check that model and tokenizer point to existing directories - assert os.path.exists(config1.model), ( - f"Model directory does not exist: {config1.model}" + assert os.path.exists(model_config1.model), ( + f"Model directory does not exist: {model_config1.model}" ) - assert os.path.isdir(config1.model), ( - f"Model path is not a directory: {config1.model}" + assert os.path.isdir(model_config1.model), ( + f"Model path is not a directory: {model_config1.model}" ) - assert os.path.exists(config1.tokenizer), ( - f"Tokenizer directory does not exist: {config1.tokenizer}" + assert os.path.exists(renderer_config1.tokenizer), ( + f"Tokenizer directory does not exist: {renderer_config1.tokenizer}" ) - assert os.path.isdir(config1.tokenizer), ( - f"Tokenizer path is not a directory: {config1.tokenizer}" + assert os.path.isdir(renderer_config1.tokenizer), ( + f"Tokenizer path is not a directory: {renderer_config1.tokenizer}" ) # Verify that the paths are different from the original S3 URL - assert config1.model != s3_url, "Model path should be converted to local directory" - assert config1.tokenizer != s3_url, ( + assert model_config1.model != s3_url, ( + "Model path should be converted to local directory" + ) + assert renderer_config1.tokenizer != s3_url, ( "Tokenizer path should be converted to local directory" ) # Store the original paths - created_model_dir = config1.model - create_tokenizer_dir = config1.tokenizer + created_model_dir = model_config1.model + create_tokenizer_dir = renderer_config1.tokenizer # Create a new mock and run the method with the same S3 URL - config2 = MockConfig(model=s3_url, tokenizer=s3_url) - ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url) + model_config2 = MockModelConfig(model=s3_url) + renderer_config2 = MockRendererConfig(model_config=model_config2) + ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url) + RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url) # Check that the new directories exist - assert os.path.exists(config2.model), ( - f"Model directory does not exist: {config2.model}" + assert os.path.exists(model_config2.model), ( + f"Model directory does not exist: {model_config2.model}" ) - assert os.path.isdir(config2.model), ( - f"Model path is not a directory: {config2.model}" + assert os.path.isdir(model_config2.model), ( + f"Model path is not a directory: {model_config2.model}" ) - assert os.path.exists(config2.tokenizer), ( - f"Tokenizer directory does not exist: {config2.tokenizer}" + assert os.path.exists(renderer_config2.tokenizer), ( + f"Tokenizer directory does not exist: {renderer_config2.tokenizer}" ) - assert os.path.isdir(config2.tokenizer), ( - f"Tokenizer path is not a directory: {config2.tokenizer}" + assert os.path.isdir(renderer_config2.tokenizer), ( + f"Tokenizer path is not a directory: {renderer_config2.tokenizer}" ) # Verify that the paths are deterministic (same as before) - assert config2.model == created_model_dir, ( + assert model_config2.model == created_model_dir, ( f"Model paths are not deterministic. " - f"Original: {created_model_dir}, New: {config2.model}" + f"Original: {created_model_dir}, New: {model_config2.model}" ) - assert config2.tokenizer == create_tokenizer_dir, ( + assert renderer_config2.tokenizer == create_tokenizer_dir, ( f"Tokenizer paths are not deterministic. " - f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}" + f"Original: {create_tokenizer_dir}, New: {renderer_config2.tokenizer}" ) @@ -580,28 +601,36 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files): s3_url2 = "s3://example-bucket-2/model/" # Create mocks with different S3 URLs and run the method - config1 = MockConfig(model=s3_url1, tokenizer=s3_url1) - ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url1, s3_url1) + model_config1 = MockModelConfig(model=s3_url1) + renderer_config1 = MockRendererConfig(model_config=model_config1) + ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url1) + RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url1) - config2 = MockConfig(model=s3_url2, tokenizer=s3_url2) - ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url2, s3_url2) + model_config2 = MockModelConfig(model=s3_url2) + renderer_config2 = MockRendererConfig(model_config=model_config2) + ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url2) + RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url2) # Verify that different URLs produce different directories - assert config1.model != config2.model, ( + assert model_config1.model != model_config2.model, ( f"Different S3 URLs should create different model directories. " - f"URL1 model: {config1.model}, URL2 model: {config2.model}" + f"URL1 model: {model_config1.model}, URL2 model: {model_config2.model}" ) - assert config1.tokenizer != config2.tokenizer, ( + assert renderer_config1.tokenizer != renderer_config2.tokenizer, ( f"Different S3 URLs should create different tokenizer directories. " - f"URL1 tokenizer: {config1.tokenizer}, " - f"URL2 tokenizer: {config2.tokenizer}" + f"URL1 tokenizer: {renderer_config1.tokenizer}, " + f"URL2 tokenizer: {renderer_config2.tokenizer}" ) # Verify that both sets of directories exist - assert os.path.exists(config1.model) and os.path.isdir(config1.model) - assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer) - assert os.path.exists(config2.model) and os.path.isdir(config2.model) - assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer) + assert os.path.exists(model_config1.model) and os.path.isdir(model_config1.model) + assert os.path.exists(renderer_config1.tokenizer) and os.path.isdir( + renderer_config1.tokenizer + ) + assert os.path.exists(model_config2.model) and os.path.isdir(model_config2.model) + assert os.path.exists(renderer_config2.tokenizer) and os.path.isdir( + renderer_config2.tokenizer + ) @pytest.mark.parametrize( diff --git a/tests/test_inputs.py b/tests/test_inputs.py index c4339827de8b..48fd076ab3c6 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -3,7 +3,7 @@ import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.inputs import zip_enc_dec_prompts from vllm.inputs.parse import parse_raw_prompts from vllm.inputs.preprocess import InputPreprocessor @@ -108,8 +108,9 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs): ) def test_preprocessor_always_mm_code_path(model_id, prompt): model_config = ModelConfig(model=model_id) - tokenizer = init_tokenizer_from_config(model_config) - input_preprocessor = InputPreprocessor(model_config, tokenizer) + renderer_config = RendererConfig(model_config=model_config) + tokenizer = init_tokenizer_from_config(renderer_config) + input_preprocessor = InputPreprocessor(renderer_config, tokenizer) # HF processor adds sep token sep_token_id = tokenizer.vocab[tokenizer.sep_token] diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py index 70047a993c3f..5901d38d1b78 100644 --- a/tests/v1/structured_output/test_reasoning_structured_output.py +++ b/tests/v1/structured_output/test_reasoning_structured_output.py @@ -7,7 +7,7 @@ import pytest -from vllm.config import ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig from vllm.reasoning import ReasoningParser from vllm.v1.request import Request from vllm.v1.structured_output import StructuredOutputManager @@ -17,19 +17,26 @@ class TestReasoningStructuredOutput: """Test reasoning-aware structured output functionality.""" @pytest.fixture - def mock_model_config(self): - """Create a mock ModelConfig.""" - config = Mock(spec=ModelConfig) - config.skip_tokenizer_init = True # Skip tokenizer init to avoid network calls - config.get_vocab_size = Mock(return_value=50000) + def mock_renderer_config(self): + """Create a mock RendererConfig.""" + renderer_config = Mock(spec=RendererConfig) + renderer_config.skip_tokenizer_init = ( + True # Skip tokenizer init to avoid network calls + ) + + model_config = Mock(spec=ModelConfig) + model_config.get_vocab_size = Mock(return_value=50000) + model_config.trust_remote_code = False # Add missing runner_type attribute that tokenizer initialization expects - config.runner_type = "generate" + model_config.runner_type = "generate" + renderer_config.model_config = model_config + # Add other attributes that tokenizer initialization might need - config.tokenizer = "test-tokenizer" - config.tokenizer_mode = "auto" - config.trust_remote_code = False - config.tokenizer_revision = None - return config + renderer_config.tokenizer = "test-tokenizer" + renderer_config.tokenizer_mode = "auto" + renderer_config.tokenizer_revision = None + + return renderer_config @pytest.fixture def mock_scheduler_config(self): @@ -39,10 +46,10 @@ def mock_scheduler_config(self): return config @pytest.fixture - def mock_vllm_config(self, mock_model_config, mock_scheduler_config): + def mock_vllm_config(self, mock_renderer_config, mock_scheduler_config): """Create a mock VllmConfig.""" config = Mock(spec=VllmConfig) - config.model_config = mock_model_config + config.renderer_config = mock_renderer_config config.scheduler_config = mock_scheduler_config config.structured_outputs_config = Mock() config.structured_outputs_config.reasoning_parser = None diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index dd76a722106e..3ac792139fb9 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -23,6 +23,7 @@ from vllm.config.observability import ObservabilityConfig from vllm.config.parallel import EPLBConfig, ParallelConfig from vllm.config.pooler import PoolerConfig +from vllm.config.renderer import RendererConfig from vllm.config.scheduler import SchedulerConfig from vllm.config.speculative import SpeculativeConfig from vllm.config.speech_to_text import SpeechToTextConfig @@ -78,6 +79,8 @@ "ParallelConfig", # From vllm.config.pooler "PoolerConfig", + # From vllm.config.renderer + "RendererConfig", # From vllm.config.scheduler "SchedulerConfig", # From vllm.config.speculative diff --git a/vllm/config/model.py b/vllm/config/model.py index ae5189ce68d9..63bdee96957f 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -37,7 +37,6 @@ uses_xdrope_dim, ) from vllm.transformers_utils.gguf_utils import ( - is_gguf, is_remote_gguf, maybe_patch_hf_config_from_gguf, split_remote_gguf, @@ -84,7 +83,6 @@ "transcription", "draft", ] -TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal[ "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs" @@ -132,17 +130,6 @@ class ModelConfig: Note that the model may support other tasks using the same model runner. """ - tokenizer: SkipValidation[str] = None # type: ignore - """Name or path of the Hugging Face tokenizer to use. If unspecified, model - name or path will be used.""" - tokenizer_mode: TokenizerMode | str = "auto" - """Tokenizer mode:\n - - "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n - - "hf" will use the fast tokenizer if available.\n - - "slow" will always use the slow tokenizer.\n - - "mistral" will always use the tokenizer from `mistral_common`.\n - - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n - - Other custom values can be supported via plugins.""" trust_remote_code: bool = False """Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer.""" @@ -168,13 +155,6 @@ class ModelConfig: hf_config_path: str | None = None """Name or path of the Hugging Face config to use. If unspecified, model name or path will be used.""" - allowed_local_media_path: str = "" - """Allowing API requests to read local images or videos from directories - specified by the server file system. This is a security risk. Should only - be enabled in trusted environments.""" - allowed_media_domains: list[str] | None = None - """If set, only media URLs that belong to this domain can be used for - multi-modal inputs. """ revision: str | None = None """The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" @@ -182,10 +162,6 @@ class ModelConfig: """The specific revision to use for the model code on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" - tokenizer_revision: str | None = None - """The specific revision to use for the tokenizer on the Hugging Face Hub. - It can be a branch name, a tag name, or a commit id. If unspecified, will - use the default version.""" max_model_len: SkipValidation[int] = None # type: ignore """Model context length (prompt and output). If unspecified, will be automatically derived from the model config. @@ -230,10 +206,6 @@ class ModelConfig: preventing potential numerical issues. Note that even if this is set to False, cascade attention will be only used when the heuristic tells that it's beneficial.""" - skip_tokenizer_init: bool = False - """Skip initialization of tokenizer and detokenizer. Expects valid - `prompt_token_ids` and `None` for prompt from the input. The generated - output will contain token ids.""" enable_prompt_embeds: bool = False """If `True`, enables passing text embeddings as inputs via the `prompt_embeds` key. @@ -294,8 +266,6 @@ class ModelConfig: logits_processors: list[str | type[LogitsProcessor]] | None = None """One or more logits processors' fully-qualified class names or class definitions""" - io_processor_plugin: str | None = None - """IOProcessor plugin name to load at model startup""" # Pooler config pooler_config: PoolerConfig | None = None @@ -335,18 +305,12 @@ def compute_hash(self) -> str: "runner", "convert", "task", - "tokenizer", - "tokenizer_mode", "seed", "hf_config_path", - "allowed_local_media_path", - "allowed_media_domains", - "tokenizer_revision", "spec_target_max_model_len", "enforce_eager", "logprobs_mode", "disable_cascade_attn", - "skip_tokenizer_init", "served_model_name", "config_format", "hf_token", @@ -354,7 +318,6 @@ def compute_hash(self) -> str: "logits_processor_pattern", "override_attention_dtype", "logits_processors", - "io_processor_plugin", "pooler_config", "multimodal_config", "limit_mm_per_prompt", @@ -439,12 +402,6 @@ def __post_init__( self.model, self.served_model_name ) self.model = maybe_model_redirect(self.model) - # The tokenizer is consistent with the model by default. - if self.tokenizer is None: - self.tokenizer = self.model - if self.tokenizer_revision is None: - self.tokenizer_revision = self.revision - self.tokenizer = maybe_model_redirect(self.tokenizer) if isinstance(self.hf_config_path, str): self.hf_config_path = maybe_model_redirect(self.hf_config_path) @@ -465,7 +422,7 @@ def __post_init__( hf_overrides_kw[key] = value hf_overrides_fn = None - self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) + self.maybe_pull_model_for_runai(self.model) if ( (backend := envs.VLLM_ATTENTION_BACKEND) @@ -660,7 +617,8 @@ def _task_to_convert(task: TaskOption) -> ConvertType: ) self.original_max_model_len = self.max_model_len - self.max_model_len = self.get_and_verify_max_len(self.max_model_len) + self.recalculate_max_model_len(self.original_max_model_len) + # Init multimodal config if needed if self._model_info.supports_multimodal: if ( @@ -694,16 +652,8 @@ def _task_to_convert(task: TaskOption) -> ConvertType: self.multimodal_config = MultiModalConfig(**mm_config_kwargs) - # Multimodal GGUF models must use original repo for mm processing - if is_gguf(self.tokenizer) and self.is_multimodal_model: - raise ValueError( - "Loading a multimodal GGUF model needs to use original " - "tokenizer. Please specify the unquantized hf model's " - "repo name or path using the --tokenizer argument." - ) - if self.disable_sliding_window: - # Set after get_and_verify_max_len to ensure that max_model_len + # Set after recalculate_max_model_len to ensure that max_model_len # can be correctly capped to sliding window size self.hf_text_config.sliding_window = None @@ -727,10 +677,9 @@ def validate_quantization_before(cls, value: Any) -> Any: @model_validator(mode="after") def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": - if not isinstance(self.tokenizer, str): - raise ValueError("tokenizer must be a string after __post_init__.") if not isinstance(self.max_model_len, int): raise ValueError("max_model_len must be an integer after __post_init__.") + return self def _get_transformers_backend_cls(self) -> str: @@ -779,49 +728,17 @@ def architecture(self) -> str: """The architecture vllm actually used.""" return self._architecture - def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None: - """Pull model/tokenizer from Object Storage to temporary - directory when needed. - - Args: - model: Model name or path - tokenizer: Tokenizer name or path - """ - - if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)): + def maybe_pull_model_for_runai(self, model: str) -> None: + """Pull model from Object Storage to temporary directory when needed.""" + if not is_runai_obj_uri(model): return - if is_runai_obj_uri(model): - object_storage_model = ObjectStorageModel(url=model) - object_storage_model.pull_files( - model, allow_pattern=["*.model", "*.py", "*.json"] - ) - self.model_weights = model - self.model = object_storage_model.dir - - # If tokenizer is same as model, download to same directory - if model == tokenizer: - object_storage_model.pull_files( - model, - ignore_pattern=[ - "*.pt", - "*.safetensors", - "*.bin", - "*.tensors", - "*.pth", - ], - ) - self.tokenizer = object_storage_model.dir - return - - # Only download tokenizer if needed and not already handled - if is_runai_obj_uri(tokenizer): - object_storage_tokenizer = ObjectStorageModel(url=tokenizer) - object_storage_tokenizer.pull_files( - model, - ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"], - ) - self.tokenizer = object_storage_tokenizer.dir + object_storage_model = ObjectStorageModel(url=model) + object_storage_model.pull_files( + model, allow_pattern=["*.model", "*.py", "*.json"] + ) + self.model_weights = model + self.model = object_storage_model.dir def _get_encoder_config(self): model = self.model @@ -1724,20 +1641,29 @@ def embedding_size(self): return dense_modules[-1]["out_features"] return self.get_hidden_size() - def get_and_verify_max_len(self, max_model_len: int): + def recalculate_max_model_len( + self, + max_model_len: int, + *, + tokenizer: str | None = None, + tokenizer_revision: str | None = None, + ) -> None: # Consider max_model_len in tokenizer_config only when # pooling models use absolute position_embedding. + # NOTE: For simplicity we assume `args.model == args.tokenizer` + # since this is tokenizer_config = None if ( self.runner_type == "pooling" and getattr(self.hf_config, "position_embedding_type", "") == "absolute" ): tokenizer_config = try_get_tokenizer_config( - self.tokenizer, + tokenizer or self.model, trust_remote_code=self.trust_remote_code, - revision=self.tokenizer_revision, + revision=tokenizer_revision or self.revision, ) - max_model_len = _get_and_verify_max_len( + + self.max_model_len = _get_and_verify_max_len( hf_config=self.hf_text_config, tokenizer_config=tokenizer_config, max_model_len=max_model_len, @@ -1746,8 +1672,7 @@ def get_and_verify_max_len(self, max_model_len: int): spec_target_max_model_len=self.spec_target_max_model_len, encoder_config=self.encoder_config, ) - logger.info("Using max model len %s", max_model_len) - return max_model_len + logger.info("Using max model len %s", self.max_model_len) @property def attn_type(self) -> AttnTypeStr: diff --git a/vllm/config/renderer.py b/vllm/config/renderer.py new file mode 100644 index 000000000000..f5d75f86719e --- /dev/null +++ b/vllm/config/renderer.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Literal + +from pydantic import Field +from pydantic.dataclasses import dataclass + +from vllm.config.model import ModelConfig +from vllm.config.utils import config +from vllm.transformers_utils.gguf_utils import is_gguf +from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri +from vllm.transformers_utils.utils import maybe_model_redirect + +TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] + + +@config +@dataclass +class RendererConfig: + """Configuration for the renderer.""" + + model_config: ModelConfig = Field(default_factory=ModelConfig) + """Provides model context to the renderer.""" + + tokenizer: str = "" + """Name or path of the Hugging Face tokenizer to use. If unspecified, model + name or path will be used.""" + tokenizer_mode: TokenizerMode | str = "auto" + """Tokenizer mode:\n + - "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n + - "hf" will use the fast tokenizer if available.\n + - "slow" will always use the slow tokenizer.\n + - "mistral" will always use the tokenizer from `mistral_common`.\n + - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n + - Other custom values can be supported via plugins.""" + tokenizer_revision: str | None = None + """The specific revision to use for the tokenizer on the Hugging Face Hub. + It can be a branch name, a tag name, or a commit id. If unspecified, will + use the default version.""" + skip_tokenizer_init: bool = False + """Skip initialization of tokenizer and detokenizer. Expects valid + `prompt_token_ids` and `None` for prompt from the input. The generated + output will contain token ids.""" + + io_processor_plugin: str | None = None + """IOProcessor plugin name to load at model startup.""" + + # Security-related + allowed_local_media_path: str = "" + """Allowing API requests to read local images or videos from directories + specified by the server file system. This is a security risk. Should only + be enabled in trusted environments.""" + allowed_media_domains: list[str] | None = None + """If set, only media URLs that belong to this domain can be used for + multi-modal inputs. """ + + @property + def trust_remote_code(self) -> bool: + return self.model_config.trust_remote_code + + def __post_init__(self) -> None: + # The tokenizer is consistent with the model by default. + if not self.tokenizer: + self.tokenizer = self.model_config.model + if not self.tokenizer_revision: + self.tokenizer_revision = self.model_config.revision + + self.tokenizer = maybe_model_redirect(self.tokenizer) + self.maybe_pull_tokenizer_for_runai(self.tokenizer) + + # Multimodal GGUF models must use original repo for mm processing + if is_gguf(self.tokenizer) and self.model_config.is_multimodal_model: + raise ValueError( + "Loading a multimodal GGUF model needs to use original " + "tokenizer. Please specify the unquantized hf model's " + "repo name or path using the --tokenizer argument." + ) + + def maybe_pull_tokenizer_for_runai(self, tokenizer: str) -> None: + """Pull tokenizer from Object Storage to temporary directory when needed.""" + if not is_runai_obj_uri(tokenizer): + return + + object_storage_tokenizer = ObjectStorageModel(url=tokenizer) + object_storage_tokenizer.pull_files( + self.model_config.model, + ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"], + ) + self.tokenizer = object_storage_tokenizer.dir diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index c6d6f705f535..93ff16a58be0 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -322,16 +322,11 @@ def __post_init__(self): self.draft_model_config = ModelConfig( model=self.model, runner="draft", - tokenizer=self.target_model_config.tokenizer, - tokenizer_mode=self.target_model_config.tokenizer_mode, trust_remote_code=self.target_model_config.trust_remote_code, - allowed_local_media_path=self.target_model_config.allowed_local_media_path, - allowed_media_domains=self.target_model_config.allowed_media_domains, dtype=self.target_model_config.dtype, seed=self.target_model_config.seed, revision=self.revision, code_revision=self.code_revision, - tokenizer_revision=self.target_model_config.tokenizer_revision, spec_target_max_model_len=self.target_model_config.max_model_len, quantization=self.quantization, enforce_eager=self.target_model_config.enforce_eager, diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 823bd96db9ac..1c882741c88b 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -38,6 +38,7 @@ from .model import ModelConfig from .observability import ObservabilityConfig from .parallel import ParallelConfig +from .renderer import RendererConfig from .scheduler import SchedulerConfig from .speculative import SpeculativeConfig from .structured_outputs import StructuredOutputsConfig @@ -180,6 +181,8 @@ class VllmConfig: # try to download a model model_config: ModelConfig = Field(default=None) """Model configuration.""" + renderer_config: RendererConfig = Field(default_factory=RendererConfig) + """Renderer configuration.""" cache_config: CacheConfig = Field(default_factory=CacheConfig) """Cache configuration.""" parallel_config: ParallelConfig = Field(default_factory=ParallelConfig) @@ -1120,10 +1123,12 @@ def _set_cudagraph_sizes(self): self.compilation_config.post_init_cudagraph_sizes() def recalculate_max_model_len(self, max_model_len: int): - # Can only be called in try_verify_and_update_config - model_config = self.model_config - max_model_len = model_config.get_and_verify_max_len(max_model_len) - self.model_config.max_model_len = max_model_len + # Can only be called during try_verify_and_update_config + self.model_config.recalculate_max_model_len( + max_model_len, + tokenizer=self.renderer_config.tokenizer, + tokenizer_revision=self.renderer_config.tokenizer_revision, + ) def try_verify_and_update_config(self): if self.model_config is None: @@ -1197,11 +1202,11 @@ def __str__(self): return ( f"model={self.model_config.model!r}, " f"speculative_config={self.speculative_config!r}, " - f"tokenizer={self.model_config.tokenizer!r}, " - f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, " - f"tokenizer_mode={self.model_config.tokenizer_mode}, " + f"tokenizer={self.renderer_config.tokenizer!r}, " + f"skip_tokenizer_init={self.renderer_config.skip_tokenizer_init}, " + f"tokenizer_mode={self.renderer_config.tokenizer_mode}, " f"revision={self.model_config.revision}, " - f"tokenizer_revision={self.model_config.tokenizer_revision}, " + f"tokenizer_revision={self.renderer_config.tokenizer_revision}, " f"trust_remote_code={self.model_config.trust_remote_code}, " f"dtype={self.model_config.dtype}, " f"max_seq_len={self.model_config.max_model_len}, " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index fd07cded7bc5..ba299ca4df4b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -70,11 +70,11 @@ ModelDType, RunnerOption, TaskOption, - TokenizerMode, ) from vllm.config.multimodal import MMCacheType, MMEncoderTPMode from vllm.config.observability import DetailedTraceModules from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy +from vllm.config.renderer import RendererConfig, TokenizerMode from vllm.config.scheduler import SchedulerPolicy from vllm.config.utils import get_field from vllm.config.vllm import OptimizationLevel @@ -354,17 +354,12 @@ class EngineArgs: model: str = ModelConfig.model served_model_name: str | list[str] | None = ModelConfig.served_model_name - tokenizer: str | None = ModelConfig.tokenizer hf_config_path: str | None = ModelConfig.hf_config_path runner: RunnerOption = ModelConfig.runner convert: ConvertOption = ModelConfig.convert task: TaskOption | None = ModelConfig.task - skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds - tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode trust_remote_code: bool = ModelConfig.trust_remote_code - allowed_local_media_path: str = ModelConfig.allowed_local_media_path - allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains download_dir: str | None = LoadConfig.download_dir safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy load_format: str | LoadFormats = LoadConfig.load_format @@ -448,7 +443,6 @@ class EngineArgs: code_revision: str | None = ModelConfig.code_revision hf_token: bool | str | None = ModelConfig.hf_token hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides") - tokenizer_revision: str | None = ModelConfig.tokenizer_revision quantization: QuantizationMethods | None = ModelConfig.quantization enforce_eager: bool = ModelConfig.enforce_eager disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce @@ -473,9 +467,16 @@ class EngineArgs: mm_encoder_attn_backend: AttentionBackendEnum | str | None = ( MultiModalConfig.mm_encoder_attn_backend ) - io_processor_plugin: str | None = None skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling video_pruning_rate: float = MultiModalConfig.video_pruning_rate + # Renderer fields + tokenizer: str | None = RendererConfig.tokenizer + tokenizer_mode: TokenizerMode | str = RendererConfig.tokenizer_mode + tokenizer_revision: str | None = RendererConfig.tokenizer_revision + skip_tokenizer_init: bool = RendererConfig.skip_tokenizer_init + io_processor_plugin: str | None = None + allowed_local_media_path: str = RendererConfig.allowed_local_media_path + allowed_media_domains: list[str] | None = RendererConfig.allowed_media_domains # LoRA fields enable_lora: bool = False max_loras: int = LoRAConfig.max_loras @@ -619,25 +620,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: model_group.add_argument("--runner", **model_kwargs["runner"]) model_group.add_argument("--convert", **model_kwargs["convert"]) model_group.add_argument("--task", **model_kwargs["task"], deprecated=True) - model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"]) - model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"]) model_group.add_argument( "--trust-remote-code", **model_kwargs["trust_remote_code"] ) model_group.add_argument("--dtype", **model_kwargs["dtype"]) model_group.add_argument("--seed", **model_kwargs["seed"]) model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"]) - model_group.add_argument( - "--allowed-local-media-path", **model_kwargs["allowed_local_media_path"] - ) - model_group.add_argument( - "--allowed-media-domains", **model_kwargs["allowed_media_domains"] - ) model_group.add_argument("--revision", **model_kwargs["revision"]) model_group.add_argument("--code-revision", **model_kwargs["code_revision"]) - model_group.add_argument( - "--tokenizer-revision", **model_kwargs["tokenizer_revision"] - ) model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"]) model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"]) model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"]) @@ -649,9 +639,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: model_group.add_argument( "--disable-cascade-attn", **model_kwargs["disable_cascade_attn"] ) - model_group.add_argument( - "--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"] - ) model_group.add_argument( "--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"] ) @@ -690,8 +677,31 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: model_group.add_argument( "--logits-processors", **model_kwargs["logits_processors"] ) - model_group.add_argument( - "--io-processor-plugin", **model_kwargs["io_processor_plugin"] + + # Renderer arguments + renderer_kwargs = get_kwargs(RendererConfig) + renderer_group = parser.add_argument_group( + title="RendererConfig", + description=RendererConfig.__doc__, + ) + renderer_group.add_argument("--tokenizer", **renderer_kwargs["tokenizer"]) + renderer_group.add_argument( + "--tokenizer-mode", **renderer_kwargs["tokenizer_mode"] + ) + renderer_group.add_argument( + "--tokenizer-revision", **renderer_kwargs["tokenizer_revision"] + ) + renderer_group.add_argument( + "--skip-tokenizer-init", **renderer_kwargs["skip_tokenizer_init"] + ) + renderer_group.add_argument( + "--allowed-local-media-path", **renderer_kwargs["allowed_local_media_path"] + ) + renderer_group.add_argument( + "--allowed-media-domains", **renderer_kwargs["allowed_media_domains"] + ) + renderer_group.add_argument( + "--io-processor-plugin", **renderer_kwargs["io_processor_plugin"] ) # Model loading arguments @@ -1230,18 +1240,13 @@ def create_model_config(self) -> ModelConfig: runner=self.runner, convert=self.convert, task=self.task, - tokenizer=self.tokenizer, - tokenizer_mode=self.tokenizer_mode, trust_remote_code=self.trust_remote_code, - allowed_local_media_path=self.allowed_local_media_path, - allowed_media_domains=self.allowed_media_domains, dtype=self.dtype, seed=self.seed, revision=self.revision, code_revision=self.code_revision, hf_token=self.hf_token, hf_overrides=self.hf_overrides, - tokenizer_revision=self.tokenizer_revision, max_model_len=self.max_model_len, quantization=self.quantization, enforce_eager=self.enforce_eager, @@ -1249,7 +1254,6 @@ def create_model_config(self) -> ModelConfig: logprobs_mode=self.logprobs_mode, disable_sliding_window=self.disable_sliding_window, disable_cascade_attn=self.disable_cascade_attn, - skip_tokenizer_init=self.skip_tokenizer_init, enable_prompt_embeds=self.enable_prompt_embeds, served_model_name=self.served_model_name, limit_mm_per_prompt=self.limit_mm_per_prompt, @@ -1273,7 +1277,6 @@ def create_model_config(self) -> ModelConfig: override_attention_dtype=self.override_attention_dtype, logits_processors=self.logits_processors, video_pruning_rate=self.video_pruning_rate, - io_processor_plugin=self.io_processor_plugin, ) def validate_tensorizer_args(self): @@ -1369,9 +1372,24 @@ def create_engine_config( ) model_config = self.create_model_config() - self.model = model_config.model - self.tokenizer = model_config.tokenizer + renderer_config = RendererConfig( + model_config=model_config, + tokenizer=self.tokenizer, + tokenizer_mode=self.tokenizer_mode, + tokenizer_revision=self.tokenizer_revision, + skip_tokenizer_init=self.skip_tokenizer_init, + io_processor_plugin=self.io_processor_plugin, + allowed_local_media_path=self.allowed_local_media_path, + allowed_media_domains=self.allowed_media_domains, + ) + + model_config.recalculate_max_model_len( + model_config.original_max_model_len, + tokenizer=renderer_config.tokenizer, + tokenizer_revision=renderer_config.tokenizer_revision, + ) + self.model = model_config.model self._check_feature_supported(model_config) self._set_default_chunked_prefill_and_prefix_caching_args(model_config) self._set_default_max_num_seqs_and_batched_tokens_args( @@ -1738,6 +1756,7 @@ def create_engine_config( ) config = VllmConfig( model_config=model_config, + renderer_config=renderer_config, cache_config=cache_config, parallel_config=parallel_config, scheduler_config=scheduler_config, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 077fe681bc5b..6b9c8844ea39 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -44,7 +44,7 @@ from typing_extensions import Required, TypedDict from vllm import envs -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.logger import init_logger from vllm.model_executor.models import SupportsMultiModal from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict @@ -452,9 +452,9 @@ def resolve_mistral_chat_template( def _try_get_processor_chat_template( tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> str | None: - cache_key = (tokenizer.name_or_path, model_config.trust_remote_code) + cache_key = (tokenizer.name_or_path, renderer_config.trust_remote_code) if cache_key in _PROCESSOR_CHAT_TEMPLATES: return _PROCESSOR_CHAT_TEMPLATES[cache_key] @@ -466,7 +466,7 @@ def _try_get_processor_chat_template( PreTrainedTokenizerFast, ProcessorMixin, ), - trust_remote_code=model_config.trust_remote_code, + trust_remote_code=renderer_config.trust_remote_code, ) if ( isinstance(processor, ProcessorMixin) @@ -491,7 +491,7 @@ def resolve_hf_chat_template( chat_template: str | None, tools: list[dict[str, Any]] | None, *, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> str | None: # 1st priority: The given chat template if chat_template is not None: @@ -499,7 +499,7 @@ def resolve_hf_chat_template( # 2nd priority: AutoProcessor chat template, unless tool calling is enabled if tools is None: - chat_template = _try_get_processor_chat_template(tokenizer, model_config) + chat_template = _try_get_processor_chat_template(tokenizer, renderer_config) if chat_template is not None: return chat_template @@ -515,8 +515,8 @@ def resolve_hf_chat_template( # 4th priority: Predefined fallbacks path = get_chat_template_fallback_path( - model_type=model_config.hf_config.model_type, - tokenizer_name_or_path=model_config.tokenizer, + model_type=renderer_config.model_config.hf_config.model_type, + tokenizer_name_or_path=renderer_config.tokenizer, ) if path is not None: logger.info_once( @@ -538,14 +538,14 @@ def _resolve_chat_template_content_format( tools: list[dict[str, Any]] | None, tokenizer: TokenizerLike | None, *, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> _ChatTemplateContentFormat: if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)): hf_chat_template = resolve_hf_chat_template( tokenizer, chat_template=chat_template, tools=tools, - model_config=model_config, + renderer_config=renderer_config, ) else: hf_chat_template = None @@ -595,7 +595,7 @@ def resolve_chat_template_content_format( given_format: ChatTemplateContentFormatOption, tokenizer: TokenizerLike | None, *, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> _ChatTemplateContentFormat: if given_format != "auto": return given_format @@ -604,7 +604,7 @@ def resolve_chat_template_content_format( chat_template, tools, tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) _log_chat_template_content_format( @@ -627,32 +627,32 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): maximum per prompt. """ - def __init__(self, model_config: ModelConfig): + def __init__(self, renderer_config: RendererConfig): super().__init__() - self._model_config = model_config + self._renderer_config = renderer_config self._items_by_modality = defaultdict[str, list[_T | None]](list) self._uuids_by_modality = defaultdict[str, list[str | None]](list) @property - def model_config(self) -> ModelConfig: - return self._model_config + def renderer_config(self) -> RendererConfig: + return self._renderer_config @cached_property def model_cls(self) -> type[SupportsMultiModal]: from vllm.model_executor.model_loader import get_model_cls - model_cls = get_model_cls(self.model_config) + model_cls = get_model_cls(self.renderer_config.model_config) return cast(type[SupportsMultiModal], model_cls) @property def allowed_local_media_path(self): - return self._model_config.allowed_local_media_path + return self._renderer_config.allowed_local_media_path @property def allowed_media_domains(self): - return self._model_config.allowed_media_domains + return self._renderer_config.allowed_media_domains @property def mm_registry(self): @@ -660,7 +660,7 @@ def mm_registry(self): @cached_property def mm_processor(self): - return self.mm_registry.create_processor(self.model_config) + return self.mm_registry.create_processor(self.renderer_config) def add( self, @@ -857,7 +857,7 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: super().__init__() self._tracker = tracker - multimodal_config = self._tracker.model_config.multimodal_config + multimodal_config = self.model_config.multimodal_config media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load( @@ -867,9 +867,13 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: allowed_media_domains=tracker.allowed_media_domains, ) + @property + def renderer_config(self) -> RendererConfig: + return self._tracker.renderer_config + @property def model_config(self) -> ModelConfig: - return self._tracker.model_config + return self.renderer_config.model_config def parse_image(self, image_url: str | None, uuid: str | None = None) -> None: image = self._connector.fetch_image(image_url) if image_url else None @@ -969,7 +973,7 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: super().__init__() self._tracker = tracker - multimodal_config = self._tracker.model_config.multimodal_config + multimodal_config = self.model_config.multimodal_config media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load( envs.VLLM_MEDIA_CONNECTOR, @@ -978,9 +982,13 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: allowed_media_domains=tracker.allowed_media_domains, ) + @property + def renderer_config(self) -> RendererConfig: + return self._tracker.renderer_config + @property def model_config(self) -> ModelConfig: - return self._tracker.model_config + return self.renderer_config.model_config def parse_image(self, image_url: str | None, uuid: str | None = None) -> None: image_coro = self._connector.fetch_image_async(image_url) if image_url else None @@ -1610,15 +1618,17 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None: def parse_chat_messages( messages: list[ChatCompletionMessageParam], - model_config: ModelConfig, + renderer_config: RendererConfig, content_format: _ChatTemplateContentFormat, ) -> tuple[ list[ConversationMessage], MultiModalDataDict | None, MultiModalUUIDDict | None, ]: + model_config = renderer_config.model_config + conversation: list[ConversationMessage] = [] - mm_tracker = MultiModalItemTracker(model_config) + mm_tracker = MultiModalItemTracker(renderer_config) for msg in messages: sub_messages = _parse_chat_message_content( @@ -1641,15 +1651,17 @@ def parse_chat_messages( def parse_chat_messages_futures( messages: list[ChatCompletionMessageParam], - model_config: ModelConfig, + renderer_config: RendererConfig, content_format: _ChatTemplateContentFormat, ) -> tuple[ list[ConversationMessage], Awaitable[MultiModalDataDict | None], MultiModalUUIDDict | None, ]: + model_config = renderer_config.model_config + conversation: list[ConversationMessage] = [] - mm_tracker = AsyncMultiModalItemTracker(model_config) + mm_tracker = AsyncMultiModalItemTracker(renderer_config) for msg in messages: sub_messages = _parse_chat_message_content( @@ -1754,14 +1766,14 @@ def apply_hf_chat_template( chat_template: str | None, tools: list[dict[str, Any]] | None, *, - model_config: ModelConfig, + renderer_config: RendererConfig, **kwargs: Any, ) -> str: hf_chat_template = resolve_hf_chat_template( tokenizer, chat_template=chat_template, tools=tools, - model_config=model_config, + renderer_config=renderer_config, ) if hf_chat_template is None: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 481a47a97f7d..62c80e41db11 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -29,8 +29,8 @@ HfOverrides, ModelDType, RunnerOption, - TokenizerMode, ) +from vllm.config.renderer import TokenizerMode from vllm.engine.arg_utils import EngineArgs from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, @@ -343,6 +343,7 @@ def __init__( logger.info("Supported tasks: %s", supported_tasks) self.supported_tasks = supported_tasks + self.renderer_config = self.llm_engine.renderer_config self.model_config = self.llm_engine.model_config self.input_processor = self.llm_engine.input_processor self.io_processor = self.llm_engine.io_processor @@ -808,13 +809,13 @@ def preprocess_chat( list_of_messages = [cast(list[ChatCompletionMessageParam], messages)] tokenizer = self.get_tokenizer() - model_config = self.model_config + renderer_config = self.renderer_config resolved_content_format = resolve_chat_template_content_format( chat_template, tools, chat_template_content_format, tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) _chat_template_kwargs: dict[str, Any] = dict( @@ -833,7 +834,7 @@ def preprocess_chat( # the chat message parsing for it. conversation, mm_data, mm_uuids = parse_chat_messages( msgs, - model_config, + renderer_config, content_format=resolved_content_format, ) @@ -847,7 +848,7 @@ def preprocess_chat( prompt_str = apply_hf_chat_template( tokenizer=tokenizer, conversation=conversation, - model_config=model_config, + renderer_config=renderer_config, **_chat_template_kwargs, ) # Special tokens are already included in chat templates so diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 3e421e21e3e8..a9e72fb00c5b 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -122,7 +122,7 @@ async def create_completion( try: lora_request = self._maybe_get_adapters(request) - if self.model_config.skip_tokenizer_init: + if self.renderer_config.skip_tokenizer_init: tokenizer = None else: tokenizer = await self.engine_client.get_tokenizer() diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index bfa98f29a064..e66197b5bca1 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -279,6 +279,7 @@ def __init__( self.input_processor = self.models.input_processor self.io_processor = self.models.io_processor + self.renderer_config = self.models.renderer_config self.model_config = self.models.model_config self.max_model_len = self.model_config.max_model_len @@ -1088,18 +1089,18 @@ async def _preprocess_chat( Sequence[RequestPrompt], list[EngineTokensPrompt], ]: - model_config = self.model_config + renderer_config = self.renderer_config resolved_content_format = resolve_chat_template_content_format( chat_template, tool_dicts, chat_template_content_format, tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) conversation, mm_data_future, mm_uuids = parse_chat_messages_futures( messages, - model_config, + renderer_config, content_format=resolved_content_format, ) @@ -1126,14 +1127,14 @@ async def _preprocess_chat( request_prompt = tokenizer.apply_chat_template( conversation=conversation, messages=messages, - model_config=model_config, + model_config=renderer_config.model_config, **_chat_template_kwargs, ) else: request_prompt = apply_hf_chat_template( tokenizer=tokenizer, conversation=conversation, - model_config=model_config, + renderer_config=renderer_config, **_chat_template_kwargs, ) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 953398a9a72a..ec65e659383d 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -71,6 +71,7 @@ def __init__( self.input_processor = self.engine_client.input_processor self.io_processor = self.engine_client.io_processor + self.renderer_config = self.engine_client.renderer_config self.model_config = self.engine_client.model_config self.max_model_len = self.model_config.max_model_len diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index cea9924ebbac..1ab83983eced 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -101,8 +101,8 @@ def __init__( self.tokenizer = cast( PreTrainedTokenizerBase, get_tokenizer( - tokenizer_name=self.model_config.tokenizer, - tokenizer_mode=self.model_config.tokenizer_mode, + tokenizer_name=self.renderer_config.tokenizer, + tokenizer_mode=self.renderer_config.tokenizer_mode, ), ) diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py index 7fb767e26d01..cd28ccba9ef9 100644 --- a/vllm/entrypoints/pooling/pooling/serving.py +++ b/vllm/entrypoints/pooling/pooling/serving.py @@ -94,7 +94,7 @@ async def create_pooling( try: lora_request = self._maybe_get_adapters(request) - if self.model_config.skip_tokenizer_init: + if self.renderer_config.skip_tokenizer_init: tokenizer = None else: tokenizer = await self.engine_client.get_tokenizer() diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 2893a56b1190..e886b2e23855 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -6,7 +6,7 @@ from typing_extensions import assert_never -from vllm.config import ModelConfig +from vllm.config import RendererConfig from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.cache import BaseMultiModalProcessorCache @@ -45,14 +45,15 @@ class InputPreprocessor: def __init__( self, - model_config: ModelConfig, + renderer_config: RendererConfig, tokenizer: TokenizerLike | None, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, mm_processor_cache: BaseMultiModalProcessorCache | None = None, ) -> None: super().__init__() - self.model_config = model_config + self.renderer_config = renderer_config + self.model_config = renderer_config.model_config self.tokenizer = tokenizer self.mm_registry = mm_registry self.mm_processor_cache = mm_processor_cache @@ -231,7 +232,7 @@ def _tokenize_prompt( def _get_mm_processor(self) -> BaseMultiModalProcessor: if not hasattr(self, "_mm_processor"): self._mm_processor = self.mm_registry.create_processor( - self.model_config, + self.renderer_config, tokenizer=self.tokenizer, cache=self.mm_processor_cache, ) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 007d847ac3b7..a2700bd5a501 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -415,7 +415,7 @@ def load_weights_using_from_2_way_softmax( from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader - model_config = model.vllm_config.model_config + renderer_config = model.vllm_config.renderer_config quant_config = model.vllm_config.quant_config text_config = model.config.get_text_config() @@ -447,10 +447,10 @@ def load_weights_using_from_2_way_softmax( from vllm.tokenizers import get_tokenizer tokenizer = get_tokenizer( - model_config.tokenizer, - revision=model_config.tokenizer_revision, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + revision=renderer_config.tokenizer_revision, + tokenizer_mode=renderer_config.tokenizer_mode, + trust_remote_code=renderer_config.trust_remote_code, ) false_id = tokenizer.convert_tokens_to_ids(tokens[0]) @@ -473,7 +473,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader - model_config = model.vllm_config.model_config + renderer_config = model.vllm_config.renderer_config quant_config = model.vllm_config.quant_config text_config = model.config.get_text_config() @@ -501,10 +501,10 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te from vllm.tokenizers import get_tokenizer tokenizer = get_tokenizer( - model_config.tokenizer, - revision=model_config.tokenizer_revision, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + revision=renderer_config.tokenizer_revision, + tokenizer_mode=renderer_config.tokenizer_mode, + trust_remote_code=renderer_config.trust_remote_code, ) token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens] diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index a612ebd95628..0cd676fdcf10 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -379,8 +379,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.projector_config = config.projector_config self.text_config = config.text_config - model_config = vllm_config.model_config - tokenizer = cached_tokenizer_from_config(model_config) + renderer_config = vllm_config.renderer_config + tokenizer = cached_tokenizer_from_config(renderer_config) self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN] self.sam_model = build_sam_vit_b() diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 56c1a87a2540..3eb750c0317b 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -372,8 +372,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.projector_config = config.projector_config self.text_config = config.text_config - model_config = vllm_config.model_config - tokenizer = cached_tokenizer_from_config(model_config) + renderer_config = vllm_config.renderer_config + tokenizer = cached_tokenizer_from_config(renderer_config) self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN] self.vision = self._init_vision_module( diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index accf7e6ef2f4..b5c1dc827916 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -34,7 +34,7 @@ from torch import nn from transformers import BatchFeature, PretrainedConfig -from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear @@ -841,7 +841,7 @@ def get_mm_mapping(self) -> MultiModelKeys: def get_generation_prompt( cls, audio: np.ndarray, - model_config: ModelConfig, + renderer_config: RendererConfig, stt_config: SpeechToTextConfig, language: str | None, task_type: Literal["transcribe", "translate"], @@ -862,7 +862,7 @@ def get_generation_prompt( else: raise ValueError(f"Unsupported task type {task_type}") - tokenizer = cached_tokenizer_from_config(model_config) + tokenizer = cached_tokenizer_from_config(renderer_config) chat = [dict(role="user", content=user_prompt)] prompt = tokenizer.apply_chat_template( chat, @@ -883,10 +883,10 @@ def get_num_audio_tokens( cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> int | None: """Get the number of audio tokens for an audio duration in sec.""" - processor = cached_processor_from_config(model_config) + processor = cached_processor_from_config(renderer_config) hop_length = processor.audio_processor.melspec_kwargs["hop_length"] proj_win_size = processor.audio_processor.projector_window_size ds_rate = processor.audio_processor.projector_downsample_rate @@ -904,7 +904,9 @@ def get_num_audio_tokens( @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: str + cls, + renderer_config: RendererConfig, + task_type: str, ) -> SpeechToTextConfig: """Get the stt config for this model.""" # Default settings are reasonable for this model and we don't currently diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 2aba626a7c73..b9f3ac8aee5f 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -6,7 +6,7 @@ import torch import torch.nn as nn -from vllm.config import ModelConfig, VllmConfig +from vllm.config import RendererConfig, VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.pooler import ( DispatchPooler, @@ -29,12 +29,12 @@ class GritLMMeanPool(nn.Module): """As `MeanPool`, but only includes non-instruction tokens.""" - def __init__(self, model_config: ModelConfig): + def __init__(self, renderer_config: RendererConfig): super().__init__() - self.model_config = model_config + self.renderer_config = renderer_config - tokenizer = cached_tokenizer_from_config(self.model_config) + tokenizer = cached_tokenizer_from_config(self.renderer_config) # Collect the tokens needed for pattern matching. # "▁<" is different from "_<". The former uses "▁" to indicate that @@ -174,10 +174,10 @@ def forward( class GritLMPooler(Pooler): - def __init__(self, model_config: ModelConfig): + def __init__(self, renderer_config: RendererConfig): super().__init__() - self.pooling = GritLMMeanPool(model_config) + self.pooling = GritLMMeanPool(renderer_config) self.head = PoolerHead(PoolerNormalize()) def get_supported_tasks(self) -> Set[PoolingTask]: @@ -238,6 +238,6 @@ def __init__( self.pooler = DispatchPooler( { "token_embed": Pooler.for_token_embed(pooler_config), - "embed": GritLMPooler(vllm_config.model_config), + "embed": GritLMPooler(vllm_config.renderer_config), } ) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 01b3e7827424..34a4104185d7 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -19,7 +19,7 @@ from transformers.models.whisper.tokenization_whisper import LANGUAGES from typing_extensions import Self, TypeIs -from vllm.config import ModelConfig, SpeechToTextConfig +from vllm.config import ModelConfig, RendererConfig, SpeechToTextConfig from vllm.inputs import TokensPrompt from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -902,7 +902,9 @@ def validate_language(cls, language: str | None) -> str | None: @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: Literal["transcribe", "translate"] + cls, + renderer_config: RendererConfig, + task_type: Literal["transcribe", "translate"], ) -> SpeechToTextConfig: """Get the speech to text config for the ASR model.""" ... @@ -912,7 +914,7 @@ def get_num_audio_tokens( cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> int | None: """ Map from audio duration to number of audio tokens produced by the ASR diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index c2195fd0cb88..46a0f64c449c 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -182,7 +182,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo): def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs) hf_processor.video_processor = cached_video_processor_from_config( - self.ctx.model_config, + self.ctx.renderer_config, processor_cls=InternVLVideoProcessor, size=hf_processor.image_processor.size, **kwargs, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index c4198d36b392..3169760a78c5 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1171,16 +1171,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.mlp1 = self.mlp1.to(self.language_model.config.dtype) self.config = config - self.model_config = vllm_config.model_config # Pre-tokenize special tokens for video processing # to avoid repeated tokenization - tokenizer = cached_tokenizer_from_config(vllm_config.model_config) - self._img_start_token_ids = tokenizer.encode( + self._tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config) + self._img_start_token_ids = self._tokenizer.encode( IMG_START, add_special_tokens=False ) - self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False) - self._img_context_token_ids = tokenizer.encode( + self._img_end_token_ids = self._tokenizer.encode( + IMG_END, add_special_tokens=False + ) + self._img_context_token_ids = self._tokenizer.encode( IMG_CONTEXT, add_special_tokens=False ) @@ -1366,7 +1367,7 @@ def _create_final_video_embeddings( input_embeds for the LLM. """ device = video_embeddings.device - tokenizer = cached_tokenizer_from_config(self.model_config) + tokenizer = self._tokenizer # Generate video replacement token IDs using get_video_repl # This tokenizes each frame separator independently, then uses pre-tokenized diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index a57668b21fb8..bfa97303ac9a 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -347,7 +347,7 @@ def get_hf_processor(self, **kwargs: object) -> NemotronVLProcessor: def get_image_processor(self, **kwargs: object): return cached_image_processor_from_config( - self.ctx.model_config, + self.ctx.renderer_config, **kwargs, ) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index cad241842cd3..9a6ecd0dee76 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -193,7 +193,7 @@ def __call__( class PixtralProcessingInfo(BaseProcessingInfo): def get_tokenizer(self) -> MistralTokenizer: - tokenizer = cached_tokenizer_from_config(self.ctx.model_config) + tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config) if not isinstance(tokenizer, MistralTokenizer): raise ValueError("This model requires `--tokenizer-mode mistral`") diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 45f8fa079c71..ee0cc8e97a67 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -20,7 +20,7 @@ from transformers import BatchFeature, TensorType, WhisperConfig from transformers.tokenization_utils_base import TextInput -from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -176,7 +176,7 @@ def __call__( class VoxtralProcessingInfo(BaseProcessingInfo): def get_tokenizer(self) -> MistralTokenizer: - tokenizer = cached_tokenizer_from_config(self.ctx.model_config) + tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config) if not isinstance(tokenizer, MistralTokenizer): raise ValueError("This model requires `--tokenizer-mode mistral`") @@ -341,7 +341,7 @@ class VoxtralForConditionalGeneration( def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config) + self.tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config) # update quant config to so that ignored module and target module names # match the vLLM model names @@ -452,9 +452,11 @@ def compute_logits( @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: str + cls, + renderer_config: RendererConfig, + task_type: str, ) -> SpeechToTextConfig: - tokenizer = cached_tokenizer_from_config(model_config) + tokenizer = cached_tokenizer_from_config(renderer_config) audio_config = tokenizer.instruct.audio_encoder.audio_config max_audio_clip_s = audio_config.chunk_length_s sample_rate = audio_config.sampling_rate @@ -470,17 +472,17 @@ def get_speech_to_text_config( def get_generation_prompt( cls, audio: np.ndarray, - model_config: ModelConfig, + renderer_config: RendererConfig, # not needed here stt_config: SpeechToTextConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, ) -> PromptType: - tokenizer = cached_tokenizer_from_config(model_config) + tokenizer = cached_tokenizer_from_config(renderer_config) audio = Audio(audio, int(stt_config.sample_rate), format="wav") # lossless req = TranscriptionRequest( - model=model_config.model, + model=renderer_config.model_config.model, audio=RawAudio.from_audio(audio), language=language, ) @@ -496,14 +498,14 @@ def get_num_audio_tokens( cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> int | None: """ Map from audio duration to number of audio tokens produced by the ASR model, without running a forward pass. This is used for estimating the amount of processing for this audio. """ - tokenizer = cached_tokenizer_from_config(model_config) + tokenizer = cached_tokenizer_from_config(renderer_config) adapter = VoxtralProcessorAdapter(tokenizer) return adapter.get_num_audio_tokens( int(audio_duration_s * stt_config.sample_rate) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 0daf6bda61cc..618acb6f998d 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -19,7 +19,7 @@ from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention, MultiHeadAttention from vllm.attention.layers.cross_attention import CrossAttention -from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType @@ -812,7 +812,7 @@ def validate_language(cls, language: str | None) -> str | None: def get_generation_prompt( cls, audio: np.ndarray, - model_config: ModelConfig, # not needed here + renderer_config: RendererConfig, # not needed here stt_config: SpeechToTextConfig, language: str | None, task_type: Literal["transcribe", "translate"], @@ -848,9 +848,11 @@ def get_placeholder_str(cls, modality: str, i: int) -> str | None: @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: str + cls, + renderer_config: RendererConfig, + task_type: str, ) -> SpeechToTextConfig: - processor = cached_processor_from_config(model_config) + processor = cached_processor_from_config(renderer_config) return SpeechToTextConfig( max_audio_clip_s=processor.feature_extractor.chunk_length, @@ -862,9 +864,9 @@ def get_num_audio_tokens( cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> int | None: - processor = cached_processor_from_config(model_config) + processor = cached_processor_from_config(renderer_config) hop_length = processor.feature_extractor.hop_length assert hop_length is not None # NOTE(NickLucche) user can't pass encoder diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 039077378396..81ceb76a4b96 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -23,7 +23,7 @@ from typing_extensions import TypeVar, assert_never from vllm.logger import init_logger -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config from vllm.transformers_utils.processor import cached_processor_from_config from vllm.utils.collection_utils import flatten_2d_lists, full_groupby from vllm.utils.func_utils import get_allowed_kwarg_only_overrides @@ -53,7 +53,7 @@ from transformers.feature_extraction_utils import BatchFeature from transformers.processing_utils import ProcessorMixin - from vllm.config import ModelConfig + from vllm.config import ModelConfig, RendererConfig from .cache import BaseMultiModalProcessorCache from .profiling import BaseDummyInputsBuilder @@ -63,6 +63,7 @@ ProcessorMixin = object ModelConfig = object + RendererConfig = object BaseMultiModalProcessorCache = object @@ -945,12 +946,29 @@ class InputProcessingContext: modify the inputs. """ - model_config: ModelConfig - """The configuration of the model.""" + renderer_config: RendererConfig + """The configuration of the renderer.""" tokenizer: TokenizerLike | None """The tokenizer used to tokenize the inputs.""" + @classmethod + def from_config( + cls, + renderer_config: RendererConfig, + *, + tokenizer: TokenizerLike | None = None, + ): + if tokenizer is None and not renderer_config.skip_tokenizer_init: + tokenizer = cached_tokenizer_from_config(renderer_config) + + return cls(renderer_config, tokenizer) + + @property + def model_config(self) -> ModelConfig: + """The configuration of the model.""" + return self.renderer_config.model_config + def get_tokenizer(self) -> TokenizerLike: if self.tokenizer is None: raise ValueError( @@ -1047,7 +1065,7 @@ def get_hf_processor( typ = ProcessorMixin return cached_processor_from_config( - self.model_config, + self.renderer_config, processor_cls=typ, tokenizer=self.tokenizer, **kwargs, diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 00a84f9dec4f..31e1c9e6af96 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,7 +6,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger -from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config +from vllm.tokenizers import TokenizerLike from .cache import BaseMultiModalProcessorCache from .processing import ( @@ -22,7 +22,7 @@ ) if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import ModelConfig, RendererConfig from vllm.model_executor.models.interfaces import SupportsMultiModal logger = init_logger(__name__) @@ -144,7 +144,7 @@ def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool: def get_max_tokens_per_item_by_modality( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", *, cache: BaseMultiModalProcessorCache | None = None, profiler_limits: Mapping[str, int] | None = None, @@ -153,10 +153,11 @@ def get_max_tokens_per_item_by_modality( Get the maximum number of tokens per data item from each modality based on underlying model configuration. """ + model_config = renderer_config.model_config if not model_config.is_multimodal_model: return {} - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor(renderer_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) seq_len = model_config.max_model_len @@ -171,7 +172,7 @@ def get_max_tokens_per_item_by_modality( def get_mm_limits_per_prompt( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", *, cache: BaseMultiModalProcessorCache | None = None, ) -> Mapping[str, int]: @@ -179,10 +180,11 @@ def get_mm_limits_per_prompt( Get the maximum number of multi-modal input instances for each modality that are allowed per prompt for a model class. """ + model_config = renderer_config.model_config if not model_config.is_multimodal_model: return {} - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor(renderer_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) return profiler.get_mm_limits() @@ -228,30 +230,21 @@ def _get_model_cls(self, model_config: "ModelConfig") -> "SupportsMultiModal": assert hasattr(model_cls, "_processor_factory") return cast("SupportsMultiModal", model_cls) - def _create_processing_ctx( - self, - model_config: "ModelConfig", - tokenizer: TokenizerLike | None = None, - ) -> InputProcessingContext: - if tokenizer is None and not model_config.skip_tokenizer_init: - tokenizer = cached_tokenizer_from_config(model_config) - - return InputProcessingContext(model_config, tokenizer) - def _create_processing_info( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", *, tokenizer: TokenizerLike | None = None, ) -> BaseProcessingInfo: - model_cls = self._get_model_cls(model_config) + model_cls = self._get_model_cls(renderer_config.model_config) factories = model_cls._processor_factory - ctx = self._create_processing_ctx(model_config, tokenizer) + + ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer) return factories.info(ctx) def create_processor( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", *, tokenizer: TokenizerLike | None = None, cache: BaseMultiModalProcessorCache | None = None, @@ -259,19 +252,19 @@ def create_processor( """ Create a multi-modal processor for a specific model and tokenizer. """ + model_config = renderer_config.model_config if not model_config.is_multimodal_model: raise ValueError(f"{model_config.model} is not a multimodal model") model_cls = self._get_model_cls(model_config) factories = model_cls._processor_factory - ctx = self._create_processing_ctx(model_config, tokenizer) - + ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer) return factories.build_processor(ctx, cache=cache) def get_decoder_dummy_data( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", seq_len: int, mm_counts: Mapping[str, int] | None = None, *, @@ -280,15 +273,15 @@ def get_decoder_dummy_data( """ Create dummy data for profiling the memory usage of a model. - The model is identified by `model_config`. + The model is identified by `renderer_config`. """ - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor(renderer_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) # Extract configurable options from multimodal config. # Only include modalities that use advanced option types so legacy # count-only behavior remains unchanged. - mm_options = self._extract_mm_options(model_config) + mm_options = self._extract_mm_options(renderer_config.model_config) dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options) @@ -304,7 +297,7 @@ def get_decoder_dummy_data( def get_encoder_dummy_data( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", seq_len: int, mm_counts: Mapping[str, int] | None = None, *, @@ -313,15 +306,15 @@ def get_encoder_dummy_data( """ Create dummy data for profiling the memory usage of a model. - The model is identified by `model_config`. + The model is identified by `renderer_config`. """ - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor(renderer_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) # Extract configurable options from multimodal config. # Only include modalities that use advanced option types so legacy # count-only behavior remains unchanged. - mm_options = self._extract_mm_options(model_config) + mm_options = self._extract_mm_options(renderer_config.model_config) dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options) diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py index 87048f2ec784..614dcc3fe01c 100644 --- a/vllm/tokenizers/registry.py +++ b/vllm/tokenizers/registry.py @@ -24,7 +24,7 @@ from .protocol import TokenizerLike if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import RendererConfig logger = init_logger(__name__) @@ -205,18 +205,18 @@ def get_tokenizer( cached_get_tokenizer = lru_cache(get_tokenizer) -def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs): +def cached_tokenizer_from_config(renderer_config: "RendererConfig", **kwargs): return cached_get_tokenizer( - model_config.tokenizer, - tokenizer_mode=model_config.tokenizer_mode, - revision=model_config.tokenizer_revision, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + tokenizer_mode=renderer_config.tokenizer_mode, + revision=renderer_config.tokenizer_revision, + trust_remote_code=renderer_config.trust_remote_code, **kwargs, ) -def init_tokenizer_from_config(model_config: "ModelConfig"): - runner_type = model_config.runner_type +def init_tokenizer_from_config(renderer_config: "RendererConfig"): + runner_type = renderer_config.model_config.runner_type if runner_type == "generate" or runner_type == "draft": truncation_side = "left" elif runner_type == "pooling": @@ -225,9 +225,9 @@ def init_tokenizer_from_config(model_config: "ModelConfig"): assert_never(runner_type) return get_tokenizer( - model_config.tokenizer, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, - revision=model_config.tokenizer_revision, + renderer_config.tokenizer, + tokenizer_mode=renderer_config.tokenizer_mode, + trust_remote_code=renderer_config.trust_remote_code, + revision=renderer_config.tokenizer_revision, truncation_side=truncation_side, ) diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index e9864b0c1531..bdebd2686bae 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -23,7 +23,7 @@ from vllm.utils.func_utils import get_allowed_kwarg_only_overrides if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import ModelConfig, RendererConfig _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin) _V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor) @@ -233,17 +233,18 @@ def cached_get_processor_without_dynamic_kwargs( def cached_processor_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin, **kwargs: Any, ) -> _P: + model_config = renderer_config.model_config if is_gguf(model_config.model): - assert not is_gguf(model_config.tokenizer), ( + assert not is_gguf(renderer_config.tokenizer), ( "For multimodal GGUF models, the original tokenizer " "should be used to correctly load processor." ) - model = model_config.tokenizer - revision = model_config.tokenizer_revision + model = renderer_config.tokenizer + revision = renderer_config.tokenizer_revision else: model = model_config.model revision = model_config.revision @@ -297,9 +298,11 @@ def get_feature_extractor( def cached_feature_extractor_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", **kwargs: Any, ): + model_config = renderer_config.model_config + return cached_get_feature_extractor( model_config.model, revision=model_config.revision, @@ -348,16 +351,17 @@ def get_image_processor( def cached_image_processor_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", **kwargs: Any, ): + model_config = renderer_config.model_config if is_gguf(model_config.model): - assert not is_gguf(model_config.tokenizer), ( + assert not is_gguf(renderer_config.tokenizer), ( "For multimodal GGUF models, the original tokenizer " "should be used to correctly load image processor." ) - model = model_config.tokenizer - revision = model_config.tokenizer_revision + model = renderer_config.tokenizer + revision = renderer_config.tokenizer_revision else: model = model_config.model revision = model_config.revision @@ -411,10 +415,12 @@ def get_video_processor( def cached_video_processor_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", processor_cls: type[_V] | None = None, **kwargs: Any, ): + model_config = renderer_config.model_config + return cached_get_video_processor( model_config.model, revision=model_config.revision, diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 3959e9a59a53..21315b85f22a 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -10,7 +10,7 @@ from vllm.v1.request import Request if TYPE_CHECKING: - from vllm.config import ModelConfig, SchedulerConfig + from vllm.config import RendererConfig, SchedulerConfig logger = init_logger(__name__) @@ -250,7 +250,7 @@ def get_freed_mm_hashes(self) -> list[str]: def compute_encoder_budget( - model_config: "ModelConfig", + renderer_config: "RendererConfig", scheduler_config: "SchedulerConfig", mm_registry: MultiModalRegistry, ) -> tuple[int, int]: @@ -263,9 +263,9 @@ def compute_encoder_budget( - Space budget for encoder cache size, measured in number of tokens from the input sequence. """ - if mm_registry.supports_multimodal_inputs(model_config): + if mm_registry.supports_multimodal_inputs(renderer_config): max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality( - model_config + renderer_config ) return compute_mm_encoder_budget( diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 75a7385df38b..0fadcca988f6 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -164,7 +164,7 @@ def __init__( # This can be changed when we make encoder cache for embedding caching # across requests. encoder_compute_budget, encoder_cache_size = compute_encoder_budget( - model_config=vllm_config.model_config, + renderer_config=vllm_config.renderer_config, scheduler_config=vllm_config.scheduler_config, mm_registry=mm_registry, ) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index ec5d6e95ce3a..f0a044d29c61 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -91,6 +91,7 @@ def __init__( # Ensure we can serialize custom transformer configs maybe_register_config_serialize_by_value() + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.vllm_config = vllm_config self.observability_config = vllm_config.observability_config @@ -108,15 +109,15 @@ def __init__( "enabling logging without default stat loggers." ) - if self.model_config.skip_tokenizer_init: + if self.renderer_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = init_tokenizer_from_config(self.model_config) + tokenizer = init_tokenizer_from_config(self.renderer_config) self.input_processor = InputProcessor(self.vllm_config, tokenizer) self.io_processor = get_io_processor( self.vllm_config, - self.model_config.io_processor_plugin, + self.renderer_config.io_processor_plugin, ) # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index e6a94f4e3de5..f965a8bc380b 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -43,6 +43,7 @@ def __init__( mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ) -> None: self.vllm_config = vllm_config + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config @@ -54,7 +55,7 @@ def __init__( self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry) self.input_preprocessor = InputPreprocessor( - self.model_config, + self.renderer_config, tokenizer, mm_registry, mm_processor_cache=self.mm_processor_cache, @@ -252,7 +253,7 @@ def _validate_structured_output(self, params: SamplingParams) -> None: if not params.structured_outputs or not self.structured_outputs_config: return - if self.model_config.skip_tokenizer_init and params.structured_outputs: + if self.renderer_config.skip_tokenizer_init and params.structured_outputs: raise ValueError( "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501 ) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 8772f2e488dc..de154f7c7ec3 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -60,6 +60,7 @@ def __init__( ) -> None: self.vllm_config = vllm_config self.observability_config = vllm_config.observability_config + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config @@ -83,15 +84,15 @@ def __init__( self.dp_group = None self.should_execute_dummy_batch = False - if self.model_config.skip_tokenizer_init: + if self.renderer_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = init_tokenizer_from_config(self.model_config) + tokenizer = init_tokenizer_from_config(self.renderer_config) self.input_processor = InputProcessor(self.vllm_config, tokenizer) self.io_processor = get_io_processor( self.vllm_config, - self.model_config.io_processor_plugin, + self.renderer_config.io_processor_plugin, ) # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 5ee88178cdf6..36aa3d9bb3f9 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -63,7 +63,7 @@ def __init__(self, vllm_config: VllmConfig): max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8)) self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers) - if not self.vllm_config.model_config.skip_tokenizer_init: + if not vllm_config.renderer_config.skip_tokenizer_init: # The default max_workers if not specified is the number of # CPUs * 5, which is way too high since these tasks are CPU-bound, # not I/O bound. We also know we would never dominate CPU usage @@ -71,21 +71,15 @@ def __init__(self, vllm_config: VllmConfig): # of CPUs. max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) self.executor = ThreadPoolExecutor(max_workers=max_workers) - self.tokenizer = init_tokenizer_from_config( - model_config=self.vllm_config.model_config - ) - reasoning_parser = ( - self.vllm_config.structured_outputs_config.reasoning_parser - ) + self.tokenizer = init_tokenizer_from_config(vllm_config.renderer_config) + reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser reasoning_parser_plugin = ( - self.vllm_config.structured_outputs_config.reasoning_parser_plugin + vllm_config.structured_outputs_config.reasoning_parser_plugin ) if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3: ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin) - reasoning_parser = ( - self.vllm_config.structured_outputs_config.reasoning_parser - ) + reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser if reasoning_parser: reasoner_cls = ReasoningParserManager.get_reasoning_parser( reasoning_parser @@ -93,7 +87,7 @@ def __init__(self, vllm_config: VllmConfig): self.reasoner = reasoner_cls(tokenizer=self.tokenizer) self.enable_in_reasoning = ( - self.vllm_config.structured_outputs_config.enable_in_reasoning + vllm_config.structured_outputs_config.enable_in_reasoning ) def grammar_init(self, request: Request) -> None: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 152bea2c0975..ad399f4c3f9c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -557,7 +557,7 @@ def __init__( self.mm_budget = ( MultiModalBudget( - self.model_config, + vllm_config.renderer_config, self.scheduler_config, self.mm_registry, ) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 292f12969aae..2827017ecbae 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -353,7 +353,7 @@ def __init__( self.mm_budget = ( MultiModalBudget( - self.model_config, + vllm_config.renderer_config, self.scheduler_config, self.mm_registry, ) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 427a0d296b25..22d98cff14d0 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -7,7 +7,7 @@ from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention -from vllm.config import ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import RendererConfig, SchedulerConfig, VllmConfig from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.cache import processor_only_cache_from_config @@ -23,13 +23,14 @@ class MultiModalBudget: def __init__( self, - model_config: ModelConfig, + renderer_config: RendererConfig, scheduler_config: SchedulerConfig, mm_registry: MultiModalRegistry, ) -> None: super().__init__() - self.model_config = model_config + self.renderer_config = renderer_config + self.model_config = model_config = renderer_config.model_config self.scheduler_config = scheduler_config self.mm_registry = mm_registry self.cache = cache = processor_only_cache_from_config(model_config, mm_registry) @@ -40,7 +41,7 @@ def __init__( self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, cache=cache) max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality( - model_config, + renderer_config, cache=cache, profiler_limits=self.mm_limits, ) From 5d7b31eea28ae11b9caa182d7899b7551f12255d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 16:29:42 +0000 Subject: [PATCH 02/29] Also move `media_io_kwargs` Signed-off-by: DarkLight1337 --- vllm/config/model.py | 4 ---- vllm/config/multimodal.py | 4 ---- vllm/config/renderer.py | 7 +++++-- vllm/engine/arg_utils.py | 14 +++++++------- vllm/entrypoints/chat_utils.py | 9 ++------- 5 files changed, 14 insertions(+), 24 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 63bdee96957f..2cf89ed1da80 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -278,7 +278,6 @@ class ModelConfig: from the architecture of `self.model`.""" limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None enable_mm_embeds: InitVar[bool | None] = None - media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None mm_processor_kwargs: InitVar[dict[str, Any] | None] = None mm_processor_cache_gb: InitVar[float | None] = None mm_processor_cache_type: InitVar[MMCacheType | None] = None @@ -321,7 +320,6 @@ def compute_hash(self) -> str: "pooler_config", "multimodal_config", "limit_mm_per_prompt", - "media_io_kwargs", "mm_processor_kwargs", "mm_processor_cache_gb", "mm_processor_cache_type", @@ -386,7 +384,6 @@ def __post_init__( # Multimodal config init vars limit_mm_per_prompt: dict[str, int | dict[str, int]] | None, enable_mm_embeds: bool | None, - media_io_kwargs: dict[str, dict[str, Any]] | None, mm_processor_kwargs: dict[str, Any] | None, mm_processor_cache_gb: float | None, mm_processor_cache_type: MMCacheType | None, @@ -634,7 +631,6 @@ def _task_to_convert(task: TaskOption) -> ConvertType: mm_config_kwargs = dict( limit_per_prompt=limit_mm_per_prompt, enable_mm_embeds=enable_mm_embeds, - media_io_kwargs=media_io_kwargs, mm_processor_kwargs=mm_processor_kwargs, mm_processor_cache_gb=mm_processor_cache_gb, mm_processor_cache_type=mm_processor_cache_type, diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 8a2936de96d6..37e2f6b4d419 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -79,10 +79,6 @@ class MultiModalConfig: WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed. Only enable this flag for trusted users!""" - media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict) - """Additional args passed to process media inputs, keyed by modalities. - For example, to set num_frames for video, set - `--media-io-kwargs '{"video": {"num_frames": 40} }'`""" mm_processor_kwargs: dict[str, object] | None = None """Arguments to be forwarded to the model's processor for multi-modal data, e.g., image processor. Overrides for the multi-modal processor obtained diff --git a/vllm/config/renderer.py b/vllm/config/renderer.py index f5d75f86719e..804774149b10 100644 --- a/vllm/config/renderer.py +++ b/vllm/config/renderer.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Literal +from typing import Any, Literal from pydantic import Field from pydantic.dataclasses import dataclass @@ -45,7 +45,10 @@ class RendererConfig: io_processor_plugin: str | None = None """IOProcessor plugin name to load at model startup.""" - # Security-related + media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict) + """Additional args passed to process media inputs, keyed by modalities. + For example, to set num_frames for video, set + `--media-io-kwargs '{"video": {"num_frames": 40} }'`""" allowed_local_media_path: str = "" """Allowing API requests to read local images or videos from directories specified by the server file system. This is a security risk. Should only diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ba299ca4df4b..77b6555a0add 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -451,9 +451,6 @@ class EngineArgs: ) enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings - media_io_kwargs: dict[str, dict[str, Any]] = get_field( - MultiModalConfig, "media_io_kwargs" - ) mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb @@ -475,6 +472,9 @@ class EngineArgs: tokenizer_revision: str | None = RendererConfig.tokenizer_revision skip_tokenizer_init: bool = RendererConfig.skip_tokenizer_init io_processor_plugin: str | None = None + media_io_kwargs: dict[str, dict[str, Any]] = get_field( + RendererConfig, "media_io_kwargs" + ) allowed_local_media_path: str = RendererConfig.allowed_local_media_path allowed_media_domains: list[str] | None = RendererConfig.allowed_media_domains # LoRA fields @@ -694,6 +694,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: renderer_group.add_argument( "--skip-tokenizer-init", **renderer_kwargs["skip_tokenizer_init"] ) + renderer_group.add_argument( + "--media-io-kwargs", **renderer_kwargs["media_io_kwargs"] + ) renderer_group.add_argument( "--allowed-local-media-path", **renderer_kwargs["allowed_local_media_path"] ) @@ -941,9 +944,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: multimodal_group.add_argument( "--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"] ) - multimodal_group.add_argument( - "--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"] - ) multimodal_group.add_argument( "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"] ) @@ -1259,7 +1259,6 @@ def create_model_config(self) -> ModelConfig: limit_mm_per_prompt=self.limit_mm_per_prompt, enable_mm_embeds=self.enable_mm_embeds, interleave_mm_strings=self.interleave_mm_strings, - media_io_kwargs=self.media_io_kwargs, skip_mm_profiling=self.skip_mm_profiling, config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, @@ -1379,6 +1378,7 @@ def create_engine_config( tokenizer_revision=self.tokenizer_revision, skip_tokenizer_init=self.skip_tokenizer_init, io_processor_plugin=self.io_processor_plugin, + media_io_kwargs=self.media_io_kwargs, allowed_local_media_path=self.allowed_local_media_path, allowed_media_domains=self.allowed_media_domains, ) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 6b9c8844ea39..bfa5929b1396 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -857,12 +857,9 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: super().__init__() self._tracker = tracker - multimodal_config = self.model_config.multimodal_config - media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) - self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load( envs.VLLM_MEDIA_CONNECTOR, - media_io_kwargs=media_io_kwargs, + media_io_kwargs=self.renderer_config.media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) @@ -973,11 +970,9 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: super().__init__() self._tracker = tracker - multimodal_config = self.model_config.multimodal_config - media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load( envs.VLLM_MEDIA_CONNECTOR, - media_io_kwargs=media_io_kwargs, + media_io_kwargs=self.renderer_config.media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) From 34f6d8c6891409d20145b5c72dd5a23ae787b627 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 16:32:58 +0000 Subject: [PATCH 03/29] Fix Signed-off-by: DarkLight1337 --- vllm/entrypoints/openai/speech_to_text.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index 1ab83983eced..5fd79eed1909 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -91,7 +91,7 @@ def __init__( self.task_type = task_type self.asr_config = self.model_cls.get_speech_to_text_config( - self.model_config, task_type + self.renderer_config, task_type ) self.enable_force_include_usage = enable_force_include_usage @@ -154,7 +154,7 @@ async def _preprocess_speech_to_text( prompt = self.model_cls.get_generation_prompt( audio=chunk, stt_config=self.asr_config, - model_config=self.model_config, + renderer_config=self.renderer_config, language=language, task_type=self.task_type, request_prompt=request.prompt, @@ -428,7 +428,7 @@ async def _speech_to_text_stream_generator( if res.prompt_token_ids is not None: num_prompt_tokens = len(res.prompt_token_ids) if audio_tokens := self.model_cls.get_num_audio_tokens( - audio_duration_s, self.asr_config, self.model_config + audio_duration_s, self.asr_config, self.renderer_config ): num_prompt_tokens += audio_tokens From 33e0d97db76fbb3a51602780d64a12a45e5074dd Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 16:40:14 +0000 Subject: [PATCH 04/29] Fix Signed-off-by: DarkLight1337 --- vllm/config/model.py | 3 ++- vllm/config/renderer.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 2cf89ed1da80..6231d5721273 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -398,7 +398,8 @@ def __post_init__( self.served_model_name = get_served_model_name( self.model, self.served_model_name ) - self.model = maybe_model_redirect(self.model) + self.original_model = self.model + self.model = maybe_model_redirect(self.original_model) if isinstance(self.hf_config_path, str): self.hf_config_path = maybe_model_redirect(self.hf_config_path) diff --git a/vllm/config/renderer.py b/vllm/config/renderer.py index 804774149b10..0ac4367f72ef 100644 --- a/vllm/config/renderer.py +++ b/vllm/config/renderer.py @@ -68,7 +68,8 @@ def __post_init__(self) -> None: if not self.tokenizer_revision: self.tokenizer_revision = self.model_config.revision - self.tokenizer = maybe_model_redirect(self.tokenizer) + self.original_tokenizer = self.tokenizer + self.tokenizer = maybe_model_redirect(self.original_tokenizer) self.maybe_pull_tokenizer_for_runai(self.tokenizer) # Multimodal GGUF models must use original repo for mm processing @@ -86,7 +87,7 @@ def maybe_pull_tokenizer_for_runai(self, tokenizer: str) -> None: object_storage_tokenizer = ObjectStorageModel(url=tokenizer) object_storage_tokenizer.pull_files( - self.model_config.model, + self.model_config.original_model, ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"], ) self.tokenizer = object_storage_tokenizer.dir From cd3fa6fccb8047c936dac1fce25fdfa248b1f857 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 16:41:12 +0000 Subject: [PATCH 05/29] Fix Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 5 +++-- vllm/v1/worker/tpu_model_runner.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ad399f4c3f9c..3b0bdcc6b280 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -270,6 +270,7 @@ def __init__( device: torch.device, ): self.vllm_config = vllm_config + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.compilation_config = vllm_config.compilation_config @@ -557,7 +558,7 @@ def __init__( self.mm_budget = ( MultiModalBudget( - vllm_config.renderer_config, + self.renderer_config, self.scheduler_config, self.mm_registry, ) @@ -3838,7 +3839,7 @@ def _get_mm_dummy_batch( assert self.mm_budget is not None dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( - model_config=self.model_config, + model_config=self.renderer_config, seq_len=self.max_model_len, mm_counts={modality: 1}, cache=self.mm_budget.cache, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 2827017ecbae..35f3591fa25a 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -143,6 +143,7 @@ def __init__( original_parallel_config: ParallelConfig | None = None, ): self.vllm_config = vllm_config + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config @@ -353,7 +354,7 @@ def __init__( self.mm_budget = ( MultiModalBudget( - vllm_config.renderer_config, + self.renderer_config, self.scheduler_config, self.mm_registry, ) @@ -2039,7 +2040,7 @@ def _get_mm_dummy_batch( assert self.mm_budget is not None dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( - model_config=self.model_config, + model_config=self.renderer_config, seq_len=self.max_model_len, mm_counts={modality: 1}, cache=self.mm_budget.cache, From 4fe9c075a026509653ca260386c590b27f244bc2 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 16:44:01 +0000 Subject: [PATCH 06/29] Fixes Signed-off-by: DarkLight1337 --- vllm/entrypoints/openai/api_server.py | 2 +- vllm/entrypoints/utils.py | 8 ++++---- vllm/model_executor/models/gemma3n_mm.py | 8 +++++--- vllm/v1/engine/input_processor.py | 2 +- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2fa6afa2bacb..91491038ca8e 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1086,7 +1086,7 @@ async def init_app_state( logger.info("Supported tasks: %s", supported_tasks) resolved_chat_template = await process_chat_template( - args.chat_template, engine_client, vllm_config.model_config + args.chat_template, engine_client, vllm_config.renderer_config ) if args.tool_server == "demo": diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index daeeb995bc74..4fcf03a400d0 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -13,7 +13,7 @@ from fastapi.responses import JSONResponse, StreamingResponse from starlette.background import BackgroundTask, BackgroundTasks -from vllm.config import ModelConfig +from vllm.config import RendererConfig from vllm.engine.arg_utils import EngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ( @@ -288,7 +288,7 @@ def process_lora_modules( async def process_chat_template( args_chat_template: Path | str | None, engine_client: EngineClient, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> str | None: resolved_chat_template = load_chat_template(args_chat_template) if resolved_chat_template is not None: @@ -305,7 +305,7 @@ async def process_chat_template( tokenizer=tokenizer, chat_template=None, tools=None, - model_config=model_config, + renderer_config=renderer_config, ) if hf_chat_template != resolved_chat_template: @@ -314,6 +314,6 @@ async def process_chat_template( "It is different from official chat template '%s'. " "This discrepancy may lead to performance degradation.", resolved_chat_template, - model_config.model, + renderer_config.model_config.model, ) return resolved_chat_template diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 6ae76976eb46..072a7d5dd1f3 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -18,7 +18,7 @@ ) from transformers.models.siglip import SiglipImageProcessorFast -from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -761,7 +761,7 @@ def get_generation_prompt( cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, language: Optional[str], task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -799,7 +799,9 @@ def get_generation_prompt( @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: str + cls, + renderer_config: RendererConfig, + task_type: str, ) -> SpeechToTextConfig: return SpeechToTextConfig( # Let's set this to 30 as suggested in the docs for now, although diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index f965a8bc380b..a2f6ba5be8c1 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -583,7 +583,7 @@ def _validate_model_input( if prompt_type == "encoder" and model_config.is_multimodal_model: mm_registry = self.input_preprocessor.mm_registry mm_processor = mm_registry.create_processor( - model_config, + self.renderer_config, tokenizer=tokenizer, ) assert isinstance(mm_processor, EncDecMultiModalProcessor) From 9eb6d283698483545541ee4e0b9669bbb9f9338a Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 16:49:58 +0000 Subject: [PATCH 07/29] Fixes Signed-off-by: DarkLight1337 --- tests/multimodal/test_registry.py | 4 +++- vllm/entrypoints/llm.py | 3 ++- vllm/entrypoints/pooling/score/serving.py | 4 +--- vllm/entrypoints/score_utils.py | 13 ++++++++----- vllm/multimodal/cache.py | 22 ++++++++++------------ vllm/multimodal/registry.py | 5 +++-- vllm/v1/spec_decode/eagle.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/tpu_model_runner.py | 2 +- vllm/v1/worker/utils.py | 12 ++++++++---- 10 files changed, 38 insertions(+), 31 deletions(-) diff --git a/tests/multimodal/test_registry.py b/tests/multimodal/test_registry.py index 3b01bda7f54c..8127fac09968 100644 --- a/tests/multimodal/test_registry.py +++ b/tests/multimodal/test_registry.py @@ -31,4 +31,6 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected): model_id, limit_mm_per_prompt=limit_mm_per_prompt, ) - assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.model_config) is expected + assert ( + MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.renderer_config) is expected + ) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 62c80e41db11..5de6e1d2dc0f 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1281,6 +1281,7 @@ def _cross_encoding_score( pooling_params: PoolingParams | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, ) -> list[ScoringRequestOutput]: + renderer_config = self.renderer_config model_config = self.model_config if isinstance(tokenizer, MistralTokenizer): @@ -1307,7 +1308,7 @@ def _cross_encoding_score( for q, d in input_pairs: _, engine_prompt = get_score_prompt( - model_config=model_config, + renderer_config=renderer_config, data_1=q, data_2=d, tokenizer=tokenizer, diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index e5a66783005a..f657fcefd3a8 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -160,10 +160,8 @@ def _preprocess_score( data_1: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam, ) -> tuple[str, TokensPrompt]: - model_config = self.model_config - full_prompt, engine_prompt = get_score_prompt( - model_config=model_config, + renderer_config=self.renderer_config, data_1=data_1, data_2=data_2, tokenizer=tokenizer, diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 072ddd4c90b1..26b6b6bf0975 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -5,7 +5,7 @@ from torch.nn import CosineSimilarity from typing_extensions import Required, TypedDict -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.entrypoints.chat_utils import ( BaseMultiModalItemTracker, ChatCompletionContentPartImageEmbedsParam, @@ -88,9 +88,9 @@ def _validate_score_input_lens( def parse_score_data( data_1: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> tuple[str, str, MultiModalDataDict | None]: - mm_tracker = MultiModalItemTracker(model_config) + mm_tracker = MultiModalItemTracker(renderer_config) content_1 = _parse_score_content(data_1, mm_tracker) content_2 = _parse_score_content(data_2, mm_tracker) @@ -176,7 +176,7 @@ def post_process_tokens( def get_score_prompt( - model_config: ModelConfig, + renderer_config: RendererConfig, tokenizer: TokenizerLike, tokenization_kwargs: dict[str, Any], data_1: str | ScoreContentPartParam, @@ -185,11 +185,14 @@ def get_score_prompt( prompt_1, prompt_2, mm_data = parse_score_data( data_1, data_2, - model_config, + renderer_config, ) + from vllm.model_executor.model_loader import get_model_cls + model_config = renderer_config.renderer_config model = get_model_cls(model_config) + if supports_score_template(model): full_prompt = apply_score_template(model_config, prompt_1, prompt_2) prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 67bdf5e1557f..9c838fe67958 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -31,7 +31,7 @@ ) if TYPE_CHECKING: - from vllm.config import ModelConfig, VllmConfig + from vllm.config import ModelConfig, RendererConfig, VllmConfig from .processing import ResolvedPromptUpdate from .registry import MultiModalRegistry @@ -561,13 +561,13 @@ def address_as_item( def _enable_processor_cache( - model_config: "ModelConfig", + renderer_config: "RendererConfig", mm_registry: "MultiModalRegistry", ) -> bool: - if not mm_registry.supports_multimodal_inputs(model_config): + if not mm_registry.supports_multimodal_inputs(renderer_config): return False - mm_config = model_config.get_multimodal_config() + mm_config = renderer_config.model_config.get_multimodal_config() return mm_config.mm_processor_cache_gb > 0 @@ -599,7 +599,7 @@ def processor_cache_from_config( """Return a `BaseMultiModalProcessorCache`, if enabled.""" model_config = vllm_config.model_config - if not _enable_processor_cache(model_config, mm_registry): + if not _enable_processor_cache(vllm_config.renderer_config, mm_registry): return None if not _enable_ipc_cache(vllm_config): @@ -611,14 +611,14 @@ def processor_cache_from_config( def processor_only_cache_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", mm_registry: "MultiModalRegistry", ): """Return a `MultiModalProcessorOnlyCache`, if enabled.""" - if not _enable_processor_cache(model_config, mm_registry): + if not _enable_processor_cache(renderer_config, mm_registry): return None - return MultiModalProcessorOnlyCache(model_config) + return MultiModalProcessorOnlyCache(renderer_config.model_config) class BaseMultiModalReceiverCache( @@ -787,7 +787,7 @@ def engine_receiver_cache_from_config( """ model_config = vllm_config.model_config - if not _enable_processor_cache(model_config, mm_registry): + if not _enable_processor_cache(vllm_config.renderer_config, mm_registry): return None if not _enable_ipc_cache(vllm_config): @@ -809,9 +809,7 @@ def worker_receiver_cache_from_config( Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and mm_processor_cache_type=="shm". """ - model_config = vllm_config.model_config - - if not _enable_processor_cache(model_config, mm_registry): + if not _enable_processor_cache(vllm_config.renderer_config, mm_registry): return None if not _enable_ipc_cache(vllm_config): diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 31e1c9e6af96..7d636ef17895 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -114,17 +114,18 @@ def _extract_mm_options( return mm_options if len(mm_options) > 0 else None - def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool: + def supports_multimodal_inputs(self, renderer_config: "RendererConfig") -> bool: """ Checks if the model supports multimodal inputs. Returns True if the model is multimodal with any non-zero supported modalities, otherwise returns False, effectively running in text-only mode. """ + model_config = renderer_config.model_config if not model_config.is_multimodal_model: return False - info = self._create_processing_info(model_config, tokenizer=None) + info = self._create_processing_info(renderer_config, tokenizer=None) supported_modalities = info.get_supported_mm_limits() mm_config = model_config.get_multimodal_config() diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 31428db2d3af..7976418510aa 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -85,7 +85,7 @@ def __init__( # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( - vllm_config.model_config + vllm_config.renderer_config ) self.attn_metadata_builder: AttentionMetadataBuilder | None = None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3b0bdcc6b280..f824f99f9fcd 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -335,7 +335,7 @@ def __init__( self.uses_mrope = model_config.uses_mrope self.uses_xdrope_dim = model_config.uses_xdrope_dim self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( - model_config + self.renderer_config ) if self.model_config.is_encoder_decoder: diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 35f3591fa25a..4d411e52b13e 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -223,7 +223,7 @@ def __init__( self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( - model_config + self.model_config ) # TODO: Support M-RoPE (e.g, Qwen2-VL) assert not self.uses_mrope, "TPU does not support M-RoPE yet." diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 22d98cff14d0..ba8260927f25 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -30,15 +30,19 @@ def __init__( super().__init__() self.renderer_config = renderer_config - self.model_config = model_config = renderer_config.model_config + self.model_config = renderer_config.model_config self.scheduler_config = scheduler_config self.mm_registry = mm_registry - self.cache = cache = processor_only_cache_from_config(model_config, mm_registry) + self.cache = cache = processor_only_cache_from_config( + renderer_config, mm_registry + ) - self.max_model_len = model_config.max_model_len + self.max_model_len = self.model_config.max_model_len self.max_num_reqs = scheduler_config.max_num_seqs - self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, cache=cache) + self.mm_limits = mm_registry.get_mm_limits_per_prompt( + renderer_config, cache=cache + ) max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality( renderer_config, From b88ad6058752298be04222f73c632459457627a3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 16:54:26 +0000 Subject: [PATCH 08/29] Skip validation to pass doc build Signed-off-by: DarkLight1337 --- vllm/config/renderer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/config/renderer.py b/vllm/config/renderer.py index 0ac4367f72ef..dc87c780f871 100644 --- a/vllm/config/renderer.py +++ b/vllm/config/renderer.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Literal -from pydantic import Field +from pydantic import Field, SkipValidation from pydantic.dataclasses import dataclass from vllm.config.model import ModelConfig @@ -19,7 +19,7 @@ class RendererConfig: """Configuration for the renderer.""" - model_config: ModelConfig = Field(default_factory=ModelConfig) + model_config: SkipValidation[ModelConfig] = None # type: ignore """Provides model context to the renderer.""" tokenizer: str = "" From da64ef34b3080d68a960f460a724ec88965a37dd Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 16:56:52 +0000 Subject: [PATCH 09/29] Typo Signed-off-by: DarkLight1337 --- docs/contributing/model/transcription.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index baaa790d611e..9cff901767e3 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics: cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - renderer_config: renderer_config, + renderer_config: RendererConfig, ) -> int | None: # Return None if unknown; otherwise return an estimate. return int(audio_duration_s * stt_config.sample_rate // 320) # example From 9bff3a0460f3bc7e54a7852cc7f12db401865957 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 16:57:33 +0000 Subject: [PATCH 10/29] Fix arg Signed-off-by: DarkLight1337 --- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/tpu_model_runner.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f824f99f9fcd..2424f6d404de 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3839,7 +3839,7 @@ def _get_mm_dummy_batch( assert self.mm_budget is not None dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( - model_config=self.renderer_config, + renderer_config=self.renderer_config, seq_len=self.max_model_len, mm_counts={modality: 1}, cache=self.mm_budget.cache, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 4d411e52b13e..7e408ae69dd6 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -2040,7 +2040,7 @@ def _get_mm_dummy_batch( assert self.mm_budget is not None dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( - model_config=self.renderer_config, + renderer_config=self.renderer_config, seq_len=self.max_model_len, mm_counts={modality: 1}, cache=self.mm_budget.cache, From e46256e425d2219e3db4974641808a479f360077 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 16:58:04 +0000 Subject: [PATCH 11/29] Fix Signed-off-by: DarkLight1337 --- vllm/v1/worker/tpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 7e408ae69dd6..8c2be79bdfb2 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -223,7 +223,7 @@ def __init__( self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( - self.model_config + self.renderer_config ) # TODO: Support M-RoPE (e.g, Qwen2-VL) assert not self.uses_mrope, "TPU does not support M-RoPE yet." From 468274f5d422cca8cc93f55a1f0f6133a6e364d4 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 17:00:21 +0000 Subject: [PATCH 12/29] Fix protocol Signed-off-by: DarkLight1337 --- docs/contributing/model/transcription.md | 2 +- vllm/engine/protocol.py | 3 ++- vllm/model_executor/models/interfaces.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index 9cff901767e3..c5605789022d 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -216,7 +216,7 @@ Relevant server logic: prompt = self.model_cls.get_generation_prompt( audio=chunk, stt_config=self.asr_config, - model_config=self.model_config, + renderer_config=self.renderer_config, language=language, task_type=self.task_type, request_prompt=request.prompt, diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 1b6330c9f9b6..795f991836fc 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -5,7 +5,7 @@ from collections.abc import AsyncGenerator, Iterable, Mapping from typing import Any -from vllm.config import ModelConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, VllmConfig from vllm.inputs.data import PromptType from vllm.lora.request import LoRARequest from vllm.outputs import PoolingRequestOutput, RequestOutput @@ -22,6 +22,7 @@ class EngineClient(ABC): """Protocol class for Clients to Engine""" vllm_config: VllmConfig + renderer_config: RendererConfig model_config: ModelConfig input_processor: InputProcessor io_processor: IOProcessor | None diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 34a4104185d7..a13e8575ced8 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -19,7 +19,7 @@ from transformers.models.whisper.tokenization_whisper import LANGUAGES from typing_extensions import Self, TypeIs -from vllm.config import ModelConfig, RendererConfig, SpeechToTextConfig +from vllm.config import RendererConfig, SpeechToTextConfig from vllm.inputs import TokensPrompt from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -859,7 +859,7 @@ def get_generation_prompt( cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, From 916b051b5560d8b2f74965b538858daea5554b52 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 17:01:11 +0000 Subject: [PATCH 13/29] Fix Signed-off-by: DarkLight1337 --- vllm/config/vllm.py | 2 +- vllm/multimodal/registry.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 1c882741c88b..d72d7f47d933 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -726,7 +726,7 @@ def has_blocked_weights(): from vllm.multimodal import MULTIMODAL_REGISTRY self.scheduler_config.max_num_encoder_input_tokens = ( - MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) + MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.renderer_config) ) logger.debug( "Encoder-decoder model detected: setting " diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 7d636ef17895..e49aaa5045c6 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -330,13 +330,15 @@ def get_encoder_dummy_data( return dummy_data - def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int: + def get_encdec_max_encoder_len(self, renderer_config: "RendererConfig") -> int: """ Get the maximum length of the encoder input for encoder-decoder models. """ + model_config = renderer_config.model_config if not model_config.is_encoder_decoder: return 0 - max_tokens = self.get_max_tokens_per_item_by_modality(model_config) + + max_tokens = self.get_max_tokens_per_item_by_modality(renderer_config) if not max_tokens: # TODO - this function assumes encoder-decoder models are # multimodal. This will need to change when adding support for more From 578527ad64e6f39c116bf42501636c3fcd561e22 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 17:04:19 +0000 Subject: [PATCH 14/29] Typo Signed-off-by: DarkLight1337 --- vllm/entrypoints/score_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 26b6b6bf0975..561adbe454f3 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -190,7 +190,7 @@ def get_score_prompt( from vllm.model_executor.model_loader import get_model_cls - model_config = renderer_config.renderer_config + model_config = renderer_config.model_config model = get_model_cls(model_config) if supports_score_template(model): From b299583bbb118f3ab31147dfae479bbc20852306 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 17:28:06 +0000 Subject: [PATCH 15/29] Update Signed-off-by: DarkLight1337 --- vllm/config/renderer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/config/renderer.py b/vllm/config/renderer.py index dc87c780f871..c230a67fa9b1 100644 --- a/vllm/config/renderer.py +++ b/vllm/config/renderer.py @@ -64,7 +64,7 @@ def trust_remote_code(self) -> bool: def __post_init__(self) -> None: # The tokenizer is consistent with the model by default. if not self.tokenizer: - self.tokenizer = self.model_config.model + self.tokenizer = self.model_config.original_model if not self.tokenizer_revision: self.tokenizer_revision = self.model_config.revision @@ -87,7 +87,7 @@ def maybe_pull_tokenizer_for_runai(self, tokenizer: str) -> None: object_storage_tokenizer = ObjectStorageModel(url=tokenizer) object_storage_tokenizer.pull_files( - self.model_config.original_model, + tokenizer, ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"], ) self.tokenizer = object_storage_tokenizer.dir From 713a0c6626015935b3e6f9fd37d3cd1f403ffc46 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 5 Dec 2025 17:38:51 +0000 Subject: [PATCH 16/29] Improve type annotation Signed-off-by: DarkLight1337 --- vllm/config/model.py | 4 ++-- vllm/config/vllm.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 6231d5721273..f4d8d8fb3492 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1640,7 +1640,7 @@ def embedding_size(self): def recalculate_max_model_len( self, - max_model_len: int, + original_max_model_len: int | None, *, tokenizer: str | None = None, tokenizer_revision: str | None = None, @@ -1663,7 +1663,7 @@ def recalculate_max_model_len( self.max_model_len = _get_and_verify_max_len( hf_config=self.hf_text_config, tokenizer_config=tokenizer_config, - max_model_len=max_model_len, + max_model_len=original_max_model_len, disable_sliding_window=self.disable_sliding_window, sliding_window=self.get_sliding_window(), spec_target_max_model_len=self.spec_target_max_model_len, diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index d72d7f47d933..a611751713f0 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1122,10 +1122,10 @@ def _set_cudagraph_sizes(self): # complete the remaining process. self.compilation_config.post_init_cudagraph_sizes() - def recalculate_max_model_len(self, max_model_len: int): + def recalculate_max_model_len(self, original_max_model_len: int | None) -> None: # Can only be called during try_verify_and_update_config self.model_config.recalculate_max_model_len( - max_model_len, + original_max_model_len, tokenizer=self.renderer_config.tokenizer, tokenizer_revision=self.renderer_config.tokenizer_revision, ) From 7c2913d1ecce422e6a54ff87f707fc359331d079 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 6 Dec 2025 03:16:20 +0000 Subject: [PATCH 17/29] Fix intialization for tests Signed-off-by: DarkLight1337 --- .../distributed/test_sequence_parallelism.py | 2 ++ tests/compile/test_functionalization.py | 6 ++++- tests/compile/test_fusion.py | 6 ++++- tests/compile/test_fusion_attn.py | 2 ++ tests/compile/test_pass_manager.py | 8 ++++-- tests/compile/test_qk_norm_rope_fusion.py | 5 +++- tests/distributed/test_kvlayout.py | 3 +++ tests/lora/test_lora_manager.py | 14 +++++++--- tests/lora/test_worker.py | 2 ++ tests/models/language/pooling/test_gritlm.py | 5 ++-- .../processing/test_tensor_schema.py | 7 +++-- tests/multimodal/test_cache.py | 27 +++++++++++-------- tests/v1/attention/utils.py | 2 ++ tests/v1/core/test_kv_cache_utils.py | 20 +++++++++++--- tests/v1/core/test_scheduler.py | 2 ++ tests/v1/core/utils.py | 2 ++ tests/v1/engine/test_engine_core.py | 2 ++ .../engine/test_process_multi_modal_uuids.py | 9 ++++++- tests/v1/kv_connector/unit/utils.py | 2 ++ tests/v1/spec_decode/test_eagle.py | 2 ++ tests/v1/spec_decode/test_mtp.py | 2 ++ tests/v1/spec_decode/test_ngram.py | 2 ++ .../test_backend_guidance.py | 12 ++++++--- tests/v1/tpu/worker/test_tpu_model_runner.py | 2 ++ tests/v1/worker/test_gpu_model_runner.py | 3 +++ vllm/config/renderer.py | 2 ++ vllm/engine/arg_utils.py | 2 +- 27 files changed, 121 insertions(+), 32 deletions(-) diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py index d9fdc3acc3d6..77d3a24d4292 100644 --- a/tests/compile/distributed/test_sequence_parallelism.py +++ b/tests/compile/distributed/test_sequence_parallelism.py @@ -17,6 +17,7 @@ DeviceConfig, ModelConfig, PassConfig, + RendererConfig, VllmConfig, get_current_vllm_config, set_current_vllm_config, @@ -276,6 +277,7 @@ def sequence_parallelism_pass_on_test_model( vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), device_config=device_config, compilation_config=compilation_config, ) diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 758591589270..52d6fd1e5d75 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -15,6 +15,7 @@ CompilationConfig, ModelConfig, PassConfig, + RendererConfig, VllmConfig, set_current_vllm_config, ) @@ -219,8 +220,11 @@ def test_fix_functionalization( torch.set_default_device("cuda") torch.set_default_dtype(dtype) + model_config = ModelConfig(dtype=dtype) + vllm_config = VllmConfig( - model_config=ModelConfig(dtype=dtype), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), compilation_config=CompilationConfig( custom_ops=["all"], pass_config=PassConfig( diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index d0ba8385f4a0..bb4ee6b8e3ec 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -15,6 +15,7 @@ CompilationMode, ModelConfig, PassConfig, + RendererConfig, VllmConfig, ) from vllm.model_executor.layers.layernorm import RMSNorm @@ -154,8 +155,11 @@ def test_fusion_rmsnorm_quant( custom_ops.append("+rms_norm") if enable_quant_fp8_custom_op: custom_ops.append("+quant_fp8") + + model_config = ModelConfig(dtype=dtype) vllm_config = VllmConfig( - model_config=ModelConfig(dtype=dtype), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, custom_ops=custom_ops, diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index db95dff5e0fc..f87825db2981 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -24,6 +24,7 @@ CompilationMode, ModelConfig, PassConfig, + RendererConfig, SchedulerConfig, VllmConfig, set_current_vllm_config, @@ -325,6 +326,7 @@ def test_attention_quant_pattern( ) vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), scheduler_config=SchedulerConfig( max_num_seqs=1024, max_model_len=model_config.max_model_len, diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py index 6d0ba6b65503..c95e9e3ff8ae 100644 --- a/tests/compile/test_pass_manager.py +++ b/tests/compile/test_pass_manager.py @@ -7,7 +7,7 @@ from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass from vllm.compilation.pass_manager import PostGradPassManager -from vllm.config import ModelConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, VllmConfig # dummy custom pass that doesn't inherit @@ -43,7 +43,11 @@ def __call__(self, graph: torch.fx.graph.Graph) -> None: ) def test_pass_manager_uuid(callable): # Some passes need dtype to be set - config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16)) + model_config = ModelConfig(dtype=torch.bfloat16) + config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) pass_manager = PostGradPassManager() pass_manager.configure(config) diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py index e0968ac79925..4d109015be48 100644 --- a/tests/compile/test_qk_norm_rope_fusion.py +++ b/tests/compile/test_qk_norm_rope_fusion.py @@ -19,6 +19,7 @@ CompilationMode, ModelConfig, PassConfig, + RendererConfig, VllmConfig, set_current_vllm_config, ) @@ -133,8 +134,10 @@ def test_qk_norm_rope_fusion( if enable_rope_custom_op: custom_ops.append("+rotary_embedding") + model_config = ModelConfig(dtype=dtype) vllm_config = VllmConfig( - model_config=ModelConfig(dtype=dtype), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, custom_ops=custom_ops, diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py index b190b2820451..0d51a51a5080 100644 --- a/tests/distributed/test_kvlayout.py +++ b/tests/distributed/test_kvlayout.py @@ -5,6 +5,7 @@ DeviceConfig, KVTransferConfig, ModelConfig, + RendererConfig, VllmConfig, set_current_vllm_config, ) @@ -47,6 +48,7 @@ def test_get_kv_connector_cache_layout_with_nixl_connector(): vllm_config = VllmConfig( device_config=DeviceConfig("cpu"), model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), kv_transfer_config=kv_transfer_config, ) with set_current_vllm_config(vllm_config): @@ -70,6 +72,7 @@ def test_get_kv_connector_cache_layout_with_multi_connector(): vllm_config = VllmConfig( device_config=DeviceConfig("cpu"), model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), kv_transfer_config=kv_transfer_config, ) with set_current_vllm_config(vllm_config): diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 081f14d6fabf..7158120fc021 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -8,7 +8,7 @@ from safetensors.torch import load_file from torch import nn -from vllm.config import ModelConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, VllmConfig from vllm.config.lora import LoRAConfig from vllm.lora.layers import ( ColumnParallelLinearWithLoRA, @@ -422,7 +422,11 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa ) model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + lora_config=lora_config, + ) vllm_config.scheduler_config.max_num_seqs = 4 vllm_config.scheduler_config.max_num_batched_tokens = 2 @@ -525,7 +529,11 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path ) model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + lora_config=lora_config, + ) vllm_config.scheduler_config.max_num_seqs = 4 vllm_config.scheduler_config.max_num_batched_tokens = 2 diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 54059ec56190..42d8c6202e79 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -11,6 +11,7 @@ DeviceConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, VllmConfig, ) @@ -43,6 +44,7 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]): vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), load_config=LoadConfig( download_dir=None, load_format="dummy", diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index 0adc9b5cf25f..11ee00358548 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -6,7 +6,7 @@ from scipy.spatial.distance import cosine from vllm import LLM, SamplingParams -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from ....utils import RemoteOpenAIServer @@ -31,7 +31,8 @@ def test_find_array(): dtype="bfloat16", seed=0, ) - pooling = GritLMMeanPool(model_config=model_config) + renderer_config = RendererConfig(model_config=model_config) + pooling = GritLMMeanPool(renderer_config=renderer_config) arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 7eda0e6bdb85..ec515069fa54 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -11,7 +11,7 @@ import torch.nn as nn from PIL import Image -from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config +from vllm.config import ModelConfig, RendererConfig, VllmConfig, set_current_vllm_config from vllm.config.multimodal import ( AudioDummyOptions, BaseDummyOptions, @@ -149,7 +149,10 @@ def initialize_dummy_model( backend="nccl", ) initialize_model_parallel(tensor_model_parallel_size=1) - vllm_config = VllmConfig(model_config=model_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) with set_current_vllm_config(vllm_config=vllm_config): with set_default_torch_dtype(model_config.dtype): model = model_cls(vllm_config=vllm_config) diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index e4fcc34740ed..77c8eb6a7474 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -6,7 +6,7 @@ import pytest import torch -from vllm.config import ModelConfig, ParallelConfig, VllmConfig +from vllm.config import ModelConfig, ParallelConfig, RendererConfig, VllmConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import ( BaseMultiModalProcessorCache, @@ -110,11 +110,14 @@ def _create_vllm_config( mm_processor_cache_gb: float, enable_ipc: bool, ): + model_config = ModelConfig( + model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + mm_processor_cache_gb=mm_processor_cache_gb, + ) + return VllmConfig( - model_config=ModelConfig( - model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", - mm_processor_cache_gb=mm_processor_cache_gb, - ), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2), ) @@ -506,13 +509,15 @@ def _run_test_cache_eviction_shm( def test_cache_eviction_shm_cache(): + model_config = ModelConfig( + model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + mm_processor_cache_type="shm", + mm_shm_cache_max_object_size_mb=6, + mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes, + ) vllm_config = VllmConfig( - model_config=ModelConfig( - model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", - mm_processor_cache_type="shm", - mm_shm_cache_max_object_size_mb=6, - mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes, - ), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), ) sender_cache = ShmObjectStoreSenderCache(vllm_config) receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock()) diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 6cab129c116c..49307e3e5437 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -16,6 +16,7 @@ LoadConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, VllmConfig, ) @@ -216,6 +217,7 @@ def create_vllm_config( return VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, parallel_config=parallel_config, scheduler_config=scheduler_config, diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index fd5cf6d3e74a..4a414bca591d 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -8,7 +8,7 @@ import torch import vllm.v1.core.kv_cache_utils as kv_cache_utils -from vllm.config import ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import ( MultiModalFeatureSpec, @@ -667,7 +667,10 @@ def test_metrics_empty_stats(): def test_get_kv_cache_configs_multiple_workers(): model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig(model_config=model_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) ref_kv_cache_spec = new_kv_cache_spec() same_kv_cache_specs = [ @@ -1136,6 +1139,7 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len) vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), scheduler_config=scheduler_config, ) @@ -1175,6 +1179,7 @@ def test_get_max_concurrency_for_kv_cache_config(): vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), scheduler_config=scheduler_config, ) @@ -1293,7 +1298,10 @@ def test_allocate_with_lookahead(): def test_get_kv_cache_config_one_worker(): # pass max_model_len to pass check_enough_kv_cache_memory model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig(model_config=model_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 # all layers are full attention -> single group @@ -1584,7 +1592,11 @@ def test_get_kv_cache_config_one_worker(): def test_get_kv_cache_configs_attention_free(): kv_cache_specs: dict[str, KVCacheSpec] = {} - vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16)) + model_config = ModelConfig(max_model_len=16) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0]) assert kv_cache_configs == [ KVCacheConfig( diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index c6c4a5085bff..1505415a6361 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -11,6 +11,7 @@ ECTransferConfig, KVTransferConfig, ModelConfig, + RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -1563,6 +1564,7 @@ def create_scheduler_with_priority( vllm_config = VllmConfig( scheduler_config=scheduler_config, model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, kv_transfer_config=kv_transfer_config, speculative_config=speculative_config, diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index f5ba613d38db..086885c29814 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -9,6 +9,7 @@ ECTransferConfig, KVTransferConfig, ModelConfig, + RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -132,6 +133,7 @@ def create_scheduler( vllm_config = VllmConfig( scheduler_config=scheduler_config, model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, kv_transfer_config=kv_transfer_config, speculative_config=speculative_config, diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 48be8c15aba9..c606100a12bf 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -15,6 +15,7 @@ ECTransferConfig, KVTransferConfig, ModelConfig, + RendererConfig, SchedulerConfig, VllmConfig, ) @@ -522,6 +523,7 @@ def test_encoder_instance_zero_kv_cache( vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, kv_transfer_config=kv_transfer_config, diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py index 1b11b8af49d1..fa079179fead 100644 --- a/tests/v1/engine/test_process_multi_modal_uuids.py +++ b/tests/v1/engine/test_process_multi_modal_uuids.py @@ -5,7 +5,13 @@ from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig +from vllm.config import ( + CacheConfig, + DeviceConfig, + ModelConfig, + RendererConfig, + VllmConfig, +) from vllm.sampling_params import SamplingParams from vllm.v1.engine import input_processor as input_processor_mod from vllm.v1.engine.input_processor import InputProcessor @@ -60,6 +66,7 @@ def __init__(self, gb: float): model_config.multimodal_config = _MockMMConfig(mm_cache_gb) # type: ignore[attr-defined] vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching), device_config=DeviceConfig(device="cpu"), ) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 58f1a7282352..768b338b5fe5 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -15,6 +15,7 @@ DeviceConfig, KVTransferConfig, ModelConfig, + RendererConfig, SchedulerConfig, VllmConfig, ) @@ -127,6 +128,7 @@ def create_vllm_config( return VllmConfig( scheduler_config=scheduler_config, model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, kv_transfer_config=kv_transfer_config, device_config=DeviceConfig("cpu"), diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 616e57de339e..888ea0169b75 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -19,6 +19,7 @@ DeviceConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -61,6 +62,7 @@ def _create_proposer( vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=CacheConfig(), speculative_config=speculative_config, device_config=DeviceConfig(device=current_platform.device_type), diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py index 3b8813ceb818..4483c8243853 100644 --- a/tests/v1/spec_decode/test_mtp.py +++ b/tests/v1/spec_decode/test_mtp.py @@ -18,6 +18,7 @@ DeviceConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -46,6 +47,7 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer: vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=CacheConfig(), speculative_config=speculative_config, device_config=DeviceConfig(device=current_platform.device_type), diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py index 6bc412abe869..2e365e08a4e7 100644 --- a/tests/v1/spec_decode/test_ngram.py +++ b/tests/v1/spec_decode/test_ngram.py @@ -4,6 +4,7 @@ from vllm.config import ( ModelConfig, + RendererConfig, SpeculativeConfig, VllmConfig, ) @@ -69,6 +70,7 @@ def get_ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer: return NgramProposer( vllm_config=VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), speculative_config=SpeculativeConfig( prompt_lookup_min=min_n, prompt_lookup_max=max_n, diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py index 4c01560fc88c..baef2459f8df 100644 --- a/tests/v1/structured_output/test_backend_guidance.py +++ b/tests/v1/structured_output/test_backend_guidance.py @@ -6,7 +6,7 @@ import pytest from transformers import AutoTokenizer -from vllm.config import StructuredOutputsConfig, VllmConfig +from vllm.config import RendererConfig, StructuredOutputsConfig, VllmConfig from vllm.config.model import ModelConfig from vllm.config.parallel import ParallelConfig from vllm.config.speculative import SpeculativeConfig @@ -72,8 +72,11 @@ def test_backend_guidance_rollback_terminated(): def test_grammar_bitmask_with_specdec(): tokenizer = AutoTokenizer.from_pretrained(TOKENIZER) prompt = tokenizer.encode('{"a": "b"}') + + model_config = ModelConfig(tokenizer=TOKENIZER) vllm_config = VllmConfig( - model_config=ModelConfig(tokenizer=TOKENIZER), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER), structured_outputs_config=StructuredOutputsConfig(backend="guidance"), speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3), ) @@ -137,8 +140,11 @@ def test_grammar_init_async_and_sync(async_grammar): # Use "external_launcher" for sync mode, None for async mode executor_backend = None if async_grammar else "external_launcher" + + model_config = ModelConfig(tokenizer=TOKENIZER) vllm_config = VllmConfig( - model_config=ModelConfig(tokenizer=TOKENIZER), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER), structured_outputs_config=StructuredOutputsConfig(backend="guidance"), parallel_config=ParallelConfig(distributed_executor_backend=executor_backend), ) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index cfc06666e798..080d23863652 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -7,6 +7,7 @@ from vllm.config import ( CacheConfig, ModelConfig, + RendererConfig, SchedulerConfig, VllmConfig, set_current_vllm_config, @@ -45,6 +46,7 @@ def get_vllm_config(): ) vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, ) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 7b8c4268a523..464e3ab99c76 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -13,6 +13,7 @@ CacheConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, VllmConfig, set_current_vllm_config, @@ -101,6 +102,7 @@ def get_vllm_config(): parallel_config = ParallelConfig() vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, parallel_config=parallel_config, @@ -811,6 +813,7 @@ def test_hybrid_attention_mamba_tensor_shapes(): attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER) vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, parallel_config=parallel_config, diff --git a/vllm/config/renderer.py b/vllm/config/renderer.py index c230a67fa9b1..d0182f5b9b08 100644 --- a/vllm/config/renderer.py +++ b/vllm/config/renderer.py @@ -19,6 +19,8 @@ class RendererConfig: """Configuration for the renderer.""" + # NOTE: In reality, this is a required argument. + # We provide a dummy default value here to generate the CLI args. model_config: SkipValidation[ModelConfig] = None # type: ignore """Provides model context to the renderer.""" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 152887eb35b9..1b8258eaf05d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1398,7 +1398,7 @@ def create_engine_config( model_config = self.create_model_config() renderer_config = RendererConfig( model_config=model_config, - tokenizer=self.tokenizer, + tokenizer=self.tokenizer or "", tokenizer_mode=self.tokenizer_mode, tokenizer_revision=self.tokenizer_revision, skip_tokenizer_init=self.skip_tokenizer_init, From 2055a7f2acfb8a93a8be7548fde066b922c339f1 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 6 Dec 2025 03:19:59 +0000 Subject: [PATCH 18/29] Fix Signed-off-by: DarkLight1337 --- vllm/config/renderer.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/vllm/config/renderer.py b/vllm/config/renderer.py index d0182f5b9b08..46836b700bee 100644 --- a/vllm/config/renderer.py +++ b/vllm/config/renderer.py @@ -64,18 +64,31 @@ def trust_remote_code(self) -> bool: return self.model_config.trust_remote_code def __post_init__(self) -> None: + model_config = self.model_config + # The tokenizer is consistent with the model by default. if not self.tokenizer: - self.tokenizer = self.model_config.original_model + self.tokenizer = ( + ModelConfig.model + if model_config is None + else model_config.original_model + ) if not self.tokenizer_revision: - self.tokenizer_revision = self.model_config.revision + self.tokenizer_revision = ( + ModelConfig.revision if model_config is None else model_config.revision + ) self.original_tokenizer = self.tokenizer self.tokenizer = maybe_model_redirect(self.original_tokenizer) self.maybe_pull_tokenizer_for_runai(self.tokenizer) # Multimodal GGUF models must use original repo for mm processing - if is_gguf(self.tokenizer) and self.model_config.is_multimodal_model: + is_multimodal_model = ( + ModelConfig.is_multimodal_model + if model_config is None + else model_config.is_multimodal_model + ) + if is_gguf(self.tokenizer) and is_multimodal_model: raise ValueError( "Loading a multimodal GGUF model needs to use original " "tokenizer. Please specify the unquantized hf model's " From fbc6e718d88d009b75108041d744724876ad0fee Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 6 Dec 2025 03:53:10 +0000 Subject: [PATCH 19/29] Fix Signed-off-by: DarkLight1337 --- tests/models/registry.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 1d3e15207d70..e2cb5bcbc6c9 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -176,15 +176,17 @@ def build_model_config(self, model: str | None = None, **kwargs) -> ModelConfig: model = self.default return ModelConfig( - model=model, - revision=self.revision, - trust_remote_code=self.trust_remote_code, - hf_overrides=self.hf_overrides, - enable_prompt_embeds=self.require_embed_inputs, - enable_mm_embeds=self.require_embed_inputs, - enforce_eager=self.enforce_eager, - dtype=self.dtype, - **kwargs, + **{ + "model": model, + "revision": self.revision, + "trust_remote_code": self.trust_remote_code, + "hf_overrides": self.hf_overrides, + "enable_prompt_embeds": self.require_embed_inputs, + "enable_mm_embeds": self.require_embed_inputs, + "enforce_eager": self.enforce_eager, + "dtype": self.dtype, + **kwargs, + } ) def build_renderer_config( From 7498f243697823631794a99e568a75cbfdf93114 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 6 Dec 2025 04:58:08 +0000 Subject: [PATCH 20/29] Avoid breaking compat with lm-eval Signed-off-by: DarkLight1337 --- tests/entrypoints/test_chat_utils.py | 8 ++++---- vllm/entrypoints/chat_utils.py | 22 +++++++++++++--------- vllm/entrypoints/utils.py | 2 +- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 5f5580fa181b..884817901839 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -1794,7 +1794,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): tokenizer, chat_template=None, tools=tools, - renderer_config=renderer_config, + model_config=renderer_config.model_config, ) assert isinstance(chat_template, str) @@ -1868,7 +1868,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa tokenizer, chat_template=None, tools=tools, - renderer_config=renderer_config, + model_config=renderer_config.model_config, ) with pytest.raises( ValueError, match="Found unexpected chat template kwargs from request" @@ -1951,7 +1951,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): tokenizer, chat_template=None, tools=None, - renderer_config=renderer_config, + model_config=renderer_config.model_config, ) assert isinstance(chat_template, str) @@ -1999,7 +1999,7 @@ def test_resolve_content_format_fallbacks(model, expected_format): tokenizer, chat_template=None, tools=None, - renderer_config=renderer_config, + model_config=renderer_config.model_config, ) assert isinstance(chat_template, str) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index bfa5929b1396..ea1ca09a0c9a 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -452,9 +452,10 @@ def resolve_mistral_chat_template( def _try_get_processor_chat_template( tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, - renderer_config: RendererConfig, + *, + trust_remote_code: bool, ) -> str | None: - cache_key = (tokenizer.name_or_path, renderer_config.trust_remote_code) + cache_key = (tokenizer.name_or_path, trust_remote_code) if cache_key in _PROCESSOR_CHAT_TEMPLATES: return _PROCESSOR_CHAT_TEMPLATES[cache_key] @@ -466,7 +467,7 @@ def _try_get_processor_chat_template( PreTrainedTokenizerFast, ProcessorMixin, ), - trust_remote_code=renderer_config.trust_remote_code, + trust_remote_code=trust_remote_code, ) if ( isinstance(processor, ProcessorMixin) @@ -491,7 +492,7 @@ def resolve_hf_chat_template( chat_template: str | None, tools: list[dict[str, Any]] | None, *, - renderer_config: RendererConfig, + model_config: ModelConfig, ) -> str | None: # 1st priority: The given chat template if chat_template is not None: @@ -499,7 +500,10 @@ def resolve_hf_chat_template( # 2nd priority: AutoProcessor chat template, unless tool calling is enabled if tools is None: - chat_template = _try_get_processor_chat_template(tokenizer, renderer_config) + chat_template = _try_get_processor_chat_template( + tokenizer, + trust_remote_code=model_config.trust_remote_code, + ) if chat_template is not None: return chat_template @@ -515,8 +519,8 @@ def resolve_hf_chat_template( # 4th priority: Predefined fallbacks path = get_chat_template_fallback_path( - model_type=renderer_config.model_config.hf_config.model_type, - tokenizer_name_or_path=renderer_config.tokenizer, + model_type=model_config.hf_config.model_type, + tokenizer_name_or_path=tokenizer.name_or_path, ) if path is not None: logger.info_once( @@ -545,7 +549,7 @@ def _resolve_chat_template_content_format( tokenizer, chat_template=chat_template, tools=tools, - renderer_config=renderer_config, + model_config=renderer_config.model_config, ) else: hf_chat_template = None @@ -1768,7 +1772,7 @@ def apply_hf_chat_template( tokenizer, chat_template=chat_template, tools=tools, - renderer_config=renderer_config, + model_config=renderer_config.model_config, ) if hf_chat_template is None: diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 4fcf03a400d0..a81f73ac9e61 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -305,7 +305,7 @@ async def process_chat_template( tokenizer=tokenizer, chat_template=None, tools=None, - renderer_config=renderer_config, + model_config=renderer_config.model_config, ) if hf_chat_template != resolved_chat_template: From 423b2cae680dbddba9baa39fd3a8bacbef272a28 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 6 Dec 2025 06:17:40 +0000 Subject: [PATCH 21/29] Fixes Signed-off-by: DarkLight1337 --- .../entrypoints/openai/test_lora_resolvers.py | 21 ++++++++++---- tests/entrypoints/openai/test_serving_chat.py | 28 ++++++++++++++----- vllm/engine/arg_utils.py | 2 +- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index ea6b3d812d8f..7310c2610ce3 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -33,26 +33,34 @@ class MockModelConfig: """Minimal mock ModelConfig for testing.""" model: str = MODEL_NAME - tokenizer: str = MODEL_NAME trust_remote_code: bool = False - tokenizer_mode: str = "auto" max_model_len: int = 100 - tokenizer_revision: str | None = None multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig) hf_config: MockHFConfig = field(default_factory=MockHFConfig) logits_processors: list[str] | None = None logits_processor_pattern: str | None = None diff_sampling_param: dict | None = None - allowed_local_media_path: str = "" - allowed_media_domains: list[str] | None = None encoder_config = None generation_config: str = "auto" - skip_tokenizer_init: bool = False def get_diff_sampling_param(self): return self.diff_sampling_param or {} +@dataclass +class MockRendererConfig: + """Minimal mock RendererConfig for testing.""" + + model_config: MockModelConfig + + tokenizer: str = MODEL_NAME + tokenizer_mode: str = "auto" + tokenizer_revision: str | None = None + skip_tokenizer_init: bool = False + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None + + class MockLoRAResolver(LoRAResolver): async def resolve_lora( self, base_model_name: str, lora_name: str @@ -114,6 +122,7 @@ async def mock_generate(*args, **kwargs): mock_engine.add_lora.reset_mock() mock_engine.model_config = MockModelConfig() + mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 9ea65f9fa6e7..964abe6ffe98 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -346,27 +346,33 @@ class MockHFConfig: class MockModelConfig: task = "generate" runner_type = "generate" - tokenizer = MODEL_NAME trust_remote_code = False - tokenizer_mode = "auto" max_model_len = 100 - tokenizer_revision = None multimodal_config = MultiModalConfig() hf_config = MockHFConfig() logits_processors: list[str] | None = None logits_processor_pattern = None diff_sampling_param: dict | None = None - allowed_local_media_path: str = "" - allowed_media_domains: list[str] | None = None encoder_config = None generation_config: str = "auto" - media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - skip_tokenizer_init = False def get_diff_sampling_param(self): return self.diff_sampling_param or {} +@dataclass +class MockRendererConfig: + model_config: MockModelConfig = MockModelConfig() + + tokenizer = MODEL_NAME + tokenizer_mode = "auto" + tokenizer_revision = None + skip_tokenizer_init = False + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None + + def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: models = OpenAIServingModels( engine_client=engine, @@ -399,6 +405,7 @@ async def _fake_process_inputs( @dataclass class MockEngine: model_config: MockModelConfig = field(default_factory=MockModelConfig) + renderer_config: MockRendererConfig = field(default_factory=MockRendererConfig) input_processor: MagicMock = field(default_factory=MagicMock) io_processor: MagicMock = field(default_factory=MagicMock) @@ -429,6 +436,7 @@ async def test_serving_chat_returns_correct_model_name(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() + mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -459,6 +467,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() + mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -492,6 +501,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config + mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -537,6 +547,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config + mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -583,6 +594,7 @@ async def test_serving_chat_could_load_correct_generation_config(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config + mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -629,6 +641,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config + mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -662,6 +675,7 @@ async def test_serving_chat_data_parallel_rank_extraction(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() + mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e3786b87767a..bd398abb0bf8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -468,7 +468,7 @@ class EngineArgs: skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling video_pruning_rate: float = MultiModalConfig.video_pruning_rate # Renderer fields - tokenizer: str | None = RendererConfig.tokenizer + tokenizer: str | None = None tokenizer_mode: TokenizerMode | str = RendererConfig.tokenizer_mode tokenizer_revision: str | None = RendererConfig.tokenizer_revision skip_tokenizer_init: bool = RendererConfig.skip_tokenizer_init From e1e05d4b7b0a85799a08ba81b61950aa1e266ca4 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 6 Dec 2025 07:22:21 +0000 Subject: [PATCH 22/29] Fix mutable default Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/test_serving_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 964abe6ffe98..9df8f886edd9 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -362,7 +362,7 @@ def get_diff_sampling_param(self): @dataclass class MockRendererConfig: - model_config: MockModelConfig = MockModelConfig() + model_config: MockModelConfig = field(default_factory=MockModelConfig) tokenizer = MODEL_NAME tokenizer_mode = "auto" From 9382291f089324102fb27142d2cc92aef24e269d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 6 Dec 2025 09:29:54 +0000 Subject: [PATCH 23/29] Fix Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/test_serving_engine.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py index 956a06dc5487..6ab0942b58da 100644 --- a/tests/entrypoints/openai/test_serving_engine.py +++ b/tests/entrypoints/openai/test_serving_engine.py @@ -7,7 +7,7 @@ import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.tokenizers import MistralTokenizer @@ -19,10 +19,16 @@ def serving() -> OpenAIServing: # Create minimal mocks engine_client = Mock() + model_config = Mock(spec=ModelConfig) model_config.max_model_len = 32768 + + renderer_config = Mock(spec=RendererConfig) + renderer_config.model_config = model_config + models = Mock(spec=OpenAIServingModels) models.model_config = model_config + models.renderer_config = renderer_config models.input_processor = Mock() models.io_processor = Mock() From b94a40743cd704f51bedee86f40651aaa5d4f6e0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 6 Dec 2025 09:32:04 +0000 Subject: [PATCH 24/29] Fix Signed-off-by: DarkLight1337 --- tests/v1/engine/test_process_multi_modal_uuids.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py index fa079179fead..8524b1849852 100644 --- a/tests/v1/engine/test_process_multi_modal_uuids.py +++ b/tests/v1/engine/test_process_multi_modal_uuids.py @@ -50,23 +50,19 @@ def _mock_input_processor( monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True) model_config = ModelConfig( - skip_tokenizer_init=True, max_model_len=128, mm_processor_cache_gb=mm_cache_gb, generation_config="vllm", + ) + renderer_config = RendererConfig( + model_config=model_config, tokenizer="dummy", + skip_tokenizer_init=True, ) - # Minimal multimodal_config to satisfy references in - # Processor.process_inputs. - class _MockMMConfig: - def __init__(self, gb: float): - self.mm_processor_cache_gb = gb - - model_config.multimodal_config = _MockMMConfig(mm_cache_gb) # type: ignore[attr-defined] vllm_config = VllmConfig( model_config=model_config, - renderer_config=RendererConfig(model_config=model_config), + renderer_config=renderer_config, cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching), device_config=DeviceConfig(device="cpu"), ) From ee458d6463b386b5ec85880e36e9ee0232ff571d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 6 Dec 2025 13:37:22 +0000 Subject: [PATCH 25/29] Fix entrypoints test Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/test_serving_models.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index b585835a0667..376df6cfecb9 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -6,7 +6,7 @@ import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.protocol import ( ErrorResponse, @@ -27,9 +27,15 @@ async def _async_serving_models_init() -> OpenAIServingModels: mock_engine_client = MagicMock(spec=EngineClient) # Set the max_model_len attribute to avoid missing attribute + mock_model_config = MagicMock(spec=ModelConfig) mock_model_config.max_model_len = 2048 + + mock_renderer_config = MagicMock(spec=RendererConfig) + mock_renderer_config.model_config = mock_model_config + mock_engine_client.model_config = mock_model_config + mock_engine_client.renderer_config = mock_renderer_config mock_engine_client.input_processor = MagicMock() mock_engine_client.io_processor = MagicMock() From 1d3ca9bfc206530590474f79e25d1830a15a8181 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 6 Dec 2025 13:50:33 +0000 Subject: [PATCH 26/29] Fix Signed-off-by: DarkLight1337 --- tests/v1/engine/test_process_multi_modal_uuids.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py index 8524b1849852..11eee647b795 100644 --- a/tests/v1/engine/test_process_multi_modal_uuids.py +++ b/tests/v1/engine/test_process_multi_modal_uuids.py @@ -50,6 +50,7 @@ def _mock_input_processor( monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True) model_config = ModelConfig( + model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", max_model_len=128, mm_processor_cache_gb=mm_cache_gb, generation_config="vllm", From 5da9f1a5e2df1e75228f181aa9e1a90caa1e77d4 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 6 Dec 2025 13:54:45 +0000 Subject: [PATCH 27/29] Pass the test Signed-off-by: DarkLight1337 --- tests/v1/engine/test_process_multi_modal_uuids.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py index 11eee647b795..85fab3a855fd 100644 --- a/tests/v1/engine/test_process_multi_modal_uuids.py +++ b/tests/v1/engine/test_process_multi_modal_uuids.py @@ -9,6 +9,7 @@ CacheConfig, DeviceConfig, ModelConfig, + MultiModalConfig, RendererConfig, VllmConfig, ) @@ -50,11 +51,12 @@ def _mock_input_processor( monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True) model_config = ModelConfig( - model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", max_model_len=128, mm_processor_cache_gb=mm_cache_gb, generation_config="vllm", ) + model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb) + renderer_config = RendererConfig( model_config=model_config, tokenizer="dummy", From 3e9e9cfd1afa137aa9e2d6e38b3d579610b025ef Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 6 Dec 2025 16:20:41 +0000 Subject: [PATCH 28/29] Fix Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/test_vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 869f6eeb4bfa..ae8860ee877b 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -118,7 +118,7 @@ def get_hf_prompt_tokens(model_name, content, image_url): image = image.media images = [image] - prompt = processor.apply_chat_template( + prompt = processor.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = processor(prompt, images, return_tensors="pt") From 78b918b4fe2a274bd2c60059a6ca2b41ee35d8ca Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 7 Dec 2025 03:19:36 +0000 Subject: [PATCH 29/29] Fix wrong model ID Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/test_chat_template.py | 2 +- tests/entrypoints/test_chat_utils.py | 8 ++++---- tests/models/multimodal/processing/test_tensor_schema.py | 1 + tests/models/utils.py | 1 + vllm/entrypoints/chat_utils.py | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index ba7cb9328155..b050cfdb561c 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -106,7 +106,7 @@ def test_get_gen_prompt( model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - renderer_config = model_info.build_renderer_config() + renderer_config = model_info.build_renderer_config(model) tokenizer = get_tokenizer( renderer_config.tokenizer, diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 884817901839..2740779c95e1 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -1767,7 +1767,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - renderer_config = model_info.build_renderer_config() + renderer_config = model_info.build_renderer_config(model) tokenizer = get_tokenizer( renderer_config.tokenizer, @@ -1856,7 +1856,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa "enable_thinking": True, } - renderer_config = model_info.build_renderer_config() + renderer_config = model_info.build_renderer_config(model) tokenizer = get_tokenizer( renderer_config.tokenizer, @@ -1939,7 +1939,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - renderer_config = model_info.build_renderer_config() + renderer_config = model_info.build_renderer_config(model) tokenizer = get_tokenizer( renderer_config.tokenizer, @@ -1987,7 +1987,7 @@ def test_resolve_content_format_fallbacks(model, expected_format): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - renderer_config = model_info.build_renderer_config() + renderer_config = model_info.build_renderer_config(model) tokenizer = get_tokenizer( renderer_config.tokenizer, diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index ec515069fa54..24959fa48ad6 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -185,6 +185,7 @@ def test_model_tensor_schema(model_id: str): dtype = model_info.dtype renderer_config = model_info.build_renderer_config( + model_id, hf_overrides=hf_overrides_fn, dtype=dtype, ) diff --git a/tests/models/utils.py b/tests/models/utils.py index d92188f40eea..87292cc4538d 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -296,6 +296,7 @@ def build_model_context( model_config_kwargs = model_config_kwargs or {} limit_mm_per_prompt = limit_mm_per_prompt or {} renderer_config = model_info.build_renderer_config( + model_id, runner=runner, dtype=dtype, seed=0, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index ea1ca09a0c9a..664d04a195a4 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -517,7 +517,7 @@ def resolve_hf_chat_template( exc_info=True, ) - # 4th priority: Predefined fallbacks + # 4th priority: Predefined fallbacks] path = get_chat_template_fallback_path( model_type=model_config.hf_config.model_type, tokenizer_name_or_path=tokenizer.name_or_path,