Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
80cf80b
[Refactor] Separate out `RendererConfig` from ModelConfig`
DarkLight1337 Dec 5, 2025
5d7b31e
Also move `media_io_kwargs`
DarkLight1337 Dec 5, 2025
34f6d8c
Fix
DarkLight1337 Dec 5, 2025
33e0d97
Fix
DarkLight1337 Dec 5, 2025
cd3fa6f
Fix
DarkLight1337 Dec 5, 2025
4fe9c07
Fixes
DarkLight1337 Dec 5, 2025
9eb6d28
Fixes
DarkLight1337 Dec 5, 2025
b88ad60
Skip validation to pass doc build
DarkLight1337 Dec 5, 2025
da64ef3
Typo
DarkLight1337 Dec 5, 2025
9bff3a0
Fix arg
DarkLight1337 Dec 5, 2025
e46256e
Fix
DarkLight1337 Dec 5, 2025
468274f
Fix protocol
DarkLight1337 Dec 5, 2025
916b051
Fix
DarkLight1337 Dec 5, 2025
578527a
Typo
DarkLight1337 Dec 5, 2025
b299583
Update
DarkLight1337 Dec 5, 2025
713a0c6
Improve type annotation
DarkLight1337 Dec 5, 2025
e4df022
Merge branch 'main' into renderer-config
DarkLight1337 Dec 5, 2025
bee2e25
Merge branch 'main' into renderer-config
DarkLight1337 Dec 6, 2025
7c2913d
Fix intialization for tests
DarkLight1337 Dec 6, 2025
2055a7f
Fix
DarkLight1337 Dec 6, 2025
fbc6e71
Fix
DarkLight1337 Dec 6, 2025
7498f24
Avoid breaking compat with lm-eval
DarkLight1337 Dec 6, 2025
6879564
Merge branch 'main' into renderer-config
DarkLight1337 Dec 6, 2025
423b2ca
Fixes
DarkLight1337 Dec 6, 2025
e1e05d4
Fix mutable default
DarkLight1337 Dec 6, 2025
9382291
Fix
DarkLight1337 Dec 6, 2025
b94a407
Fix
DarkLight1337 Dec 6, 2025
ee458d6
Fix entrypoints test
DarkLight1337 Dec 6, 2025
c1db821
Merge branch 'main' into renderer-config
DarkLight1337 Dec 6, 2025
b20a2aa
Merge branch 'main' into renderer-config
DarkLight1337 Dec 6, 2025
1d3ca9b
Fix
DarkLight1337 Dec 6, 2025
5da9f1a
Pass the test
DarkLight1337 Dec 6, 2025
3e9e9cf
Fix
DarkLight1337 Dec 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions docs/contributing/model/transcription.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Declare supported languages and capabilities:
import torch
from torch import nn

from vllm.config import ModelConfig, SpeechToTextConfig
from vllm.config import RendererConfig, SpeechToTextConfig
from vllm.inputs.data import PromptType
from vllm.model_executor.models.interfaces import SupportsTranscription

Expand Down Expand Up @@ -52,7 +52,7 @@ This is for controlling general behavior of the API when serving your model:
@classmethod
def get_speech_to_text_config(
cls,
model_config: ModelConfig,
renderer_config: RendererConfig,
task_type: Literal["transcribe", "translate"],
) -> SpeechToTextConfig:
return SpeechToTextConfig(
Expand Down Expand Up @@ -83,7 +83,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
cls,
audio: np.ndarray,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
renderer_config: RendererConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
request_prompt: str,
Expand Down Expand Up @@ -120,7 +120,7 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
cls,
audio: np.ndarray,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
renderer_config: RendererConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
request_prompt: str,
Expand Down Expand Up @@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
cls,
audio_duration_s: float,
stt_config: SpeechToTextConfig,
model_config: ModelConfig,
renderer_config: RendererConfig,
) -> int | None:
# Return None if unknown; otherwise return an estimate.
return int(audio_duration_s * stt_config.sample_rate // 320) # example
Expand Down Expand Up @@ -216,7 +216,7 @@ Relevant server logic:
prompt = self.model_cls.get_generation_prompt(
audio=chunk,
stt_config=self.asr_config,
model_config=self.model_config,
renderer_config=self.renderer_config,
language=language,
task_type=self.task_type,
request_prompt=request.prompt,
Expand Down
2 changes: 2 additions & 0 deletions tests/compile/distributed/test_sequence_parallelism.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
DeviceConfig,
ModelConfig,
PassConfig,
RendererConfig,
VllmConfig,
get_current_vllm_config,
set_current_vllm_config,
Expand Down Expand Up @@ -276,6 +277,7 @@ def sequence_parallelism_pass_on_test_model(

vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
device_config=device_config,
compilation_config=compilation_config,
)
Expand Down
6 changes: 5 additions & 1 deletion tests/compile/test_functionalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
CompilationConfig,
ModelConfig,
PassConfig,
RendererConfig,
VllmConfig,
set_current_vllm_config,
)
Expand Down Expand Up @@ -219,8 +220,11 @@ def test_fix_functionalization(
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)

model_config = ModelConfig(dtype=dtype)

vllm_config = VllmConfig(
model_config=ModelConfig(dtype=dtype),
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
compilation_config=CompilationConfig(
custom_ops=["all"],
pass_config=PassConfig(
Expand Down
6 changes: 5 additions & 1 deletion tests/compile/test_fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
CompilationMode,
ModelConfig,
PassConfig,
RendererConfig,
VllmConfig,
)
from vllm.model_executor.layers.layernorm import RMSNorm
Expand Down Expand Up @@ -154,8 +155,11 @@ def test_fusion_rmsnorm_quant(
custom_ops.append("+rms_norm")
if enable_quant_fp8_custom_op:
custom_ops.append("+quant_fp8")

model_config = ModelConfig(dtype=dtype)
vllm_config = VllmConfig(
model_config=ModelConfig(dtype=dtype),
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=custom_ops,
Expand Down
2 changes: 2 additions & 0 deletions tests/compile/test_fusion_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
CompilationMode,
ModelConfig,
PassConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
set_current_vllm_config,
Expand Down Expand Up @@ -325,6 +326,7 @@ def test_attention_quant_pattern(
)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
scheduler_config=SchedulerConfig(
max_num_seqs=1024,
max_model_len=model_config.max_model_len,
Expand Down
8 changes: 6 additions & 2 deletions tests/compile/test_pass_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
from vllm.compilation.pass_manager import PostGradPassManager
from vllm.config import ModelConfig, VllmConfig
from vllm.config import ModelConfig, RendererConfig, VllmConfig


# dummy custom pass that doesn't inherit
Expand Down Expand Up @@ -43,7 +43,11 @@ def __call__(self, graph: torch.fx.graph.Graph) -> None:
)
def test_pass_manager_uuid(callable):
# Some passes need dtype to be set
config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
model_config = ModelConfig(dtype=torch.bfloat16)
config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
)

pass_manager = PostGradPassManager()
pass_manager.configure(config)
Expand Down
5 changes: 4 additions & 1 deletion tests/compile/test_qk_norm_rope_fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
CompilationMode,
ModelConfig,
PassConfig,
RendererConfig,
VllmConfig,
set_current_vllm_config,
)
Expand Down Expand Up @@ -133,8 +134,10 @@ def test_qk_norm_rope_fusion(
if enable_rope_custom_op:
custom_ops.append("+rotary_embedding")

model_config = ModelConfig(dtype=dtype)
vllm_config = VllmConfig(
model_config=ModelConfig(dtype=dtype),
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=custom_ops,
Expand Down
3 changes: 3 additions & 0 deletions tests/distributed/test_kvlayout.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
DeviceConfig,
KVTransferConfig,
ModelConfig,
RendererConfig,
VllmConfig,
set_current_vllm_config,
)
Expand Down Expand Up @@ -47,6 +48,7 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():
vllm_config = VllmConfig(
device_config=DeviceConfig("cpu"),
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
kv_transfer_config=kv_transfer_config,
)
with set_current_vllm_config(vllm_config):
Expand All @@ -70,6 +72,7 @@ def test_get_kv_connector_cache_layout_with_multi_connector():
vllm_config = VllmConfig(
device_config=DeviceConfig("cpu"),
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
kv_transfer_config=kv_transfer_config,
)
with set_current_vllm_config(vllm_config):
Expand Down
22 changes: 4 additions & 18 deletions tests/entrypoints/openai/test_chat_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import pytest

from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.tokenizers import get_tokenizer
Expand Down Expand Up @@ -107,24 +106,11 @@ def test_get_gen_prompt(
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")

model_config = ModelConfig(
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
revision=model_info.revision,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
renderer_config = model_info.build_renderer_config()

# Initialize the tokenizer
tokenizer = get_tokenizer(
tokenizer_name=model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code,
renderer_config.tokenizer,
trust_remote_code=renderer_config.trust_remote_code,
)
template_content = load_chat_template(chat_template=template)

Expand All @@ -143,7 +129,7 @@ def test_get_gen_prompt(
tokenizer=tokenizer,
conversation=mock_request.messages,
chat_template=mock_request.chat_template or template_content,
model_config=model_config,
renderer_config=renderer_config,
tools=None,
add_generation_prompt=mock_request.add_generation_prompt,
continue_final_message=mock_request.continue_final_message,
Expand Down
21 changes: 15 additions & 6 deletions tests/entrypoints/openai/test_lora_resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,26 +33,34 @@ class MockModelConfig:
"""Minimal mock ModelConfig for testing."""

model: str = MODEL_NAME
tokenizer: str = MODEL_NAME
trust_remote_code: bool = False
tokenizer_mode: str = "auto"
max_model_len: int = 100
tokenizer_revision: str | None = None
multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
hf_config: MockHFConfig = field(default_factory=MockHFConfig)
logits_processors: list[str] | None = None
logits_processor_pattern: str | None = None
diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
encoder_config = None
generation_config: str = "auto"
skip_tokenizer_init: bool = False

def get_diff_sampling_param(self):
return self.diff_sampling_param or {}


@dataclass
class MockRendererConfig:
"""Minimal mock RendererConfig for testing."""

model_config: MockModelConfig

tokenizer: str = MODEL_NAME
tokenizer_mode: str = "auto"
tokenizer_revision: str | None = None
skip_tokenizer_init: bool = False
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None


class MockLoRAResolver(LoRAResolver):
async def resolve_lora(
self, base_model_name: str, lora_name: str
Expand Down Expand Up @@ -114,6 +122,7 @@ async def mock_generate(*args, **kwargs):
mock_engine.add_lora.reset_mock()

mock_engine.model_config = MockModelConfig()
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

Expand Down
28 changes: 21 additions & 7 deletions tests/entrypoints/openai/test_serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,27 +346,33 @@ class MockHFConfig:
class MockModelConfig:
task = "generate"
runner_type = "generate"
tokenizer = MODEL_NAME
trust_remote_code = False
tokenizer_mode = "auto"
max_model_len = 100
tokenizer_revision = None
multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()
logits_processors: list[str] | None = None
logits_processor_pattern = None
diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
encoder_config = None
generation_config: str = "auto"
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
skip_tokenizer_init = False

def get_diff_sampling_param(self):
return self.diff_sampling_param or {}


@dataclass
class MockRendererConfig:
model_config: MockModelConfig = field(default_factory=MockModelConfig)

tokenizer = MODEL_NAME
tokenizer_mode = "auto"
tokenizer_revision = None
skip_tokenizer_init = False
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None


def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
models = OpenAIServingModels(
engine_client=engine,
Expand Down Expand Up @@ -399,6 +405,7 @@ async def _fake_process_inputs(
@dataclass
class MockEngine:
model_config: MockModelConfig = field(default_factory=MockModelConfig)
renderer_config: MockRendererConfig = field(default_factory=MockRendererConfig)
input_processor: MagicMock = field(default_factory=MagicMock)
io_processor: MagicMock = field(default_factory=MagicMock)

Expand Down Expand Up @@ -429,6 +436,7 @@ async def test_serving_chat_returns_correct_model_name():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

Expand Down Expand Up @@ -459,6 +467,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

Expand Down Expand Up @@ -492,6 +501,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

Expand Down Expand Up @@ -537,6 +547,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

Expand Down Expand Up @@ -583,6 +594,7 @@ async def test_serving_chat_could_load_correct_generation_config():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

Expand Down Expand Up @@ -629,6 +641,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

Expand Down Expand Up @@ -662,6 +675,7 @@ async def test_serving_chat_data_parallel_rank_extraction():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

Expand Down
Loading