11# SPDX-License-Identifier: Apache-2.0
22# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33import copy
4+ import logging
45from contextlib import nullcontext
56from unittest .mock import patch
67
1011from vllm .compilation .counter import compilation_counter
1112from vllm .compilation .fix_functionalization import FixFunctionalizationPass
1213from vllm .config import CompilationConfig , CUDAGraphMode , VllmConfig
13- from vllm .config .compilation import CompilationMode
14+ from vllm .config .compilation import CompilationMode , PassConfig
1415from vllm .engine .arg_utils import EngineArgs
16+ from vllm .logger import _print_warning_once
1517from vllm .platforms import current_platform
1618from vllm .utils .torch_utils import _is_torch_equal_or_newer
1719
@@ -191,7 +193,7 @@ def test_splitting_ops_dynamic():
191193 config = VllmConfig (
192194 compilation_config = CompilationConfig (
193195 mode = CompilationMode .VLLM_COMPILE ,
194- pass_config = { "enable_attn_fusion" : True , "enable_noop" : True } ,
196+ pass_config = PassConfig ( fuse_attn_quant = True , eliminate_noops = True ) ,
195197 custom_ops = ["+quant_fp8" ],
196198 cudagraph_mode = CUDAGraphMode .PIECEWISE ,
197199 )
@@ -206,7 +208,7 @@ def test_splitting_ops_dynamic():
206208 config = VllmConfig (
207209 compilation_config = CompilationConfig (
208210 mode = CompilationMode .VLLM_COMPILE ,
209- pass_config = { "enable_attn_fusion" : True , "enable_noop" : True } ,
211+ pass_config = PassConfig ( fuse_attn_quant = True , eliminate_noops = True ) ,
210212 custom_ops = ["+quant_fp8" ],
211213 cudagraph_mode = CUDAGraphMode .PIECEWISE ,
212214 # work around for accessing all attntion ops
@@ -219,15 +221,15 @@ def test_splitting_ops_dynamic():
219221 compilation_config = CompilationConfig (
220222 mode = CompilationMode .VLLM_COMPILE ,
221223 use_inductor_graph_partition = True ,
222- pass_config = { "enable_attn_fusion" : True , "enable_noop" : True } ,
224+ pass_config = PassConfig ( fuse_attn_quant = True , eliminate_noops = True ) ,
223225 custom_ops = ["+quant_fp8" ],
224226 cudagraph_mode = CUDAGraphMode .PIECEWISE ,
225227 )
226228 )
227229 # With inductor graph partition, attn_fusion and splitting_ops
228230 # work together. Default splitting_ops include attention ops.
229231 assert config .compilation_config .splitting_ops_contain_attention ()
230- # enable_attn_fusion is directly supported under
232+ # fuse_attn_quant is directly supported under
231233 # use_inductor_graph_partition=True, and cudagraph_mode
232234 # is unchanged.
233235 assert config .compilation_config .cudagraph_mode == CUDAGraphMode .PIECEWISE
@@ -301,7 +303,7 @@ def test_should_split():
301303 "cudagraph_capture_sizes" ,
302304 "max_cudagraph_capture_size" ,
303305 "tp_size" ,
304- "enable_sequence_parallelism " ,
306+ "enable_sp " ,
305307 "max_num_batched_tokens" ,
306308 "cudagraph_mode" ,
307309 "expected_max_size" ,
@@ -339,7 +341,7 @@ def test_cudagraph_sizes_post_init(
339341 cudagraph_capture_sizes ,
340342 max_cudagraph_capture_size ,
341343 tp_size ,
342- enable_sequence_parallelism ,
344+ enable_sp ,
343345 max_num_batched_tokens ,
344346 cudagraph_mode ,
345347 expected_max_size ,
@@ -355,11 +357,12 @@ def test_cudagraph_sizes_post_init(
355357 compilation_config = CompilationConfig (
356358 cudagraph_capture_sizes = cudagraph_capture_sizes ,
357359 max_cudagraph_capture_size = max_cudagraph_capture_size ,
358- pass_config = {
359- "enable_sequence_parallelism" : enable_sequence_parallelism ,
360- "enable_fusion" : True ,
361- "enable_noop" : True ,
362- },
360+ pass_config = PassConfig (
361+ enable_sp = enable_sp ,
362+ fuse_norm_quant = True ,
363+ fuse_act_quant = True ,
364+ eliminate_noops = True ,
365+ ),
363366 cudagraph_mode = cudagraph_mode ,
364367 )
365368 engine_args = EngineArgs (
@@ -375,3 +378,53 @@ def test_cudagraph_sizes_post_init(
375378 vllm_config .compilation_config .max_cudagraph_capture_size
376379 == expected_max_size
377380 )
381+
382+
383+ def test_pass_config_deprecation (caplog_vllm ):
384+ caplog_vllm .set_level (logging .WARNING )
385+
386+ # Clear cache to ensure warnings are re-issued
387+ _print_warning_once .cache_clear ()
388+
389+ # Test enable_fusion -> fuse_norm_quant, fuse_act_quant
390+ caplog_vllm .clear ()
391+ config = PassConfig (enable_fusion = True )
392+ assert "enable_fusion is deprecated" in caplog_vllm .text
393+ assert config .fuse_norm_quant is True
394+ assert config .fuse_act_quant is True
395+ assert config .enable_fusion is None
396+
397+ # Test enable_attn_fusion -> fuse_attn_quant
398+ caplog_vllm .clear ()
399+ config = PassConfig (enable_attn_fusion = True )
400+ assert "enable_attn_fusion is deprecated" in caplog_vllm .text
401+ assert config .fuse_attn_quant is True
402+ assert config .enable_attn_fusion is None
403+
404+ # Test enable_noop -> eliminate_noops
405+ caplog_vllm .clear ()
406+ config = PassConfig (enable_noop = True )
407+ assert "enable_noop is deprecated" in caplog_vllm .text
408+ assert config .eliminate_noops is True
409+ assert config .enable_noop is None
410+
411+ # Test enable_sequence_parallelism -> enable_sp
412+ caplog_vllm .clear ()
413+ config = PassConfig (enable_sequence_parallelism = True )
414+ assert "enable_sequence_parallelism is deprecated" in caplog_vllm .text
415+ assert config .enable_sp is True
416+ assert config .enable_sequence_parallelism is None
417+
418+ # Test enable_async_tp -> fuse_gemm_comms
419+ caplog_vllm .clear ()
420+ config = PassConfig (enable_async_tp = True )
421+ assert "enable_async_tp is deprecated" in caplog_vllm .text
422+ assert config .fuse_gemm_comms is True
423+ assert config .enable_async_tp is None
424+
425+ # Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
426+ caplog_vllm .clear ()
427+ config = PassConfig (enable_fi_allreduce_fusion = True )
428+ assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm .text
429+ assert config .fuse_allreduce_rms is True
430+ assert config .enable_fi_allreduce_fusion is None
0 commit comments