[Fusion] normalize fusion naming and enable e2e test (#4693)
### What this PR does / why we need it?
This PR standardizes the fusion naming, changing
`enable_quantization_fusion` to `fuse_norm_quant`, and enables e2e
testing.
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
CI passed with new added/existing test.
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
---------
Signed-off-by: wxsIcey <1790571317@qq.com>
This commit is contained in:
@@ -190,19 +190,18 @@ class AscendCompilationConfig:
|
||||
deployed on Ascend platforms.
|
||||
"""
|
||||
|
||||
def __init__(self, enable_quantization_fusion: bool = True, **kwargs):
|
||||
def __init__(self, fuse_norm_quant: bool = True, **kwargs):
|
||||
"""
|
||||
Initialize the configuration.
|
||||
|
||||
Args:
|
||||
enable_quantization_fusion (bool): Whether to enable quantization fusion optimization.
|
||||
When set to True, the system will optimize quantization-related operations,
|
||||
reducing the number of quantization/dequantization nodes.
|
||||
fuse_norm_quant (bool): Whether to enable norm and quant fusion optimization.
|
||||
When set to True, the system will optimize norm and quant operations.
|
||||
Default: True
|
||||
|
||||
**kwargs: Additional optional parameters for forward compatibility and configuration extension.
|
||||
"""
|
||||
self.enable_quantization_fusion = enable_quantization_fusion
|
||||
self.fuse_norm_quant = fuse_norm_quant
|
||||
# Add more compilation related configs here as needed
|
||||
|
||||
|
||||
|
||||
@@ -46,8 +46,8 @@ class GraphFusionPassManager:
|
||||
# By default, we enable the graph fusion and quantization fusion pass.
|
||||
self.ascend_compilation_config: dict = config.additional_config.get(
|
||||
"ascend_compilation_config", {})
|
||||
if self.ascend_compilation_config.get("enable_quantization_fusion",
|
||||
True):
|
||||
from .passes.quant_fusion_pass import AddRMSNormQuantFusionPass
|
||||
if self.ascend_compilation_config.get("fuse_norm_quant", True):
|
||||
from .passes.norm_quant_fusion_pass import \
|
||||
AddRMSNormQuantFusionPass
|
||||
self.passes.append(AddRMSNormQuantFusionPass(config))
|
||||
# Add more passes here as needed
|
||||
|
||||
@@ -88,8 +88,7 @@ class NPUPlatform(Platform):
|
||||
Get the custom compile backend. Previously, we used EagerAdaptor by default.
|
||||
To use graph fusion operations, we defined our own backend compiler.
|
||||
"""
|
||||
from vllm_ascend.compilation.compiler_interface import AscendCompiler
|
||||
return AscendCompiler.__module__ + "." + AscendCompiler.__name__
|
||||
return "vllm_ascend.compilation.compiler_interface.AscendCompiler"
|
||||
|
||||
@classmethod
|
||||
def pre_register_and_update(cls,
|
||||
@@ -225,8 +224,8 @@ class NPUPlatform(Platform):
|
||||
if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE:
|
||||
compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
|
||||
|
||||
from vllm_ascend.compilation.compiler_interface import AscendCompiler
|
||||
compilation_config.oot_compiler = AscendCompiler.__module__ + "." + AscendCompiler.__name__
|
||||
# get custom compile backend for graph fusion
|
||||
compilation_config.oot_compiler = cls.get_compile_backend()
|
||||
|
||||
if compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
|
||||
compilation_config.mode = CompilationMode.NONE
|
||||
|
||||
Reference in New Issue
Block a user