[Bugfix] fix mtp profile run error where main model and mtp model use different quantization (#4102)
### What this PR does / why we need it?
In PR https://github.com/vllm-project/vllm-ascend/pull/3420, we
initially placed the quantization type (quant_type) in the MoECommMethod
class. However, since MoECommMethod follows a singleton pattern, it
couldn't accommodate scenarios where different layers in the model might
use different quantization approaches (e.g., MTP modules using
floating-point computation while the main model employs quantized
computation).
In this PR, we've moved the quantization type to the AscendFusedMoe
class and pass it as a parameter to MoECommMethod.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
```bash
export HCCL_BUFFSIZE=1024
export VLLM_VERSION=0.11.0
vllm serve /home/data/DeepSeek-R1_w8a8/ \
--data-parallel-size 2 \
--tensor-parallel-size 8 \
--enable-expert-parallel \
--served-model-name dsv3 \
--max-model-len 32768 \
--max-num-batched-tokens 4096 \
--max-num-seqs 16 \
--quantization ascend \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
```
- vLLM version: v0.11.0
- vLLM main:
83f478bb19
---------
Signed-off-by: realliujiaxu <realliujiaxu@163.com>
This commit is contained in:
@@ -37,6 +37,11 @@ from vllm_ascend.eplb.core.eplb_utils import (determine_default_expert_map,
|
||||
from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
|
||||
from vllm_ascend.ops.fused_moe.experts_selector import select_experts
|
||||
from vllm_ascend.ops.fused_moe.moe_comm_method import setup_moe_comm_method
|
||||
from vllm_ascend.ops.fused_moe.prepare_finalize import QuantType
|
||||
from vllm_ascend.quantization.w4a8_dynamic import \
|
||||
AscendW4A8DynamicFusedMoEMethod
|
||||
from vllm_ascend.quantization.w8a8_dynamic import \
|
||||
AscendW8A8DynamicFusedMoEMethod
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, enable_sp, is_310p,
|
||||
is_enable_nz, npu_stream_switch,
|
||||
shared_expert_dp_enabled,
|
||||
@@ -289,7 +294,23 @@ class AscendFusedMoE(FusedMoE):
|
||||
|
||||
self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
|
||||
|
||||
setup_moe_comm_method(self.moe_config, self.quant_method)
|
||||
setup_moe_comm_method(self.moe_config)
|
||||
self.quant_type = self._get_quant_type()
|
||||
|
||||
def _get_quant_type(self) -> QuantType:
|
||||
quant_method = self.quant_method
|
||||
if not hasattr(quant_method,
|
||||
"quant_method") or quant_method.quant_method is None:
|
||||
return QuantType.NONE
|
||||
|
||||
method = quant_method.quant_method
|
||||
|
||||
if isinstance(method, AscendW8A8DynamicFusedMoEMethod):
|
||||
return QuantType.W8A8
|
||||
elif isinstance(method, AscendW4A8DynamicFusedMoEMethod):
|
||||
return QuantType.W4A8
|
||||
else:
|
||||
return QuantType.NONE
|
||||
|
||||
def update_expert_map(self, new_expert_map):
|
||||
self.expert_map = new_expert_map
|
||||
@@ -334,7 +355,8 @@ class AscendFusedMoE(FusedMoE):
|
||||
hidden_states=hidden_states,
|
||||
router_logits=router_logits,
|
||||
replace_allreduce=forward_context.sp_enabled,
|
||||
enable_shared_expert_dp=self.enable_shared_expert_dp)
|
||||
enable_shared_expert_dp=self.enable_shared_expert_dp,
|
||||
quant_type=self.quant_type)
|
||||
|
||||
if isinstance(hidden_states, tuple):
|
||||
hidden_states, pertoken_scale = hidden_states
|
||||
|
||||
Reference in New Issue
Block a user