Revert "GMM custom operator optimization in small batch scenarios (vllm-project#7100)" (#7557)
### What this PR does / why we need it? This reverts commit42bcad7e9b. The commit cause accuracy decrease of qwen3Next, 150 items of gsm8k, 98 -> 91. - vLLM version: v0.18.0 - vLLM main:6a9cceb219Signed-off-by: Your Name <you@example.com> Co-authored-by: Your Name <you@example.com>
This commit is contained in:
@@ -17,7 +17,6 @@
|
||||
#
|
||||
import torch
|
||||
import torch_npu
|
||||
from vllm.forward_context import get_forward_context
|
||||
|
||||
from vllm_ascend.device.mxfp_compat import (
|
||||
FLOAT4_E2M1FN_X2_DTYPE,
|
||||
@@ -28,8 +27,6 @@ from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
|
||||
|
||||
|
||||
class BaseDeviceAdaptor:
|
||||
small_batch_gmm_batch_num = 16
|
||||
|
||||
@classmethod
|
||||
def reshape_and_cache(cls, key, value, key_cache, value_cache, slot_mapping):
|
||||
torch_npu._npu_reshape_and_cache(
|
||||
@@ -49,32 +46,17 @@ class BaseDeviceAdaptor:
|
||||
active_expert_range=None,
|
||||
quant_mode: int = -1,
|
||||
):
|
||||
# In small batch and non-quantization scenarios, npu_moe_init_routing_v2 is more efficient.
|
||||
# It is expected that further improvements will be made after it is incorporated into CANN on June 30th.
|
||||
if quant_mode == -1 and get_forward_context().num_tokens <= DeviceOperator.small_batch_gmm_batch_num:
|
||||
return torch_npu.npu_moe_init_routing_v2(
|
||||
hidden_states,
|
||||
topk_ids,
|
||||
scale=scale,
|
||||
active_num=active_num,
|
||||
expert_num=expert_num,
|
||||
expert_tokens_num_type=2,
|
||||
expert_tokens_num_flag=expert_tokens_num_flag,
|
||||
active_expert_range=active_expert_range,
|
||||
quant_mode=quant_mode,
|
||||
)
|
||||
else:
|
||||
return torch.ops._C_ascend.npu_moe_init_routing_custom(
|
||||
hidden_states,
|
||||
topk_ids,
|
||||
scale=scale,
|
||||
active_num=active_num,
|
||||
expert_num=expert_num,
|
||||
expert_tokens_num_type=expert_tokens_num_type,
|
||||
expert_tokens_num_flag=expert_tokens_num_flag,
|
||||
active_expert_range=active_expert_range,
|
||||
quant_mode=quant_mode,
|
||||
)
|
||||
return torch.ops._C_ascend.npu_moe_init_routing_custom(
|
||||
hidden_states,
|
||||
topk_ids,
|
||||
scale=scale,
|
||||
active_num=active_num,
|
||||
expert_num=expert_num,
|
||||
expert_tokens_num_type=expert_tokens_num_type,
|
||||
expert_tokens_num_flag=expert_tokens_num_flag,
|
||||
active_expert_range=active_expert_range,
|
||||
quant_mode=quant_mode,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def npu_dynamic_quant(
|
||||
|
||||
Reference in New Issue
Block a user