[Perf] Delete redundant operations in model_runner and forward_context (#3677)
### What this PR does / why we need it? Remove redundant operations from `model_runner` and `forward_context`. This optimization can significantly reduce the idle time (bubble) before decoding when running models with small parameter counts (e.g., Qwen/Qwen2.5-0.5B). Testing on 800I A2, bubble is reduced from 3.8ms to 2.8ms : Before <img width="1655" height="696" alt="image" src="https://github.com/user-attachments/assets/d7608e52-2438-46dd-8fc9-391fd6274495" /> After <img width="1607" height="774" alt="image" src="https://github.com/user-attachments/assets/56daf081-2dba-4d2e-99d4-e055187d9806" /> ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.1 --------- Signed-off-by: realliujiaxu <realliujiaxu@163.com>
This commit is contained in:
@@ -68,6 +68,8 @@ def test_select_moe_comm_method(soc_version, enable_expert_parallel,
|
||||
with patch('vllm_ascend.worker.model_runner_v1.get_ascend_soc_version',
|
||||
return_value=soc_version), \
|
||||
patch('vllm_ascend.worker.model_runner_v1.is_global_first_rank',
|
||||
return_value=True), \
|
||||
patch('vllm_ascend.worker.model_runner_v1.is_moe_model',
|
||||
return_value=True):
|
||||
|
||||
# Bind the real method to the mock object
|
||||
@@ -102,6 +104,8 @@ def test_select_moe_comm_method_unsupported_soc():
|
||||
return_value=unsupported_soc), \
|
||||
patch('vllm_ascend.worker.model_runner_v1.is_global_first_rank',
|
||||
return_value=True), \
|
||||
patch('vllm_ascend.worker.model_runner_v1.is_moe_model',
|
||||
return_value=True), \
|
||||
pytest.raises(ValueError, match=f"Unsupported soc_version: {unsupported_soc}"):
|
||||
|
||||
NPUModelRunner._select_moe_comm_method(mock_runner, 100, False)
|
||||
|
||||
@@ -11,7 +11,8 @@ from vllm.forward_context import (BatchDescriptor, get_forward_context,
|
||||
set_forward_context)
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.utils import enable_sp, is_moe_model, version_check
|
||||
from vllm_ascend.utils import (enable_sp, has_layer_idx, is_moe_model,
|
||||
version_check)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm_ascend.ops.weight_prefetch import WeightPrefetchMethod
|
||||
@@ -137,9 +138,7 @@ def set_ascend_forward_context(
|
||||
# set layer_idx to enable optimization features that depend on this information.
|
||||
# This is only applicable to models that contain these necessary attributes.
|
||||
forward_context.layer_idx = None
|
||||
if model_instance is not None and \
|
||||
hasattr(model_instance, "model") and \
|
||||
hasattr(model_instance.model, "start_layer"):
|
||||
if has_layer_idx(model_instance):
|
||||
forward_context.layer_idx = model_instance.model.start_layer
|
||||
|
||||
# TODO(rjg-lyh): refactor mlp weight prefetch method
|
||||
|
||||
@@ -37,7 +37,7 @@ _MoECommMethods: Dict[Optional[MoECommType], MoECommMethod] = {}
|
||||
|
||||
def get_moe_comm_method(
|
||||
moe_comm_type: Optional[MoECommType]) -> Optional[MoECommMethod]:
|
||||
return _MoECommMethods.get(moe_comm_type)
|
||||
return _MoECommMethods.get(moe_comm_type, None)
|
||||
|
||||
|
||||
def setup_moe_comm_method(moe_config):
|
||||
|
||||
@@ -58,6 +58,7 @@ _DEFAULT_BUFFER_SIZE = 200
|
||||
_MIN_DP_BUFFER_SIZE = 50
|
||||
_IS_MOE_MODEL = None
|
||||
_ENABLE_SP = None
|
||||
_HAS_LAYER_IDX = None
|
||||
|
||||
|
||||
def is_310p():
|
||||
@@ -807,3 +808,14 @@ def version_check():
|
||||
if full_date >= "20250919":
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def has_layer_idx(model_instance: torch.nn.Module) -> bool:
|
||||
if model_instance is None:
|
||||
return False
|
||||
|
||||
global _HAS_LAYER_IDX
|
||||
if _HAS_LAYER_IDX is None:
|
||||
_HAS_LAYER_IDX = hasattr(model_instance, "model") and \
|
||||
hasattr(model_instance.model, "start_layer")
|
||||
return _HAS_LAYER_IDX
|
||||
|
||||
@@ -136,7 +136,7 @@ from vllm_ascend.spec_decode.mtp_proposer import MtpProposer
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
||||
AscendSocVersion, ProfileExecuteDuration,
|
||||
enable_sp, get_ascend_soc_version, is_310p,
|
||||
is_enable_nz, lmhead_tp_enable,
|
||||
is_enable_nz, is_moe_model, lmhead_tp_enable,
|
||||
prefill_context_parallel_enable,
|
||||
vllm_version_is)
|
||||
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
||||
@@ -515,11 +515,14 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.in_profile_run = False
|
||||
|
||||
self._init_mc2_tokens_capacity()
|
||||
if is_moe_model(vllm_config):
|
||||
self.reserved_mc2_mask = torch.zeros(
|
||||
self.mc2_tokens_capacity,
|
||||
dtype=torch.bool,
|
||||
device=self.device,
|
||||
)
|
||||
else:
|
||||
self.reserved_mc2_mask = None
|
||||
self.dynamic_eplb = self.ascend_config.dynamic_eplb or self.ascend_config.expert_map_record_path
|
||||
if self.dynamic_eplb:
|
||||
EPLBParamUtils.check_dynamic_eplb(self.ascend_config.dynamic_eplb)
|
||||
@@ -1497,9 +1500,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.query_lens = torch.from_numpy(num_scheduled_tokens)
|
||||
|
||||
# Copy the tensors to the NPU.
|
||||
self.input_ids[:total_num_scheduled_tokens].copy_(
|
||||
self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
|
||||
|
||||
self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
|
||||
self.positions_cpu[total_num_scheduled_tokens:num_input_tokens].zero_()
|
||||
self.positions[:num_input_tokens].copy_(
|
||||
self.positions_cpu[:num_input_tokens], non_blocking=True)
|
||||
@@ -1521,16 +1522,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self._update_graph_pad_size(with_prefill, maybe_padded_num_tokens)
|
||||
attn_metadata: dict[str, Any] = {}
|
||||
|
||||
# Prepare input_ids
|
||||
token_indices = (positions_np +
|
||||
req_indices * self.input_batch.token_ids_cpu.shape[1])
|
||||
torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
|
||||
0,
|
||||
torch.from_numpy(token_indices),
|
||||
out=self.input_ids_cpu[:total_num_scheduled_tokens])
|
||||
# Copy the tensors to the NPU.
|
||||
self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
|
||||
|
||||
# _prepare_inputs may reorder the batch, so we must gather
|
||||
# multi-modal outputs after that to ensure the correct order
|
||||
if self.is_multimodal_model:
|
||||
@@ -2075,7 +2066,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
)
|
||||
|
||||
def _select_moe_comm_method(self, num_tokens: int,
|
||||
with_prefill: bool) -> MoECommType:
|
||||
with_prefill: bool) -> Optional[MoECommType]:
|
||||
"""1. If expert parallel is not enabled, we use all-gather since MC2 and all-to-all
|
||||
are designed for expert parallelism.
|
||||
2. If expert parallel is enabled, we need to consider the soc version and the
|
||||
@@ -2098,6 +2089,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
Returns:
|
||||
MoECommType: The selected MoE communication method.
|
||||
"""
|
||||
if not is_moe_model(self.vllm_config):
|
||||
return None
|
||||
|
||||
soc_version = get_ascend_soc_version()
|
||||
quant_type = getattr(self.vllm_config.model_config.hf_config,
|
||||
'moe_quantize', None)
|
||||
|
||||
Reference in New Issue
Block a user