[Perf] Delete redundant operations in model_runner and forward_context (#3677)
### What this PR does / why we need it? Remove redundant operations from `model_runner` and `forward_context`. This optimization can significantly reduce the idle time (bubble) before decoding when running models with small parameter counts (e.g., Qwen/Qwen2.5-0.5B). Testing on 800I A2, bubble is reduced from 3.8ms to 2.8ms : Before <img width="1655" height="696" alt="image" src="https://github.com/user-attachments/assets/d7608e52-2438-46dd-8fc9-391fd6274495" /> After <img width="1607" height="774" alt="image" src="https://github.com/user-attachments/assets/56daf081-2dba-4d2e-99d4-e055187d9806" /> ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.1 --------- Signed-off-by: realliujiaxu <realliujiaxu@163.com>
This commit is contained in:
@@ -11,7 +11,8 @@ from vllm.forward_context import (BatchDescriptor, get_forward_context,
|
||||
set_forward_context)
|
||||
|
||||
import vllm_ascend.envs as envs_ascend
|
||||
from vllm_ascend.utils import enable_sp, is_moe_model, version_check
|
||||
from vllm_ascend.utils import (enable_sp, has_layer_idx, is_moe_model,
|
||||
version_check)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm_ascend.ops.weight_prefetch import WeightPrefetchMethod
|
||||
@@ -137,9 +138,7 @@ def set_ascend_forward_context(
|
||||
# set layer_idx to enable optimization features that depend on this information.
|
||||
# This is only applicable to models that contain these necessary attributes.
|
||||
forward_context.layer_idx = None
|
||||
if model_instance is not None and \
|
||||
hasattr(model_instance, "model") and \
|
||||
hasattr(model_instance.model, "start_layer"):
|
||||
if has_layer_idx(model_instance):
|
||||
forward_context.layer_idx = model_instance.model.start_layer
|
||||
|
||||
# TODO(rjg-lyh): refactor mlp weight prefetch method
|
||||
|
||||
Reference in New Issue
Block a user