[Fix] Fix update_aclgraph_sizes when running MoE models (#913)
### What this PR does / why we need it? Fix update_aclgraph_sizes when running MoE models. --------- Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
@@ -119,6 +119,26 @@ class NPUPlatform(Platform):
|
||||
from vllm.config import CompilationLevel # noqa: E402
|
||||
compilation_config = vllm_config.compilation_config
|
||||
model_config = vllm_config.model_config
|
||||
additional_config = vllm_config.additional_config
|
||||
parallel_config = vllm_config.parallel_config
|
||||
cache_config = vllm_config.cache_config
|
||||
|
||||
if parallel_config:
|
||||
# Default value for expert tensor parallel size
|
||||
parallel_config.expert_tensor_parallel_size = parallel_config.tensor_parallel_size
|
||||
|
||||
# NOTE: When enable_expert_parallel is True, we follow vLLM convention:
|
||||
# ep_size = world_size, which means expert_tensor_parallel_size must be 1
|
||||
if (additional_config
|
||||
and "expert_tensor_parallel_size" in additional_config
|
||||
and not parallel_config.enable_expert_parallel):
|
||||
parallel_config.expert_tensor_parallel_size = int(
|
||||
additional_config["expert_tensor_parallel_size"])
|
||||
|
||||
# Calculate expert parallel size based on world size
|
||||
parallel_config.expert_parallel_size = (
|
||||
parallel_config.world_size //
|
||||
parallel_config.expert_tensor_parallel_size)
|
||||
|
||||
if model_config is None:
|
||||
logger.warning("Model config is missing. This may indicate "
|
||||
@@ -127,9 +147,9 @@ class NPUPlatform(Platform):
|
||||
else:
|
||||
enforce_eager = getattr(model_config, "enforce_eager", False)
|
||||
|
||||
if vllm_config.additional_config is not None:
|
||||
enable_graph_mode = vllm_config.additional_config.get(
|
||||
"enable_graph_mode", False)
|
||||
if additional_config is not None:
|
||||
enable_graph_mode = additional_config.get("enable_graph_mode",
|
||||
False)
|
||||
if enable_graph_mode:
|
||||
if enforce_eager:
|
||||
raise RuntimeError(
|
||||
@@ -139,7 +159,7 @@ class NPUPlatform(Platform):
|
||||
logger.warning(
|
||||
"NPU graph mode is still experimental and not supported for V1 without mla currently, "
|
||||
"it has been disabled automatically.")
|
||||
vllm_config.additional_config["enable_graph_mode"] = False
|
||||
additional_config["enable_graph_mode"] = False
|
||||
if model_config:
|
||||
model_type = model_config.hf_config.model_type
|
||||
if "deepseek" not in model_type:
|
||||
@@ -178,7 +198,6 @@ class NPUPlatform(Platform):
|
||||
["vllm.unified_ascend_attention_with_output"])
|
||||
update_aclgraph_sizes(vllm_config)
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
if parallel_config and parallel_config.worker_cls == "auto":
|
||||
if envs.VLLM_USE_V1:
|
||||
parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker"
|
||||
@@ -190,7 +209,6 @@ class NPUPlatform(Platform):
|
||||
else:
|
||||
parallel_config.worker_cls = "vllm_ascend.worker.worker.NPUWorker"
|
||||
|
||||
cache_config = vllm_config.cache_config
|
||||
if cache_config:
|
||||
if cache_config.block_size is None:
|
||||
cache_config.block_size = 128
|
||||
@@ -202,11 +220,10 @@ class NPUPlatform(Platform):
|
||||
|
||||
if envs.VLLM_USE_V1:
|
||||
# Activate custom ops for v1.
|
||||
vllm_config.compilation_config.custom_ops = ["all"]
|
||||
compilation_config.custom_ops = ["all"]
|
||||
# If ascend_scheduler_config exists in additional_config,
|
||||
# extents original scheduler_config to use AscendScheduler.
|
||||
|
||||
additional_config = vllm_config.additional_config
|
||||
if additional_config and additional_config.get(
|
||||
"ascend_scheduler_config", None) is not None:
|
||||
additional_scheduler_config = additional_config.get(
|
||||
|
||||
Reference in New Issue
Block a user