[V1] Make V1 engine backward compatible (#637)

### What this PR does / why we need it?
Enforce eager mode in the V1 engine ahead of the upcoming CANN and
torch_npu releases.

### Does this PR introduce _any_ user-facing change?
After this change, users will no longer need to manually set
enforce_eager=True.

### How was this patch tested?
Test it with regular offline inference examples.

Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
yiz-liu
2025-04-24 17:20:11 +08:00
committed by GitHub
parent bd70ce828c
commit d785e78563
4 changed files with 43 additions and 46 deletions

View File

@@ -47,7 +47,6 @@ def test_models_distributed(model: str,
dtype=dtype, dtype=dtype,
tensor_parallel_size=4, tensor_parallel_size=4,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
) as vllm_model: ) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens) vllm_model.generate_greedy(example_prompts, max_tokens)

View File

@@ -22,7 +22,6 @@ Run `pytest tests/ops/test_fused_moe.py`.
import pytest import pytest
import torch import torch
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm_ascend.ops.fused_moe import fused_experts from vllm_ascend.ops.fused_moe import fused_experts
@@ -68,36 +67,31 @@ def test_fused_experts(
dtype: torch.dtype, dtype: torch.dtype,
device: str, device: str,
): ):
vllm_config = VllmConfig() a = torch.randn((m, k), device=device, dtype=dtype) / 10
with set_current_vllm_config(vllm_config): w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 10
a = torch.randn((m, k), device=device, dtype=dtype) / 10 w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 10
w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 10
w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 10
score = torch.randn((m, e), device=device, dtype=dtype) score = torch.randn((m, e), device=device, dtype=dtype)
if ep_size > 1: if ep_size > 1:
local_e = e // ep_size local_e = e // ep_size
e_ids = torch.randint(0, e_ids = torch.randint(0,
e, (local_e, ), e, (local_e, ),
device=device, device=device,
dtype=torch.int32) dtype=torch.int32)
e_map = torch.full((e, ), -1, device=device, dtype=torch.int32) e_map = torch.full((e, ), -1, device=device, dtype=torch.int32)
e_map[e_ids] = torch.arange(local_e, e_map[e_ids] = torch.arange(local_e, device=device, dtype=torch.int32)
device=device, w1 = w1[e_ids]
dtype=torch.int32) w2 = w2[e_ids]
w1 = w1[e_ids] else:
w2 = w2[e_ids] e_map = None
else:
e_map = None
score = torch.softmax(score, dim=-1, dtype=dtype) score = torch.softmax(score, dim=-1, dtype=dtype)
topk_weights, topk_ids = torch.topk(score, topk) topk_weights, topk_ids = torch.topk(score, topk)
topk_ids = topk_ids.to(torch.int32) topk_ids = topk_ids.to(torch.int32)
output = fused_experts(a, w1, w2, topk_weights, topk_ids, topk, e_map) output = fused_experts(a, w1, w2, topk_weights, topk_ids, topk, e_map)
torch_output = torch_moe(a, w1, w2, topk_weights, topk_ids, topk, torch_output = torch_moe(a, w1, w2, topk_weights, topk_ids, topk, e_map)
e_map) # TODO: The native params are: atol=2e-2, rtol=0, maybe related to the nan problem
# TODO: The native params are: atol=2e-2, rtol=0, maybe related to the nan problem torch.testing.assert_close(output, torch_output, atol=4e-2, rtol=1)
torch.testing.assert_close(output, torch_output, atol=4e-2, rtol=1)
torch.npu.empty_cache() torch.npu.empty_cache()

View File

@@ -52,7 +52,7 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
with VllmRunner(model, with VllmRunner(model,
max_model_len=8192, max_model_len=8192,
dtype=dtype, dtype=dtype,
enforce_eager=True, enforce_eager=False,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens) vllm_model.generate_greedy(example_prompts, max_tokens)

View File

@@ -115,29 +115,33 @@ class NPUPlatform(Platform):
from vllm.config import CompilationLevel # noqa: E402 from vllm.config import CompilationLevel # noqa: E402
compilation_config = vllm_config.compilation_config compilation_config = vllm_config.compilation_config
enforce_eager_flag = False if vllm_config.model_config is None:
# Check whether the eager mode is configured logger.warning("Model config is missing. This may indicate "
try: "that we are running a test case")
enforce_eager_flag = vllm_config.model_config.enforce_eager enforce_eager = False
except Exception: else:
logger.warning( enforce_eager = getattr(vllm_config.model_config, "enforce_eager",
"There is currently no enforce_eager mode configured, the default value of enforce_eager=False is used" False)
)
if enforce_eager_flag or compilation_config.level == CompilationLevel.NO_COMPILATION: # TODO(Yizhou): Override the value of enforce_eager to True before
logger.warning( # the CANN and torch_npu support NPU compilation.
"Compilation level PIECEWISE is not enable on NPU now, current compilation level to NO_COMPILATION" enforce_eager = True
) logger.warning(
"NPU compilation support pending. Will be available in future CANN and "
"torch_npu releases. Using default: enforce_eager=True")
if enforce_eager or compilation_config.level == CompilationLevel.NO_COMPILATION:
logger.info("Compilation disabled, using eager mode by default")
compilation_config.level = CompilationLevel.NO_COMPILATION compilation_config.level = CompilationLevel.NO_COMPILATION
elif compilation_config.level != CompilationLevel.PIECEWISE: elif compilation_config.level != CompilationLevel.PIECEWISE:
logger.warning( logger.warning(
"Compilation level %s is not enable on NPU now, forcing compilation level to NO_COMPILATION", "NPU does not support %s compilation level. Setting level to NO_COMPILATION",
compilation_config.level) compilation_config.level)
compilation_config.level = CompilationLevel.NO_COMPILATION compilation_config.level = CompilationLevel.NO_COMPILATION
else: else:
logger.info( logger.info(
"Compilation level PIECEWISE is enable on NPU now, But use_inductor is no support, only use npu_graph now" "PIECEWISE compilation enabled on NPU. use_inductor not supported - "
) "using only ACL Graph mode")
compilation_config.use_inductor = False compilation_config.use_inductor = False
compilation_config.splitting_ops.extend( compilation_config.splitting_ops.extend(
["vllm.unified_ascend_attention_with_output"]) ["vllm.unified_ascend_attention_with_output"])