[V1] Make V1 engine backward compatible (#637)
### What this PR does / why we need it? Enforce eager mode in the V1 engine ahead of the upcoming CANN and torch_npu releases. ### Does this PR introduce _any_ user-facing change? After this change, users will no longer need to manually set enforce_eager=True. ### How was this patch tested? Test it with regular offline inference examples. Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
This commit is contained in:
@@ -47,7 +47,6 @@ def test_models_distributed(model: str,
|
|||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
tensor_parallel_size=4,
|
tensor_parallel_size=4,
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
enforce_eager=True,
|
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
|
|||||||
@@ -22,7 +22,6 @@ Run `pytest tests/ops/test_fused_moe.py`.
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from vllm.config import VllmConfig, set_current_vllm_config
|
|
||||||
from vllm.model_executor.layers.activation import SiluAndMul
|
from vllm.model_executor.layers.activation import SiluAndMul
|
||||||
|
|
||||||
from vllm_ascend.ops.fused_moe import fused_experts
|
from vllm_ascend.ops.fused_moe import fused_experts
|
||||||
@@ -68,36 +67,31 @@ def test_fused_experts(
|
|||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
device: str,
|
device: str,
|
||||||
):
|
):
|
||||||
vllm_config = VllmConfig()
|
a = torch.randn((m, k), device=device, dtype=dtype) / 10
|
||||||
with set_current_vllm_config(vllm_config):
|
w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 10
|
||||||
a = torch.randn((m, k), device=device, dtype=dtype) / 10
|
w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 10
|
||||||
w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 10
|
|
||||||
w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 10
|
|
||||||
|
|
||||||
score = torch.randn((m, e), device=device, dtype=dtype)
|
score = torch.randn((m, e), device=device, dtype=dtype)
|
||||||
|
|
||||||
if ep_size > 1:
|
if ep_size > 1:
|
||||||
local_e = e // ep_size
|
local_e = e // ep_size
|
||||||
e_ids = torch.randint(0,
|
e_ids = torch.randint(0,
|
||||||
e, (local_e, ),
|
e, (local_e, ),
|
||||||
device=device,
|
device=device,
|
||||||
dtype=torch.int32)
|
dtype=torch.int32)
|
||||||
e_map = torch.full((e, ), -1, device=device, dtype=torch.int32)
|
e_map = torch.full((e, ), -1, device=device, dtype=torch.int32)
|
||||||
e_map[e_ids] = torch.arange(local_e,
|
e_map[e_ids] = torch.arange(local_e, device=device, dtype=torch.int32)
|
||||||
device=device,
|
w1 = w1[e_ids]
|
||||||
dtype=torch.int32)
|
w2 = w2[e_ids]
|
||||||
w1 = w1[e_ids]
|
else:
|
||||||
w2 = w2[e_ids]
|
e_map = None
|
||||||
else:
|
|
||||||
e_map = None
|
|
||||||
|
|
||||||
score = torch.softmax(score, dim=-1, dtype=dtype)
|
score = torch.softmax(score, dim=-1, dtype=dtype)
|
||||||
topk_weights, topk_ids = torch.topk(score, topk)
|
topk_weights, topk_ids = torch.topk(score, topk)
|
||||||
topk_ids = topk_ids.to(torch.int32)
|
topk_ids = topk_ids.to(torch.int32)
|
||||||
|
|
||||||
output = fused_experts(a, w1, w2, topk_weights, topk_ids, topk, e_map)
|
output = fused_experts(a, w1, w2, topk_weights, topk_ids, topk, e_map)
|
||||||
torch_output = torch_moe(a, w1, w2, topk_weights, topk_ids, topk,
|
torch_output = torch_moe(a, w1, w2, topk_weights, topk_ids, topk, e_map)
|
||||||
e_map)
|
# TODO: The native params are: atol=2e-2, rtol=0, maybe related to the nan problem
|
||||||
# TODO: The native params are: atol=2e-2, rtol=0, maybe related to the nan problem
|
torch.testing.assert_close(output, torch_output, atol=4e-2, rtol=1)
|
||||||
torch.testing.assert_close(output, torch_output, atol=4e-2, rtol=1)
|
|
||||||
torch.npu.empty_cache()
|
torch.npu.empty_cache()
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
|
|||||||
with VllmRunner(model,
|
with VllmRunner(model,
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
enforce_eager=True,
|
enforce_eager=False,
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
gpu_memory_utilization=0.7) as vllm_model:
|
||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
|
|||||||
@@ -115,29 +115,33 @@ class NPUPlatform(Platform):
|
|||||||
from vllm.config import CompilationLevel # noqa: E402
|
from vllm.config import CompilationLevel # noqa: E402
|
||||||
compilation_config = vllm_config.compilation_config
|
compilation_config = vllm_config.compilation_config
|
||||||
|
|
||||||
enforce_eager_flag = False
|
if vllm_config.model_config is None:
|
||||||
# Check whether the eager mode is configured
|
logger.warning("Model config is missing. This may indicate "
|
||||||
try:
|
"that we are running a test case")
|
||||||
enforce_eager_flag = vllm_config.model_config.enforce_eager
|
enforce_eager = False
|
||||||
except Exception:
|
else:
|
||||||
logger.warning(
|
enforce_eager = getattr(vllm_config.model_config, "enforce_eager",
|
||||||
"There is currently no enforce_eager mode configured, the default value of enforce_eager=False is used"
|
False)
|
||||||
)
|
|
||||||
|
|
||||||
if enforce_eager_flag or compilation_config.level == CompilationLevel.NO_COMPILATION:
|
# TODO(Yizhou): Override the value of enforce_eager to True before
|
||||||
logger.warning(
|
# the CANN and torch_npu support NPU compilation.
|
||||||
"Compilation level PIECEWISE is not enable on NPU now, current compilation level to NO_COMPILATION"
|
enforce_eager = True
|
||||||
)
|
logger.warning(
|
||||||
|
"NPU compilation support pending. Will be available in future CANN and "
|
||||||
|
"torch_npu releases. Using default: enforce_eager=True")
|
||||||
|
|
||||||
|
if enforce_eager or compilation_config.level == CompilationLevel.NO_COMPILATION:
|
||||||
|
logger.info("Compilation disabled, using eager mode by default")
|
||||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||||
elif compilation_config.level != CompilationLevel.PIECEWISE:
|
elif compilation_config.level != CompilationLevel.PIECEWISE:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Compilation level %s is not enable on NPU now, forcing compilation level to NO_COMPILATION",
|
"NPU does not support %s compilation level. Setting level to NO_COMPILATION",
|
||||||
compilation_config.level)
|
compilation_config.level)
|
||||||
compilation_config.level = CompilationLevel.NO_COMPILATION
|
compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||||
else:
|
else:
|
||||||
logger.info(
|
logger.info(
|
||||||
"Compilation level PIECEWISE is enable on NPU now, But use_inductor is no support, only use npu_graph now"
|
"PIECEWISE compilation enabled on NPU. use_inductor not supported - "
|
||||||
)
|
"using only ACL Graph mode")
|
||||||
compilation_config.use_inductor = False
|
compilation_config.use_inductor = False
|
||||||
compilation_config.splitting_ops.extend(
|
compilation_config.splitting_ops.extend(
|
||||||
["vllm.unified_ascend_attention_with_output"])
|
["vllm.unified_ascend_attention_with_output"])
|
||||||
|
|||||||
Reference in New Issue
Block a user