diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py index a41996d..dfc6675 100644 --- a/tests/multicard/test_offline_inference_distributed.py +++ b/tests/multicard/test_offline_inference_distributed.py @@ -47,7 +47,6 @@ def test_models_distributed(model: str, dtype=dtype, tensor_parallel_size=4, distributed_executor_backend=distributed_executor_backend, - enforce_eager=True, ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/ops/test_fused_moe.py b/tests/ops/test_fused_moe.py index be13f12..7b21307 100644 --- a/tests/ops/test_fused_moe.py +++ b/tests/ops/test_fused_moe.py @@ -22,7 +22,6 @@ Run `pytest tests/ops/test_fused_moe.py`. import pytest import torch -from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.activation import SiluAndMul from vllm_ascend.ops.fused_moe import fused_experts @@ -68,36 +67,31 @@ def test_fused_experts( dtype: torch.dtype, device: str, ): - vllm_config = VllmConfig() - with set_current_vllm_config(vllm_config): - a = torch.randn((m, k), device=device, dtype=dtype) / 10 - w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 10 - w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 10 + a = torch.randn((m, k), device=device, dtype=dtype) / 10 + w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 10 + w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 10 - score = torch.randn((m, e), device=device, dtype=dtype) + score = torch.randn((m, e), device=device, dtype=dtype) - if ep_size > 1: - local_e = e // ep_size - e_ids = torch.randint(0, - e, (local_e, ), - device=device, - dtype=torch.int32) - e_map = torch.full((e, ), -1, device=device, dtype=torch.int32) - e_map[e_ids] = torch.arange(local_e, - device=device, - dtype=torch.int32) - w1 = w1[e_ids] - w2 = w2[e_ids] - else: - e_map = None + if ep_size > 1: + local_e = e // ep_size + e_ids = torch.randint(0, + e, (local_e, ), + device=device, + dtype=torch.int32) + e_map = torch.full((e, ), -1, device=device, dtype=torch.int32) + e_map[e_ids] = torch.arange(local_e, device=device, dtype=torch.int32) + w1 = w1[e_ids] + w2 = w2[e_ids] + else: + e_map = None - score = torch.softmax(score, dim=-1, dtype=dtype) - topk_weights, topk_ids = torch.topk(score, topk) - topk_ids = topk_ids.to(torch.int32) + score = torch.softmax(score, dim=-1, dtype=dtype) + topk_weights, topk_ids = torch.topk(score, topk) + topk_ids = topk_ids.to(torch.int32) - output = fused_experts(a, w1, w2, topk_weights, topk_ids, topk, e_map) - torch_output = torch_moe(a, w1, w2, topk_weights, topk_ids, topk, - e_map) - # TODO: The native params are: atol=2e-2, rtol=0, maybe related to the nan problem - torch.testing.assert_close(output, torch_output, atol=4e-2, rtol=1) + output = fused_experts(a, w1, w2, topk_weights, topk_ids, topk, e_map) + torch_output = torch_moe(a, w1, w2, topk_weights, topk_ids, topk, e_map) + # TODO: The native params are: atol=2e-2, rtol=0, maybe related to the nan problem + torch.testing.assert_close(output, torch_output, atol=4e-2, rtol=1) torch.npu.empty_cache() diff --git a/tests/singlecard/test_offline_inference.py b/tests/singlecard/test_offline_inference.py index 5c10479..8d95556 100644 --- a/tests/singlecard/test_offline_inference.py +++ b/tests/singlecard/test_offline_inference.py @@ -52,7 +52,7 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None: with VllmRunner(model, max_model_len=8192, dtype=dtype, - enforce_eager=True, + enforce_eager=False, gpu_memory_utilization=0.7) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index c82d4e8..79e9486 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -115,29 +115,33 @@ class NPUPlatform(Platform): from vllm.config import CompilationLevel # noqa: E402 compilation_config = vllm_config.compilation_config - enforce_eager_flag = False - # Check whether the eager mode is configured - try: - enforce_eager_flag = vllm_config.model_config.enforce_eager - except Exception: - logger.warning( - "There is currently no enforce_eager mode configured, the default value of enforce_eager=False is used" - ) + if vllm_config.model_config is None: + logger.warning("Model config is missing. This may indicate " + "that we are running a test case") + enforce_eager = False + else: + enforce_eager = getattr(vllm_config.model_config, "enforce_eager", + False) - if enforce_eager_flag or compilation_config.level == CompilationLevel.NO_COMPILATION: - logger.warning( - "Compilation level PIECEWISE is not enable on NPU now, current compilation level to NO_COMPILATION" - ) + # TODO(Yizhou): Override the value of enforce_eager to True before + # the CANN and torch_npu support NPU compilation. + enforce_eager = True + logger.warning( + "NPU compilation support pending. Will be available in future CANN and " + "torch_npu releases. Using default: enforce_eager=True") + + if enforce_eager or compilation_config.level == CompilationLevel.NO_COMPILATION: + logger.info("Compilation disabled, using eager mode by default") compilation_config.level = CompilationLevel.NO_COMPILATION elif compilation_config.level != CompilationLevel.PIECEWISE: logger.warning( - "Compilation level %s is not enable on NPU now, forcing compilation level to NO_COMPILATION", + "NPU does not support %s compilation level. Setting level to NO_COMPILATION", compilation_config.level) compilation_config.level = CompilationLevel.NO_COMPILATION else: logger.info( - "Compilation level PIECEWISE is enable on NPU now, But use_inductor is no support, only use npu_graph now" - ) + "PIECEWISE compilation enabled on NPU. use_inductor not supported - " + "using only ACL Graph mode") compilation_config.use_inductor = False compilation_config.splitting_ops.extend( ["vllm.unified_ascend_attention_with_output"])