enable mm allreduce test (#2192)
### What this PR does / why we need it?
This PR is to add e2e test for using npu_mm_all_reduce_base fusion
kernel.
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
not involved
- vLLM version: v0.10.0
- vLLM main:
5d5d419ca6
Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
This commit is contained in:
@@ -24,11 +24,14 @@ import os
|
|||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import torch_npu
|
||||||
|
|
||||||
MODELS = ["Qwen/Qwen3-0.6B"]
|
MODELS = ["Qwen/Qwen3-0.6B"]
|
||||||
MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
|
MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
|
||||||
|
DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@@ -147,3 +150,38 @@ def test_external_launcher_and_sleepmode():
|
|||||||
assert "Generated text:" in output
|
assert "Generated text:" in output
|
||||||
assert "Sleep and wake up successfully!!" in output
|
assert "Sleep and wake up successfully!!" in output
|
||||||
assert proc.returncode == 0
|
assert proc.returncode == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
DEVICE_NAME != "Ascend910B",
|
||||||
|
reason="This test is only for Ascend910B devices.",
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1"})
|
||||||
|
def test_mm_allreduce(model):
|
||||||
|
script = Path(
|
||||||
|
__file__
|
||||||
|
).parent.parent.parent.parent / "examples" / "offline_external_launcher.py"
|
||||||
|
env = os.environ.copy()
|
||||||
|
cmd = [
|
||||||
|
sys.executable,
|
||||||
|
str(script),
|
||||||
|
"--model",
|
||||||
|
model,
|
||||||
|
"--trust-remote-code",
|
||||||
|
]
|
||||||
|
|
||||||
|
print(f"Running subprocess: {' '.join(cmd)}")
|
||||||
|
proc = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
env=env,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
|
timeout=600,
|
||||||
|
)
|
||||||
|
|
||||||
|
output = proc.stdout.decode()
|
||||||
|
print(output)
|
||||||
|
|
||||||
|
assert "Generated text:" in output
|
||||||
|
assert proc.returncode == 0
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ from torch.nn.parameter import Parameter
|
|||||||
from vllm.distributed import (get_tensor_model_parallel_rank,
|
from vllm.distributed import (get_tensor_model_parallel_rank,
|
||||||
split_tensor_along_last_dim)
|
split_tensor_along_last_dim)
|
||||||
from vllm.distributed.parallel_state import get_tp_group
|
from vllm.distributed.parallel_state import get_tp_group
|
||||||
|
from vllm.logger import logger
|
||||||
from vllm.model_executor.layers.linear import RowParallelLinear
|
from vllm.model_executor.layers.linear import RowParallelLinear
|
||||||
|
|
||||||
from vllm_ascend import envs
|
from vllm_ascend import envs
|
||||||
@@ -142,4 +143,5 @@ class AscendRowParallelLinear(RowParallelLinear):
|
|||||||
|
|
||||||
|
|
||||||
if envs.VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE:
|
if envs.VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE:
|
||||||
|
logger.info("AscendRowParallelLinear: Matmul all-reduce is enabled. ")
|
||||||
vllm.model_executor.layers.linear.RowParallelLinear = AscendRowParallelLinear
|
vllm.model_executor.layers.linear.RowParallelLinear = AscendRowParallelLinear
|
||||||
|
|||||||
Reference in New Issue
Block a user