forked from EngineX-Ascend/enginex-ascend-910-vllm
init v0.11.0rc0
This commit is contained in:
@@ -14,7 +14,7 @@ _err() { _red "Error: $*" && exit 1; }
|
||||
|
||||
CURL_TIMEOUT=1
|
||||
CURL_COOLDOWN=5
|
||||
CURL_MAX_TRIES=180
|
||||
CURL_MAX_TRIES=300
|
||||
|
||||
function wait_url_ready() {
|
||||
local serve_name="$1"
|
||||
|
||||
@@ -32,7 +32,14 @@ from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
|
||||
BatchEncoding, BatchFeature)
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import TaskOption, _get_and_verify_dtype
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.10.2"):
|
||||
from vllm.config import TaskOption, _get_and_verify_dtype
|
||||
else:
|
||||
from vllm.config.model import TaskOption, _get_and_verify_dtype
|
||||
|
||||
from vllm.inputs import TextPrompt
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
|
||||
@@ -57,8 +57,8 @@ function quickstart_online_test() {
|
||||
}
|
||||
|
||||
_info "====> Start simple_test"
|
||||
simple_test
|
||||
time simple_test
|
||||
_info "====> Start quickstart_offline_test"
|
||||
quickstart_offline_test
|
||||
time quickstart_offline_test
|
||||
_info "====> Start quickstart_online_test"
|
||||
quickstart_online_test
|
||||
time quickstart_online_test
|
||||
|
||||
@@ -59,4 +59,4 @@ function install_binary_test() {
|
||||
}
|
||||
|
||||
_info "====> Start install_binary_test"
|
||||
install_binary_test
|
||||
time install_binary_test
|
||||
|
||||
@@ -19,7 +19,12 @@
|
||||
|
||||
from typing import Dict, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
from vllm.sequence import PromptLogprobs, SampleLogprobs
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.10.2"):
|
||||
from vllm.sequence import PromptLogprobs, SampleLogprobs
|
||||
else:
|
||||
from vllm.logprobs import PromptLogprobs, SampleLogprobs
|
||||
|
||||
TokensText = Tuple[List[int], str]
|
||||
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
model_name: "deepseek-ai/DeepSeek-V2-Lite"
|
||||
runner: "linux-aarch64-a2-2"
|
||||
hardware: "Atlas A2 Series"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.375
|
||||
value: 0.385
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.375
|
||||
value: 0.385
|
||||
tensor_parallel_size: 2
|
||||
batch_size: 32
|
||||
gpu_memory_utilization: 0.7
|
||||
apply_chat_template: False
|
||||
fewshot_as_multiturn: False
|
||||
trust_remote_code: True
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||
runner: "linux-aarch64-a2-1"
|
||||
hardware: "Atlas A2 Series"
|
||||
model: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "mmmu_val"
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
model_name: "Qwen/Qwen3-30B-A3B"
|
||||
runner: "linux-aarch64-a2-2"
|
||||
hardware: "Atlas A2 Series"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
model_name: "Qwen/Qwen3-8B-Base"
|
||||
runner: "linux-aarch64-a2-1"
|
||||
hardware: "Atlas A2 Series"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
DeepSeek-V2-Lite.yaml
|
||||
Qwen3-8B-Base.yaml
|
||||
Qwen2.5-VL-7B-Instruct.yaml
|
||||
Qwen3-30B-A3B.yaml
|
||||
@@ -2,16 +2,28 @@
|
||||
|
||||
- **vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})), **vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))
|
||||
- **Software Environment**: **CANN**: {{ cann_version }}, **PyTorch**: {{ torch_version }}, **torch-npu**: {{ torch_npu_version }}
|
||||
- **Hardware Environment**: Atlas A2 Series
|
||||
- **Hardware Environment**: {{ hardware }}
|
||||
- **Parallel mode**: {{ parallel_mode }}
|
||||
- **Execution mode**: ACLGraph
|
||||
- **Execution mode**: {{ execution_model }}
|
||||
|
||||
**Command**:
|
||||
|
||||
```bash
|
||||
export MODEL_ARGS={{ model_args }}
|
||||
lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \
|
||||
{% if apply_chat_template %} --apply_chat_template {{ apply_chat_template }} {% endif %} {% if fewshot_as_multiturn %} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% endif %} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} {% if limit is defined and limit != "N/A" %} --limit {{ limit }} {% endif %} --batch_size {{ batch_size}}
|
||||
{% if apply_chat_template is defined and (apply_chat_template|string|lower in ["true", "1"]) -%}
|
||||
--apply_chat_template \
|
||||
{%- endif %}
|
||||
{% if fewshot_as_multiturn is defined and (fewshot_as_multiturn|string|lower in ["true", "1"]) -%}
|
||||
--fewshot_as_multiturn \
|
||||
{%- endif %}
|
||||
{% if num_fewshot is defined and num_fewshot != "N/A" -%}
|
||||
--num_fewshot {{ num_fewshot }} \
|
||||
{%- endif %}
|
||||
{% if limit is defined and limit != "N/A" -%}
|
||||
--limit {{ limit }} \
|
||||
{%- endif %}
|
||||
--batch_size {{ batch_size }}
|
||||
```
|
||||
|
||||
| Task | Metric | Value | Stderr |
|
||||
|
||||
@@ -69,6 +69,8 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
|
||||
if model_args.get('enable_expert_parallel', False):
|
||||
parallel_mode += " + EP"
|
||||
|
||||
execution_model = f"{'Eager' if model_args.get('enforce_eager', False) else 'ACLGraph'}"
|
||||
|
||||
report_content = template.render(
|
||||
vllm_version=env_config.vllm_version,
|
||||
vllm_commit=env_config.vllm_commit,
|
||||
@@ -77,6 +79,7 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
|
||||
cann_version=env_config.cann_version,
|
||||
torch_version=env_config.torch_version,
|
||||
torch_npu_version=env_config.torch_npu_version,
|
||||
hardware=eval_config.get("hardware", "unknown"),
|
||||
model_name=eval_config["model_name"],
|
||||
model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'",
|
||||
model_type=eval_config.get("model", "vllm"),
|
||||
@@ -84,10 +87,11 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
|
||||
apply_chat_template=eval_config.get("apply_chat_template", True),
|
||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True),
|
||||
limit=eval_config.get("limit", "N/A"),
|
||||
batch_size="auto",
|
||||
batch_size=eval_config.get("batch_size", "auto"),
|
||||
num_fewshot=eval_config.get("num_fewshot", "N/A"),
|
||||
rows=report_data["rows"],
|
||||
parallel_mode=parallel_mode)
|
||||
parallel_mode=parallel_mode,
|
||||
execution_model=execution_model)
|
||||
|
||||
report_output = os.path.join(
|
||||
report_dir, f"{os.path.basename(eval_config['model_name'])}.md")
|
||||
@@ -110,7 +114,7 @@ def test_lm_eval_correctness_param(config_filename, tp_size, report_dir,
|
||||
"apply_chat_template": eval_config.get("apply_chat_template", True),
|
||||
"fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True),
|
||||
"limit": eval_config.get("limit", None),
|
||||
"batch_size": "auto",
|
||||
"batch_size": eval_config.get("batch_size", "auto"),
|
||||
}
|
||||
for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]:
|
||||
val = eval_config.get(s, None)
|
||||
|
||||
@@ -14,14 +14,24 @@ def test_e2e_ep_correctness(model_name):
|
||||
]
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(model_name, tensor_parallel_size=2,
|
||||
enforce_eager=True) as vllm_model:
|
||||
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
tensor_parallel_size=2,
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": True
|
||||
}},
|
||||
enforce_eager=True) as vllm_model:
|
||||
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(model_name,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True) as vllm_model:
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": True
|
||||
}},
|
||||
enforce_eager=True) as vllm_model:
|
||||
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
|
||||
@@ -23,6 +23,7 @@ Run `pytest tests/test_offline_inference.py`.
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
from vllm import SamplingParams
|
||||
|
||||
@@ -30,6 +31,15 @@ from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||
|
||||
QWEN_DENSE_MODELS = [
|
||||
"vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
|
||||
]
|
||||
|
||||
DEEPSEEK_W4A8_MODELS = [
|
||||
"vllm-ascend/DeepSeek-V3-W4A8-Pruing",
|
||||
"vllm-ascend/DeepSeek-V3.1-W4A8-puring"
|
||||
]
|
||||
|
||||
|
||||
def test_models_distributed_QwQ():
|
||||
example_prompts = [
|
||||
@@ -61,8 +71,8 @@ def test_models_distributed_DeepSeek_multistream_moe():
|
||||
additional_config={
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
"enable_multistream_moe": True,
|
||||
},
|
||||
"enable_multistream_moe": True,
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
@@ -104,14 +114,15 @@ def test_models_distributed_Qwen3_W4A8DYNAMIC():
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_MLA_PA": "1"})
|
||||
def test_models_distributed_DeepSeek_W4A8DYNAMIC():
|
||||
def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download("vllm-ascend/DeepSeek-V3-W4A8-Pruing"),
|
||||
snapshot_download(model),
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
@@ -150,3 +161,46 @@ def test_sp_for_qwen3_moe() -> None:
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM": "1"})
|
||||
def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
max_model_len=8192,
|
||||
enforce_eager=enforce_eager,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
|
||||
def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(
|
||||
model, enforce_eager):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
max_model_len=8192,
|
||||
enforce_eager=enforce_eager,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -116,20 +116,22 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
|
||||
prefix_cache_output = vllm_model.generate_greedy(
|
||||
INPUT_PROMPTS, max_tokens)
|
||||
|
||||
with VllmRunner(model,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
'enable_prefix_caching': True,
|
||||
"enable_chunked_prefill": True,
|
||||
},
|
||||
},
|
||||
enforce_eager=True,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
|
||||
INPUT_PROMPTS, max_tokens)
|
||||
# TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
|
||||
# Disable it now. Fix it or drop the ascend scheduler in the future.
|
||||
# with VllmRunner(model,
|
||||
# additional_config={
|
||||
# 'ascend_scheduler_config': {
|
||||
# 'enabled': True,
|
||||
# 'enable_prefix_caching': True,
|
||||
# "enable_chunked_prefill": True,
|
||||
# },
|
||||
# },
|
||||
# enforce_eager=True,
|
||||
# max_model_len=2048,
|
||||
# tensor_parallel_size=2,
|
||||
# gpu_memory_utilization=0.7) as vllm_model:
|
||||
# chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
|
||||
# INPUT_PROMPTS, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_output,
|
||||
@@ -138,9 +140,9 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
|
||||
name_1="prefix_cache_output",
|
||||
)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=chunk_prefill_prefix_cache_output,
|
||||
outputs_1_lst=prefix_cache_output,
|
||||
name_0="chunk_prefill_prefix_cache_output",
|
||||
name_1="prefix_cache_output",
|
||||
)
|
||||
# check_outputs_equal(
|
||||
# outputs_0_lst=chunk_prefill_prefix_cache_output,
|
||||
# outputs_1_lst=prefix_cache_output,
|
||||
# name_0="chunk_prefill_prefix_cache_output",
|
||||
# name_1="prefix_cache_output",
|
||||
# )
|
||||
|
||||
@@ -66,7 +66,6 @@ def test_models_distributed_Qwen3_MOE_W8A8():
|
||||
max_model_len=8192,
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
@@ -22,6 +22,8 @@ Run `pytest tests/multicard/test_torchair_graph_mode.py`.
|
||||
import os
|
||||
from typing import Dict
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||
@@ -153,6 +155,7 @@ def _pangu_torchair_test_fixture(
|
||||
print(f"Generated text: {vllm_output[i][1]!r}")
|
||||
|
||||
|
||||
@pytest.mark.skip("skipping test_e2e_pangu_with_torchair")
|
||||
def test_e2e_pangu_with_torchair():
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
|
||||
188
tests/e2e/multicard/test_weight_loader.py
Normal file
188
tests/e2e/multicard/test_weight_loader.py
Normal file
@@ -0,0 +1,188 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""
|
||||
Compare the outputs of vLLM with and without aclgraph.
|
||||
|
||||
Run `pytest tests/multicard/test_external_launcher.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
import torch_npu
|
||||
|
||||
MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
|
||||
MODELS = ["Qwen/Qwen3-8B"]
|
||||
DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MOE_MODELS)
|
||||
def test_external_launcher_eager(model):
|
||||
script = script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script),
|
||||
"--model",
|
||||
model,
|
||||
"--tp-size",
|
||||
"2",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--enforce-eager",
|
||||
"--enable-expert-parallel",
|
||||
"--enable-sleep-mode",
|
||||
"--model-weight-gib",
|
||||
"20",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode()
|
||||
|
||||
print(output)
|
||||
|
||||
assert "TP RANKS: [0]" in output
|
||||
assert "TP RANKS: [1]" in output
|
||||
assert "Generated text:" in output
|
||||
assert proc.returncode == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MOE_MODELS)
|
||||
def test_external_launcher_aclgraph(model):
|
||||
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script),
|
||||
"--model",
|
||||
model,
|
||||
"--tp-size",
|
||||
"2",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--enable-expert-parallel",
|
||||
"--enable-sleep-mode",
|
||||
"--model-weight-gib",
|
||||
"20",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode()
|
||||
|
||||
print(output)
|
||||
|
||||
assert "TP RANKS: [0]" in output
|
||||
assert "TP RANKS: [1]" in output
|
||||
assert "Generated text:" in output
|
||||
assert proc.returncode == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_external_launcher_dense(model):
|
||||
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script),
|
||||
"--model",
|
||||
model,
|
||||
"--tp-size",
|
||||
"2",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--enable-sleep-mode",
|
||||
"--model-weight-gib",
|
||||
"20",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode()
|
||||
|
||||
print(output)
|
||||
|
||||
assert "TP RANKS: [0]" in output
|
||||
assert "TP RANKS: [1]" in output
|
||||
assert "Generated text:" in output
|
||||
assert proc.returncode == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_external_launcher_dense_eager(model):
|
||||
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script),
|
||||
"--model",
|
||||
model,
|
||||
"--tp-size",
|
||||
"2",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--enforce-eager",
|
||||
"--enable-sleep-mode",
|
||||
"--model-weight-gib",
|
||||
"20",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode()
|
||||
|
||||
print(output)
|
||||
|
||||
assert "TP RANKS: [0]" in output
|
||||
assert "TP RANKS: [1]" in output
|
||||
assert "Generated text:" in output
|
||||
assert proc.returncode == 0
|
||||
@@ -70,7 +70,7 @@ run_tests_for_model() {
|
||||
# Start prefill instance
|
||||
PREFILL_PORT=8001
|
||||
|
||||
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_LLMDD_RPC_PORT=5559 vllm serve $model_name \
|
||||
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_ASCEND_LLMDD_RPC_PORT=5559 vllm serve $model_name \
|
||||
--port $PREFILL_PORT \
|
||||
--seed 1024 \
|
||||
--enforce-eager \
|
||||
@@ -90,7 +90,7 @@ run_tests_for_model() {
|
||||
DECODE_PORT=8002
|
||||
|
||||
# Build the command with or without model-specific args
|
||||
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_LLMDD_RPC_PORT=6000 vllm serve $model_name \
|
||||
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_ASCEND_LLMDD_RPC_PORT=6000 vllm serve $model_name \
|
||||
--port $DECODE_PORT \
|
||||
--seed 1024 \
|
||||
--enforce-eager \
|
||||
|
||||
@@ -22,7 +22,6 @@ set -eo errexit
|
||||
. $(dirname "$0")/common.sh
|
||||
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_LOGGING_LEVEL=ERROR
|
||||
|
||||
_info "====> Start Quickstart test"
|
||||
. "${SCRIPT_DIR}/doctests/001-quickstart-test.sh"
|
||||
|
||||
@@ -33,8 +33,8 @@ def test_bgmv_expand():
|
||||
y_npu = y.npu()
|
||||
|
||||
y_out = bgmv_expand_cpu_impl(x, w, indices, y, 0, 128)
|
||||
y_out_npu = torch.ops._C.bgmv_expand(x_npu, w_npu, indices_npu, y_npu, 0,
|
||||
128)
|
||||
y_out_npu = torch.ops._C_ascend.bgmv_expand(x_npu, w_npu, indices_npu,
|
||||
y_npu, 0, 128)
|
||||
|
||||
# Compare the results.
|
||||
torch.testing.assert_close(y_out_npu.cpu(),
|
||||
|
||||
@@ -33,7 +33,7 @@ def test_bgmv_shrink():
|
||||
y_npu = y.npu()
|
||||
|
||||
y = bgmv_shrink_cpu_impl(x, w, indices, y, 0.5)
|
||||
torch.ops._C.bgmv_shrink(x_npu, w_npu, indices_npu, y_npu, 0.5)
|
||||
torch.ops._C_ascend.bgmv_shrink(x_npu, w_npu, indices_npu, y_npu, 0.5)
|
||||
|
||||
# Compare the results.
|
||||
torch.testing.assert_close(y_npu.cpu(),
|
||||
|
||||
@@ -28,12 +28,12 @@ import torch
|
||||
import torch_npu
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
|
||||
from vllm_ascend.ops.layers.experts_selector import select_experts
|
||||
from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
|
||||
TokenDispatcherWithAllGather
|
||||
from vllm_ascend.ops.moe.experts_selector import select_experts
|
||||
from vllm_ascend.ops.moe.moe_mlp import unified_apply_mlp
|
||||
from vllm_ascend.ops.moe.token_dispatcher import TokenDispatcherWithAllGather
|
||||
|
||||
NUM_EXPERTS = [8, 64]
|
||||
EP_SIZE = [1, 4]
|
||||
EP_SIZE = [1]
|
||||
TOP_KS = [2, 6]
|
||||
DEVICE = ["npu"]
|
||||
|
||||
@@ -115,19 +115,6 @@ def test_token_dispatcher_with_all_gather(
|
||||
w1_local = w1
|
||||
w2_local = w2
|
||||
|
||||
if ep_size > 1:
|
||||
local_e = e // ep_size
|
||||
e_ids = torch.arange(local_e * 0,
|
||||
local_e * (0 + 1),
|
||||
device=device,
|
||||
dtype=torch.int32)
|
||||
expert_map = torch.full((e, ), -1, device=device, dtype=torch.int32)
|
||||
expert_map[e_ids] = torch.arange(local_e,
|
||||
device=device,
|
||||
dtype=torch.int32)
|
||||
w1_local = w1[e_ids]
|
||||
w2_local = w2[e_ids]
|
||||
|
||||
score = torch.softmax(score, dim=-1, dtype=dtype)
|
||||
topk_weights, topk_ids = torch.topk(score, topk)
|
||||
topk_ids = topk_ids.to(torch.int32)
|
||||
@@ -179,6 +166,87 @@ def test_token_dispatcher_with_all_gather(
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", [1, 33, 64])
|
||||
@pytest.mark.parametrize("n", [128, 1024, 2048])
|
||||
@pytest.mark.parametrize("k", [128, 511, 1024])
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("ep_size", EP_SIZE)
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("device", DEVICE)
|
||||
def test_token_dispatcher_with_all_gather_quant(
|
||||
m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
e: int,
|
||||
topk: int,
|
||||
ep_size: int,
|
||||
dtype: torch.dtype,
|
||||
device: str,
|
||||
):
|
||||
context_mock = MagicMock()
|
||||
context_mock.fused_moe_state = 0
|
||||
with patch("vllm_ascend.ops.moe.moe_mlp.get_forward_context",
|
||||
return_value=context_mock):
|
||||
a = torch.randn((m, k), device=device, dtype=dtype) / 10
|
||||
w1 = torch.randn((e, k, 2 * n), device=device, dtype=torch.int8)
|
||||
w1_scale = torch.empty((e, 2 * n), device=device, dtype=dtype)
|
||||
w2 = torch.randn((e, n, k), device=device, dtype=torch.int8)
|
||||
w2_scale = torch.empty((e, k), device=device, dtype=dtype)
|
||||
|
||||
score = torch.randn((m, e), device=device, dtype=dtype)
|
||||
expert_map = None
|
||||
local_e = e
|
||||
|
||||
score = torch.softmax(score, dim=-1, dtype=dtype)
|
||||
topk_weights, topk_ids = torch.topk(score, topk)
|
||||
topk_ids = topk_ids.to(torch.int32)
|
||||
row_idx = (torch.arange(
|
||||
0,
|
||||
m * topk,
|
||||
device=device,
|
||||
dtype=torch.int32,
|
||||
).view(topk, -1).permute(1, 0).contiguous())
|
||||
|
||||
dispatcher_kwargs = {
|
||||
"num_experts": e,
|
||||
"top_k": topk,
|
||||
"num_local_experts": local_e,
|
||||
}
|
||||
dispatcher = TokenDispatcherWithAllGather(**dispatcher_kwargs)
|
||||
|
||||
apply_router_weight_on_input = False
|
||||
dispatch_output = dispatcher.token_dispatch(
|
||||
hidden_states=a,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
row_idx=row_idx,
|
||||
expert_map=expert_map,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
with_quant=True)
|
||||
|
||||
sorted_hidden_states = dispatch_output["hidden_states"]
|
||||
group_list = dispatch_output["group_list"]
|
||||
group_list_type = dispatch_output.get("group_list_type", 1)
|
||||
dynamic_scale = dispatch_output["dynamic_scale"]
|
||||
|
||||
expert_output = unified_apply_mlp(hidden_states=sorted_hidden_states,
|
||||
w1=w1,
|
||||
w1_scale=w1_scale,
|
||||
w2=w2,
|
||||
w2_scale=w2_scale,
|
||||
group_list=group_list,
|
||||
group_list_type=group_list_type,
|
||||
dynamic_scale=dynamic_scale,
|
||||
with_quant=True)
|
||||
combined_output = dispatcher.token_combine(hidden_states=expert_output,
|
||||
bias=None)
|
||||
assert combined_output.shape == (m, k)
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", [1, 33, 64])
|
||||
@pytest.mark.parametrize("n", [128, 1024, 2048])
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@@ -222,7 +290,7 @@ def test_select_experts(
|
||||
dtype=torch.int32)
|
||||
custom_routing_function.return_value = (mock_weights, mock_ids)
|
||||
|
||||
with patch("vllm_ascend.ops.layers.experts_selector._native_grouped_topk"
|
||||
with patch("vllm_ascend.ops.moe.experts_selector._native_grouped_topk"
|
||||
) as mock_native_grouped_topk:
|
||||
mock_native_grouped_topk.side_effect = lambda x, num_groups, k: torch.randn_like(
|
||||
x)
|
||||
|
||||
@@ -1,175 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
|
||||
import gc
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.config import ( # isort: skip
|
||||
FusedMoEConfig, FusedMoEParallelConfig)
|
||||
|
||||
from vllm_ascend.distributed.moe_comm_method import ( # isort: skip
|
||||
AllGatherCommImpl, NativeAllGatherCommImpl)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", [16, 128])
|
||||
@pytest.mark.parametrize("hidden_size", [64, 128])
|
||||
@pytest.mark.parametrize("global_num_experts", [8, 16])
|
||||
@pytest.mark.parametrize("num_local_experts", [4, 8])
|
||||
@pytest.mark.parametrize("top_k_num", [2, 4])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
|
||||
@pytest.mark.parametrize("ep_rank", [0, 1])
|
||||
@pytest.mark.parametrize("apply_a8_quantization", [False])
|
||||
def test_all_gather_comm_impl(
|
||||
num_tokens,
|
||||
hidden_size,
|
||||
global_num_experts,
|
||||
num_local_experts,
|
||||
top_k_num,
|
||||
dtype,
|
||||
ep_rank,
|
||||
apply_a8_quantization,
|
||||
mocker,
|
||||
):
|
||||
"""
|
||||
Tests the AllGatherCommImpl against the NativeAllGatherCommImpl.
|
||||
|
||||
This test compares the outputs of the NPU-optimized AllGatherCommImpl
|
||||
with a native PyTorch implementation (NativeAllGatherCommImpl) to ensure
|
||||
correctness across various configurations.
|
||||
"""
|
||||
if top_k_num > global_num_experts:
|
||||
pytest.skip("top_k_num cannot be greater than global_num_experts")
|
||||
if num_local_experts > global_num_experts:
|
||||
pytest.skip(
|
||||
"num_local_experts cannot be greater than global_num_experts")
|
||||
|
||||
device = torch.device("npu")
|
||||
|
||||
# mock get_tensor_model_parallel_rank to return ep_rank
|
||||
mocker.patch(
|
||||
"vllm.model_executor.layers.fused_moe.config.get_tensor_model_parallel_rank",
|
||||
return_value=ep_rank,
|
||||
)
|
||||
|
||||
# make moe config
|
||||
parallel_config = SimpleNamespace(
|
||||
enable_expert_parallel=num_local_experts < global_num_experts)
|
||||
moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
|
||||
tp_size_=max(2, global_num_experts // num_local_experts),
|
||||
dp_size_=1,
|
||||
vllm_parallel_config=parallel_config,
|
||||
)
|
||||
|
||||
moe_config = FusedMoEConfig(
|
||||
num_experts=global_num_experts,
|
||||
experts_per_token=top_k_num,
|
||||
hidden_dim=hidden_size,
|
||||
num_local_experts=num_local_experts,
|
||||
moe_parallel_config=moe_parallel_config,
|
||||
in_dtype=dtype,
|
||||
quant_config=None, # No quantization in this test
|
||||
max_num_tokens=num_tokens,
|
||||
)
|
||||
|
||||
# Instantiate implementations
|
||||
native_impl = NativeAllGatherCommImpl(moe_config)
|
||||
|
||||
all_gather_impl = AllGatherCommImpl(moe_config)
|
||||
|
||||
# --- Input Data ---
|
||||
hidden_states = torch.randn(num_tokens,
|
||||
hidden_size,
|
||||
device=device,
|
||||
dtype=dtype)
|
||||
topk_ids = torch.randint(0,
|
||||
global_num_experts, (num_tokens, top_k_num),
|
||||
device=device,
|
||||
dtype=torch.int32)
|
||||
topk_weights = torch.rand(num_tokens, top_k_num, device=device).to(dtype)
|
||||
topk_weights = torch.nn.functional.softmax(topk_weights, dim=1)
|
||||
|
||||
num_experts = global_num_experts
|
||||
|
||||
expert_map = None
|
||||
if num_local_experts < global_num_experts:
|
||||
# Create a map where some experts are local and some are not
|
||||
expert_map = torch.full((global_num_experts, ), -1, device=device)
|
||||
expert_map[ep_rank * num_local_experts:(ep_rank + 1) *
|
||||
num_local_experts] = torch.arange(num_local_experts,
|
||||
device=device)
|
||||
num_experts = num_local_experts
|
||||
|
||||
# --- Run Native Implementation (Golden Reference) ---
|
||||
native_hidden_states_out = hidden_states.clone()
|
||||
(
|
||||
native_permuted_hidden,
|
||||
native_expert_tokens,
|
||||
_,
|
||||
_,
|
||||
) = native_impl.permute(hidden_states, topk_ids, topk_weights, expert_map,
|
||||
num_experts, apply_a8_quantization)
|
||||
# Simulate MLP output
|
||||
native_mlp_output = torch.randn_like(native_permuted_hidden)
|
||||
native_impl.unpermute(native_mlp_output, native_hidden_states_out)
|
||||
|
||||
# --- Run AllGather Implementation ---
|
||||
all_gather_hidden_states_out = hidden_states.clone()
|
||||
(
|
||||
all_gather_permuted_hidden,
|
||||
all_gather_expert_tokens,
|
||||
_,
|
||||
_,
|
||||
) = all_gather_impl.permute(hidden_states, topk_ids, topk_weights,
|
||||
expert_map, num_experts, apply_a8_quantization)
|
||||
|
||||
# Use the same simulated MLP output for a fair comparison
|
||||
all_gather_mlp_output = native_mlp_output.clone()
|
||||
|
||||
all_gather_impl.unpermute(all_gather_mlp_output,
|
||||
all_gather_hidden_states_out)
|
||||
|
||||
# --- Assertions ---
|
||||
# Define tolerance based on dtype
|
||||
atol = 1e-3 if dtype == torch.float16 else 1e-2
|
||||
rtol = 1e-3 if dtype == torch.float16 else 1e-2
|
||||
|
||||
# 1. Compare expert_tokens from pre_process
|
||||
assert torch.allclose(native_expert_tokens.to(
|
||||
all_gather_expert_tokens.device),
|
||||
all_gather_expert_tokens,
|
||||
atol=atol,
|
||||
rtol=rtol), "Expert tokens do not match."
|
||||
|
||||
# 2. Compare permuted_hidden_states from pre_process
|
||||
num_valid_tokens = native_expert_tokens.sum()
|
||||
assert torch.allclose(native_permuted_hidden[:num_valid_tokens].to(
|
||||
all_gather_permuted_hidden.device),
|
||||
all_gather_permuted_hidden[:num_valid_tokens],
|
||||
atol=atol,
|
||||
rtol=rtol), "Permuted hidden states do not match."
|
||||
|
||||
# 3. Compare final hidden_states from post_process
|
||||
assert torch.allclose(native_hidden_states_out.to(
|
||||
all_gather_hidden_states_out.device),
|
||||
all_gather_hidden_states_out,
|
||||
atol=atol,
|
||||
rtol=rtol), "Final hidden states do not match."
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
@@ -182,7 +182,7 @@ def test_rotary_embedding_quant_with_leading_dim(
|
||||
)
|
||||
|
||||
ref_query, ref_key = rope.forward_native(positions, query, key)
|
||||
query, key = torch.ops._C.rotary_embedding(
|
||||
query, key = torch.ops._C_ascend.rotary_embedding(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
@@ -239,7 +239,7 @@ class ModelwithRotaryEmbedding(nn.Module):
|
||||
# we simulated a simple attention layer to test if it can be seamlessly captured into aclgraph
|
||||
qkv = self.qkv_proj(hidden_states)
|
||||
q, k, v = qkv.chunk(3, dim=-1)
|
||||
query, key = torch.ops._C.rotary_embedding(
|
||||
query, key = torch.ops._C_ascend.rotary_embedding(
|
||||
positions,
|
||||
q,
|
||||
k,
|
||||
@@ -299,7 +299,7 @@ def test_capture_rotary_embedding_in_aclgraph(
|
||||
# Validate if the rotary_embedding custom kernel is indeed inside the graph by
|
||||
# string match
|
||||
graph = str(gm.graph)
|
||||
assert "_C.rotary_embedding" in graph
|
||||
assert "_C_ascend.rotary_embedding" in graph
|
||||
return gm
|
||||
|
||||
static_positions = torch.randint(0, max_position_embeddings,
|
||||
|
||||
@@ -72,7 +72,7 @@ def test_get_masked_input_and_mask(
|
||||
|
||||
# Get custom op result
|
||||
print("input_tensor:", input_tensor)
|
||||
custom_masked_input, custom_mask = torch.ops._C.get_masked_input_and_mask(
|
||||
custom_masked_input, custom_mask = torch.ops._C_ascend.get_masked_input_and_mask(
|
||||
input_tensor, test_case["org_start"], test_case["org_end"],
|
||||
test_case["padding"], test_case["added_start"], test_case["added_end"])
|
||||
|
||||
|
||||
@@ -1,14 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sampling_config():
|
||||
@@ -20,9 +16,10 @@ def model_name():
|
||||
return "wemaster/deepseek_mtp_main_random_bf16"
|
||||
|
||||
|
||||
def test_mtp_correctness(
|
||||
def mtp_correctness(
|
||||
sampling_config: SamplingParams,
|
||||
model_name: str,
|
||||
num_speculative_tokens: int,
|
||||
):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
@@ -38,7 +35,7 @@ def test_mtp_correctness(
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_model_len=256,
|
||||
enforce_eager=True) as ref_llm:
|
||||
enforce_eager=False) as ref_llm:
|
||||
ref_outputs = ref_llm.generate(example_prompts, sampling_config)
|
||||
|
||||
with VllmRunner(
|
||||
@@ -50,9 +47,9 @@ def test_mtp_correctness(
|
||||
enable_expert_parallel=True,
|
||||
speculative_config={
|
||||
"method": "deepseek_mtp",
|
||||
"num_speculative_tokens": 1,
|
||||
"num_speculative_tokens": num_speculative_tokens,
|
||||
},
|
||||
enforce_eager=True,
|
||||
enforce_eager=False,
|
||||
max_model_len=2000,
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": False
|
||||
@@ -74,3 +71,18 @@ def test_mtp_correctness(
|
||||
# Heuristic: expect at least 66% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.66 * len(ref_outputs))
|
||||
del spec_llm
|
||||
|
||||
|
||||
def test_mtp1_correctness(
|
||||
sampling_config: SamplingParams,
|
||||
model_name: str,
|
||||
):
|
||||
mtp_correctness(sampling_config, model_name, 1)
|
||||
|
||||
|
||||
def test_mtp2_correctness(
|
||||
sampling_config: SamplingParams,
|
||||
model_name: str,
|
||||
):
|
||||
mtp_correctness(sampling_config, model_name, 2)
|
||||
|
||||
@@ -1,14 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sampling_config():
|
||||
|
||||
@@ -99,7 +99,6 @@ def test_ngram_correctness(
|
||||
assert matches > int(0.7 * len(ref_outputs))
|
||||
|
||||
|
||||
@pytest.mark.skipif(True, reason="oom in CI, fix me")
|
||||
@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
|
||||
def test_eagle_correctness(
|
||||
test_prompts: list[list[dict[str, Any]]],
|
||||
@@ -111,8 +110,6 @@ def test_eagle_correctness(
|
||||
Compare the outputs of a original LLM and a speculative LLM
|
||||
should be the same when using eagle speculative decoding.
|
||||
'''
|
||||
if not use_eagle3:
|
||||
pytest.skip("Not current support for the test.")
|
||||
|
||||
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
@@ -121,7 +118,6 @@ def test_eagle_correctness(
|
||||
spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name()
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_seqs=1,
|
||||
max_num_batched_tokens=2048,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
@@ -86,3 +87,25 @@ def test_chunked_prefill_with_ascend_scheduler(
|
||||
name_0="vllm_output",
|
||||
name_1="chunked_prefill_output",
|
||||
)
|
||||
|
||||
|
||||
def test_async_scheduling() -> None:
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
] * 10
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=10,
|
||||
stop_token_ids=None)
|
||||
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=50,
|
||||
dtype="bfloat16",
|
||||
gpu_memory_utilization=0.9,
|
||||
async_scheduling=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate(prompts, sampling_params=sampling_params)
|
||||
|
||||
@@ -17,17 +17,23 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import json
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
import jsonschema
|
||||
import pytest
|
||||
import regex as re
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.10.2"):
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
else:
|
||||
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
|
||||
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
GuidedDecodingBackend = ["xgrammar", "guidance", "outlines"]
|
||||
@@ -84,16 +90,29 @@ def sample_json_schema():
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
|
||||
def test_guided_json_completion(guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=500,
|
||||
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
|
||||
|
||||
with VllmRunner(
|
||||
MODEL_NAME,
|
||||
seed=0,
|
||||
guided_decoding_backend=guided_decoding_backend,
|
||||
) as vllm_model:
|
||||
runner_kwargs: Dict[str, Any] = {}
|
||||
if vllm_version_is("0.10.2"):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=500,
|
||||
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
|
||||
runner_kwargs = {
|
||||
"seed": 0,
|
||||
"guided_decoding_backend": guided_decoding_backend,
|
||||
}
|
||||
else:
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=500,
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
json=sample_json_schema))
|
||||
runner_kwargs = {
|
||||
"seed": 0,
|
||||
"structured_outputs_config": {
|
||||
"backend": guided_decoding_backend
|
||||
},
|
||||
}
|
||||
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
|
||||
prompts = [
|
||||
f"Give an example JSON for an employee profile "
|
||||
f"that fits this schema: {sample_json_schema}"
|
||||
@@ -121,17 +140,29 @@ def test_guided_json_completion(guided_decoding_backend: str,
|
||||
def test_guided_regex(guided_decoding_backend: str, sample_regex):
|
||||
if guided_decoding_backend == "outlines":
|
||||
pytest.skip("Outlines doesn't support regex-based guided decoding.")
|
||||
runner_kwargs: Dict[str, Any] = {}
|
||||
if vllm_version_is("0.10.2"):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(regex=sample_regex))
|
||||
runner_kwargs = {
|
||||
"seed": 0,
|
||||
"guided_decoding_backend": guided_decoding_backend,
|
||||
}
|
||||
else:
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
structured_outputs=StructuredOutputsParams(regex=sample_regex))
|
||||
runner_kwargs = {
|
||||
"seed": 0,
|
||||
"structured_outputs_config": {
|
||||
"backend": guided_decoding_backend
|
||||
},
|
||||
}
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(regex=sample_regex))
|
||||
|
||||
with VllmRunner(
|
||||
MODEL_NAME,
|
||||
seed=0,
|
||||
guided_decoding_backend=guided_decoding_backend,
|
||||
) as vllm_model:
|
||||
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
|
||||
prompts = [
|
||||
f"Give an example IPv4 address with this regex: {sample_regex}"
|
||||
] * 2
|
||||
|
||||
103
tests/e2e/singlecard/test_multistream_overlap_shared_expert.py
Normal file
103
tests/e2e/singlecard/test_multistream_overlap_shared_expert.py
Normal file
@@ -0,0 +1,103 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""
|
||||
Compare the outputs of vLLM with multistream_overlap_shared_expert
|
||||
enabled and disabled.
|
||||
|
||||
Run `pytest tests/e2e/singlecard/test_multistream_overlap_shared_expert.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen3-0.6B",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_models_with_multistream_overlap_shared_expert(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
additional_config={
|
||||
"multistream_overlap_shared_expert": True,
|
||||
},
|
||||
) as runner:
|
||||
vllm_moe_ms_eager_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
additional_config={
|
||||
"multistream_overlap_shared_expert": True,
|
||||
},
|
||||
) as runner:
|
||||
vllm_moe_ms_aclgraph_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
vllm_moe_ms_eager_outputs_list = []
|
||||
for output in vllm_moe_ms_eager_outputs:
|
||||
vllm_moe_ms_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
vllm_moe_ms_aclgraph_outputs_list = []
|
||||
for output in vllm_moe_ms_aclgraph_outputs:
|
||||
vllm_moe_ms_aclgraph_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
vllm_eager_outputs_list = []
|
||||
for output in vllm_eager_outputs:
|
||||
vllm_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
outputs_1_lst=vllm_moe_ms_eager_outputs_list,
|
||||
name_0="vllm_eager_outputs",
|
||||
name_1="vllm_moe_ms_eager_outputs",
|
||||
)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
outputs_1_lst=vllm_moe_ms_aclgraph_outputs_list,
|
||||
name_0="vllm_eager_outputs",
|
||||
name_1="vllm_moe_ms_aclgraph_outputs",
|
||||
)
|
||||
@@ -20,19 +20,14 @@
|
||||
|
||||
Run `pytest tests/test_offline_inference.py`.
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="fix me")
|
||||
def test_multimodal_vl(prompt_template):
|
||||
image = ImageAsset("cherry_blossom") \
|
||||
.pil_image.convert("RGB")
|
||||
@@ -52,9 +47,12 @@ def test_multimodal_vl(prompt_template):
|
||||
"fps": 1,
|
||||
},
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_model.generate_greedy(prompts=prompts,
|
||||
images=images,
|
||||
max_tokens=64)
|
||||
outputs = vllm_model.generate_greedy(prompts=prompts,
|
||||
images=images,
|
||||
max_tokens=64)
|
||||
assert len(outputs) == len(prompts)
|
||||
for _, output_str in outputs:
|
||||
assert output_str, "Generated output should not be empty."
|
||||
|
||||
|
||||
def test_multimodal_audio():
|
||||
@@ -86,4 +84,7 @@ def test_multimodal_audio():
|
||||
dtype="bfloat16",
|
||||
limit_mm_per_prompt={"audio": 2},
|
||||
gpu_memory_utilization=0.9) as runner:
|
||||
runner.generate(inputs, sampling_params=sampling_params)
|
||||
outputs = runner.generate(inputs, sampling_params=sampling_params)
|
||||
|
||||
assert outputs is not None, "Generated outputs should not be None."
|
||||
assert len(outputs) > 0, "Generated outputs should not be empty."
|
||||
|
||||
36
tests/e2e/vllm_interface/singlecard/test_sampler.py
Normal file
36
tests/e2e/vllm_interface/singlecard/test_sampler.py
Normal file
@@ -0,0 +1,36 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm/tests/entrypoints/llm/test_guided_generate.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
|
||||
def test_models_topk() -> None:
|
||||
example_prompts = [
|
||||
"The capital of France is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=10,
|
||||
temperature=0.0,
|
||||
top_k=10,
|
||||
top_p=0.9)
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-0.6B",
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.7) as runner:
|
||||
runner.generate(example_prompts, sampling_params)
|
||||
2
tests/e2e/vllm_interface/vllm_test.cfg
Normal file
2
tests/e2e/vllm_interface/vllm_test.cfg
Normal file
@@ -0,0 +1,2 @@
|
||||
# Base docker image used to build the vllm-ascend e2e test image, which is built in the vLLM repository
|
||||
BASE_IMAGE_NAME="quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11"
|
||||
Reference in New Issue
Block a user