2025-02-05 10:53:12 +08:00
|
|
|
#
|
|
|
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
|
|
|
# Copyright 2023 The vLLM team.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
2025-04-17 14:59:56 +08:00
|
|
|
# This file is a part of the vllm-ascend project.
|
|
|
|
|
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
|
2025-02-05 10:53:12 +08:00
|
|
|
#
|
|
|
|
|
"""Compare the short outputs of HF and vLLM when using greedy sampling.
|
|
|
|
|
|
|
|
|
|
Run `pytest tests/test_offline_inference.py`.
|
|
|
|
|
"""
|
2026-03-10 09:52:50 +08:00
|
|
|
|
2025-02-05 10:53:12 +08:00
|
|
|
import os
|
2025-06-05 16:42:18 +08:00
|
|
|
from unittest.mock import patch
|
2026-03-10 09:52:50 +08:00
|
|
|
|
2025-09-08 22:52:24 +08:00
|
|
|
import pytest
|
2025-06-05 16:42:18 +08:00
|
|
|
from vllm import SamplingParams
|
[SpecDecode] Add spec decode support (#500)
### What this PR does / why we need it?
Backport: https://github.com/vllm-project/vllm-ascend/pull/252
This support speculative decoding in Ascend, including speculating with
a draft model、by matching n-grams in the prompt、using MLP speculators
and using EAGLE based draft models.
Backport: https://github.com/vllm-project/vllm-ascend/pull/423
spec decode MultiStepWorker support TP1DraftModelRunner fully, support
run the draft_model_runner with multi-step prepare on the NPU directly
and support draft_model_runner use MLA.
1. before this pr, `MultiStepWorker` would not step into the branch
using NPU prepare, but only into the branch using CPU prepare (`line 52`
of `vllm_ascend/patch/patch_multi_step_worker.py`). Although this has
`no effect` on the `correct operation` of speculative decoding and the
performance of the two branches is basically the same as of the current
version, I support entering this branch in this PR. In general, there
are two main changes in `patch_multi_step_worker.py`: first, the
`is_cuda_like()` check is removed and the `TP1DraftModelRunner`
rewritten in vllm_ascend is used; second, the
`supports_gpu_multi_step()` function is made to return true on NPU
devices when outer Multi_step_worker could work correct.
3. before this pr, `TP1DraftModelRunner` only supports Attention on NPU,
but not MLA. The relevant adaptation is in
`vllm_ascend/worker/draft_model_runner.py`. Although I don’t know why
the `input_positions` of `model_input.attn_metadata` in vllm-ascend
needs to be added in `execute_model`, it is done in `model_runner.py`,
so I also made corresponding changes. Otherwise, when atten_backend is
MLA, it will prompt that input_positions cannot be found.
4. I commented out two lines in `draft_model_runner.py` in `line118` to
support the scenario of K>1.
```
# lora_mapping=model_input.lora_mapping,
# lora_requests=model_input.lora_requests,
```
I added comments. In the future, when vllm-ascend supports lora feature,
the changes here can be restored.
TODO:
- [ ] revert the patch when the related issues are addressed in vllm
### How was this patch tested?
CI passed with new added test.
- e2e test for medusa proposer:
tests/singlecard/spec_decode/e2e/test_medusa_correctness.py
- e2e test for mlp proposer:
tests/singlecard/spec_decode/e2e/test_mlp_correctness.py
- e2e test for n-gram proposer:
tests/singlecard/spec_decode/e2e/test_ngram_correctness.py
Tests for patched files:
- tests/singlecard/spec_decode/test_dynamic_spec_decode.py
- tests/singlecard/spec_decode/test_multi_step_worker.py
- tests/singlecard/spec_decode/test_ngram_worker.py
- tests/singlecard/spec_decode/test_spec_decode_worker.py
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
Co-authored-by: mengwei805 <mengwei25@huawei.com>
2025-04-17 20:16:32 +08:00
|
|
|
|
2026-03-19 17:17:36 +08:00
|
|
|
from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free
|
2025-12-18 14:10:14 +08:00
|
|
|
from tests.e2e.model_utils import check_outputs_equal
|
2025-02-05 10:53:12 +08:00
|
|
|
|
2025-03-07 09:47:13 +08:00
|
|
|
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
2025-10-21 20:18:39 +08:00
|
|
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
2025-02-05 10:53:12 +08:00
|
|
|
|
2025-09-11 21:20:09 +08:00
|
|
|
QWEN_DENSE_MODELS = [
|
2025-12-23 14:13:42 +08:00
|
|
|
"vllm-ascend/Qwen3-0.6B-W8A8",
|
2025-09-11 21:20:09 +08:00
|
|
|
]
|
2025-09-08 22:52:24 +08:00
|
|
|
|
2025-12-11 20:35:32 +08:00
|
|
|
QWEN_W4A8_MODELS = [
|
2025-10-21 20:18:39 +08:00
|
|
|
"vllm-ascend/Qwen3-1.7B-W4A8-V1",
|
|
|
|
|
]
|
|
|
|
|
|
2026-01-22 05:34:58 +03:00
|
|
|
QWEN_W4A4_MODELS = [
|
|
|
|
|
"Eco-Tech/Qwen3-32B-w4a4-LAOS",
|
|
|
|
|
]
|
|
|
|
|
|
2025-09-27 21:01:16 +08:00
|
|
|
DEEPSEEK_W4A8_MODELS = [
|
2025-12-11 20:35:32 +08:00
|
|
|
"vllm-ascend/DeepSeek-V3.1-W4A8-puring",
|
2025-09-27 21:01:16 +08:00
|
|
|
]
|
|
|
|
|
|
2026-02-12 10:55:34 +08:00
|
|
|
GPT_OSS_MODELS = [
|
|
|
|
|
"unsloth/gpt-oss-20b-BF16",
|
|
|
|
|
]
|
2025-02-05 10:53:12 +08:00
|
|
|
|
2026-03-10 09:52:50 +08:00
|
|
|
|
2025-12-23 14:13:42 +08:00
|
|
|
def test_deepseek_multistream_moe_tp2():
|
2025-06-25 19:56:49 +08:00
|
|
|
example_prompts = [
|
|
|
|
|
"Hello, my name is",
|
|
|
|
|
]
|
|
|
|
|
dtype = "half"
|
|
|
|
|
max_tokens = 5
|
|
|
|
|
with VllmRunner(
|
2026-03-10 09:52:50 +08:00
|
|
|
"vllm-ascend/DeepSeek-V3-Pruning",
|
|
|
|
|
dtype=dtype,
|
|
|
|
|
tensor_parallel_size=2,
|
|
|
|
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
|
|
|
|
distributed_executor_backend="mp",
|
|
|
|
|
additional_config={
|
|
|
|
|
"enable_multistream_moe": True,
|
|
|
|
|
"refresh": True,
|
|
|
|
|
},
|
2025-06-25 19:56:49 +08:00
|
|
|
) as vllm_model:
|
|
|
|
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
|
|
|
|
|
|
|
|
|
|
2025-12-11 20:35:32 +08:00
|
|
|
@pytest.mark.parametrize("model", QWEN_W4A8_MODELS)
|
2025-12-23 14:13:42 +08:00
|
|
|
def test_qwen3_w4a8_dynamic_tp2(model):
|
2025-10-21 20:18:39 +08:00
|
|
|
prompts = [
|
|
|
|
|
"Hello, my name is",
|
|
|
|
|
]
|
|
|
|
|
max_tokens = 5
|
2025-07-30 14:57:14 +08:00
|
|
|
with VllmRunner(
|
2026-03-10 09:52:50 +08:00
|
|
|
model,
|
|
|
|
|
max_model_len=8192,
|
|
|
|
|
dtype="auto",
|
|
|
|
|
tensor_parallel_size=2,
|
|
|
|
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
|
|
|
|
quantization="ascend",
|
2025-07-22 19:03:13 +08:00
|
|
|
) as vllm_model:
|
2025-10-21 20:18:39 +08:00
|
|
|
vllm_model.generate_greedy(prompts, max_tokens)
|
2025-08-06 10:17:44 +08:00
|
|
|
|
|
|
|
|
|
2026-03-19 17:17:36 +08:00
|
|
|
@wait_until_npu_memory_free(target_free_percentage=0.95)
|
2025-12-23 14:13:42 +08:00
|
|
|
def test_qwen3_moe_sp_tp2() -> None:
|
2025-08-07 09:15:49 +08:00
|
|
|
example_prompts = [
|
|
|
|
|
"Hello, my name is",
|
|
|
|
|
]
|
2026-03-10 09:52:50 +08:00
|
|
|
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
|
|
|
|
|
|
|
|
|
with VllmRunner(
|
|
|
|
|
"Qwen/Qwen3-30B-A3B",
|
|
|
|
|
dtype="auto",
|
|
|
|
|
tensor_parallel_size=2,
|
|
|
|
|
distributed_executor_backend="mp",
|
|
|
|
|
compilation_config={"pass_config": {"enable_sp": True}},
|
|
|
|
|
enable_expert_parallel=True,
|
|
|
|
|
enforce_eager=True,
|
|
|
|
|
) as vllm_model:
|
2025-08-07 09:15:49 +08:00
|
|
|
vllm_model.generate(example_prompts, sampling_params)
|
2025-09-08 22:52:24 +08:00
|
|
|
|
|
|
|
|
|
2025-12-18 14:10:14 +08:00
|
|
|
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
|
|
|
|
|
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "2048"})
|
2026-03-19 17:17:36 +08:00
|
|
|
@wait_until_npu_memory_free(target_free_percentage=0.95)
|
2025-12-23 14:13:42 +08:00
|
|
|
def test_deepseek_w4a8_accuracy_tp2(model):
|
2025-12-18 14:10:14 +08:00
|
|
|
prompts = [
|
2026-03-10 09:52:50 +08:00
|
|
|
"Hello, my name is",
|
|
|
|
|
"The president of the United States is",
|
|
|
|
|
"vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs",
|
2025-12-18 14:10:14 +08:00
|
|
|
]
|
2026-03-10 09:52:50 +08:00
|
|
|
vllm_ds_w4a8_answers = ["逍遙而至地去 accrued", "平行于我udo madreHelen", "ysteepaolis backwards Kj"]
|
2025-12-18 14:10:14 +08:00
|
|
|
sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
|
2026-03-10 09:52:50 +08:00
|
|
|
with VllmRunner(
|
|
|
|
|
model,
|
|
|
|
|
dtype="auto",
|
|
|
|
|
tensor_parallel_size=2,
|
|
|
|
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
|
|
|
|
quantization="ascend",
|
|
|
|
|
enable_expert_parallel=True,
|
|
|
|
|
) as vllm_model:
|
|
|
|
|
vllm_quant_outputs = vllm_model.model.generate(prompts, sampling_params)
|
2025-12-18 14:10:14 +08:00
|
|
|
|
|
|
|
|
vllm_quant_outputs_list = []
|
|
|
|
|
for output in vllm_quant_outputs:
|
2026-03-10 09:52:50 +08:00
|
|
|
vllm_quant_outputs_list.append(([output.outputs[0].index], output.outputs[0].text))
|
2025-12-18 14:10:14 +08:00
|
|
|
vllm_answer_list = []
|
2026-03-10 09:52:50 +08:00
|
|
|
vllm_answer_list = [([0], answer) for answer in vllm_ds_w4a8_answers]
|
2025-12-18 14:10:14 +08:00
|
|
|
|
2026-03-10 09:52:50 +08:00
|
|
|
check_outputs_equal(
|
|
|
|
|
outputs_0_lst=vllm_answer_list,
|
|
|
|
|
outputs_1_lst=vllm_quant_outputs_list,
|
|
|
|
|
name_0="vllm_quant_outputs",
|
|
|
|
|
name_1="vllm_answer_outputs",
|
|
|
|
|
)
|
2025-12-18 14:10:14 +08:00
|
|
|
|
|
|
|
|
|
2025-11-10 11:01:45 +08:00
|
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
|
|
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "1"})
|
2025-12-23 14:13:42 +08:00
|
|
|
def test_qwen3_moe_fc2_tp2() -> None:
|
2025-11-10 11:01:45 +08:00
|
|
|
example_prompts = [
|
|
|
|
|
"Hello, my name is",
|
|
|
|
|
]
|
2026-03-10 09:52:50 +08:00
|
|
|
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
|
|
|
|
|
|
|
|
|
with VllmRunner(
|
|
|
|
|
"Qwen/Qwen3-30B-A3B",
|
|
|
|
|
dtype="auto",
|
|
|
|
|
tensor_parallel_size=2,
|
|
|
|
|
distributed_executor_backend="mp",
|
|
|
|
|
enable_expert_parallel=True,
|
|
|
|
|
enforce_eager=True,
|
|
|
|
|
) as vllm_model:
|
2025-11-10 11:01:45 +08:00
|
|
|
vllm_model.generate(example_prompts, sampling_params)
|
|
|
|
|
|
|
|
|
|
|
2026-01-10 22:57:57 +08:00
|
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
|
|
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "1"})
|
|
|
|
|
def test_qwen3_moe_fc2_oshard_tp2() -> None:
|
|
|
|
|
example_prompts = [
|
|
|
|
|
"Hello, my name is",
|
|
|
|
|
]
|
2026-03-10 09:52:50 +08:00
|
|
|
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
2026-01-10 22:57:57 +08:00
|
|
|
|
|
|
|
|
with VllmRunner(
|
2026-03-10 09:52:50 +08:00
|
|
|
"Qwen/Qwen3-30B-A3B",
|
|
|
|
|
dtype="auto",
|
|
|
|
|
tensor_parallel_size=2,
|
|
|
|
|
distributed_executor_backend="mp",
|
|
|
|
|
enable_expert_parallel=True,
|
|
|
|
|
enforce_eager=True, # TODO(Levi-JQ): support graph mode for fc2 in Qwen
|
|
|
|
|
additional_config={"layer_sharding": ["o_proj"]},
|
|
|
|
|
) as vllm_model:
|
2026-01-10 22:57:57 +08:00
|
|
|
vllm_model.generate(example_prompts, sampling_params)
|
|
|
|
|
|
|
|
|
|
|
2025-11-04 16:49:58 +08:00
|
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
2025-12-23 14:13:42 +08:00
|
|
|
def test_deepseek_v2_lite_fc1_tp2() -> None:
|
2025-11-04 16:49:58 +08:00
|
|
|
example_prompts = [
|
|
|
|
|
"test" * 1001,
|
|
|
|
|
]
|
2026-03-10 09:52:50 +08:00
|
|
|
sampling_params = SamplingParams(max_tokens=5, temperature=0.0, top_k=50, top_p=0.9)
|
|
|
|
|
with VllmRunner(
|
|
|
|
|
"vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
|
|
|
|
dtype="auto",
|
|
|
|
|
tensor_parallel_size=2,
|
|
|
|
|
distributed_executor_backend="mp",
|
|
|
|
|
enable_expert_parallel=True,
|
|
|
|
|
enforce_eager=True,
|
|
|
|
|
quantization="ascend",
|
|
|
|
|
) as vllm_model:
|
2025-11-04 16:49:58 +08:00
|
|
|
vllm_model.generate(example_prompts, sampling_params)
|
|
|
|
|
|
|
|
|
|
|
2025-09-08 22:52:24 +08:00
|
|
|
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
2025-10-15 19:36:32 +08:00
|
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
2025-12-23 14:13:42 +08:00
|
|
|
def test_qwen3_dense_fc1_tp2(model):
|
2025-09-08 22:52:24 +08:00
|
|
|
example_prompts = [
|
|
|
|
|
"Hello, my name is",
|
|
|
|
|
]
|
|
|
|
|
max_tokens = 5
|
|
|
|
|
|
|
|
|
|
with VllmRunner(
|
2026-03-10 09:52:50 +08:00
|
|
|
model,
|
|
|
|
|
max_model_len=8192,
|
|
|
|
|
dtype="auto",
|
|
|
|
|
tensor_parallel_size=2,
|
|
|
|
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
|
|
|
|
quantization="ascend",
|
2025-09-11 21:20:09 +08:00
|
|
|
) as vllm_model:
|
|
|
|
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
2026-02-04 09:08:18 +08:00
|
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
2025-12-23 14:13:42 +08:00
|
|
|
def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
|
2025-09-11 21:20:09 +08:00
|
|
|
example_prompts = [
|
|
|
|
|
"Hello, my name is",
|
|
|
|
|
]
|
|
|
|
|
max_tokens = 5
|
|
|
|
|
|
|
|
|
|
with VllmRunner(
|
2026-03-10 09:52:50 +08:00
|
|
|
model,
|
|
|
|
|
max_model_len=8192,
|
|
|
|
|
dtype="auto",
|
|
|
|
|
tensor_parallel_size=2,
|
|
|
|
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
|
|
|
|
quantization="ascend",
|
|
|
|
|
additional_config={"weight_prefetch_config": {"enabled": True}},
|
2025-09-08 22:52:24 +08:00
|
|
|
) as vllm_model:
|
|
|
|
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
2026-01-16 15:49:57 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@patch.dict(os.environ, {"HCCL_OP_EXPANSION_MODE": "AIV"})
|
2026-01-23 19:48:37 +08:00
|
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
2026-01-16 15:49:57 +08:00
|
|
|
@patch.dict(os.environ, {"ASCEND_AGGREGATE_ENABLE": "1"})
|
|
|
|
|
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
|
2026-03-20 23:23:57 +08:00
|
|
|
@wait_until_npu_memory_free()
|
2026-01-16 15:49:57 +08:00
|
|
|
def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
|
2026-02-04 09:10:50 +08:00
|
|
|
short_example_prompts = [
|
|
|
|
|
"Hello ",
|
2026-01-16 15:49:57 +08:00
|
|
|
]
|
2026-02-04 09:10:50 +08:00
|
|
|
# "max_position_embeddings": 163840,
|
2026-03-10 09:52:50 +08:00
|
|
|
long_example_prompts = ["Hello " * (163839 - 500) + "Hello"]
|
2026-02-26 10:58:50 +08:00
|
|
|
max_tokens = 500
|
2026-03-10 09:52:50 +08:00
|
|
|
with VllmRunner(
|
|
|
|
|
"vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
|
|
|
|
|
tensor_parallel_size=2,
|
|
|
|
|
quantization="ascend",
|
|
|
|
|
enable_expert_parallel=True,
|
|
|
|
|
max_model_len=163840,
|
|
|
|
|
compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
|
|
|
|
speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
|
|
|
|
|
additional_config={"layer_sharding": ["q_b_proj", "o_proj"]},
|
|
|
|
|
reasoning_parser="deepseek_v3",
|
|
|
|
|
tokenizer_mode="deepseek_v32",
|
2026-03-23 14:22:59 +08:00
|
|
|
gpu_memory_utilization=0.8,
|
2026-03-10 09:52:50 +08:00
|
|
|
) as vllm_model:
|
2026-02-04 09:10:50 +08:00
|
|
|
vllm_model.generate_greedy(short_example_prompts, max_tokens)
|
|
|
|
|
vllm_model.generate_greedy(long_example_prompts, max_tokens)
|
2026-01-22 05:34:58 +03:00
|
|
|
|
|
|
|
|
|
2026-03-16 15:39:42 +08:00
|
|
|
@patch.dict(os.environ, {"HCCL_OP_EXPANSION_MODE": "AIV"})
|
|
|
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
|
|
|
|
@patch.dict(os.environ, {"ASCEND_AGGREGATE_ENABLE": "1"})
|
|
|
|
|
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
|
2026-03-20 23:23:57 +08:00
|
|
|
@wait_until_npu_memory_free()
|
2026-03-16 15:39:42 +08:00
|
|
|
def test_deepseek3_2_w8a8c8_pruning_mtp_tp2_ep():
|
|
|
|
|
short_example_prompts = [
|
|
|
|
|
"Hello ",
|
|
|
|
|
]
|
|
|
|
|
# "max_position_embeddings": 163840,
|
|
|
|
|
long_example_prompts = ["Hello " * (163839 - 500) + "Hello"]
|
|
|
|
|
max_tokens = 500
|
|
|
|
|
with VllmRunner(
|
|
|
|
|
"vllm-ascend/DeepSeek-V3.2-W8A8-Pruning",
|
|
|
|
|
tensor_parallel_size=2,
|
|
|
|
|
quantization="ascend",
|
|
|
|
|
enable_expert_parallel=True,
|
|
|
|
|
max_model_len=163840,
|
|
|
|
|
compilation_config={"cudagraph_capture_sizes": [2, 4, 6, 8, 10, 12], "cudagraph_mode": "FULL_DECODE_ONLY"},
|
|
|
|
|
speculative_config={"num_speculative_tokens": 1, "method": "deepseek_mtp"},
|
|
|
|
|
additional_config={"layer_sharding": ["q_b_proj", "o_proj"], "enable_sparse_c8": True},
|
|
|
|
|
reasoning_parser="deepseek_v3",
|
|
|
|
|
tokenizer_mode="deepseek_v32",
|
2026-03-23 14:22:59 +08:00
|
|
|
gpu_memory_utilization=0.8,
|
2026-03-16 15:39:42 +08:00
|
|
|
) as vllm_model:
|
|
|
|
|
vllm_model.generate_greedy(short_example_prompts, max_tokens)
|
|
|
|
|
vllm_model.generate_greedy(long_example_prompts, max_tokens)
|
|
|
|
|
|
|
|
|
|
|
2026-01-22 05:34:58 +03:00
|
|
|
@pytest.mark.parametrize("model", QWEN_W4A4_MODELS)
|
|
|
|
|
def test_qwen3_w4a4_distributed_tp2(model):
|
|
|
|
|
example_prompts = [
|
|
|
|
|
"Hello, my name is",
|
|
|
|
|
]
|
|
|
|
|
max_tokens = 5
|
|
|
|
|
with VllmRunner(
|
2026-03-10 09:52:50 +08:00
|
|
|
model,
|
|
|
|
|
tensor_parallel_size=2,
|
|
|
|
|
cudagraph_capture_sizes=[1, 2, 4, 8],
|
|
|
|
|
quantization="ascend",
|
2026-01-22 05:34:58 +03:00
|
|
|
) as vllm_model:
|
|
|
|
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
2026-02-12 10:55:34 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("model", GPT_OSS_MODELS)
|
|
|
|
|
def test_gpt_oss_distributed_tp2(model):
|
|
|
|
|
example_prompts = [
|
|
|
|
|
"Hello, my name is",
|
|
|
|
|
]
|
|
|
|
|
max_tokens = 5
|
|
|
|
|
with VllmRunner(
|
2026-03-10 09:52:50 +08:00
|
|
|
model,
|
|
|
|
|
tensor_parallel_size=2,
|
|
|
|
|
enforce_eager=True,
|
2026-02-12 10:55:34 +08:00
|
|
|
) as vllm_model:
|
|
|
|
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|