### What this PR does / why we need it?
Supports generalized FlashComm2 optimization, which reduces
communication overhead, decreases RmsNorm computation, and saves one
AllGather step by replacing Allreduce operations in the Attention module
with pre-AlltoAll and post-AllGather operations (used in combination
with FlashComm1). This feature is enabled during the Prefill phase and
is recommended to be used together with FlashComm1, delivering broad
performance improvements, especially in long sequence scenarios with
large tensor parallelism (TP) configurations. Benchmark tests show that
under TP16DP1 configuration, it can improve the prefill performance of
the DeepSeek model by 8% on top of FlashComm1.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
83f478bb19
---------
Signed-off-by: zzhxx <2783294813@qq.com>
Signed-off-by: Levi-JQ <yujinqi2@huawei.com>
Co-authored-by: Levi-JQ <yujinqi2@huawei.com>
Co-authored-by: zzhxx <2783294813@qq.com>
269 lines
8.4 KiB
Python
269 lines
8.4 KiB
Python
#
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
# Copyright 2023 The vLLM team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# This file is a part of the vllm-ascend project.
|
|
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
|
|
#
|
|
"""Compare the short outputs of HF and vLLM when using greedy sampling.
|
|
|
|
Run `pytest tests/test_offline_inference.py`.
|
|
"""
|
|
import os
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
from modelscope import snapshot_download # type: ignore
|
|
from vllm import SamplingParams
|
|
|
|
from tests.e2e.conftest import VllmRunner
|
|
|
|
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
|
|
|
QWEN_DENSE_MODELS = [
|
|
"vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
|
|
]
|
|
|
|
QWEN_W4A8_OLD_VERSION_MODELS = [
|
|
"vllm-ascend/Qwen3-8B-W4A8",
|
|
]
|
|
|
|
QWEN_W4A8_NEW_VERSION_MODELS = [
|
|
"vllm-ascend/Qwen3-1.7B-W4A8-V1",
|
|
]
|
|
|
|
DEEPSEEK_W4A8_MODELS = [
|
|
"vllm-ascend/DeepSeek-V3-W4A8-Pruing",
|
|
"vllm-ascend/DeepSeek-V3.1-W4A8-puring"
|
|
]
|
|
|
|
|
|
def test_models_distributed_QwQ():
|
|
example_prompts = [
|
|
"Hello, my name is",
|
|
]
|
|
dtype = "half"
|
|
max_tokens = 5
|
|
with VllmRunner(
|
|
"Qwen/QwQ-32B",
|
|
dtype=dtype,
|
|
tensor_parallel_size=2,
|
|
distributed_executor_backend="mp",
|
|
enforce_eager=False,
|
|
) as vllm_model:
|
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
|
|
|
|
|
def test_models_distributed_DeepSeek_multistream_moe():
|
|
example_prompts = [
|
|
"Hello, my name is",
|
|
]
|
|
dtype = "half"
|
|
max_tokens = 5
|
|
with VllmRunner(
|
|
"vllm-ascend/DeepSeek-V3-Pruning",
|
|
dtype=dtype,
|
|
tensor_parallel_size=2,
|
|
distributed_executor_backend="mp",
|
|
additional_config={
|
|
"torchair_graph_config": {
|
|
"enabled": True,
|
|
},
|
|
"enable_multistream_moe": True,
|
|
"ascend_scheduler_config": {
|
|
"enabled": True,
|
|
},
|
|
"refresh": True,
|
|
},
|
|
) as vllm_model:
|
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
|
|
|
|
|
def test_models_distributed_Qwen3_W8A8():
|
|
example_prompts = [
|
|
"Hello, my name is",
|
|
]
|
|
max_tokens = 5
|
|
|
|
with VllmRunner(
|
|
snapshot_download("vllm-ascend/Qwen3-8B-W8A8"),
|
|
max_model_len=8192,
|
|
dtype="auto",
|
|
tensor_parallel_size=2,
|
|
quantization="ascend",
|
|
) as vllm_model:
|
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
|
|
|
|
|
@pytest.mark.parametrize("model", QWEN_W4A8_OLD_VERSION_MODELS)
|
|
def test_models_distributed_Qwen3_W4A8DYNAMIC_old_version(model):
|
|
prompts = [
|
|
"Hello, my name is",
|
|
]
|
|
max_tokens = 5
|
|
with VllmRunner(
|
|
snapshot_download(model),
|
|
max_model_len=8192,
|
|
dtype="auto",
|
|
tensor_parallel_size=2,
|
|
quantization="ascend",
|
|
) as vllm_model:
|
|
vllm_model.generate_greedy(prompts, max_tokens)
|
|
|
|
|
|
@pytest.mark.parametrize("model", QWEN_W4A8_NEW_VERSION_MODELS)
|
|
def test_models_distributed_Qwen3_W4A8DYNAMIC_new_version(model):
|
|
prompts = [
|
|
"Hello, my name is",
|
|
]
|
|
max_tokens = 5
|
|
with VllmRunner(
|
|
snapshot_download(model),
|
|
max_model_len=8192,
|
|
dtype="auto",
|
|
tensor_parallel_size=2,
|
|
quantization="ascend",
|
|
) as vllm_model:
|
|
vllm_model.generate_greedy(prompts, max_tokens)
|
|
|
|
|
|
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_MLA_PA": "1"})
|
|
@patch.dict(os.environ, {"HCCL_BUFFSIZE": "1024"})
|
|
def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
|
|
prompts = [
|
|
"Hello, my name is",
|
|
]
|
|
max_tokens = 5
|
|
with VllmRunner(
|
|
snapshot_download(model),
|
|
dtype="auto",
|
|
tensor_parallel_size=2,
|
|
quantization="ascend",
|
|
enforce_eager=True,
|
|
enable_expert_parallel=True,
|
|
additional_config={
|
|
"torchair_graph_config": {
|
|
"enabled": False,
|
|
},
|
|
"ascend_scheduler_config": {
|
|
"enabled": True,
|
|
}
|
|
},
|
|
) as vllm_model:
|
|
vllm_model.generate_greedy(prompts, max_tokens)
|
|
|
|
|
|
def test_sp_for_qwen3_moe() -> None:
|
|
example_prompts = [
|
|
"Hello, my name is",
|
|
]
|
|
sampling_params = SamplingParams(max_tokens=5,
|
|
temperature=0.0,
|
|
top_k=50,
|
|
top_p=0.9)
|
|
|
|
with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
|
|
dtype="auto",
|
|
tensor_parallel_size=2,
|
|
distributed_executor_backend="mp",
|
|
compilation_config={
|
|
"pass_config": {
|
|
"enable_sequence_parallelism": True
|
|
}
|
|
},
|
|
enable_expert_parallel=True,
|
|
enforce_eager=True) as vllm_model:
|
|
vllm_model.generate(example_prompts, sampling_params)
|
|
|
|
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "1"})
|
|
def test_fc2_for_qwen3_moe() -> None:
|
|
example_prompts = [
|
|
"Hello, my name is",
|
|
]
|
|
sampling_params = SamplingParams(max_tokens=5,
|
|
temperature=0.0,
|
|
top_k=50,
|
|
top_p=0.9)
|
|
|
|
with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
|
|
dtype="auto",
|
|
tensor_parallel_size=2,
|
|
distributed_executor_backend="mp",
|
|
enable_expert_parallel=True,
|
|
enforce_eager=True) as vllm_model:
|
|
vllm_model.generate(example_prompts, sampling_params)
|
|
|
|
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
|
def test_models_distributed_deepseek_v2_lite_with_flashcomm_v1() -> None:
|
|
example_prompts = [
|
|
"test" * 1001,
|
|
]
|
|
sampling_params = SamplingParams(max_tokens=5,
|
|
temperature=0.0,
|
|
top_k=50,
|
|
top_p=0.9)
|
|
with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"),
|
|
dtype="auto",
|
|
tensor_parallel_size=2,
|
|
distributed_executor_backend="mp",
|
|
enable_expert_parallel=True,
|
|
enforce_eager=True,
|
|
quantization="ascend") as vllm_model:
|
|
vllm_model.generate(example_prompts, sampling_params)
|
|
|
|
|
|
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
|
def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model):
|
|
example_prompts = [
|
|
"Hello, my name is",
|
|
]
|
|
max_tokens = 5
|
|
|
|
with VllmRunner(
|
|
snapshot_download(model),
|
|
max_model_len=8192,
|
|
enforce_eager=False,
|
|
dtype="auto",
|
|
tensor_parallel_size=2,
|
|
quantization="ascend",
|
|
) as vllm_model:
|
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
|
|
|
|
|
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
|
|
def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(model):
|
|
example_prompts = [
|
|
"Hello, my name is",
|
|
]
|
|
max_tokens = 5
|
|
|
|
with VllmRunner(
|
|
snapshot_download(model),
|
|
max_model_len=8192,
|
|
enforce_eager=False,
|
|
dtype="auto",
|
|
tensor_parallel_size=2,
|
|
quantization="ascend",
|
|
) as vllm_model:
|
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|