### What this PR does / why we need it?
Supports generalized FlashComm2 optimization, which reduces
communication overhead, decreases RmsNorm computation, and saves one
AllGather step by replacing Allreduce operations in the Attention module
with pre-AlltoAll and post-AllGather operations (used in combination
with FlashComm1). This feature is enabled during the Prefill phase and
is recommended to be used together with FlashComm1, delivering broad
performance improvements, especially in long sequence scenarios with
large tensor parallelism (TP) configurations. Benchmark tests show that
under TP16DP1 configuration, it can improve the prefill performance of
the DeepSeek model by 8% on top of FlashComm1.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
83f478bb19
---------
Signed-off-by: zzhxx <2783294813@qq.com>
Signed-off-by: Levi-JQ <yujinqi2@huawei.com>
Co-authored-by: Levi-JQ <yujinqi2@huawei.com>
Co-authored-by: zzhxx <2783294813@qq.com>
102 lines
4.2 KiB
Python
102 lines
4.2 KiB
Python
#
|
|
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
# Copyright 2023 The vLLM team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import multiprocessing
|
|
import os
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
import torch
|
|
from modelscope import snapshot_download # type: ignore
|
|
from vllm import SamplingParams
|
|
|
|
from tests.e2e.conftest import VllmRunner
|
|
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
|
|
|
MODELS = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
|
|
|
|
|
|
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
|
|
reason="aclgraph only support on v1")
|
|
@pytest.mark.parametrize("model", MODELS)
|
|
@pytest.mark.parametrize("max_tokens", [4])
|
|
@patch.dict(os.environ, {"VLLM_ASCEND_FLASHCOMM2_PARALLEL_SIZE": "0"})
|
|
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"})
|
|
def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
|
|
del os.environ["VLLM_WORKER_MULTIPROC_METHOD"]
|
|
capture_called = multiprocessing.Value("i", 0) # int, 0 or 1
|
|
capture_mem_before = multiprocessing.Value("q", -1) # long long (64-bit)
|
|
capture_mem_after = multiprocessing.Value("q", -1) # long long
|
|
|
|
def capture_model_wrapper(original_method):
|
|
|
|
def wrapped(self):
|
|
mem_before = torch.npu.mem_get_info()[0] # free memory
|
|
result = original_method(self)
|
|
mem_after = torch.npu.mem_get_info()[0]
|
|
with capture_called.get_lock():
|
|
capture_called.value = 1
|
|
capture_mem_before.value = mem_before
|
|
capture_mem_after.value = mem_after
|
|
return result
|
|
|
|
return wrapped
|
|
|
|
original_capture = NPUModelRunner._capture_model
|
|
|
|
with patch.object(NPUModelRunner,
|
|
'_capture_model',
|
|
new=capture_model_wrapper(original_capture)):
|
|
prompts = [
|
|
"Hello, my name is", "The president of the United States is",
|
|
"The capital of France is", "The future of AI is"
|
|
]
|
|
sampling_params = SamplingParams(max_tokens=max_tokens,
|
|
temperature=0.0)
|
|
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
|
|
vllm_model = VllmRunner(snapshot_download(model),
|
|
max_model_len=1024,
|
|
quantization="ascend")
|
|
else:
|
|
vllm_model = VllmRunner(snapshot_download(model))
|
|
_ = vllm_model.generate(prompts, sampling_params)
|
|
|
|
assert capture_called.value == 1, "_capture_model was not called during test"
|
|
assert capture_mem_before.value != -1, "capture_mem_before not set"
|
|
assert capture_mem_after.value != -1, "capture_mem_after not set"
|
|
|
|
print("capture_mem_before =", capture_mem_before.value)
|
|
print("capture_mem_after =", capture_mem_after.value)
|
|
|
|
mem_used_by_capture = capture_mem_before.value - capture_mem_after.value
|
|
# Empirical observation: capturing ACL graphs for Qwen3-0.6B uses ~0.20 GiB of NPU memory.
|
|
# DeepSeek-V2-Lite-W8A8 uses ~0.68 GiB of NPU memory
|
|
# a 1.3x tolerance is applied to account for runtime variance.
|
|
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
|
|
baseline_capture_mem = 0.68
|
|
capture_mem_tolerance = 1.5
|
|
else:
|
|
baseline_capture_mem = 0.20
|
|
capture_mem_tolerance = 1.3
|
|
max_capture_mem_gib = baseline_capture_mem * capture_mem_tolerance
|
|
max_mem_expected = max_capture_mem_gib * (1024**3)
|
|
assert mem_used_by_capture < max_mem_expected, (
|
|
f"_capture_model used more memory than expected. "
|
|
f"Used: {mem_used_by_capture / (1024**3):.2f} GiB, "
|
|
f"Expected: < {max_capture_mem_gib:.2f} GiB")
|
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = 'spawn'
|