### What this PR does / why we need it?
Supports generalized FlashComm2 optimization, which reduces
communication overhead, decreases RmsNorm computation, and saves one
AllGather step by replacing Allreduce operations in the Attention module
with pre-AlltoAll and post-AllGather operations (used in combination
with FlashComm1). This feature is enabled during the Prefill phase and
is recommended to be used together with FlashComm1, delivering broad
performance improvements, especially in long sequence scenarios with
large tensor parallelism (TP) configurations. Benchmark tests show that
under TP16DP1 configuration, it can improve the prefill performance of
the DeepSeek model by 8% on top of FlashComm1.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.11.0
- vLLM main:
83f478bb19
---------
Signed-off-by: zzhxx <2783294813@qq.com>
Signed-off-by: Levi-JQ <yujinqi2@huawei.com>
Co-authored-by: Levi-JQ <yujinqi2@huawei.com>
Co-authored-by: zzhxx <2783294813@qq.com>
206 lines
8.4 KiB
YAML
206 lines
8.4 KiB
YAML
name: 'e2e test'
|
|
|
|
on:
|
|
workflow_call:
|
|
inputs:
|
|
vllm:
|
|
required: true
|
|
type: string
|
|
runner:
|
|
required: true
|
|
type: string
|
|
image:
|
|
required: true
|
|
type: string
|
|
type:
|
|
required: true
|
|
type: string
|
|
|
|
jobs:
|
|
e2e:
|
|
name: singlecard
|
|
runs-on: ${{ inputs.runner }}-1
|
|
container:
|
|
image: ${{ inputs.image }}
|
|
env:
|
|
VLLM_LOGGING_LEVEL: ERROR
|
|
VLLM_USE_MODELSCOPE: True
|
|
steps:
|
|
- name: Check npu and CANN info
|
|
run: |
|
|
npu-smi info
|
|
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
|
|
|
|
- name: Config mirrors
|
|
run: |
|
|
sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
|
|
pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
|
|
pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
|
|
apt-get update -y
|
|
apt install git -y
|
|
|
|
- name: Checkout vllm-project/vllm-ascend repo
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Install system dependencies
|
|
run: |
|
|
apt-get -y install `cat packages.txt`
|
|
apt-get -y install gcc g++ cmake libnuma-dev
|
|
|
|
- name: Checkout vllm-project/vllm repo
|
|
uses: actions/checkout@v4
|
|
with:
|
|
repository: vllm-project/vllm
|
|
ref: ${{ inputs.vllm }}
|
|
path: ./vllm-empty
|
|
fetch-depth: 1
|
|
|
|
- name: Install vllm-project/vllm from source
|
|
working-directory: ./vllm-empty
|
|
run: |
|
|
VLLM_TARGET_DEVICE=empty pip install -e .
|
|
|
|
- name: Install vllm-project/vllm-ascend
|
|
env:
|
|
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|
run: |
|
|
pip install -r requirements-dev.txt
|
|
pip install -v -e .
|
|
|
|
- name: Run vllm-project/vllm-ascend test
|
|
env:
|
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
|
VLLM_USE_MODELSCOPE: True
|
|
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
|
|
if: ${{ inputs.type == 'light' }}
|
|
run: |
|
|
# pytest -sv tests/e2e/singlecard/test_aclgraph.py
|
|
# pytest -sv tests/e2e/singlecard/test_quantization.py
|
|
pytest -sv tests/e2e/singlecard/test_vlm.py::test_multimodal_vl
|
|
|
|
- name: Run e2e test
|
|
env:
|
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
|
VLLM_USE_MODELSCOPE: True
|
|
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
|
|
if: ${{ inputs.type == 'full' }}
|
|
run: |
|
|
# We found that if running aclgraph tests in batch, it will cause AclmdlRICaptureBegin error. So we run
|
|
# the test separately.
|
|
|
|
pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py
|
|
pytest -sv tests/e2e/singlecard/test_aclgraph.py
|
|
pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py
|
|
pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
|
|
pytest -sv tests/e2e/singlecard/test_bge_model.py
|
|
pytest -sv tests/e2e/singlecard/test_camem.py
|
|
pytest -sv tests/e2e/singlecard/test_chunked.py
|
|
pytest -sv tests/e2e/singlecard/test_embedding.py
|
|
# pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
|
|
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
|
|
pytest -sv tests/e2e/singlecard/test_ilama_lora.py
|
|
pytest -sv tests/e2e/singlecard/test_profile_execute_duration.py
|
|
pytest -sv tests/e2e/singlecard/test_quantization.py
|
|
pytest -sv tests/e2e/singlecard/test_sampler.py
|
|
pytest -sv tests/e2e/singlecard/test_vlm.py
|
|
pytest -sv tests/e2e/singlecard/multi-modal/test_internvl.py
|
|
|
|
# ------------------------------------ v1 spec decode test ------------------------------------ #
|
|
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
|
|
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py
|
|
# Fix me: OOM error
|
|
# pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
|
|
|
|
# TODO: Move ops test to nightly test
|
|
#pytest -sv tests/e2e/singlecard/ops/
|
|
|
|
e2e-2-cards:
|
|
name: multicard
|
|
runs-on: ${{ inputs.runner }}-2
|
|
container:
|
|
image: ${{ inputs.image }}
|
|
env:
|
|
VLLM_LOGGING_LEVEL: ERROR
|
|
VLLM_USE_MODELSCOPE: True
|
|
steps:
|
|
- name: Check npu and CANN info
|
|
run: |
|
|
npu-smi info
|
|
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
|
|
|
|
- name: Config mirrors
|
|
run: |
|
|
sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
|
|
pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
|
|
pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
|
|
apt-get update -y
|
|
apt install git -y
|
|
|
|
- name: Checkout vllm-project/vllm-ascend repo
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Install system dependencies
|
|
run: |
|
|
apt-get -y install `cat packages.txt`
|
|
apt-get -y install gcc g++ cmake libnuma-dev
|
|
|
|
- name: Checkout vllm-project/vllm repo
|
|
uses: actions/checkout@v4
|
|
with:
|
|
repository: vllm-project/vllm
|
|
ref: ${{ inputs.vllm }}
|
|
path: ./vllm-empty
|
|
fetch-depth: 1
|
|
|
|
- name: Install vllm-project/vllm from source
|
|
working-directory: ./vllm-empty
|
|
run: |
|
|
VLLM_TARGET_DEVICE=empty pip install -e .
|
|
|
|
- name: Install vllm-project/vllm-ascend
|
|
env:
|
|
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|
run: |
|
|
pip install -r requirements-dev.txt
|
|
pip install -v -e .
|
|
|
|
- name: Run vllm-project/vllm-ascend test (light)
|
|
env:
|
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
|
VLLM_USE_MODELSCOPE: True
|
|
if: ${{ inputs.type == 'light' }}
|
|
run: |
|
|
pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP
|
|
pytest -sv tests/e2e/multicard/test_torchair_graph_mode.py::test_e2e_deepseekv2lite_with_torchair
|
|
|
|
- name: Run vllm-project/vllm-ascend test (full)
|
|
env:
|
|
VLLM_WORKER_MULTIPROC_METHOD: spawn
|
|
VLLM_USE_MODELSCOPE: True
|
|
if: ${{ inputs.type == 'full' }}
|
|
run: |
|
|
pytest -sv tests/e2e/multicard/test_torchair_graph_mode.py
|
|
pytest -sv tests/e2e/multicard/test_data_parallel.py
|
|
pytest -sv tests/e2e/multicard/test_expert_parallel.py
|
|
pytest -sv tests/e2e/multicard/test_external_launcher.py
|
|
pytest -sv tests/e2e/multicard/test_single_request_aclgraph.py
|
|
pytest -sv tests/e2e/multicard/test_fused_moe_allgather_ep.py
|
|
pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
|
|
|
|
# To avoid oom, we need to run the test in a single process.
|
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
|
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
|
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W8A8
|
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC_new_version
|
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC_old_version
|
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
|
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe
|
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_fc2_for_qwen3_moe
|
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_flashcomm_v1
|
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight
|
|
|
|
pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
|
|
pytest -sv tests/e2e/multicard/test_prefix_caching.py
|
|
pytest -sv tests/e2e/multicard/test_qwen3_moe.py
|
|
|