[CI] Add DeepSeek-V3.2-W8A8 nightly ci test (#4633)
### What this PR does / why we need it?
Add DeepSeek-V3.2-W8A8 nightly ci test:
DeepSeek-V3.2-W8A8 1node DP2+TP8
:tests/e2e/nightly/models/test_deepseek_v3_2_w8a8.py
### Does this PR introduce _any_ user-facing change
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
13
.github/workflows/_e2e_nightly_single_node.yaml
vendored
13
.github/workflows/_e2e_nightly_single_node.yaml
vendored
@@ -110,19 +110,6 @@ jobs:
|
||||
fi
|
||||
cd ..
|
||||
|
||||
- name: Install custom-ops (for DeepSeek-V3.2-Exp)
|
||||
if: ${{ inputs.name == 'deepseek3_2-exp-w8a8' }}
|
||||
shell: bash -l {0}
|
||||
run: |
|
||||
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/CANN-custom_ops-sfa-linux.aarch64.run
|
||||
chmod +x ./CANN-custom_ops-sfa-linux.aarch64.run
|
||||
./CANN-custom_ops-sfa-linux.aarch64.run --quiet
|
||||
export ASCEND_CUSTOM_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize:${ASCEND_CUSTOM_OPP_PATH}
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
|
||||
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/a3/custom_ops-1.0-cp311-cp311-linux_aarch64.whl
|
||||
pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl
|
||||
. /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
- name: Install Ascend toolkit & triton_ascend
|
||||
shell: bash -l {0}
|
||||
run: |
|
||||
|
||||
11
.github/workflows/nightly_test_a3.yaml
vendored
11
.github/workflows/nightly_test_a3.yaml
vendored
@@ -150,13 +150,12 @@ jobs:
|
||||
- name: kimi-k2-thinking
|
||||
os: linux-aarch64-a3-16
|
||||
tests: tests/e2e/nightly/single_node/models/test_kimi_k2_thinking.py
|
||||
# TODO: Replace deepseek3.2-exp with deepseek3.2 after nightly tests pass
|
||||
# - name: deepseek3_2-exp-w8a8
|
||||
# os: linux-aarch64-a3-16
|
||||
# tests: tests/e2e/nightly/single_node/models/test_deepseek_v3_2_exp_w8a8.py
|
||||
- name: deepseek-r1-w8a8-hmb
|
||||
- name: deepseek-r1-w8a8-hbm
|
||||
os: linux-aarch64-a3-16
|
||||
tests: tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hmb.py
|
||||
tests: tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hbm.py
|
||||
- name: deepseek3_2-w8a8
|
||||
os: linux-aarch64-a3-16
|
||||
tests: tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py
|
||||
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
||||
with:
|
||||
vllm: v0.13.0
|
||||
|
||||
@@ -1,51 +0,0 @@
|
||||
test_name: "test DeepSeek-V3.2-Exp-bf16 multi-dp"
|
||||
model: "Yanguan/DeepSeek-V3.2-Exp-bf16"
|
||||
num_nodes: 2
|
||||
npu_per_node: 16
|
||||
env_common:
|
||||
VLLM_USE_MODELSCOPE: true
|
||||
OMP_PROC_BIND: false
|
||||
OMP_NUM_THREADS: 100
|
||||
HCCL_BUFFSIZE: 1024
|
||||
SERVER_PORT: 8080
|
||||
VLLM_ASCEND_ENABLE_MLAPO: 0
|
||||
|
||||
deployment:
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve "Yanguan/DeepSeek-V3.2-Exp-bf16"
|
||||
--host 0.0.0.0
|
||||
--port $SERVER_PORT
|
||||
--data-parallel-address $LOCAL_IP
|
||||
--data-parallel-size 2
|
||||
--data-parallel-size-local 1
|
||||
--data-parallel-rpc-port 13389
|
||||
--tensor-parallel-size 16
|
||||
--seed 1024
|
||||
--enable-expert-parallel
|
||||
--max-num-seqs 16
|
||||
--max-model-len 17450
|
||||
--max-num-batched-tokens 17450
|
||||
--trust-remote-code
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.9
|
||||
|
||||
-
|
||||
server_cmd: >
|
||||
vllm serve "Yanguan/DeepSeek-V3.2-Exp-bf16"
|
||||
--headless
|
||||
--data-parallel-size 2
|
||||
--data-parallel-size-local 1
|
||||
--data-parallel-start-rank 1
|
||||
--data-parallel-address $MASTER_IP
|
||||
--data-parallel-rpc-port 13389
|
||||
--tensor-parallel-size 16
|
||||
--seed 1024
|
||||
--max-num-seqs 16
|
||||
--max-model-len 17450
|
||||
--max-num-batched-tokens 17450
|
||||
--enable-expert-parallel
|
||||
--trust-remote-code
|
||||
--no-enable-prefix-caching
|
||||
--gpu-memory-utilization 0.92
|
||||
benchmarks:
|
||||
@@ -23,13 +23,10 @@ from vllm.utils.network_utils import get_open_port
|
||||
from tests.e2e.conftest import RemoteOpenAIServer
|
||||
from tools.aisbench import run_aisbench_cases
|
||||
|
||||
MODELS = [
|
||||
"vllm-ascend/DeepSeek-V3.2-Exp-W8A8",
|
||||
]
|
||||
MODELS = ["vllm-ascend/DeepSeek-V3.2-W8A8"]
|
||||
|
||||
TENSOR_PARALLELS = [8]
|
||||
DATA_PARALLELS = [2]
|
||||
FULL_GRAPH = [True, False]
|
||||
|
||||
prompts = [
|
||||
"San Francisco is a",
|
||||
@@ -53,11 +50,11 @@ aisbench_cases = [{
|
||||
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
|
||||
"request_conf": "vllm_api_stream_chat",
|
||||
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
||||
"num_prompts": 16,
|
||||
"num_prompts": 100,
|
||||
"max_out_len": 1500,
|
||||
"batch_size": 8,
|
||||
"request_rate": 0,
|
||||
"baseline": 1,
|
||||
"batch_size": 4,
|
||||
"request_rate": 11.2,
|
||||
"baseline": 120,
|
||||
"threshold": 0.97
|
||||
}]
|
||||
|
||||
@@ -66,24 +63,29 @@ aisbench_cases = [{
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
|
||||
@pytest.mark.parametrize("full_graph", FULL_GRAPH)
|
||||
async def test_models(model: str, tp_size: int, dp_size: int,
|
||||
full_graph: bool) -> None:
|
||||
async def test_models(model: str, tp_size: int, dp_size: int) -> None:
|
||||
port = get_open_port()
|
||||
env_dict = {"HCCL_BUFFSIZE": "1024", "VLLM_ASCEND_ENABLE_MLAPO": "0"}
|
||||
env_dict = {
|
||||
"HCCL_OP_EXPANSION_MODE": "AIV",
|
||||
"OMP_PROC_BIND": "false",
|
||||
"OMP_NUM_THREADS": "1",
|
||||
"HCCL_BUFFSIZE": "1024",
|
||||
"VLLM_ASCEND_ENABLE_MLAPO": "1",
|
||||
"PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
|
||||
"VLLM_ASCEND_ENABLE_FLASHCOMM1": "0",
|
||||
}
|
||||
|
||||
server_args = [
|
||||
"--no-enable-prefix-caching", "--enable-expert-parallel",
|
||||
"--tensor-parallel-size",
|
||||
"--enable-expert-parallel", "--tensor-parallel-size",
|
||||
str(tp_size), "--data-parallel-size",
|
||||
str(dp_size), "--port",
|
||||
str(port), "--max-model-len", "16384", "--max-num-batched-tokens",
|
||||
"16384", "--block-size", "16", "--trust-remote-code", "--quantization",
|
||||
"ascend", "--gpu-memory-utilization", "0.9"
|
||||
]
|
||||
if full_graph:
|
||||
server_args += [
|
||||
"--compilation-config",
|
||||
'{"cudagraph_capture": [16], "cudagraph_model":"FULL_DECODE_ONLY"}'
|
||||
str(port), "--max-model-len", "8192", "--max-num-batched-tokens",
|
||||
"8192", "--max-num-seqs", "4", "--trust-remote-code", "--quantization",
|
||||
"ascend", "--gpu-memory-utilization", "0.92", "--compilation-config",
|
||||
'{"cudagraph_capture_sizes":[3, 6, 9, 12], "cudagraph_mode":"FULL_DECODE_ONLY"}',
|
||||
"--speculative-config",
|
||||
'{"num_speculative_tokens": 2, "method":"deepseek_mtp"}',
|
||||
"--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32"
|
||||
]
|
||||
request_keyword_args: dict[str, Any] = {
|
||||
**api_keyword_args,
|
||||
Reference in New Issue
Block a user