### What this PR does / why we need it?
GLM4.6 support mtp with fullgraph to improve performance
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
`
export HCCL_BUFFSIZE=1024
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export HCCL_OP_EXPANSION_MODE=AIV
vllm serve /weight/glm4.6_w8a8_with_float_mtp \
--data-parallel-size 1 \
--tensor-parallel-size 16 \
--seed 1024 \
--served-model-name glm \
--max-model-len 35000 \
--max-num-batched-tokens 16384 \
--max-num-seqs 16 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--speculative-config '{"num_speculative_tokens": 1,
"model":"/weight/glm4.6_w8a8_with_float_mtp", "method":"mtp"}' \
--compilation-config '{"cudagraph_capture_sizes": [1,2,4,8,16,32],
"cudagraph_mode": "FULL_DECODE_ONLY"}' \
--async-scheduling \
`
test case:
`
vllm bench serve \
--backend vllm \
--dataset-name prefix_repetition \
--prefix-repetition-prefix-len 22400 \
--prefix-repetition-suffix-len 9600 \
--prefix-repetition-output-len 1024 \
--num-prompts 1 \
--prefix-repetition-num-prefixes 1 \
--ignore-eos \
--model glm \
--tokenizer /weight/glm4.6_w8a8_with_float_mtp \
--seed 1000 \
--host 0.0.0.0 \
--port 8000 \
--endpoint /v1/completions \
--max-concurrency 1 \
--request-rate 1
`
- vLLM version: v0.13.0
- vLLM main:
5326c89803
Signed-off-by: 1092626063 <1092626063@qq.com>
116 lines
3.4 KiB
Python
116 lines
3.4 KiB
Python
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
|
# Copyright 2023 The vLLM team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# This file is a part of the vllm-ascend project.
|
|
#
|
|
from typing import Any
|
|
|
|
import openai
|
|
import pytest
|
|
from vllm.utils.network_utils import get_open_port
|
|
|
|
from tests.e2e.conftest import RemoteOpenAIServer
|
|
from tools.aisbench import run_aisbench_cases
|
|
|
|
MODELS = [
|
|
"ZhipuAI/GLM-4.5",
|
|
]
|
|
|
|
TENSOR_PARALLELS = [8]
|
|
DATA_PARALLELS = [2]
|
|
FULL_GRAPH = [True, False]
|
|
|
|
prompts = [
|
|
"San Francisco is a",
|
|
]
|
|
|
|
api_keyword_args = {
|
|
"max_tokens": 10,
|
|
}
|
|
|
|
aisbench_cases = [{
|
|
"case_type": "accuracy",
|
|
"dataset_path": "vllm-ascend/gsm8k-lite",
|
|
"request_conf": "vllm_api_general_chat",
|
|
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt",
|
|
"max_out_len": 4096,
|
|
"batch_size": 8,
|
|
"baseline": 95,
|
|
"threshold": 5
|
|
}, {
|
|
"case_type": "performance",
|
|
"dataset_path": "vllm-ascend/GSM8K-in3500-bs400",
|
|
"request_conf": "vllm_api_stream_chat",
|
|
"dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf",
|
|
"num_prompts": 16,
|
|
"max_out_len": 1500,
|
|
"batch_size": 8,
|
|
"request_rate": 0,
|
|
"baseline": 1,
|
|
"threshold": 0.97
|
|
}]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("model", MODELS)
|
|
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
|
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
|
|
@pytest.mark.parametrize("full_graph", FULL_GRAPH)
|
|
async def test_models(model: str, tp_size: int, dp_size: int,
|
|
full_graph: bool) -> None:
|
|
port = get_open_port()
|
|
env_dict = {"HCCL_BUFFSIZE": "1024"}
|
|
server_args = [
|
|
"--no-enable-prefix-caching",
|
|
"--enable-expert-parallel",
|
|
"--tensor-parallel-size",
|
|
str(tp_size),
|
|
"--data-parallel-size",
|
|
str(dp_size),
|
|
"--port",
|
|
str(port),
|
|
"--max-model-len",
|
|
"8192",
|
|
"--max-num-batched-tokens",
|
|
"8192",
|
|
"--block-size",
|
|
"16",
|
|
"--trust-remote-code",
|
|
"--gpu-memory-utilization",
|
|
"0.9",
|
|
]
|
|
if full_graph:
|
|
server_args += [
|
|
"--compilation-config",
|
|
'{"cudagraph_capture": [1,2,4,8,16], "cudagraph_model":"FULL_DECODE_ONLY"}'
|
|
]
|
|
request_keyword_args: dict[str, Any] = {
|
|
**api_keyword_args,
|
|
}
|
|
with RemoteOpenAIServer(model,
|
|
server_args,
|
|
server_port=port,
|
|
env_dict=env_dict,
|
|
auto_port=False) as server:
|
|
client = server.get_async_client()
|
|
batch = await client.completions.create(
|
|
model=model,
|
|
prompt=prompts,
|
|
**request_keyword_args,
|
|
)
|
|
choices: list[openai.types.CompletionChoice] = batch.choices
|
|
assert choices[0].text, "empty response"
|
|
# aisbench test
|
|
run_aisbench_cases(model, port, aisbench_cases)
|