[Feature] GLM4.6 support mtp with fullgraph (#5460)
### What this PR does / why we need it?
GLM4.6 support mtp with fullgraph to improve performance
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
`
export HCCL_BUFFSIZE=1024
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export HCCL_OP_EXPANSION_MODE=AIV
vllm serve /weight/glm4.6_w8a8_with_float_mtp \
--data-parallel-size 1 \
--tensor-parallel-size 16 \
--seed 1024 \
--served-model-name glm \
--max-model-len 35000 \
--max-num-batched-tokens 16384 \
--max-num-seqs 16 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--speculative-config '{"num_speculative_tokens": 1,
"model":"/weight/glm4.6_w8a8_with_float_mtp", "method":"mtp"}' \
--compilation-config '{"cudagraph_capture_sizes": [1,2,4,8,16,32],
"cudagraph_mode": "FULL_DECODE_ONLY"}' \
--async-scheduling \
`
test case:
`
vllm bench serve \
--backend vllm \
--dataset-name prefix_repetition \
--prefix-repetition-prefix-len 22400 \
--prefix-repetition-suffix-len 9600 \
--prefix-repetition-output-len 1024 \
--num-prompts 1 \
--prefix-repetition-num-prefixes 1 \
--ignore-eos \
--model glm \
--tokenizer /weight/glm4.6_w8a8_with_float_mtp \
--seed 1000 \
--host 0.0.0.0 \
--port 8000 \
--endpoint /v1/completions \
--max-concurrency 1 \
--request-rate 1
`
- vLLM version: v0.13.0
- vLLM main:
5326c89803
Signed-off-by: 1092626063 <1092626063@qq.com>
This commit is contained in:
@@ -29,6 +29,7 @@ MODELS = [
|
|||||||
|
|
||||||
TENSOR_PARALLELS = [8]
|
TENSOR_PARALLELS = [8]
|
||||||
DATA_PARALLELS = [2]
|
DATA_PARALLELS = [2]
|
||||||
|
FULL_GRAPH = [True, False]
|
||||||
|
|
||||||
prompts = [
|
prompts = [
|
||||||
"San Francisco is a",
|
"San Francisco is a",
|
||||||
@@ -65,11 +66,9 @@ aisbench_cases = [{
|
|||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model", MODELS)
|
||||||
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
|
||||||
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
|
@pytest.mark.parametrize("dp_size", DATA_PARALLELS)
|
||||||
async def test_models(
|
@pytest.mark.parametrize("full_graph", FULL_GRAPH)
|
||||||
model: str,
|
async def test_models(model: str, tp_size: int, dp_size: int,
|
||||||
tp_size: int,
|
full_graph: bool) -> None:
|
||||||
dp_size: int,
|
|
||||||
) -> None:
|
|
||||||
port = get_open_port()
|
port = get_open_port()
|
||||||
env_dict = {"HCCL_BUFFSIZE": "1024"}
|
env_dict = {"HCCL_BUFFSIZE": "1024"}
|
||||||
server_args = [
|
server_args = [
|
||||||
@@ -91,6 +90,11 @@ async def test_models(
|
|||||||
"--gpu-memory-utilization",
|
"--gpu-memory-utilization",
|
||||||
"0.9",
|
"0.9",
|
||||||
]
|
]
|
||||||
|
if full_graph:
|
||||||
|
server_args += [
|
||||||
|
"--compilation-config",
|
||||||
|
'{"cudagraph_capture": [1,2,4,8,16], "cudagraph_model":"FULL_DECODE_ONLY"}'
|
||||||
|
]
|
||||||
request_keyword_args: dict[str, Any] = {
|
request_keyword_args: dict[str, Any] = {
|
||||||
**api_keyword_args,
|
**api_keyword_args,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -173,7 +173,15 @@ class AscendQuantConfig(QuantizationConfig):
|
|||||||
"are quantized. All shards of fused layers "
|
"are quantized. All shards of fused layers "
|
||||||
"to have the same precision.")
|
"to have the same precision.")
|
||||||
else:
|
else:
|
||||||
is_skipped = self.quant_description[prefix + '.weight'] == "FLOAT"
|
# NOTE: In GLM4.6, the MTP draft model shares the same LM head weigthts
|
||||||
|
# with the main model. Therefore, before `load_weights()` runs, some parameter
|
||||||
|
# names may not include the expected prefix and may appear only with the
|
||||||
|
# ".head" suffix. This can trigger a load-time error, so here we replace the
|
||||||
|
# key with "lm_head.weight".
|
||||||
|
key = prefix + '.weight'
|
||||||
|
if key not in self.quant_description and ".head" in prefix:
|
||||||
|
key = 'lm_head.weight'
|
||||||
|
is_skipped = self.quant_description[key] == "FLOAT"
|
||||||
|
|
||||||
assert is_skipped is not None
|
assert is_skipped is not None
|
||||||
return is_skipped
|
return is_skipped
|
||||||
|
|||||||
Reference in New Issue
Block a user