[v0.18.0][Test][Misc] Update CI for GLM-5 configuration on vllm-ascend/releases/v0.18.0 branch (#8322)
<!-- Thanks for sending a pull request! BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html --> ### What this PR does / why we need it? Update CI for GLM-5 configuration on vllm-ascend/releases/v0.18.0 branch 在0.18.0版本上对glm5-w4a8做测试 ### Does this PR introduce _any_ user-facing change? <!-- Note that it means *any* user-facing change including all aspects such as API, interface or other behavior changes. Documentation-only updates are not considered user-facing changes. --> ### How was this patch tested? <!-- CI passed with new added/existing test. If it was tested in a way different from regular unit tests, please clarify how you tested step by step, ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future. If tests were not added, please describe why they were not added and/or why it was difficult to add. --> --------- Signed-off-by: yangjiuhua <y00845194@china.huawei.com> Co-authored-by: yangjiuhua <y00845194@china.huawei.com>
This commit is contained in:
@@ -9,7 +9,6 @@ _envs: &envs
|
|||||||
OMP_PROC_BIND: "false"
|
OMP_PROC_BIND: "false"
|
||||||
OMP_NUM_THREADS: "1"
|
OMP_NUM_THREADS: "1"
|
||||||
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
|
||||||
|
|
||||||
_server_cmd: &server_cmd
|
_server_cmd: &server_cmd
|
||||||
- "--enable-expert-parallel"
|
- "--enable-expert-parallel"
|
||||||
- "--tensor-parallel-size"
|
- "--tensor-parallel-size"
|
||||||
@@ -19,7 +18,7 @@ _server_cmd: &server_cmd
|
|||||||
- "--port"
|
- "--port"
|
||||||
- "$SERVER_PORT"
|
- "$SERVER_PORT"
|
||||||
- "--max-model-len"
|
- "--max-model-len"
|
||||||
- "8192"
|
- "16384"
|
||||||
- "--max-num-batched-tokens"
|
- "--max-num-batched-tokens"
|
||||||
- "4096"
|
- "4096"
|
||||||
- "--trust-remote-code"
|
- "--trust-remote-code"
|
||||||
@@ -31,7 +30,7 @@ _server_cmd: &server_cmd
|
|||||||
- "ascend"
|
- "ascend"
|
||||||
- "--async-scheduling"
|
- "--async-scheduling"
|
||||||
- "--additional-config"
|
- "--additional-config"
|
||||||
- '{"enable_npugraph_ex": true,"fuse_muls_add":true,"multistream_overlap_shared_expert":true}'
|
- '{"fuse_muls_add": true, "multistream_overlap_shared_expert": false, "ascend_compilation_config": {"enable_npugraph_ex": true}}'
|
||||||
- "--speculative-config"
|
- "--speculative-config"
|
||||||
- '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}'
|
- '{"num_speculative_tokens": 3, "method": "deepseek_mtp"}'
|
||||||
|
|
||||||
@@ -44,7 +43,7 @@ _benchmarks: &benchmarks
|
|||||||
dataset_path: vllm-ascend/gsm8k-lite
|
dataset_path: vllm-ascend/gsm8k-lite
|
||||||
request_conf: vllm_api_general_chat
|
request_conf: vllm_api_general_chat
|
||||||
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
|
||||||
max_out_len: 4096
|
max_out_len: 8192
|
||||||
batch_size: 8
|
batch_size: 8
|
||||||
baseline: 95
|
baseline: 95
|
||||||
threshold: 5
|
threshold: 5
|
||||||
|
|||||||
Reference in New Issue
Block a user