[CI] Enable linux-aarch64-a2 (64GB) and tp2 * 2 max-parallel to speed up CI (#2065)

### What this PR does / why we need it? Currently our workflow run time takes about 3 hours in total, which seriously affects the developer experience, so it is urgent to have a optimization, after this pr, It is expected that the running time of the full CI can be shortened to 1h40min. - Enable linux-aarch64-a2 (64GB) to replace linux-arm64-npu (32GB) - Change TP4 ---> TP2 * 2 max-parallel - Move DeepSeek-V2-Lite-W8A8 to single card test ### Does this PR introduce _any_ user-facing change? No - vLLM version: v0.10.0 - vLLM main: a2480251ec --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-07-29 18:59:05 +08:00
parent ca8007f584
commit f60bb474f9
14 changed files with 75 additions and 75 deletions
--- a/benchmarks/scripts/run_accuracy.py
+++ b/benchmarks/scripts/run_accuracy.py
@@ -50,17 +50,17 @@ MODEL_TYPE = {
 # Command templates for running evaluations
 MODEL_RUN_INFO = {
    "Qwen/Qwen3-30B-A3B": (
-        "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n"
+        "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True'\n"
        "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
        "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
    ),
    "Qwen/Qwen3-8B-Base": (
-        "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
+        "export MODEL_ARGS='pretrained={model},max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6'\n"
        "lm_eval --model vllm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
        "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
    ),
    "Qwen/Qwen2.5-VL-7B-Instruct": (
-        "export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2'\n"
+        "export MODEL_ARGS='pretrained={model},max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2'\n"
        "lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks {datasets} \ \n"
        "--apply_chat_template --fewshot_as_multiturn  --batch_size 1"
    ),
@@ -94,9 +94,9 @@ EXECUTION_MODE = {

 # Model arguments for evaluation
 MODEL_ARGS = {
-    "Qwen/Qwen3-8B-Base": "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6",
-    "Qwen/Qwen2.5-VL-7B-Instruct": "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=2,max_images=2",
-    "Qwen/Qwen3-30B-A3B": "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=4,gpu_memory_utilization=0.6,enable_expert_parallel=True",
+    "Qwen/Qwen3-8B-Base": "pretrained=Qwen/Qwen3-8B-Base,max_model_len=4096,dtype=auto,tensor_parallel_size=1,gpu_memory_utilization=0.6",
+    "Qwen/Qwen2.5-VL-7B-Instruct": "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_model_len=8192,dtype=auto,tensor_parallel_size=1,max_images=2",
+    "Qwen/Qwen3-30B-A3B": "pretrained=Qwen/Qwen3-30B-A3B,max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6,enable_expert_parallel=True",
 }

 # Whether to apply chat template formatting