# ========================================== # ACTUAL TEST CASES # ========================================== test_cases: - name: "Qwen2.5-VL-32B-Instruct-a3" model: "Qwen/Qwen2.5-VL-32B-Instruct" envs: TASK_QUEUE_ENABLE: "1" VLLM_ASCEND_ENABLE_NZ: "0" HCCL_OP_EXPANSION_MODE: "AIV" SERVER_PORT: "DEFAULT_PORT" server_cmd: - "--no-enable-prefix-caching" - "--mm-processor-cache-gb" - "0" - "--tensor-parallel-size" - "4" - "--port" - "$SERVER_PORT" - "--max-model-len" - "30000" - "--max-num-batched-tokens" - "40000" - "--max-num-seqs" - "400" - "--trust-remote-code" - "--gpu-memory-utilization" - "0.8" - "--compilation_config" - '{"cudagraph_mode": "FULL_DECODE_ONLY"}' test_content: - "completion" - "image" benchmarks: acc: case_type: accuracy dataset_path: vllm-ascend/textvqa-lite request_conf: vllm_api_stream_chat dataset_conf: textvqa/textvqa_gen_base64 max_out_len: 2048 batch_size: 128 baseline: 76.22 temperature: 0 top_k: -1 top_p: 1 repetition_penalty: 1 threshold: 5 perf: case_type: performance dataset_path: vllm-ascend/textvqa-perf-1080p request_conf: vllm_api_stream_chat dataset_conf: textvqa/textvqa_gen_base64 num_prompts: 512 max_out_len: 256 batch_size: 128 temperature: 0 top_k: -1 top_p: 1 repetition_penalty: 1 request_rate: 0 baseline: 1 threshold: 0.97