# ==========================================
# ACTUAL TEST CASES
# ==========================================

test_cases:
  - name: "Qwen2.5-VL-32B-Instruct-a3"
    model: "Qwen/Qwen2.5-VL-32B-Instruct"
    envs:
      TASK_QUEUE_ENABLE: "1"
      VLLM_ASCEND_ENABLE_NZ: "0"
      HCCL_OP_EXPANSION_MODE: "AIV"
      SERVER_PORT: "DEFAULT_PORT"
    server_cmd:
      - "--no-enable-prefix-caching"
      - "--mm-processor-cache-gb"
      - "0"
      - "--tensor-parallel-size"
      - "4"
      - "--port"
      - "$SERVER_PORT"
      - "--max-model-len"
      - "30000"
      - "--max-num-batched-tokens"
      - "40000"
      - "--max-num-seqs"
      - "400"
      - "--trust-remote-code"
      - "--gpu-memory-utilization"
      - "0.8"
      - "--compilation_config"
      - '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
    test_content:
      - "completion"
      - "image"
    benchmarks:
      acc:
        case_type: accuracy
        dataset_path: vllm-ascend/textvqa-lite
        request_conf: vllm_api_stream_chat
        dataset_conf: textvqa/textvqa_gen_base64
        max_out_len: 2048
        batch_size: 128
        baseline: 76.22
        temperature: 0
        top_k: -1
        top_p: 1
        repetition_penalty: 1
        threshold: 5
      perf:
        case_type: performance
        dataset_path: vllm-ascend/textvqa-perf-1080p
        request_conf: vllm_api_stream_chat
        dataset_conf: textvqa/textvqa_gen_base64
        num_prompts: 512
        max_out_len: 256
        batch_size: 128
        temperature: 0
        top_k: -1
        top_p: 1
        repetition_penalty: 1
        request_rate: 0
        baseline: 1
        threshold: 0.97