xc-llm-ascend/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml

# ==========================================
# ACTUAL TEST CASES
# ==========================================

test_cases:
  - name: "prefix-cache-deepseek-r1-0528-w8a8"
    model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
    envs:
      OMP_NUM_THREADS: "10"
      OMP_PROC_BIND: "false"
      HCCL_BUFFSIZE: "1024"
      PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
      SERVER_PORT: "DEFAULT_PORT"
    server_cmd:
      - "--quantization"
      - "ascend"
      - "--data-parallel-size"
      - "2"
      - "--tensor-parallel-size"
      - "8"
      - "--enable-expert-parallel"
      - "--port"
      - "$SERVER_PORT"
      - "--seed"
      - "1024"
      - "--max-model-len"
      - "5200"
      - "--max-num-batched-tokens"
      - "4096"
      - "--max-num-seqs"
      - "16"
      - "--trust-remote-code"
      - "--gpu-memory-utilization"
      - "0.9"
      - "--additional-config"
      - '{"enable_weight_nz_layout": true}'
      - "--speculative-config"
      - '{"num_speculative_tokens": 1, "method": "mtp"}'
    test_content:
      - "benchmark_comparisons"
    benchmark_comparisons_args:
      - metric: "TTFT"
        baseline: "prefix0"
        target: "prefix75"
        ratio: 0.4
        operator: "<"
    benchmarks:
      warm_up:
        case_type: performance
        dataset_path: vllm-ascend/GSM8K-in1024-bs210
        request_conf: vllm_api_stream_chat
        dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
        num_prompts: 210
        max_out_len: 1
        batch_size: 1000
        baseline: 0
        threshold: 0.97
      prefix0:
        case_type: performance
        dataset_path: vllm-ascend/prefix0-in3500-bs210
        request_conf: vllm_api_stream_chat
        dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
        num_prompts: 210
        max_out_len: 1
        batch_size: 18
        baseline: 1
        threshold: 0.97
      prefix75:
        case_type: performance
        dataset_path: vllm-ascend/prefix75-in3500-bs210
        request_conf: vllm_api_stream_chat
        dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
        num_prompts: 210
        max_out_len: 1
        batch_size: 18
        baseline: 1
        threshold: 0.97