xc-llm-ascend/tests/e2e/nightly/single_node/models/configs/MiniMax-M2.5-A3.yaml

# ==========================================
# ACTUAL TEST CASES
# ==========================================

test_cases:
  - name: "MiniMax-M2.5-TP16-Reasoning-Tool"
    model: "MiniMax/MiniMax-M2.5"
    envs:
      HCCL_BUFFSIZE: "1024"
      OMP_PROC_BIND: "false"
      HCCL_OP_EXPANSION_MODE: "AIV"
      PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
      VLLM_ASCEND_ENABLE_FLASHCOMM1: "1"
      SERVER_PORT: "DEFAULT_PORT"
    prompts:
      - "Hello. Please introduce yourself briefly."
    api_keyword_args:
      max_tokens: 128
      temperature: 0
    test_content:
      - chat_completion
    server_cmd:
      - "--tensor-parallel-size"
      - "16"
      - "--port"
      - "$SERVER_PORT"
      - "--trust-remote-code"
      - "--dtype"
      - "bfloat16"
      - "--enable-expert-parallel"
      - "--max-num-seqs"
      - "32"
      - "--max-num-batched-tokens"
      - "32768"
      # Prefer a smaller max length for nightly stability. For full context,
      # omit this flag and rely on the model config (196608).
      - "--max-model-len"
      - "32768"
      - "--compilation-config"
      - '{"cudagraph_mode":"FULL_DECODE_ONLY"}'
      - "--enable-auto-tool-choice"
      - "--tool-call-parser"
      - "minimax_m2"
      - "--reasoning-parser"
      - "minimax_m2_append_think"