diff --git a/.github/workflows/nightly_test_a2.yaml b/.github/workflows/nightly_test_a2.yaml index bef88138..03aa1ed6 100644 --- a/.github/workflows/nightly_test_a2.yaml +++ b/.github/workflows/nightly_test_a2.yaml @@ -49,7 +49,7 @@ jobs: fail-fast: false matrix: test_config: - - name: qwen3next + - name: qwen3-next os: linux-aarch64-a2-4 tests: tests/e2e/nightly/single_node/models/test_qwen3_next.py - name: qwen3-32b diff --git a/tests/e2e/nightly/single_node/models/test_qwen3_next.py b/tests/e2e/nightly/single_node/models/test_qwen3_next.py index 5fd9d183..7d178727 100644 --- a/tests/e2e/nightly/single_node/models/test_qwen3_next.py +++ b/tests/e2e/nightly/single_node/models/test_qwen3_next.py @@ -16,7 +16,7 @@ MODELS = [ MODES = ["aclgraph"] TENSOR_PARALLELS = [4] -MAX_NUM_BATCHED_TOKENS = [1024, 4096, 8192, 32768] +MAX_NUM_BATCHED_TOKENS = [8192, 32768] prompts = [ "San Francisco is a", @@ -70,7 +70,6 @@ async def test_models(model: str, mode: str, tp_size: int, "HCCL_BUFFSIZE": "1024", "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", } - compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"} server_args = [ "--tensor-parallel-size", str(tp_size), @@ -81,15 +80,14 @@ async def test_models(model: str, mode: str, tp_size: int, "--max-num-batched-tokens", str(max_num_batched_tokens), "--trust-remote-code", + "--async-scheduling", + "--no-enable-prefix-caching", + "--enable-expert-parallel", "--gpu-memory-utilization", "0.8", "--max-num-seqs", "64", ] - if mode == "aclgraph": - server_args.extend( - ["--compilation-config", - json.dumps(compilation_config)]) request_keyword_args: dict[str, Any] = { **api_keyword_args, }