[CI] Optimize nightly CI (#3898)

### What this PR does / why we need it?
This patch mainly fix the the problem of not being able to determine the
exit status of the pod's entrypoint script and some other tiny
optimizations:
1. Shorten wait for server timeout
2. fix typo
3. fix the issue of ais_bench failing to correctly access the proxy URL
in a PD separation scenario.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?


- vLLM version: v0.11.0
- vLLM main:
83f478bb19

---------

Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
Li Wang
2025-10-30 23:42:20 +08:00
committed by GitHub
parent 2c291bc63f
commit eb0a2ee2d0
14 changed files with 94 additions and 66 deletions

View File

@@ -42,6 +42,7 @@ concurrency:
jobs:
single-node-tests:
name: single-node
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
strategy:
fail-fast: false
@@ -85,6 +86,7 @@ jobs:
tests: ${{ matrix.test_config.tests }}
multi-node-tests:
name: multi-node
needs: single-node-tests
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
@@ -93,19 +95,19 @@ jobs:
matrix:
test_config:
- name: multi-node-deepseek-pd
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-V3.yaml
config_file_path: DeepSeek-V3.yaml
size: 2
- name: multi-node-qwen3-dp
config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-A3B.yaml
config_file_path: Qwen3-235B-A3B.yaml
size: 2
- name: multi-node-dpsk-4node-pd
config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8.yaml
config_file_path: DeepSeek-R1-W8A8.yaml
size: 4
- name: multi-node-qwenw8a8-2node
config_file_path: tests/e2e/nightly/multi_node/config/models/Qwen3-235B-W8A8.yaml
config_file_path: Qwen3-235B-W8A8.yaml
size: 2
- name: multi-node-glm-2node
config_file_path: tests/e2e/nightly/multi_node/config/models/GLM-4_5.yaml
config_file_path: GLM-4_5.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
@@ -117,12 +119,3 @@ jobs:
config_file_path: ${{ matrix.test_config.config_file_path }}
secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
clear_resources:
needs: multi-node-tests
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
uses: ./.github/workflows/_kill_lws_resources.yaml
with:
runner: linux-aarch64-a3-0
secrets:
KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}