Fix nightly (#5413)

### What this PR does / why we need it? This pacth mainly do the following things: 1. Bugfix for multi_node_tests log, log names must be unique when uploading logs. 2. Optimize `get_cluster_ips` logic, increase the max retry times for robustness 3. Abandoned the existing gh-proxy temporarily until it is stable enough. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: release/v0.13.0 - vLLM main: 81786c8774 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-12-27 18:16:46 +08:00
parent e91e11d3b0
commit 1d81bfaed1
5 changed files with 8 additions and 9 deletions
--- a/.github/workflows/_e2e_nightly_multi_node.yaml
+++ b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -288,7 +288,7 @@ jobs:
          if: always()
          uses: actions/upload-artifact@v4
          with:
-            name: pod-logs
+            name: ${{ inputs.config_file_path }}-pod-logs
            path: /tmp/vllm*_logs.txt
            retention-days: 7

--- a/.github/workflows/nightly_test_a3.yaml
+++ b/.github/workflows/nightly_test_a3.yaml
@@ -62,10 +62,6 @@ jobs:
          - name: multi-node-qwenw8a8-2node
            config_file_path: Qwen3-235B-W8A8.yaml
            size: 2
-          # TODO: Replace deepseek3.2-exp with deepseek3.2 after nightly tests pass
-          # - name: multi-node-dpsk3.2-exp-2node
-          #   config_file_path: DeepSeek-V3_2-Exp-bf16.yaml
-          #   size: 2
          - name: multi-node-deepseek-r1-w8a8-eplb
            config_file_path: DeepSeek-R1-W8A8-EPLB.yaml
            size: 4
--- a/tests/e2e/nightly/multi_node/config/utils.py
+++ b/tests/e2e/nightly/multi_node/config/utils.py
@@ -24,7 +24,7 @@ def temp_env(env_dict):
                os.environ[k] = v


-def dns_resolver(retries: int = 20, base_delay: float = 0.5):
+def dns_resolver(retries: int = 240, base_delay: float = 0.5):
    # We should resolve DNS with retries to avoid transient network issues.
    # When the pod is just started, DNS resolution may fail.
    def resolve(dns: str):
--- a/tests/e2e/nightly/multi_node/scripts/run.sh
+++ b/tests/e2e/nightly/multi_node/scripts/run.sh
@@ -13,7 +13,8 @@ LOG_DIR="/root/.cache/tests/logs"
 OVERWRITE_LOGS=true
 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
 export BENCHMARK_HOME=${WORKSPACE}/vllm-ascend/benchmark
-export VLLM_LOGGING_LEVEL="DEBUG"
+export VLLM_LOGGING_LEVEL="INFO"
+export TRANSFORMERS_OFFLINE="1"

 # Function to print section headers
 print_section() {
@@ -88,7 +89,9 @@ check_npu_info() {

 check_and_config() {
    echo "====> Configure mirrors and git proxy"
-    git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/"
+    # Fix me(Potabk): Currently, there have some issues with accessing GitHub via https://gh-proxy.test.osinfra.cn in certain regions.
+    # We should switch to a more stable proxy for now until the network proxy is stable enough.
+    git config --global url."https://ghfast.top/https://github.com/".insteadOf "https://github.com/"
    pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
 }
--- a/tests/e2e/nightly/multi_node/test_multi_node.py
+++ b/tests/e2e/nightly/multi_node/test_multi_node.py
@@ -114,7 +114,7 @@ async def test_multi_node() -> None:
                proxy_port=proxy_port,
                disaggregated_prefill=disaggregated_prefill,
                nodes_info=nodes_info,
-                max_wait_seconds=1200,
+                max_wait_seconds=2800,
        ) as remote_server:
            if config.is_master:
                port = proxy_port if disaggregated_prefill else server_port