diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index 93ad51fc..3c508e9a 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -288,7 +288,7 @@ jobs: if: always() uses: actions/upload-artifact@v4 with: - name: pod-logs + name: ${{ inputs.config_file_path }}-pod-logs path: /tmp/vllm*_logs.txt retention-days: 7 diff --git a/.github/workflows/nightly_test_a3.yaml b/.github/workflows/nightly_test_a3.yaml index 3038b541..2439e02a 100644 --- a/.github/workflows/nightly_test_a3.yaml +++ b/.github/workflows/nightly_test_a3.yaml @@ -62,10 +62,6 @@ jobs: - name: multi-node-qwenw8a8-2node config_file_path: Qwen3-235B-W8A8.yaml size: 2 - # TODO: Replace deepseek3.2-exp with deepseek3.2 after nightly tests pass - # - name: multi-node-dpsk3.2-exp-2node - # config_file_path: DeepSeek-V3_2-Exp-bf16.yaml - # size: 2 - name: multi-node-deepseek-r1-w8a8-eplb config_file_path: DeepSeek-R1-W8A8-EPLB.yaml size: 4 diff --git a/tests/e2e/nightly/multi_node/config/utils.py b/tests/e2e/nightly/multi_node/config/utils.py index 95fcad5b..0551ef5f 100644 --- a/tests/e2e/nightly/multi_node/config/utils.py +++ b/tests/e2e/nightly/multi_node/config/utils.py @@ -24,7 +24,7 @@ def temp_env(env_dict): os.environ[k] = v -def dns_resolver(retries: int = 20, base_delay: float = 0.5): +def dns_resolver(retries: int = 240, base_delay: float = 0.5): # We should resolve DNS with retries to avoid transient network issues. # When the pod is just started, DNS resolution may fail. def resolve(dns: str): diff --git a/tests/e2e/nightly/multi_node/scripts/run.sh b/tests/e2e/nightly/multi_node/scripts/run.sh index 8ef48bf4..1df6d5a7 100644 --- a/tests/e2e/nightly/multi_node/scripts/run.sh +++ b/tests/e2e/nightly/multi_node/scripts/run.sh @@ -13,7 +13,8 @@ LOG_DIR="/root/.cache/tests/logs" OVERWRITE_LOGS=true export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH export BENCHMARK_HOME=${WORKSPACE}/vllm-ascend/benchmark -export VLLM_LOGGING_LEVEL="DEBUG" +export VLLM_LOGGING_LEVEL="INFO" +export TRANSFORMERS_OFFLINE="1" # Function to print section headers print_section() { @@ -88,7 +89,9 @@ check_npu_info() { check_and_config() { echo "====> Configure mirrors and git proxy" - git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" + # Fix me(Potabk): Currently, there have some issues with accessing GitHub via https://gh-proxy.test.osinfra.cn in certain regions. + # We should switch to a more stable proxy for now until the network proxy is stable enough. + git config --global url."https://ghfast.top/https://github.com/".insteadOf "https://github.com/" pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi } diff --git a/tests/e2e/nightly/multi_node/test_multi_node.py b/tests/e2e/nightly/multi_node/test_multi_node.py index 054ea31e..57727364 100644 --- a/tests/e2e/nightly/multi_node/test_multi_node.py +++ b/tests/e2e/nightly/multi_node/test_multi_node.py @@ -114,7 +114,7 @@ async def test_multi_node() -> None: proxy_port=proxy_port, disaggregated_prefill=disaggregated_prefill, nodes_info=nodes_info, - max_wait_seconds=1200, + max_wait_seconds=2800, ) as remote_server: if config.is_master: port = proxy_port if disaggregated_prefill else server_port