Fix nightly (#5413)
### What this PR does / why we need it?
This pacth mainly do the following things:
1. Bugfix for multi_node_tests log, log names must be unique when
uploading logs.
2. Optimize `get_cluster_ips` logic, increase the max retry times for
robustness
3. Abandoned the existing gh-proxy temporarily until it is stable
enough.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: release/v0.13.0
- vLLM main:
81786c8774
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -288,7 +288,7 @@ jobs:
|
|||||||
if: always()
|
if: always()
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: pod-logs
|
name: ${{ inputs.config_file_path }}-pod-logs
|
||||||
path: /tmp/vllm*_logs.txt
|
path: /tmp/vllm*_logs.txt
|
||||||
retention-days: 7
|
retention-days: 7
|
||||||
|
|
||||||
|
|||||||
4
.github/workflows/nightly_test_a3.yaml
vendored
4
.github/workflows/nightly_test_a3.yaml
vendored
@@ -62,10 +62,6 @@ jobs:
|
|||||||
- name: multi-node-qwenw8a8-2node
|
- name: multi-node-qwenw8a8-2node
|
||||||
config_file_path: Qwen3-235B-W8A8.yaml
|
config_file_path: Qwen3-235B-W8A8.yaml
|
||||||
size: 2
|
size: 2
|
||||||
# TODO: Replace deepseek3.2-exp with deepseek3.2 after nightly tests pass
|
|
||||||
# - name: multi-node-dpsk3.2-exp-2node
|
|
||||||
# config_file_path: DeepSeek-V3_2-Exp-bf16.yaml
|
|
||||||
# size: 2
|
|
||||||
- name: multi-node-deepseek-r1-w8a8-eplb
|
- name: multi-node-deepseek-r1-w8a8-eplb
|
||||||
config_file_path: DeepSeek-R1-W8A8-EPLB.yaml
|
config_file_path: DeepSeek-R1-W8A8-EPLB.yaml
|
||||||
size: 4
|
size: 4
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ def temp_env(env_dict):
|
|||||||
os.environ[k] = v
|
os.environ[k] = v
|
||||||
|
|
||||||
|
|
||||||
def dns_resolver(retries: int = 20, base_delay: float = 0.5):
|
def dns_resolver(retries: int = 240, base_delay: float = 0.5):
|
||||||
# We should resolve DNS with retries to avoid transient network issues.
|
# We should resolve DNS with retries to avoid transient network issues.
|
||||||
# When the pod is just started, DNS resolution may fail.
|
# When the pod is just started, DNS resolution may fail.
|
||||||
def resolve(dns: str):
|
def resolve(dns: str):
|
||||||
|
|||||||
@@ -13,7 +13,8 @@ LOG_DIR="/root/.cache/tests/logs"
|
|||||||
OVERWRITE_LOGS=true
|
OVERWRITE_LOGS=true
|
||||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||||
export BENCHMARK_HOME=${WORKSPACE}/vllm-ascend/benchmark
|
export BENCHMARK_HOME=${WORKSPACE}/vllm-ascend/benchmark
|
||||||
export VLLM_LOGGING_LEVEL="DEBUG"
|
export VLLM_LOGGING_LEVEL="INFO"
|
||||||
|
export TRANSFORMERS_OFFLINE="1"
|
||||||
|
|
||||||
# Function to print section headers
|
# Function to print section headers
|
||||||
print_section() {
|
print_section() {
|
||||||
@@ -88,7 +89,9 @@ check_npu_info() {
|
|||||||
|
|
||||||
check_and_config() {
|
check_and_config() {
|
||||||
echo "====> Configure mirrors and git proxy"
|
echo "====> Configure mirrors and git proxy"
|
||||||
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/"
|
# Fix me(Potabk): Currently, there have some issues with accessing GitHub via https://gh-proxy.test.osinfra.cn in certain regions.
|
||||||
|
# We should switch to a more stable proxy for now until the network proxy is stable enough.
|
||||||
|
git config --global url."https://ghfast.top/https://github.com/".insteadOf "https://github.com/"
|
||||||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
|
||||||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ async def test_multi_node() -> None:
|
|||||||
proxy_port=proxy_port,
|
proxy_port=proxy_port,
|
||||||
disaggregated_prefill=disaggregated_prefill,
|
disaggregated_prefill=disaggregated_prefill,
|
||||||
nodes_info=nodes_info,
|
nodes_info=nodes_info,
|
||||||
max_wait_seconds=1200,
|
max_wait_seconds=2800,
|
||||||
) as remote_server:
|
) as remote_server:
|
||||||
if config.is_master:
|
if config.is_master:
|
||||||
port = proxy_port if disaggregated_prefill else server_port
|
port = proxy_port if disaggregated_prefill else server_port
|
||||||
|
|||||||
Reference in New Issue
Block a user