[CI] Fix nightly CI (#3821)
### What this PR does / why we need it? This patch fix the nightly CI runs [failure](https://github.com/vllm-project/vllm-ascend/actions/runs/18848144365) ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.1 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -88,12 +88,17 @@ jobs:
|
|||||||
- name: Install kubectl
|
- name: Install kubectl
|
||||||
run: |
|
run: |
|
||||||
# Install kubectl
|
# Install kubectl
|
||||||
|
arch=$(uname -m)
|
||||||
|
|
||||||
|
if echo "$arch" | grep -qiE "arm|aarch64"; then
|
||||||
|
echo "Detected ARM architecture: $arch"
|
||||||
|
KUBECTL="$KUBECTL"_arm
|
||||||
|
fi
|
||||||
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
|
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
|
||||||
|
|
||||||
# Verify kubectl installation
|
# Verify kubectl installation
|
||||||
kubectl version --client=true
|
kubectl version --client=true
|
||||||
|
|
||||||
# TODO: Add A2 tests
|
|
||||||
- name: Decode kubeconfig from secrets
|
- name: Decode kubeconfig from secrets
|
||||||
run: |
|
run: |
|
||||||
# Decode and save kubeconfig
|
# Decode and save kubeconfig
|
||||||
@@ -175,7 +180,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Determine is success
|
- name: Determine is success
|
||||||
run: |
|
run: |
|
||||||
TIMEOUT=600
|
TIMEOUT=300
|
||||||
ELAPSED=0
|
ELAPSED=0
|
||||||
while [ ! -f "$RESULT_FILE" ]; do
|
while [ ! -f "$RESULT_FILE" ]; do
|
||||||
sleep 5
|
sleep 5
|
||||||
|
|||||||
@@ -116,7 +116,7 @@ class RemoteOpenAIServer:
|
|||||||
model: str,
|
model: str,
|
||||||
vllm_serve_args: Union[list[str], str],
|
vllm_serve_args: Union[list[str], str],
|
||||||
*,
|
*,
|
||||||
server_host: str = "0.0.0.0",
|
server_host: str = '0.0.0.0',
|
||||||
server_port: int = 8080,
|
server_port: int = 8080,
|
||||||
env_dict: Optional[dict[str, str]] = None,
|
env_dict: Optional[dict[str, str]] = None,
|
||||||
seed: Optional[int] = None,
|
seed: Optional[int] = None,
|
||||||
|
|||||||
@@ -84,16 +84,17 @@ class MultiNodeConfig:
|
|||||||
self.envs["LOCAL_IP"] = self.cur_ip
|
self.envs["LOCAL_IP"] = self.cur_ip
|
||||||
self.envs["NIC_NAME"] = self.nic_name
|
self.envs["NIC_NAME"] = self.nic_name
|
||||||
|
|
||||||
|
master_ip = self.cluster_ips[0]
|
||||||
if self.disaggregated_prefill:
|
if self.disaggregated_prefill:
|
||||||
self.envs[
|
self.envs[
|
||||||
"DISAGGREGATED_PREFILL_RANK_TABLE_PATH"] = self.disaggregated_prefill.get(
|
"DISAGGREGATED_PREFILL_RANK_TABLE_PATH"] = self.disaggregated_prefill.get(
|
||||||
"ranktable_path")
|
"ranktable_path")
|
||||||
if self.cur_index < self.decode_start_index:
|
if self.cur_index < self.decode_start_index:
|
||||||
self.envs["MASTER_IP"] = self.cluster_ips[0]
|
master_ip = self.cluster_ips[0]
|
||||||
else:
|
else:
|
||||||
self.envs["MASTER_IP"] = self.cluster_ips[
|
master_ip = self.cluster_ips[self.decode_start_index]
|
||||||
self.decode_start_index]
|
|
||||||
|
|
||||||
|
self.envs["MASTER_IP"] = master_ip
|
||||||
ascend_path = "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages"
|
ascend_path = "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages"
|
||||||
self.envs[
|
self.envs[
|
||||||
"LD_LIBRARY_PATH"] = f"{ascend_path}:{self.envs.get('LD_LIBRARY_PATH', os.environ.get('LD_LIBRARY_PATH', ''))}"
|
"LD_LIBRARY_PATH"] = f"{ascend_path}:{self.envs.get('LD_LIBRARY_PATH', os.environ.get('LD_LIBRARY_PATH', ''))}"
|
||||||
@@ -288,8 +289,3 @@ class MultiNodeConfig:
|
|||||||
subprocess.run(cmd, env=env, check=True)
|
subprocess.run(cmd, env=env, check=True)
|
||||||
assert os.path.exists(
|
assert os.path.exists(
|
||||||
str(ranktable_path)), "failed generate ranktable.json"
|
str(ranktable_path)), "failed generate ranktable.json"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
config = MultiNodeConfig.from_yaml()
|
|
||||||
print(config.perf_cmd)
|
|
||||||
|
|||||||
@@ -121,7 +121,7 @@ download_go() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
install_ais_bench() {
|
install_ais_bench() {
|
||||||
local AIS_BENCH="$SRC_DIR/benchmark"
|
local AIS_BENCH="$SRC_DIR/vllm-ascend/benchmark"
|
||||||
git clone https://gitee.com/aisbench/benchmark.git $AIS_BENCH
|
git clone https://gitee.com/aisbench/benchmark.git $AIS_BENCH
|
||||||
cd $AIS_BENCH
|
cd $AIS_BENCH
|
||||||
git checkout v3.0-20250930-master
|
git checkout v3.0-20250930-master
|
||||||
@@ -166,8 +166,8 @@ run_tests() {
|
|||||||
kill_npu_processes
|
kill_npu_processes
|
||||||
ret=$?
|
ret=$?
|
||||||
if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
|
if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
|
||||||
mkdir -p "$(dirname "$RESULT_PATH")"
|
mkdir -p "$(dirname "$RESULT_FILE_PATH")"
|
||||||
echo $ret > "$RESULT_PATH"
|
echo $ret > "$RESULT_FILE_PATH"
|
||||||
fi
|
fi
|
||||||
return $ret
|
return $ret
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ def get_local_model_path_with_retry(
|
|||||||
async def get_completions(url: str, model: str, prompts: Union[str, List[str]],
|
async def get_completions(url: str, model: str, prompts: Union[str, List[str]],
|
||||||
**api_kwargs: Any) -> List[str]:
|
**api_kwargs: Any) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Asynchronously send HTTP requests to a /v1/completions endpoint.
|
Asynchronously send HTTP requests to endpoint.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url: Full endpoint URL, e.g. "http://localhost:1025/v1/completions"
|
url: Full endpoint URL, e.g. "http://localhost:1025/v1/completions"
|
||||||
@@ -88,7 +88,10 @@ async def get_completions(url: str, model: str, prompts: Union[str, List[str]],
|
|||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_multi_node() -> None:
|
async def test_multi_node() -> None:
|
||||||
config = MultiNodeConfig.from_yaml()
|
config = MultiNodeConfig.from_yaml()
|
||||||
|
# To avoid modelscope 400 HttpError, we should download the model with retry
|
||||||
local_model_path = get_local_model_path_with_retry(config.model)
|
local_model_path = get_local_model_path_with_retry(config.model)
|
||||||
|
config.server_cmd = config.server_cmd.replace(config.model,
|
||||||
|
local_model_path)
|
||||||
assert local_model_path is not None, "can not find any local weight for test"
|
assert local_model_path is not None, "can not find any local weight for test"
|
||||||
env_dict = config.envs
|
env_dict = config.envs
|
||||||
perf_cmd = config.perf_cmd
|
perf_cmd = config.perf_cmd
|
||||||
@@ -113,11 +116,6 @@ async def test_multi_node() -> None:
|
|||||||
) as remote_server:
|
) as remote_server:
|
||||||
if config.is_master:
|
if config.is_master:
|
||||||
port = proxy_port if disaggregated_prefill else server_port
|
port = proxy_port if disaggregated_prefill else server_port
|
||||||
base_url = f"http://localhost:{port}/v1/completions"
|
|
||||||
_ = await get_completions(url=base_url,
|
|
||||||
model=local_model_path,
|
|
||||||
prompts=prompts,
|
|
||||||
api_kwargs=api_keyword_args)
|
|
||||||
# aisbench test
|
# aisbench test
|
||||||
if acc_cmd:
|
if acc_cmd:
|
||||||
run_aisbench_cases(local_model_path, port, acc_cmd)
|
run_aisbench_cases(local_model_path, port, acc_cmd)
|
||||||
|
|||||||
@@ -14,11 +14,16 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# This file is a part of the vllm-ascend project.
|
# This file is a part of the vllm-ascend project.
|
||||||
#
|
#
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import filelock
|
||||||
|
import huggingface_hub
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from modelscope import snapshot_download # type: ignore
|
from modelscope import snapshot_download # type: ignore
|
||||||
|
|
||||||
@@ -63,10 +68,12 @@ class AisbenchRunner:
|
|||||||
port: int,
|
port: int,
|
||||||
aisbench_config: dict,
|
aisbench_config: dict,
|
||||||
verify=True):
|
verify=True):
|
||||||
self.dataset_path = snapshot_download(aisbench_config["dataset_path"],
|
|
||||||
repo_type='dataset')
|
|
||||||
self.model = model
|
self.model = model
|
||||||
self.model_path = snapshot_download(model)
|
self.dataset_path = maybe_download_from_modelscope(
|
||||||
|
aisbench_config["dataset_path"], repo_type="dataset")
|
||||||
|
self.model_path = maybe_download_from_modelscope(model)
|
||||||
|
assert self.dataset_path is not None and self.model_path is not None, \
|
||||||
|
f"Failed to download dataset or model: dataset={self.dataset_path}, model={self.model_path}"
|
||||||
self.port = port
|
self.port = port
|
||||||
self.task_type = aisbench_config["case_type"]
|
self.task_type = aisbench_config["case_type"]
|
||||||
self.request_conf = aisbench_config["request_conf"]
|
self.request_conf = aisbench_config["request_conf"]
|
||||||
@@ -254,3 +261,52 @@ def run_aisbench_cases(model, port, aisbench_cases):
|
|||||||
def get_TTFT(result):
|
def get_TTFT(result):
|
||||||
TTFT = result[0][0].loc["TTFT", "Average"][:-3]
|
TTFT = result[0][0].loc["TTFT", "Average"][:-3]
|
||||||
return float(TTFT)
|
return float(TTFT)
|
||||||
|
|
||||||
|
|
||||||
|
temp_dir = tempfile.gettempdir()
|
||||||
|
|
||||||
|
|
||||||
|
def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None):
|
||||||
|
lock_dir = cache_dir or temp_dir
|
||||||
|
model_name_or_path = str(model_name_or_path)
|
||||||
|
os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
|
||||||
|
model_name = model_name_or_path.replace("/", "-")
|
||||||
|
hash_name = hashlib.sha256(model_name.encode()).hexdigest()
|
||||||
|
# add hash to avoid conflict with old users' lock files
|
||||||
|
lock_file_name = hash_name + model_name + ".lock"
|
||||||
|
# mode 0o666 is required for the filelock to be shared across users
|
||||||
|
lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name),
|
||||||
|
mode=0o666)
|
||||||
|
return lock
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_download_from_modelscope(
|
||||||
|
model: str,
|
||||||
|
repo_type: str | None = None,
|
||||||
|
revision: str | None = None,
|
||||||
|
download_dir: str | None = None,
|
||||||
|
ignore_patterns: str | list[str] | None = None,
|
||||||
|
allow_patterns: list[str] | str | None = None,
|
||||||
|
) -> str | None:
|
||||||
|
"""
|
||||||
|
Download model/dataset from ModelScope hub.
|
||||||
|
Returns the path to the downloaded model, or None if the model is not
|
||||||
|
downloaded from ModelScope.
|
||||||
|
"""
|
||||||
|
# Use file lock to prevent multiple processes from
|
||||||
|
# downloading the same model weights at the same time.
|
||||||
|
with get_lock(model, download_dir):
|
||||||
|
if not os.path.exists(model):
|
||||||
|
model_path = snapshot_download(
|
||||||
|
model_id=model,
|
||||||
|
repo_type=repo_type,
|
||||||
|
cache_dir=download_dir,
|
||||||
|
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||||
|
revision=revision,
|
||||||
|
ignore_file_pattern=ignore_patterns,
|
||||||
|
allow_patterns=allow_patterns,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
model_path = model
|
||||||
|
return model_path
|
||||||
|
return None
|
||||||
|
|||||||
Reference in New Issue
Block a user