[CI] Fix nightly CI (#3821)

### What this PR does / why we need it? This patch fix the nightly CI runs [failure](https://github.com/vllm-project/vllm-ascend/actions/runs/18848144365) ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/releases/v0.11.1 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-28 20:40:03 +08:00
parent a7450db1bd
commit 90ae114569
6 changed files with 79 additions and 24 deletions
--- a/tools/aisbench.py
+++ b/tools/aisbench.py
@@ -14,11 +14,16 @@
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
+import hashlib
 import json
 import os
 import re
 import subprocess
+import tempfile
+from pathlib import Path

+import filelock
+import huggingface_hub
 import pandas as pd
 from modelscope import snapshot_download  # type: ignore

@@ -63,10 +68,12 @@ class AisbenchRunner:
                 port: int,
                 aisbench_config: dict,
                 verify=True):
-        self.dataset_path = snapshot_download(aisbench_config["dataset_path"],
-                                              repo_type='dataset')
        self.model = model
-        self.model_path = snapshot_download(model)
+        self.dataset_path = maybe_download_from_modelscope(
+            aisbench_config["dataset_path"], repo_type="dataset")
+        self.model_path = maybe_download_from_modelscope(model)
+        assert self.dataset_path is not None and self.model_path is not None, \
+            f"Failed to download dataset or model: dataset={self.dataset_path}, model={self.model_path}"
        self.port = port
        self.task_type = aisbench_config["case_type"]
        self.request_conf = aisbench_config["request_conf"]
@@ -254,3 +261,52 @@ def run_aisbench_cases(model, port, aisbench_cases):
 def get_TTFT(result):
    TTFT = result[0][0].loc["TTFT", "Average"][:-3]
    return float(TTFT)
+
+
+temp_dir = tempfile.gettempdir()
+
+
+def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None):
+    lock_dir = cache_dir or temp_dir
+    model_name_or_path = str(model_name_or_path)
+    os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
+    model_name = model_name_or_path.replace("/", "-")
+    hash_name = hashlib.sha256(model_name.encode()).hexdigest()
+    # add hash to avoid conflict with old users' lock files
+    lock_file_name = hash_name + model_name + ".lock"
+    # mode 0o666 is required for the filelock to be shared across users
+    lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name),
+                             mode=0o666)
+    return lock
+
+
+def maybe_download_from_modelscope(
+    model: str,
+    repo_type: str | None = None,
+    revision: str | None = None,
+    download_dir: str | None = None,
+    ignore_patterns: str | list[str] | None = None,
+    allow_patterns: list[str] | str | None = None,
+) -> str | None:
+    """
+    Download model/dataset from ModelScope hub.
+    Returns the path to the downloaded model, or None if the model is not
+    downloaded from ModelScope.
+    """
+    # Use file lock to prevent multiple processes from
+    # downloading the same model weights at the same time.
+    with get_lock(model, download_dir):
+        if not os.path.exists(model):
+            model_path = snapshot_download(
+                model_id=model,
+                repo_type=repo_type,
+                cache_dir=download_dir,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                revision=revision,
+                ignore_file_pattern=ignore_patterns,
+                allow_patterns=allow_patterns,
+            )
+        else:
+            model_path = model
+        return model_path
+    return None