[CI] Optimize nightly CI (#3898)

### What this PR does / why we need it? This patch mainly fix the the problem of not being able to determine the exit status of the pod's entrypoint script and some other tiny optimizations: 1. Shorten wait for server timeout 2. fix typo 3. fix the issue of ais_bench failing to correctly access the proxy URL in a PD separation scenario. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0 - vLLM main: 83f478bb19 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2025-10-30 23:42:20 +08:00
parent 2c291bc63f
commit eb0a2ee2d0
14 changed files with 94 additions and 66 deletions
--- a/tools/aisbench.py
+++ b/tools/aisbench.py
@@ -68,6 +68,7 @@ class AisbenchRunner:
                 model: str,
                 port: int,
                 aisbench_config: dict,
+                 host_ip: str = "localhost",
                 verify=True):
        self.model = model
        self.dataset_path = maybe_download_from_modelscope(
@@ -76,6 +77,7 @@ class AisbenchRunner:
        assert self.dataset_path is not None and self.model_path is not None, \
            f"Failed to download dataset or model: dataset={self.dataset_path}, model={self.model_path}"
        self.port = port
+        self.host_ip = host_ip
        self.task_type = aisbench_config["case_type"]
        self.request_conf = aisbench_config["request_conf"]
        self.dataset_conf = aisbench_config.get("dataset_conf")
@@ -131,6 +133,7 @@ class AisbenchRunner:
            content = f.read()
        content = re.sub(r'model=.*', f'model="{self.model}",', content)
        content = re.sub(r'host_port.*', f'host_port = {self.port},', content)
+        content = re.sub(r'host_ip.*', f'host_ip = "{self.host_ip}",', content)
        content = re.sub(r'max_out_len.*',
                         f'max_out_len = {self.max_out_len},', content)
        content = re.sub(r'batch_size.*', f'batch_size = {self.batch_size},',
@@ -238,14 +241,21 @@ class AisbenchRunner:
        assert self.baseline - self.threshold <= acc_value <= self.baseline + self.threshold, f"Accuracy verification failed. The accuracy of {self.dataset_path} is {acc_value}, which is not within {self.threshold} relative to baseline {self.baseline}."


-def run_aisbench_cases(model, port, aisbench_cases, server_args=""):
+def run_aisbench_cases(model,
+                       port,
+                       aisbench_cases,
+                       server_args="",
+                       host_ip="localhost"):
    aisbench_results = []
    aisbench_errors = []
    for aisbench_case in aisbench_cases:
        if not aisbench_case:
            continue
        try:
-            with AisbenchRunner(model, port, aisbench_case) as aisbench:
+            with AisbenchRunner(model=model,
+                                port=port,
+                                host_ip=host_ip,
+                                aisbench_config=aisbench_case) as aisbench:
                aisbench_results.append(aisbench.result)
        except Exception as e:
            aisbench_results.append("")