[CI] Migrate e2e test runner to hk (#5344)

### What this PR does / why we need it? This patch add new runner labels for the HK region, and e2e single-card testing has been migrated to this runner. - vLLM version: release/v0.13.0 - vLLM main: bc0a5a0c08 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
2026-01-26 09:00:51 +08:00
parent 99bdd7363c
commit ca297eb57f
8 changed files with 40 additions and 23 deletions
--- a/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py
+++ b/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py
@@ -400,9 +400,14 @@ def test_llama_qwen_eagle_acceptance(
            compilation_config=compilation_config,
            async_scheduling=async_scheduling,
    ) as llm:
-        _ = llm.generate(prompts, sampling_params)
+        outputs = llm.model.generate(prompts, sampling_params)
        metrics = llm.model.get_metrics()
-
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        output_tokens = output.outputs[0].token_ids
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print(f"Output tokens: {output_tokens}")
    num_drafts = 0
    num_accepted_tokens_per_pos = [0] * num_speculative_tokens
    for metric in metrics:
@@ -418,7 +423,10 @@ def test_llama_qwen_eagle_acceptance(
        num_accepted_tokens / num_drafts
        for num_accepted_tokens in num_accepted_tokens_per_pos
    ]
-    golden = BASELINES[method]
+    if method == "eagle":
+        golden = [0.7313432835820896, 0.373134328358209, 0.19402985074626866]
+    else:
+        golden = [0.68, 0.40, 0.18]

    match = all(abs(a - b) < 0.08 for a, b in zip(acceptance_per_pos, golden))
    if not match:
--- a/tests/e2e/singlecard/test_aclgraph_accuracy.py
+++ b/tests/e2e/singlecard/test_aclgraph_accuracy.py
@@ -48,8 +48,8 @@ CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
    prompts=PROMPTS_LONG,
    golden_answers=[
        ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
-        " \n\nTo solve this problem, we can use the following approach: Let $ABCD$ be a unit square with coordinates $A(0,0), B",
-        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
+        " \n\nTo solve this problem, we can use the following approach: Let $P$ be the perimeter of the square. Then, the expected value of the area",
+        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations $x^2 +'
    ])

 CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
@@ -67,7 +67,7 @@ CASE_QWEN_EX = LLMTestCase(
    prompts=PROMPTS_LONG,
    golden_answers=[
        ' \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the',
-        " \n\nTo solve this problem, we can use the fact that the expected value of the area of a triangle formed by two random points on a square's perimeter is",
+        " \n\nTo solve this problem, we can use the following approach: Let $P$ be the perimeter of the square. Then, the expected value of the area",
        ' \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can'
    ])

--- a/tests/e2e/singlecard/test_llama32_lora.py
+++ b/tests/e2e/singlecard/test_llama32_lora.py
@@ -3,8 +3,8 @@

 import vllm
 import vllm.config
-from modelscope import snapshot_download  # type: ignore
 from vllm.lora.request import LoRARequest
+from unittest.mock import patch

 from tests.e2e.conftest import VllmRunner
 from vllm_ascend.utils import enable_custom_op
@@ -29,8 +29,8 @@ EXPECTED_LORA_OUTPUT = [
    "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
    "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
 ]
-
-MODEL_PATH = "vllm-ascend/Llama-3.2-3B-Instruct"
+# For hk region, we need to use the model from hf to avoid the network issue
+MODEL_PATH = "meta-llama/Llama-3.2-3B-Instruct"


 def do_sample(
@@ -105,9 +105,10 @@ def generate_and_test(llm,
    print("removing lora")


+@patch.dict("os.environ", {"VLLM_USE_MODELSCOPE": "False"})
 def test_llama_lora(llama32_lora_files):
    vllm_model = VllmRunner(
-        snapshot_download(MODEL_PATH),
+        MODEL_PATH,
        enable_lora=True,
        # also test odd max_num_seqs
        max_num_seqs=7,
--- a/tests/e2e/singlecard/test_qwen3_multi_loras.py
+++ b/tests/e2e/singlecard/test_qwen3_multi_loras.py
@@ -1,19 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from modelscope import snapshot_download  # type: ignore
 from vllm import SamplingParams
 from vllm.lora.request import LoRARequest
+from unittest.mock import patch

 from tests.e2e.conftest import VllmRunner
 from vllm_ascend.utils import enable_custom_op

 enable_custom_op()

-MODEL_PATH = "vllm-ascend/Qwen3-0.6B"
+MODEL_PATH = "Qwen/Qwen3-0.6B"
 LORA_NAME_PATH_MAP = {
-    "Alice": "vllm-ascend/self_cognition_Alice",
-    "Bob": "vllm-ascend/self_cognition_Bob",
-    "Cat": "vllm-ascend/self_cognition_Bob",  # same as Bob
+    "Alice": "charent/self_cognition_Alice",
+    "Bob": "charent/self_cognition_Bob",
+    "Cat": "charent/self_cognition_Bob",  # same as Bob
 }

 LORA_RANK = 8
@@ -37,9 +37,8 @@ def format_chatml_messages(prompt: str):
        },
    ]

-
+@patch.dict("os.environ", {"VLLM_USE_MODELSCOPE": "False"})
 def test_multi_loras_with_tp_sync():
-
    lora_name_id_map = {}
    increase_lora_id = 0

@@ -51,11 +50,11 @@ def test_multi_loras_with_tp_sync():
        return LoRARequest(
            lora_name=name,
            lora_int_id=increase_lora_id,
-            lora_path=snapshot_download(path),
+            lora_path=path,
        )

    vllm_model = VllmRunner(
-        snapshot_download(MODEL_PATH),
+        MODEL_PATH,
        enable_lora=True,
        # dtype="half",
        max_loras=2,  # ensure max_loras < max_cpu_loras
@@ -157,4 +156,4 @@ def test_multi_loras_with_tp_sync():
        print("After reload Alice:")

        output_text = call_llm_get_outputs(prompt, "Alice")
-        check_outputs(output_text, expected_output, prompt)
+        check_outputs(output_text, expected_output, prompt)