[CI] Upgrade trasnformers version (#6307)

Upgrade transformers to >=4.56.4 - vLLM version: v0.14.1 - vLLM main: dc917cceb8 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-01-28 14:06:39 +08:00
parent c498cea22d
commit f8e76a49fa
14 changed files with 30 additions and 42 deletions
--- a/tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py
+++ b/tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py
@@ -1,5 +1,4 @@
 import pytest
-from modelscope import snapshot_download  # type: ignore

 from tests.e2e.conftest import VllmRunner
 from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
@@ -9,7 +8,7 @@ from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
@pytest.mark.parametrize("distributed_executor_backend", ["mp"])
 def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
    with VllmRunner(
-            snapshot_download(MODEL_PATH),
+            MODEL_PATH,
            enable_lora=True,
            max_loras=4,
            dtype="half",
--- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
@@ -24,7 +24,6 @@ import os
 from unittest.mock import patch

 import pytest
-from modelscope import snapshot_download  # type: ignore
 from vllm import SamplingParams

 from tests.e2e.conftest import VllmRunner
@@ -77,7 +76,7 @@ def test_qwen3_w4a8_dynamic_tp2(model):
    ]
    max_tokens = 5
    with VllmRunner(
-            snapshot_download(model),
+            model,
            max_model_len=8192,
            dtype="auto",
            tensor_parallel_size=2,
@@ -96,7 +95,7 @@ def test_qwen3_moe_sp_tp2() -> None:
                                     top_k=50,
                                     top_p=0.9)

-    with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
+    with VllmRunner("Qwen/Qwen3-30B-A3B",
                    dtype="auto",
                    tensor_parallel_size=2,
                    distributed_executor_backend="mp",
@@ -119,7 +118,7 @@ def test_deepseek_w4a8_accuracy_tp2(model):
        '逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj'
    ]
    sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
-    with VllmRunner(snapshot_download(model),
+    with VllmRunner(model,
                    dtype="auto",
                    tensor_parallel_size=2,
                    cudagraph_capture_sizes=[1, 2, 4, 8],
@@ -152,7 +151,7 @@ def test_qwen3_moe_fc2_tp2() -> None:
                                     top_k=50,
                                     top_p=0.9)

-    with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
+    with VllmRunner("Qwen/Qwen3-30B-A3B",
                    dtype="auto",
                    tensor_parallel_size=2,
                    distributed_executor_backend="mp",
@@ -173,7 +172,7 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None:
                                     top_p=0.9)

    with VllmRunner(
-            snapshot_download("Qwen/Qwen3-30B-A3B"),
+            "Qwen/Qwen3-30B-A3B",
            dtype="auto",
            tensor_parallel_size=2,
            distributed_executor_backend="mp",
@@ -193,7 +192,7 @@ def test_deepseek_v2_lite_fc1_tp2() -> None:
                                     temperature=0.0,
                                     top_k=50,
                                     top_p=0.9)
-    with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"),
+    with VllmRunner("vllm-ascend/DeepSeek-V2-Lite-W8A8",
                    dtype="auto",
                    tensor_parallel_size=2,
                    distributed_executor_backend="mp",
@@ -212,7 +211,7 @@ def test_qwen3_dense_fc1_tp2(model):
    max_tokens = 5

    with VllmRunner(
-            snapshot_download(model),
+            model,
            max_model_len=8192,
            dtype="auto",
            tensor_parallel_size=2,
@@ -231,7 +230,7 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
    max_tokens = 5

    with VllmRunner(
-            snapshot_download(model),
+            model,
            max_model_len=8192,
            dtype="auto",
            tensor_parallel_size=2,
@@ -277,7 +276,7 @@ def test_qwen3_w4a4_distributed_tp2(model):
    ]
    max_tokens = 5
    with VllmRunner(
-            snapshot_download(model),
+            model,
            tensor_parallel_size=2,
            cudagraph_capture_sizes=[1, 2, 4, 8],
            quantization="ascend",
--- a/tests/e2e/multicard/2-cards/test_quantization.py
+++ b/tests/e2e/multicard/2-cards/test_quantization.py
@@ -16,7 +16,6 @@
 # This file is a part of the vllm-ascend project.
 # Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
 #
-from modelscope import snapshot_download  # type: ignore
 import pytest

 from tests.e2e.conftest import VllmRunner
@@ -28,7 +27,7 @@ def test_qwen2_5_w8a8_external_quantized_tp2():
    ]
    max_tokens = 5
    with VllmRunner(
-            snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"),
+            "neuralmagic/Qwen2.5-3B-quantized.w8a8",
            tensor_parallel_size=2,
            cudagraph_capture_sizes=[1, 2, 4, 8],
            max_model_len=4096,
@@ -52,8 +51,7 @@ def test_qwen3_moe_w8a8_dynamic_llm_compressor():
    ]
    max_tokens = 5
    with VllmRunner(
-            snapshot_download(
-                "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8"),
+            "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
            tensor_parallel_size=2,
            max_model_len=4096,
            gpu_memory_utilization=0.8,
--- a/tests/e2e/multicard/2-cards/test_qwen3_moe.py
+++ b/tests/e2e/multicard/2-cards/test_qwen3_moe.py
@@ -22,7 +22,6 @@ from unittest.mock import patch

 import openai
 import pytest
-from modelscope import snapshot_download  # type: ignore
 from vllm.utils.network_utils import get_open_port

 from tests.e2e.conftest import RemoteOpenAIServer, VllmRunner
@@ -50,7 +49,7 @@ def test_qwen3_moe_w8a8_distributed_tp2():
    ]
    max_tokens = 5
    with VllmRunner(
-            snapshot_download("vllm-ascend/Qwen3-30B-A3B-W8A8"),
+            "vllm-ascend/Qwen3-30B-A3B-W8A8",
            max_model_len=8192,
            tensor_parallel_size=2,
            cudagraph_capture_sizes=[1, 2, 4, 8],
--- a/tests/e2e/multicard/4-cards/test_qwen3_next.py
+++ b/tests/e2e/multicard/4-cards/test_qwen3_next.py
@@ -19,8 +19,6 @@
 import os
 from unittest.mock import patch

-from modelscope import snapshot_download  # type: ignore
-
 from tests.e2e.conftest import VllmRunner


@@ -65,7 +63,7 @@ def test_qwen3_next_w8a8dynamic_distributed_tp4_ep():
    ]
    max_tokens = 5
    with VllmRunner(
-            snapshot_download("vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8"),
+            "vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8",
            max_model_len=4096,
            tensor_parallel_size=4,
            gpu_memory_utilization=0.4,