From f8e76a49facd2e7877a039634f07e1292d1ccb7e Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Wed, 28 Jan 2026 14:06:39 +0800
Subject: [PATCH] [CI] Upgrade trasnformers version (#6307)

Upgrade transformers to >=4.56.4

- vLLM version: v0.14.1
- vLLM main:
https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 .../workflows/_e2e_nightly_single_node.yaml   |  2 +-
 .github/workflows/_e2e_test.yaml              | 10 +++++-----
 pyproject.toml                                |  2 +-
 requirements.txt                              |  2 +-
 .../multicard/2-cards/test_ilama_lora_tp2.py  |  3 +--
 .../test_offline_inference_distributed.py     | 19 +++++++++----------
 .../multicard/2-cards/test_quantization.py    |  6 ++----
 tests/e2e/multicard/2-cards/test_qwen3_moe.py |  3 +--
 .../e2e/multicard/4-cards/test_qwen3_next.py  |  4 +---
 tests/e2e/nightly/multi_node/scripts/run.sh   |  2 +-
 tests/e2e/singlecard/test_aclgraph_mem.py     |  5 ++---
 tests/e2e/singlecard/test_ilama_lora.py       |  3 +--
 tests/e2e/singlecard/test_models.py           |  5 ++---
 tests/e2e/singlecard/test_quantization.py     |  6 ++----
 14 files changed, 30 insertions(+), 42 deletions(-)

diff --git a/.github/workflows/_e2e_nightly_single_node.yaml b/.github/workflows/_e2e_nightly_single_node.yaml
index 4037bd3b..34bd4cd9 100644
--- a/.github/workflows/_e2e_nightly_single_node.yaml
+++ b/.github/workflows/_e2e_nightly_single_node.yaml
@@ -58,7 +58,7 @@ jobs:
     container:
       image: ${{ inputs.image }}
     env:
-      TRANSFORMERS_OFFLINE: 1
+      HF_HUB_OFFLINE: 1
       VLLM_USE_MODELSCOPE: True
     steps:
       - name: Check npu and CANN info
diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
index 62eb3983..a0c81b2e 100644
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -28,7 +28,7 @@ jobs:
       env:
         VLLM_LOGGING_LEVEL: ERROR
         VLLM_USE_MODELSCOPE: True
-        TRANSFORMERS_OFFLINE: 1
+        HF_HUB_OFFLINE: 1
     steps:
       - name: Check npu and CANN info
         run: |
@@ -137,7 +137,7 @@ jobs:
         VLLM_LOGGING_LEVEL: ERROR
         VLLM_USE_MODELSCOPE: True
         HCCL_BUFFSIZE: 1024
-        TRANSFORMERS_OFFLINE: 1
+        HF_HUB_OFFLINE: 1
     steps:
       - name: Check npu and CANN info
         run: |
@@ -245,7 +245,7 @@ jobs:
       env:
         VLLM_LOGGING_LEVEL: ERROR
         VLLM_USE_MODELSCOPE: True
-        TRANSFORMERS_OFFLINE: 1
+        HF_HUB_OFFLINE: 1
     steps:
       - name: Check npu and CANN info
         run: |
@@ -322,7 +322,7 @@ jobs:
       env:
         VLLM_LOGGING_LEVEL: ERROR
         VLLM_USE_MODELSCOPE: True
-        TRANSFORMERS_OFFLINE: 1
+        HF_HUB_OFFLINE: 1
     steps:
       - name: Check npu and CANN info
         run: |
@@ -380,7 +380,7 @@ jobs:
       env:
         VLLM_LOGGING_LEVEL: ERROR
         VLLM_USE_MODELSCOPE: True
-        TRANSFORMERS_OFFLINE: 1
+        HF_HUB_OFFLINE: 1
     steps:
       - name: Check npu and CANN info
         run: |
diff --git a/pyproject.toml b/pyproject.toml
index 08ce30f2..fca3e0f3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ requires = [
     "psutil",
     "setuptools>=64",
     "setuptools-scm>=8",
-    "transformers<=4.57.1",
+    "transformers>=4.57.4",
     "torch-npu==2.9.0",
     "torch==2.9.0",
     "torchvision",
diff --git a/requirements.txt b/requirements.txt
index f14d584f..82cca9ea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -32,6 +32,6 @@ numba
 torch-npu==2.9.0
 
 arctic-inference==0.1.1
-transformers>=4.57.3
+transformers>=4.57.4
 fastapi<0.124.0
 triton-ascend==3.2.0
diff --git a/tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py b/tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py
index e0174e18..fc4866ec 100644
--- a/tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py
+++ b/tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py
@@ -1,5 +1,4 @@
 import pytest
-from modelscope import snapshot_download  # type: ignore
 
 from tests.e2e.conftest import VllmRunner
 from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
@@ -9,7 +8,7 @@ from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
 @pytest.mark.parametrize("distributed_executor_backend", ["mp"])
 def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
     with VllmRunner(
-            snapshot_download(MODEL_PATH),
+            MODEL_PATH,
             enable_lora=True,
             max_loras=4,
             dtype="half",
diff --git a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
index e9032e99..77e28ece 100644
--- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py
@@ -24,7 +24,6 @@ import os
 from unittest.mock import patch
 
 import pytest
-from modelscope import snapshot_download  # type: ignore
 from vllm import SamplingParams
 
 from tests.e2e.conftest import VllmRunner
@@ -77,7 +76,7 @@ def test_qwen3_w4a8_dynamic_tp2(model):
     ]
     max_tokens = 5
     with VllmRunner(
-            snapshot_download(model),
+            model,
             max_model_len=8192,
             dtype="auto",
             tensor_parallel_size=2,
@@ -96,7 +95,7 @@ def test_qwen3_moe_sp_tp2() -> None:
                                      top_k=50,
                                      top_p=0.9)
 
-    with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
+    with VllmRunner("Qwen/Qwen3-30B-A3B",
                     dtype="auto",
                     tensor_parallel_size=2,
                     distributed_executor_backend="mp",
@@ -119,7 +118,7 @@ def test_deepseek_w4a8_accuracy_tp2(model):
         '逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj'
     ]
     sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
-    with VllmRunner(snapshot_download(model),
+    with VllmRunner(model,
                     dtype="auto",
                     tensor_parallel_size=2,
                     cudagraph_capture_sizes=[1, 2, 4, 8],
@@ -152,7 +151,7 @@ def test_qwen3_moe_fc2_tp2() -> None:
                                      top_k=50,
                                      top_p=0.9)
 
-    with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
+    with VllmRunner("Qwen/Qwen3-30B-A3B",
                     dtype="auto",
                     tensor_parallel_size=2,
                     distributed_executor_backend="mp",
@@ -173,7 +172,7 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None:
                                      top_p=0.9)
 
     with VllmRunner(
-            snapshot_download("Qwen/Qwen3-30B-A3B"),
+            "Qwen/Qwen3-30B-A3B",
             dtype="auto",
             tensor_parallel_size=2,
             distributed_executor_backend="mp",
@@ -193,7 +192,7 @@ def test_deepseek_v2_lite_fc1_tp2() -> None:
                                      temperature=0.0,
                                      top_k=50,
                                      top_p=0.9)
-    with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"),
+    with VllmRunner("vllm-ascend/DeepSeek-V2-Lite-W8A8",
                     dtype="auto",
                     tensor_parallel_size=2,
                     distributed_executor_backend="mp",
@@ -212,7 +211,7 @@ def test_qwen3_dense_fc1_tp2(model):
     max_tokens = 5
 
     with VllmRunner(
-            snapshot_download(model),
+            model,
             max_model_len=8192,
             dtype="auto",
             tensor_parallel_size=2,
@@ -231,7 +230,7 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
     max_tokens = 5
 
     with VllmRunner(
-            snapshot_download(model),
+            model,
             max_model_len=8192,
             dtype="auto",
             tensor_parallel_size=2,
@@ -277,7 +276,7 @@ def test_qwen3_w4a4_distributed_tp2(model):
     ]
     max_tokens = 5
     with VllmRunner(
-            snapshot_download(model),
+            model,
             tensor_parallel_size=2,
             cudagraph_capture_sizes=[1, 2, 4, 8],
             quantization="ascend",
diff --git a/tests/e2e/multicard/2-cards/test_quantization.py b/tests/e2e/multicard/2-cards/test_quantization.py
index 36d9ea0d..245693ff 100644
--- a/tests/e2e/multicard/2-cards/test_quantization.py
+++ b/tests/e2e/multicard/2-cards/test_quantization.py
@@ -16,7 +16,6 @@
 # This file is a part of the vllm-ascend project.
 # Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
 #
-from modelscope import snapshot_download  # type: ignore
 import pytest
 
 from tests.e2e.conftest import VllmRunner
@@ -28,7 +27,7 @@ def test_qwen2_5_w8a8_external_quantized_tp2():
     ]
     max_tokens = 5
     with VllmRunner(
-            snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"),
+            "neuralmagic/Qwen2.5-3B-quantized.w8a8",
             tensor_parallel_size=2,
             cudagraph_capture_sizes=[1, 2, 4, 8],
             max_model_len=4096,
@@ -52,8 +51,7 @@ def test_qwen3_moe_w8a8_dynamic_llm_compressor():
     ]
     max_tokens = 5
     with VllmRunner(
-            snapshot_download(
-                "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8"),
+            "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
             tensor_parallel_size=2,
             max_model_len=4096,
             gpu_memory_utilization=0.8,
diff --git a/tests/e2e/multicard/2-cards/test_qwen3_moe.py b/tests/e2e/multicard/2-cards/test_qwen3_moe.py
index 4dcc3f29..7a8cf77d 100644
--- a/tests/e2e/multicard/2-cards/test_qwen3_moe.py
+++ b/tests/e2e/multicard/2-cards/test_qwen3_moe.py
@@ -22,7 +22,6 @@ from unittest.mock import patch
 
 import openai
 import pytest
-from modelscope import snapshot_download  # type: ignore
 from vllm.utils.network_utils import get_open_port
 
 from tests.e2e.conftest import RemoteOpenAIServer, VllmRunner
@@ -50,7 +49,7 @@ def test_qwen3_moe_w8a8_distributed_tp2():
     ]
     max_tokens = 5
     with VllmRunner(
-            snapshot_download("vllm-ascend/Qwen3-30B-A3B-W8A8"),
+            "vllm-ascend/Qwen3-30B-A3B-W8A8",
             max_model_len=8192,
             tensor_parallel_size=2,
             cudagraph_capture_sizes=[1, 2, 4, 8],
diff --git a/tests/e2e/multicard/4-cards/test_qwen3_next.py b/tests/e2e/multicard/4-cards/test_qwen3_next.py
index e5c54ff5..445cd36e 100644
--- a/tests/e2e/multicard/4-cards/test_qwen3_next.py
+++ b/tests/e2e/multicard/4-cards/test_qwen3_next.py
@@ -19,8 +19,6 @@
 import os
 from unittest.mock import patch
 
-from modelscope import snapshot_download  # type: ignore
-
 from tests.e2e.conftest import VllmRunner
 
 
@@ -65,7 +63,7 @@ def test_qwen3_next_w8a8dynamic_distributed_tp4_ep():
     ]
     max_tokens = 5
     with VllmRunner(
-            snapshot_download("vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8"),
+            "vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8",
             max_model_len=4096,
             tensor_parallel_size=4,
             gpu_memory_utilization=0.4,
diff --git a/tests/e2e/nightly/multi_node/scripts/run.sh b/tests/e2e/nightly/multi_node/scripts/run.sh
index f42b325e..a2e68f02 100644
--- a/tests/e2e/nightly/multi_node/scripts/run.sh
+++ b/tests/e2e/nightly/multi_node/scripts/run.sh
@@ -18,7 +18,7 @@ export VLLM_LOGGING_LEVEL="INFO"
 # Reduce glog verbosity for mooncake
 export GLOG_minloglevel=1
 # Set transformers to offline mode to avoid downloading models during tests
-export TRANSFORMERS_OFFLINE="1"
+export HF_HUB_OFFLINE="1"
 
 # Function to print section headers
 print_section() {
diff --git a/tests/e2e/singlecard/test_aclgraph_mem.py b/tests/e2e/singlecard/test_aclgraph_mem.py
index 877d40c9..25d09786 100644
--- a/tests/e2e/singlecard/test_aclgraph_mem.py
+++ b/tests/e2e/singlecard/test_aclgraph_mem.py
@@ -21,7 +21,6 @@ from unittest.mock import patch
 
 import pytest
 import torch
-from modelscope import snapshot_download  # type: ignore
 from vllm import SamplingParams
 
 from tests.e2e.conftest import VllmRunner
@@ -66,11 +65,11 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
         sampling_params = SamplingParams(max_tokens=max_tokens,
                                          temperature=0.0)
         if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
-            vllm_model = VllmRunner(snapshot_download(model),
+            vllm_model = VllmRunner(model,
                                     max_model_len=1024,
                                     quantization="ascend")
         else:
-            vllm_model = VllmRunner(snapshot_download(model))
+            vllm_model = VllmRunner(model)
         _ = vllm_model.generate(prompts, sampling_params)
 
     assert capture_called.value == 1, "capture_model was not called during test"
diff --git a/tests/e2e/singlecard/test_ilama_lora.py b/tests/e2e/singlecard/test_ilama_lora.py
index d9c4814e..d59acd30 100644
--- a/tests/e2e/singlecard/test_ilama_lora.py
+++ b/tests/e2e/singlecard/test_ilama_lora.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 import vllm
-from modelscope import snapshot_download  # type: ignore
 from vllm.lora.request import LoRARequest
 
 from tests.e2e.conftest import VllmRunner
@@ -46,7 +45,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 
 def test_ilama_lora(ilama_lora_files):
     with VllmRunner(
-            snapshot_download(MODEL_PATH),
+            MODEL_PATH,
             enable_lora=True,
             dtype="half",
             max_loras=4,
diff --git a/tests/e2e/singlecard/test_models.py b/tests/e2e/singlecard/test_models.py
index e0464a55..659b5f69 100644
--- a/tests/e2e/singlecard/test_models.py
+++ b/tests/e2e/singlecard/test_models.py
@@ -20,7 +20,6 @@
 import os
 
 import pytest
-from modelscope import snapshot_download  # type: ignore
 from vllm import SamplingParams
 from vllm.assets.audio import AudioAsset
 
@@ -46,7 +45,7 @@ def test_minicpm(model) -> None:
     ]
     max_tokens = 5
 
-    with VllmRunner(snapshot_download(model),
+    with VllmRunner(model,
                     max_model_len=512,
                     gpu_memory_utilization=0.7) as runner:
         runner.generate_greedy(example_prompts, max_tokens)
@@ -61,7 +60,7 @@ def test_whisper(model) -> None:
                                      max_tokens=10,
                                      stop_token_ids=None)
 
-    with VllmRunner(snapshot_download(model),
+    with VllmRunner(model,
                     max_model_len=448,
                     max_num_seqs=5,
                     dtype="bfloat16",
diff --git a/tests/e2e/singlecard/test_quantization.py b/tests/e2e/singlecard/test_quantization.py
index 93776410..4457a05f 100644
--- a/tests/e2e/singlecard/test_quantization.py
+++ b/tests/e2e/singlecard/test_quantization.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
-from modelscope import snapshot_download  # type: ignore[import-untyped]
-
 from tests.e2e.conftest import VllmRunner
 from tests.e2e.model_utils import check_outputs_equal
 
@@ -33,7 +31,7 @@ def test_qwen3_w8a8_quant():
                             )]
 
     with VllmRunner(
-            snapshot_download("vllm-ascend/Qwen3-0.6B-W8A8"),
+            "vllm-ascend/Qwen3-0.6B-W8A8",
             max_model_len=8192,
             gpu_memory_utilization=0.7,
             cudagraph_capture_sizes=[1, 2, 4, 8],
@@ -62,7 +60,7 @@ def test_qwen3_dense_w8a16():
                             )]
 
     with VllmRunner(
-            snapshot_download("vllm-ascend/Qwen3-0.6B-W8A16"),
+            "vllm-ascend/Qwen3-0.6B-W8A16",
             max_model_len=8192,
             enforce_eager=False,
             gpu_memory_utilization=0.7,