From f8e76a49facd2e7877a039634f07e1292d1ccb7e Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Wed, 28 Jan 2026 14:06:39 +0800 Subject: [PATCH] [CI] Upgrade trasnformers version (#6307) Upgrade transformers to >=4.56.4 - vLLM version: v0.14.1 - vLLM main: https://github.com/vllm-project/vllm/commit/dc917cceb877dfd13f98c538c4c96158047d98bd Signed-off-by: wangxiyuan --- .../workflows/_e2e_nightly_single_node.yaml | 2 +- .github/workflows/_e2e_test.yaml | 10 +++++----- pyproject.toml | 2 +- requirements.txt | 2 +- .../multicard/2-cards/test_ilama_lora_tp2.py | 3 +-- .../test_offline_inference_distributed.py | 19 +++++++++---------- .../multicard/2-cards/test_quantization.py | 6 ++---- tests/e2e/multicard/2-cards/test_qwen3_moe.py | 3 +-- .../e2e/multicard/4-cards/test_qwen3_next.py | 4 +--- tests/e2e/nightly/multi_node/scripts/run.sh | 2 +- tests/e2e/singlecard/test_aclgraph_mem.py | 5 ++--- tests/e2e/singlecard/test_ilama_lora.py | 3 +-- tests/e2e/singlecard/test_models.py | 5 ++--- tests/e2e/singlecard/test_quantization.py | 6 ++---- 14 files changed, 30 insertions(+), 42 deletions(-) diff --git a/.github/workflows/_e2e_nightly_single_node.yaml b/.github/workflows/_e2e_nightly_single_node.yaml index 4037bd3b..34bd4cd9 100644 --- a/.github/workflows/_e2e_nightly_single_node.yaml +++ b/.github/workflows/_e2e_nightly_single_node.yaml @@ -58,7 +58,7 @@ jobs: container: image: ${{ inputs.image }} env: - TRANSFORMERS_OFFLINE: 1 + HF_HUB_OFFLINE: 1 VLLM_USE_MODELSCOPE: True steps: - name: Check npu and CANN info diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 62eb3983..a0c81b2e 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -28,7 +28,7 @@ jobs: env: VLLM_LOGGING_LEVEL: ERROR VLLM_USE_MODELSCOPE: True - TRANSFORMERS_OFFLINE: 1 + HF_HUB_OFFLINE: 1 steps: - name: Check npu and CANN info run: | @@ -137,7 +137,7 @@ jobs: VLLM_LOGGING_LEVEL: ERROR VLLM_USE_MODELSCOPE: True HCCL_BUFFSIZE: 1024 - TRANSFORMERS_OFFLINE: 1 + HF_HUB_OFFLINE: 1 steps: - name: Check npu and CANN info run: | @@ -245,7 +245,7 @@ jobs: env: VLLM_LOGGING_LEVEL: ERROR VLLM_USE_MODELSCOPE: True - TRANSFORMERS_OFFLINE: 1 + HF_HUB_OFFLINE: 1 steps: - name: Check npu and CANN info run: | @@ -322,7 +322,7 @@ jobs: env: VLLM_LOGGING_LEVEL: ERROR VLLM_USE_MODELSCOPE: True - TRANSFORMERS_OFFLINE: 1 + HF_HUB_OFFLINE: 1 steps: - name: Check npu and CANN info run: | @@ -380,7 +380,7 @@ jobs: env: VLLM_LOGGING_LEVEL: ERROR VLLM_USE_MODELSCOPE: True - TRANSFORMERS_OFFLINE: 1 + HF_HUB_OFFLINE: 1 steps: - name: Check npu and CANN info run: | diff --git a/pyproject.toml b/pyproject.toml index 08ce30f2..fca3e0f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ requires = [ "psutil", "setuptools>=64", "setuptools-scm>=8", - "transformers<=4.57.1", + "transformers>=4.57.4", "torch-npu==2.9.0", "torch==2.9.0", "torchvision", diff --git a/requirements.txt b/requirements.txt index f14d584f..82cca9ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,6 +32,6 @@ numba torch-npu==2.9.0 arctic-inference==0.1.1 -transformers>=4.57.3 +transformers>=4.57.4 fastapi<0.124.0 triton-ascend==3.2.0 diff --git a/tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py b/tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py index e0174e18..fc4866ec 100644 --- a/tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py +++ b/tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py @@ -1,5 +1,4 @@ import pytest -from modelscope import snapshot_download # type: ignore from tests.e2e.conftest import VllmRunner from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT, @@ -9,7 +8,7 @@ from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT, @pytest.mark.parametrize("distributed_executor_backend", ["mp"]) def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files): with VllmRunner( - snapshot_download(MODEL_PATH), + MODEL_PATH, enable_lora=True, max_loras=4, dtype="half", diff --git a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py index e9032e99..77e28ece 100644 --- a/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py +++ b/tests/e2e/multicard/2-cards/test_offline_inference_distributed.py @@ -24,7 +24,6 @@ import os from unittest.mock import patch import pytest -from modelscope import snapshot_download # type: ignore from vllm import SamplingParams from tests.e2e.conftest import VllmRunner @@ -77,7 +76,7 @@ def test_qwen3_w4a8_dynamic_tp2(model): ] max_tokens = 5 with VllmRunner( - snapshot_download(model), + model, max_model_len=8192, dtype="auto", tensor_parallel_size=2, @@ -96,7 +95,7 @@ def test_qwen3_moe_sp_tp2() -> None: top_k=50, top_p=0.9) - with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"), + with VllmRunner("Qwen/Qwen3-30B-A3B", dtype="auto", tensor_parallel_size=2, distributed_executor_backend="mp", @@ -119,7 +118,7 @@ def test_deepseek_w4a8_accuracy_tp2(model): '逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj' ] sampling_params = SamplingParams(max_tokens=5, temperature=0.0) - with VllmRunner(snapshot_download(model), + with VllmRunner(model, dtype="auto", tensor_parallel_size=2, cudagraph_capture_sizes=[1, 2, 4, 8], @@ -152,7 +151,7 @@ def test_qwen3_moe_fc2_tp2() -> None: top_k=50, top_p=0.9) - with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"), + with VllmRunner("Qwen/Qwen3-30B-A3B", dtype="auto", tensor_parallel_size=2, distributed_executor_backend="mp", @@ -173,7 +172,7 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None: top_p=0.9) with VllmRunner( - snapshot_download("Qwen/Qwen3-30B-A3B"), + "Qwen/Qwen3-30B-A3B", dtype="auto", tensor_parallel_size=2, distributed_executor_backend="mp", @@ -193,7 +192,7 @@ def test_deepseek_v2_lite_fc1_tp2() -> None: temperature=0.0, top_k=50, top_p=0.9) - with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"), + with VllmRunner("vllm-ascend/DeepSeek-V2-Lite-W8A8", dtype="auto", tensor_parallel_size=2, distributed_executor_backend="mp", @@ -212,7 +211,7 @@ def test_qwen3_dense_fc1_tp2(model): max_tokens = 5 with VllmRunner( - snapshot_download(model), + model, max_model_len=8192, dtype="auto", tensor_parallel_size=2, @@ -231,7 +230,7 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model): max_tokens = 5 with VllmRunner( - snapshot_download(model), + model, max_model_len=8192, dtype="auto", tensor_parallel_size=2, @@ -277,7 +276,7 @@ def test_qwen3_w4a4_distributed_tp2(model): ] max_tokens = 5 with VllmRunner( - snapshot_download(model), + model, tensor_parallel_size=2, cudagraph_capture_sizes=[1, 2, 4, 8], quantization="ascend", diff --git a/tests/e2e/multicard/2-cards/test_quantization.py b/tests/e2e/multicard/2-cards/test_quantization.py index 36d9ea0d..245693ff 100644 --- a/tests/e2e/multicard/2-cards/test_quantization.py +++ b/tests/e2e/multicard/2-cards/test_quantization.py @@ -16,7 +16,6 @@ # This file is a part of the vllm-ascend project. # Adapted from vllm/tests/basic_correctness/test_basic_correctness.py # -from modelscope import snapshot_download # type: ignore import pytest from tests.e2e.conftest import VllmRunner @@ -28,7 +27,7 @@ def test_qwen2_5_w8a8_external_quantized_tp2(): ] max_tokens = 5 with VllmRunner( - snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"), + "neuralmagic/Qwen2.5-3B-quantized.w8a8", tensor_parallel_size=2, cudagraph_capture_sizes=[1, 2, 4, 8], max_model_len=4096, @@ -52,8 +51,7 @@ def test_qwen3_moe_w8a8_dynamic_llm_compressor(): ] max_tokens = 5 with VllmRunner( - snapshot_download( - "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8"), + "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8", tensor_parallel_size=2, max_model_len=4096, gpu_memory_utilization=0.8, diff --git a/tests/e2e/multicard/2-cards/test_qwen3_moe.py b/tests/e2e/multicard/2-cards/test_qwen3_moe.py index 4dcc3f29..7a8cf77d 100644 --- a/tests/e2e/multicard/2-cards/test_qwen3_moe.py +++ b/tests/e2e/multicard/2-cards/test_qwen3_moe.py @@ -22,7 +22,6 @@ from unittest.mock import patch import openai import pytest -from modelscope import snapshot_download # type: ignore from vllm.utils.network_utils import get_open_port from tests.e2e.conftest import RemoteOpenAIServer, VllmRunner @@ -50,7 +49,7 @@ def test_qwen3_moe_w8a8_distributed_tp2(): ] max_tokens = 5 with VllmRunner( - snapshot_download("vllm-ascend/Qwen3-30B-A3B-W8A8"), + "vllm-ascend/Qwen3-30B-A3B-W8A8", max_model_len=8192, tensor_parallel_size=2, cudagraph_capture_sizes=[1, 2, 4, 8], diff --git a/tests/e2e/multicard/4-cards/test_qwen3_next.py b/tests/e2e/multicard/4-cards/test_qwen3_next.py index e5c54ff5..445cd36e 100644 --- a/tests/e2e/multicard/4-cards/test_qwen3_next.py +++ b/tests/e2e/multicard/4-cards/test_qwen3_next.py @@ -19,8 +19,6 @@ import os from unittest.mock import patch -from modelscope import snapshot_download # type: ignore - from tests.e2e.conftest import VllmRunner @@ -65,7 +63,7 @@ def test_qwen3_next_w8a8dynamic_distributed_tp4_ep(): ] max_tokens = 5 with VllmRunner( - snapshot_download("vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8"), + "vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8", max_model_len=4096, tensor_parallel_size=4, gpu_memory_utilization=0.4, diff --git a/tests/e2e/nightly/multi_node/scripts/run.sh b/tests/e2e/nightly/multi_node/scripts/run.sh index f42b325e..a2e68f02 100644 --- a/tests/e2e/nightly/multi_node/scripts/run.sh +++ b/tests/e2e/nightly/multi_node/scripts/run.sh @@ -18,7 +18,7 @@ export VLLM_LOGGING_LEVEL="INFO" # Reduce glog verbosity for mooncake export GLOG_minloglevel=1 # Set transformers to offline mode to avoid downloading models during tests -export TRANSFORMERS_OFFLINE="1" +export HF_HUB_OFFLINE="1" # Function to print section headers print_section() { diff --git a/tests/e2e/singlecard/test_aclgraph_mem.py b/tests/e2e/singlecard/test_aclgraph_mem.py index 877d40c9..25d09786 100644 --- a/tests/e2e/singlecard/test_aclgraph_mem.py +++ b/tests/e2e/singlecard/test_aclgraph_mem.py @@ -21,7 +21,6 @@ from unittest.mock import patch import pytest import torch -from modelscope import snapshot_download # type: ignore from vllm import SamplingParams from tests.e2e.conftest import VllmRunner @@ -66,11 +65,11 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None: sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0) if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8": - vllm_model = VllmRunner(snapshot_download(model), + vllm_model = VllmRunner(model, max_model_len=1024, quantization="ascend") else: - vllm_model = VllmRunner(snapshot_download(model)) + vllm_model = VllmRunner(model) _ = vllm_model.generate(prompts, sampling_params) assert capture_called.value == 1, "capture_model was not called during test" diff --git a/tests/e2e/singlecard/test_ilama_lora.py b/tests/e2e/singlecard/test_ilama_lora.py index d9c4814e..d59acd30 100644 --- a/tests/e2e/singlecard/test_ilama_lora.py +++ b/tests/e2e/singlecard/test_ilama_lora.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 import vllm -from modelscope import snapshot_download # type: ignore from vllm.lora.request import LoRARequest from tests.e2e.conftest import VllmRunner @@ -46,7 +45,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: def test_ilama_lora(ilama_lora_files): with VllmRunner( - snapshot_download(MODEL_PATH), + MODEL_PATH, enable_lora=True, dtype="half", max_loras=4, diff --git a/tests/e2e/singlecard/test_models.py b/tests/e2e/singlecard/test_models.py index e0464a55..659b5f69 100644 --- a/tests/e2e/singlecard/test_models.py +++ b/tests/e2e/singlecard/test_models.py @@ -20,7 +20,6 @@ import os import pytest -from modelscope import snapshot_download # type: ignore from vllm import SamplingParams from vllm.assets.audio import AudioAsset @@ -46,7 +45,7 @@ def test_minicpm(model) -> None: ] max_tokens = 5 - with VllmRunner(snapshot_download(model), + with VllmRunner(model, max_model_len=512, gpu_memory_utilization=0.7) as runner: runner.generate_greedy(example_prompts, max_tokens) @@ -61,7 +60,7 @@ def test_whisper(model) -> None: max_tokens=10, stop_token_ids=None) - with VllmRunner(snapshot_download(model), + with VllmRunner(model, max_model_len=448, max_num_seqs=5, dtype="bfloat16", diff --git a/tests/e2e/singlecard/test_quantization.py b/tests/e2e/singlecard/test_quantization.py index 93776410..4457a05f 100644 --- a/tests/e2e/singlecard/test_quantization.py +++ b/tests/e2e/singlecard/test_quantization.py @@ -15,8 +15,6 @@ # limitations under the License. # This file is a part of the vllm-ascend project. # -from modelscope import snapshot_download # type: ignore[import-untyped] - from tests.e2e.conftest import VllmRunner from tests.e2e.model_utils import check_outputs_equal @@ -33,7 +31,7 @@ def test_qwen3_w8a8_quant(): )] with VllmRunner( - snapshot_download("vllm-ascend/Qwen3-0.6B-W8A8"), + "vllm-ascend/Qwen3-0.6B-W8A8", max_model_len=8192, gpu_memory_utilization=0.7, cudagraph_capture_sizes=[1, 2, 4, 8], @@ -62,7 +60,7 @@ def test_qwen3_dense_w8a16(): )] with VllmRunner( - snapshot_download("vllm-ascend/Qwen3-0.6B-W8A16"), + "vllm-ascend/Qwen3-0.6B-W8A16", max_model_len=8192, enforce_eager=False, gpu_memory_utilization=0.7,