[CI] Upgrade trasnformers version (#6307)
Upgrade transformers to >=4.56.4
- vLLM version: v0.14.1
- vLLM main:
dc917cceb8
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -58,7 +58,7 @@ jobs:
|
||||
container:
|
||||
image: ${{ inputs.image }}
|
||||
env:
|
||||
TRANSFORMERS_OFFLINE: 1
|
||||
HF_HUB_OFFLINE: 1
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
steps:
|
||||
- name: Check npu and CANN info
|
||||
|
||||
10
.github/workflows/_e2e_test.yaml
vendored
10
.github/workflows/_e2e_test.yaml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
env:
|
||||
VLLM_LOGGING_LEVEL: ERROR
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
TRANSFORMERS_OFFLINE: 1
|
||||
HF_HUB_OFFLINE: 1
|
||||
steps:
|
||||
- name: Check npu and CANN info
|
||||
run: |
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
VLLM_LOGGING_LEVEL: ERROR
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
HCCL_BUFFSIZE: 1024
|
||||
TRANSFORMERS_OFFLINE: 1
|
||||
HF_HUB_OFFLINE: 1
|
||||
steps:
|
||||
- name: Check npu and CANN info
|
||||
run: |
|
||||
@@ -245,7 +245,7 @@ jobs:
|
||||
env:
|
||||
VLLM_LOGGING_LEVEL: ERROR
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
TRANSFORMERS_OFFLINE: 1
|
||||
HF_HUB_OFFLINE: 1
|
||||
steps:
|
||||
- name: Check npu and CANN info
|
||||
run: |
|
||||
@@ -322,7 +322,7 @@ jobs:
|
||||
env:
|
||||
VLLM_LOGGING_LEVEL: ERROR
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
TRANSFORMERS_OFFLINE: 1
|
||||
HF_HUB_OFFLINE: 1
|
||||
steps:
|
||||
- name: Check npu and CANN info
|
||||
run: |
|
||||
@@ -380,7 +380,7 @@ jobs:
|
||||
env:
|
||||
VLLM_LOGGING_LEVEL: ERROR
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
TRANSFORMERS_OFFLINE: 1
|
||||
HF_HUB_OFFLINE: 1
|
||||
steps:
|
||||
- name: Check npu and CANN info
|
||||
run: |
|
||||
|
||||
@@ -17,7 +17,7 @@ requires = [
|
||||
"psutil",
|
||||
"setuptools>=64",
|
||||
"setuptools-scm>=8",
|
||||
"transformers<=4.57.1",
|
||||
"transformers>=4.57.4",
|
||||
"torch-npu==2.9.0",
|
||||
"torch==2.9.0",
|
||||
"torchvision",
|
||||
|
||||
@@ -32,6 +32,6 @@ numba
|
||||
torch-npu==2.9.0
|
||||
|
||||
arctic-inference==0.1.1
|
||||
transformers>=4.57.3
|
||||
transformers>=4.57.4
|
||||
fastapi<0.124.0
|
||||
triton-ascend==3.2.0
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import pytest
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
|
||||
@@ -9,7 +8,7 @@ from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT,
|
||||
@pytest.mark.parametrize("distributed_executor_backend", ["mp"])
|
||||
def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
|
||||
with VllmRunner(
|
||||
snapshot_download(MODEL_PATH),
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=4,
|
||||
dtype="half",
|
||||
|
||||
@@ -24,7 +24,6 @@ import os
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
@@ -77,7 +76,7 @@ def test_qwen3_w4a8_dynamic_tp2(model):
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
@@ -96,7 +95,7 @@ def test_qwen3_moe_sp_tp2() -> None:
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
|
||||
with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
|
||||
with VllmRunner("Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
@@ -119,7 +118,7 @@ def test_deepseek_w4a8_accuracy_tp2(model):
|
||||
'逍遙而至地去 accrued', '平行于我udo madreHelen', 'ysteepaolis backwards Kj'
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=5, temperature=0.0)
|
||||
with VllmRunner(snapshot_download(model),
|
||||
with VllmRunner(model,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
@@ -152,7 +151,7 @@ def test_qwen3_moe_fc2_tp2() -> None:
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
|
||||
with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"),
|
||||
with VllmRunner("Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
@@ -173,7 +172,7 @@ def test_qwen3_moe_fc2_oshard_tp2() -> None:
|
||||
top_p=0.9)
|
||||
|
||||
with VllmRunner(
|
||||
snapshot_download("Qwen/Qwen3-30B-A3B"),
|
||||
"Qwen/Qwen3-30B-A3B",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
@@ -193,7 +192,7 @@ def test_deepseek_v2_lite_fc1_tp2() -> None:
|
||||
temperature=0.0,
|
||||
top_k=50,
|
||||
top_p=0.9)
|
||||
with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"),
|
||||
with VllmRunner("vllm-ascend/DeepSeek-V2-Lite-W8A8",
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="mp",
|
||||
@@ -212,7 +211,7 @@ def test_qwen3_dense_fc1_tp2(model):
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
@@ -231,7 +230,7 @@ def test_qwen3_dense_prefetch_mlp_weight_tp2(model):
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
model,
|
||||
max_model_len=8192,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
@@ -277,7 +276,7 @@ def test_qwen3_w4a4_distributed_tp2(model):
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
model,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
quantization="ascend",
|
||||
|
||||
@@ -16,7 +16,6 @@
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
|
||||
#
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
import pytest
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
@@ -28,7 +27,7 @@ def test_qwen2_5_w8a8_external_quantized_tp2():
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"),
|
||||
"neuralmagic/Qwen2.5-3B-quantized.w8a8",
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
max_model_len=4096,
|
||||
@@ -52,8 +51,7 @@ def test_qwen3_moe_w8a8_dynamic_llm_compressor():
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download(
|
||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8"),
|
||||
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.8,
|
||||
|
||||
@@ -22,7 +22,6 @@ from unittest.mock import patch
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
from tests.e2e.conftest import RemoteOpenAIServer, VllmRunner
|
||||
@@ -50,7 +49,7 @@ def test_qwen3_moe_w8a8_distributed_tp2():
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download("vllm-ascend/Qwen3-30B-A3B-W8A8"),
|
||||
"vllm-ascend/Qwen3-30B-A3B-W8A8",
|
||||
max_model_len=8192,
|
||||
tensor_parallel_size=2,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
|
||||
@@ -19,8 +19,6 @@
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
|
||||
@@ -65,7 +63,7 @@ def test_qwen3_next_w8a8dynamic_distributed_tp4_ep():
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download("vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8"),
|
||||
"vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8",
|
||||
max_model_len=4096,
|
||||
tensor_parallel_size=4,
|
||||
gpu_memory_utilization=0.4,
|
||||
|
||||
@@ -18,7 +18,7 @@ export VLLM_LOGGING_LEVEL="INFO"
|
||||
# Reduce glog verbosity for mooncake
|
||||
export GLOG_minloglevel=1
|
||||
# Set transformers to offline mode to avoid downloading models during tests
|
||||
export TRANSFORMERS_OFFLINE="1"
|
||||
export HF_HUB_OFFLINE="1"
|
||||
|
||||
# Function to print section headers
|
||||
print_section() {
|
||||
|
||||
@@ -21,7 +21,6 @@ from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
@@ -66,11 +65,11 @@ def test_aclgraph_mem_use(model: str, max_tokens: int) -> None:
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens,
|
||||
temperature=0.0)
|
||||
if model == "vllm-ascend/DeepSeek-V2-Lite-W8A8":
|
||||
vllm_model = VllmRunner(snapshot_download(model),
|
||||
vllm_model = VllmRunner(model,
|
||||
max_model_len=1024,
|
||||
quantization="ascend")
|
||||
else:
|
||||
vllm_model = VllmRunner(snapshot_download(model))
|
||||
vllm_model = VllmRunner(model)
|
||||
_ = vllm_model.generate(prompts, sampling_params)
|
||||
|
||||
assert capture_called.value == 1, "capture_model was not called during test"
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import vllm
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
@@ -46,7 +45,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
|
||||
|
||||
def test_ilama_lora(ilama_lora_files):
|
||||
with VllmRunner(
|
||||
snapshot_download(MODEL_PATH),
|
||||
MODEL_PATH,
|
||||
enable_lora=True,
|
||||
dtype="half",
|
||||
max_loras=4,
|
||||
|
||||
@@ -20,7 +20,6 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
from vllm import SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
|
||||
@@ -46,7 +45,7 @@ def test_minicpm(model) -> None:
|
||||
]
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(snapshot_download(model),
|
||||
with VllmRunner(model,
|
||||
max_model_len=512,
|
||||
gpu_memory_utilization=0.7) as runner:
|
||||
runner.generate_greedy(example_prompts, max_tokens)
|
||||
@@ -61,7 +60,7 @@ def test_whisper(model) -> None:
|
||||
max_tokens=10,
|
||||
stop_token_ids=None)
|
||||
|
||||
with VllmRunner(snapshot_download(model),
|
||||
with VllmRunner(model,
|
||||
max_model_len=448,
|
||||
max_num_seqs=5,
|
||||
dtype="bfloat16",
|
||||
|
||||
@@ -15,8 +15,6 @@
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
from modelscope import snapshot_download # type: ignore[import-untyped]
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
@@ -33,7 +31,7 @@ def test_qwen3_w8a8_quant():
|
||||
)]
|
||||
|
||||
with VllmRunner(
|
||||
snapshot_download("vllm-ascend/Qwen3-0.6B-W8A8"),
|
||||
"vllm-ascend/Qwen3-0.6B-W8A8",
|
||||
max_model_len=8192,
|
||||
gpu_memory_utilization=0.7,
|
||||
cudagraph_capture_sizes=[1, 2, 4, 8],
|
||||
@@ -62,7 +60,7 @@ def test_qwen3_dense_w8a16():
|
||||
)]
|
||||
|
||||
with VllmRunner(
|
||||
snapshot_download("vllm-ascend/Qwen3-0.6B-W8A16"),
|
||||
"vllm-ascend/Qwen3-0.6B-W8A16",
|
||||
max_model_len=8192,
|
||||
enforce_eager=False,
|
||||
gpu_memory_utilization=0.7,
|
||||
|
||||
Reference in New Issue
Block a user