[CI] Upgrade vLLM version (#3139)

Upgrade vLLM version to the newest commit.
- Fix the break change introduced by
969b4da3a6
- Add a patch to quick fix torhcair
de94289a98
- fix the ut error introduced by
de94289a98

Close: https://github.com/vllm-project/vllm-ascend/issues/3138


- vLLM version: v0.10.2
- vLLM main:
f225ea7dd9

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: MengqingCao <cmq0113@163.com>
Co-authored-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
wangxiyuan
2025-09-25 07:36:51 +08:00
committed by GitHub
parent 464270e4ca
commit a055183821
9 changed files with 105 additions and 15 deletions

View File

@@ -36,7 +36,7 @@ jobs:
- name: Get vLLM version - name: Get vLLM version
run: | run: |
VLLM_COMMIT=f225ea7dd98e9f29752e5c032cd4a8ee1d712f16 VLLM_COMMIT=b1068903fdca26cf6b4a1a51a32c3365ce3ac636
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
- name: Checkout repository - name: Checkout repository

View File

@@ -42,7 +42,7 @@ jobs:
lint: lint:
uses: ./.github/workflows/pre-commit.yml uses: ./.github/workflows/pre-commit.yml
with: with:
vllm: f225ea7dd98e9f29752e5c032cd4a8ee1d712f16 vllm: b1068903fdca26cf6b4a1a51a32c3365ce3ac636
changes: changes:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@@ -83,7 +83,7 @@ jobs:
VLLM_USE_MODELSCOPE: True VLLM_USE_MODELSCOPE: True
strategy: strategy:
matrix: matrix:
vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2] vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
steps: steps:
- name: Install packages - name: Install packages
run: | run: |
@@ -138,7 +138,7 @@ jobs:
name: e2e-light name: e2e-light
strategy: strategy:
matrix: matrix:
vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2] vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
# Note (yikun): If CI resource are limited we can split job into two chain jobs # Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes] needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request. # only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -68,7 +68,7 @@ jobs:
name: e2e-full name: e2e-full
strategy: strategy:
matrix: matrix:
vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2] vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -298,6 +298,10 @@ class TestAscendQwen2_5_VisionTransformer(PytestBase):
"vllm_ascend.ops.linear_op.get_tp_group", "vllm_ascend.ops.linear_op.get_tp_group",
return_value=mock_group, return_value=mock_group,
) )
mocker.patch(
"vllm.distributed.parallel_state.get_tp_group",
return_value=mock_group,
)
vision_transformer = AscendQwen2_5_VisionTransformer( vision_transformer = AscendQwen2_5_VisionTransformer(
vision_config, vision_config,

View File

@@ -33,6 +33,10 @@ class BaseLinearTest(unittest.TestCase):
return_value=self.mock_group), return_value=self.mock_group),
patch("vllm_ascend.ops.linear_op.get_tp_group", patch("vllm_ascend.ops.linear_op.get_tp_group",
return_value=self.mock_group), return_value=self.mock_group),
patch(
"vllm.distributed.parallel_state.get_tp_group",
return_value=self.mock_group,
),
patch("vllm_ascend.utils.mlp_tp_enable", return_value=True), patch("vllm_ascend.utils.mlp_tp_enable", return_value=True),
patch("vllm_ascend.utils.oproj_tp_enable", return_value=True) patch("vllm_ascend.utils.oproj_tp_enable", return_value=True)
] ]

View File

@@ -22,6 +22,7 @@ if HAS_TRITON:
import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa
import vllm_ascend.patch.worker.patch_common.patch_logits # noqa import vllm_ascend.patch.worker.patch_common.patch_logits # noqa
import vllm_ascend.patch.worker.patch_common.patch_weight_loader # noqa
# TODO: revert me when triton import is fixed # TODO: revert me when triton import is fixed
# import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa # import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa

View File

@@ -0,0 +1,60 @@
import torch
from torch.nn.parameter import Parameter
from vllm.logger import init_logger
# yapf: disable
from vllm.model_executor.parameter import ModelWeightParameter
# yapf: enable
from vllm.model_executor.utils import set_weight_attrs
from vllm.utils import GiB_bytes
from vllm_ascend.utils import vllm_version_is
logger = init_logger(__name__)
def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
output_partition_sizes: list[int], input_size: int,
output_size: int, params_dtype: torch.dtype,
**extra_weight_attrs):
from vllm_ascend.ascend_config import get_ascend_config
ascend_config = get_ascend_config()
# This method creates unquantized linear weights.
# The weights are not quantized, and they are not sharded.
# The amount of memory allocated for the weights is
# sum(output_partition_sizes) * input_size_per_partition.
try:
if ascend_config.torchair_graph_config.enabled:
weight = Parameter(torch.empty(sum(output_partition_sizes),
input_size_per_partition,
dtype=params_dtype),
requires_grad=False)
else:
weight_loader = extra_weight_attrs.pop("weight_loader")
weight = ModelWeightParameter(data=torch.empty(
sum(output_partition_sizes),
input_size_per_partition,
dtype=params_dtype),
input_dim=1,
output_dim=0,
weight_loader=weight_loader)
except torch.cuda.OutOfMemoryError as e:
logger.error("Failed to create unquantized linear weights: %s", e)
if torch.cuda.is_available():
logger.debug("CUDA device: %s", torch.cuda.current_device())
logger.debug("Allocated: %.2f GiB",
torch.cuda.memory_allocated() / GiB_bytes)
logger.debug("Reserved: %.2f GiB",
torch.cuda.memory_reserved() / GiB_bytes)
raise RuntimeError(
"Failed to create unquantized linear weights. "
"This may be caused by insufficient memory to allocate "
"the weight.") from e
if ascend_config.torchair_graph_config.enabled:
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
layer.register_parameter("weight", weight)
set_weight_attrs(weight, extra_weight_attrs)
if not vllm_version_is("0.10.2"):
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
UnquantizedLinearMethod.create_weights = create_weights

View File

@@ -28,6 +28,20 @@ from vllm_ascend.worker.worker_v1 import NPUWorker
class NPUTorchairWorker(NPUWorker): class NPUTorchairWorker(NPUWorker):
"""Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class.""" """Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class."""
def __init__(self,
vllm_config,
local_rank,
rank,
distributed_init_method,
is_driver_worker=False,
**kwargs):
super().__init__(vllm_config, local_rank, rank,
distributed_init_method, is_driver_worker, **kwargs)
from vllm.model_executor.layers.linear import \
WEIGHT_LOADER_V2_SUPPORTED
if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
def determine_available_memory(self) -> int: def determine_available_memory(self) -> int:
"""Override determine_available_memory to use cached torchair kv_cache_bytes.""" """Override determine_available_memory to use cached torchair kv_cache_bytes."""

View File

@@ -304,17 +304,24 @@ class NPUModelRunner(LoRAModelRunnerMixin):
(self.max_num_tokens, self.model_config.get_hidden_size()), (self.max_num_tokens, self.model_config.get_hidden_size()),
dtype=self.dtype, dtype=self.dtype,
device=self.device) device=self.device)
# Set up Attention # Set up Attention
self.attn_backend = get_attn_backend( if vllm_version_is("0.10.2"):
0, self.attn_backend = get_attn_backend(
self.dtype, 0,
None, self.dtype,
self.block_size, None,
self.model_config.is_attention_free, self.block_size,
use_mla=self.model_config.use_mla, self.model_config.is_attention_free,
) use_mla=self.model_config.use_mla,
)
else:
self.attn_backend = get_attn_backend(
0,
self.dtype,
None,
self.block_size,
use_mla=self.model_config.use_mla,
)
if torch.version.cann.startswith("8.3"): if torch.version.cann.startswith("8.3"):
self.attn_mask_builder = AttentionMaskBuilder( self.attn_mask_builder = AttentionMaskBuilder(
self.scheduler_config.max_num_batched_tokens, self.dtype, self.scheduler_config.max_num_batched_tokens, self.dtype,