[CI] Upgrade vLLM version (#3139)
Upgrade vLLM version to the newest commit. - Fix the break change introduced by969b4da3a6- Add a patch to quick fix torhcairde94289a98- fix the ut error introduced byde94289a98Close: https://github.com/vllm-project/vllm-ascend/issues/3138 - vLLM version: v0.10.2 - vLLM main:f225ea7dd9--------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
2
.github/workflows/format_pr_body.yaml
vendored
2
.github/workflows/format_pr_body.yaml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
|
||||
- name: Get vLLM version
|
||||
run: |
|
||||
VLLM_COMMIT=f225ea7dd98e9f29752e5c032cd4a8ee1d712f16
|
||||
VLLM_COMMIT=b1068903fdca26cf6b4a1a51a32c3365ce3ac636
|
||||
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
|
||||
|
||||
- name: Checkout repository
|
||||
|
||||
6
.github/workflows/vllm_ascend_test.yaml
vendored
6
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -42,7 +42,7 @@ jobs:
|
||||
lint:
|
||||
uses: ./.github/workflows/pre-commit.yml
|
||||
with:
|
||||
vllm: f225ea7dd98e9f29752e5c032cd4a8ee1d712f16
|
||||
vllm: b1068903fdca26cf6b4a1a51a32c3365ce3ac636
|
||||
|
||||
changes:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -83,7 +83,7 @@ jobs:
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2]
|
||||
vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
|
||||
steps:
|
||||
- name: Install packages
|
||||
run: |
|
||||
@@ -138,7 +138,7 @@ jobs:
|
||||
name: e2e-light
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2]
|
||||
vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
|
||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||
needs: [lint, changes]
|
||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||
|
||||
2
.github/workflows/vllm_ascend_test_full.yaml
vendored
2
.github/workflows/vllm_ascend_test_full.yaml
vendored
@@ -68,7 +68,7 @@ jobs:
|
||||
name: e2e-full
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [f225ea7dd98e9f29752e5c032cd4a8ee1d712f16, v0.10.2]
|
||||
vllm_version: [b1068903fdca26cf6b4a1a51a32c3365ce3ac636, v0.10.2]
|
||||
needs: [changes]
|
||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
||||
uses: ./.github/workflows/_e2e_test.yaml
|
||||
|
||||
@@ -298,6 +298,10 @@ class TestAscendQwen2_5_VisionTransformer(PytestBase):
|
||||
"vllm_ascend.ops.linear_op.get_tp_group",
|
||||
return_value=mock_group,
|
||||
)
|
||||
mocker.patch(
|
||||
"vllm.distributed.parallel_state.get_tp_group",
|
||||
return_value=mock_group,
|
||||
)
|
||||
|
||||
vision_transformer = AscendQwen2_5_VisionTransformer(
|
||||
vision_config,
|
||||
|
||||
@@ -33,6 +33,10 @@ class BaseLinearTest(unittest.TestCase):
|
||||
return_value=self.mock_group),
|
||||
patch("vllm_ascend.ops.linear_op.get_tp_group",
|
||||
return_value=self.mock_group),
|
||||
patch(
|
||||
"vllm.distributed.parallel_state.get_tp_group",
|
||||
return_value=self.mock_group,
|
||||
),
|
||||
patch("vllm_ascend.utils.mlp_tp_enable", return_value=True),
|
||||
patch("vllm_ascend.utils.oproj_tp_enable", return_value=True)
|
||||
]
|
||||
|
||||
@@ -22,6 +22,7 @@ if HAS_TRITON:
|
||||
|
||||
import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa
|
||||
import vllm_ascend.patch.worker.patch_common.patch_logits # noqa
|
||||
import vllm_ascend.patch.worker.patch_common.patch_weight_loader # noqa
|
||||
|
||||
# TODO: revert me when triton import is fixed
|
||||
# import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa
|
||||
|
||||
60
vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
Normal file
60
vllm_ascend/patch/worker/patch_common/patch_weight_loader.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import torch
|
||||
from torch.nn.parameter import Parameter
|
||||
from vllm.logger import init_logger
|
||||
# yapf: disable
|
||||
from vllm.model_executor.parameter import ModelWeightParameter
|
||||
# yapf: enable
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.utils import GiB_bytes
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def create_weights(self, layer: torch.nn.Module, input_size_per_partition: int,
|
||||
output_partition_sizes: list[int], input_size: int,
|
||||
output_size: int, params_dtype: torch.dtype,
|
||||
**extra_weight_attrs):
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
ascend_config = get_ascend_config()
|
||||
# This method creates unquantized linear weights.
|
||||
# The weights are not quantized, and they are not sharded.
|
||||
# The amount of memory allocated for the weights is
|
||||
# sum(output_partition_sizes) * input_size_per_partition.
|
||||
try:
|
||||
if ascend_config.torchair_graph_config.enabled:
|
||||
weight = Parameter(torch.empty(sum(output_partition_sizes),
|
||||
input_size_per_partition,
|
||||
dtype=params_dtype),
|
||||
requires_grad=False)
|
||||
else:
|
||||
weight_loader = extra_weight_attrs.pop("weight_loader")
|
||||
weight = ModelWeightParameter(data=torch.empty(
|
||||
sum(output_partition_sizes),
|
||||
input_size_per_partition,
|
||||
dtype=params_dtype),
|
||||
input_dim=1,
|
||||
output_dim=0,
|
||||
weight_loader=weight_loader)
|
||||
except torch.cuda.OutOfMemoryError as e:
|
||||
logger.error("Failed to create unquantized linear weights: %s", e)
|
||||
if torch.cuda.is_available():
|
||||
logger.debug("CUDA device: %s", torch.cuda.current_device())
|
||||
logger.debug("Allocated: %.2f GiB",
|
||||
torch.cuda.memory_allocated() / GiB_bytes)
|
||||
logger.debug("Reserved: %.2f GiB",
|
||||
torch.cuda.memory_reserved() / GiB_bytes)
|
||||
raise RuntimeError(
|
||||
"Failed to create unquantized linear weights. "
|
||||
"This may be caused by insufficient memory to allocate "
|
||||
"the weight.") from e
|
||||
if ascend_config.torchair_graph_config.enabled:
|
||||
set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
|
||||
layer.register_parameter("weight", weight)
|
||||
set_weight_attrs(weight, extra_weight_attrs)
|
||||
|
||||
|
||||
if not vllm_version_is("0.10.2"):
|
||||
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
|
||||
UnquantizedLinearMethod.create_weights = create_weights
|
||||
@@ -28,6 +28,20 @@ from vllm_ascend.worker.worker_v1 import NPUWorker
|
||||
class NPUTorchairWorker(NPUWorker):
|
||||
"""Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class."""
|
||||
|
||||
def __init__(self,
|
||||
vllm_config,
|
||||
local_rank,
|
||||
rank,
|
||||
distributed_init_method,
|
||||
is_driver_worker=False,
|
||||
**kwargs):
|
||||
super().__init__(vllm_config, local_rank, rank,
|
||||
distributed_init_method, is_driver_worker, **kwargs)
|
||||
from vllm.model_executor.layers.linear import \
|
||||
WEIGHT_LOADER_V2_SUPPORTED
|
||||
if "UnquantizedLinearMethod" in WEIGHT_LOADER_V2_SUPPORTED:
|
||||
WEIGHT_LOADER_V2_SUPPORTED.remove("UnquantizedLinearMethod")
|
||||
|
||||
def determine_available_memory(self) -> int:
|
||||
"""Override determine_available_memory to use cached torchair kv_cache_bytes."""
|
||||
|
||||
|
||||
@@ -304,17 +304,24 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
(self.max_num_tokens, self.model_config.get_hidden_size()),
|
||||
dtype=self.dtype,
|
||||
device=self.device)
|
||||
|
||||
# Set up Attention
|
||||
self.attn_backend = get_attn_backend(
|
||||
0,
|
||||
self.dtype,
|
||||
None,
|
||||
self.block_size,
|
||||
self.model_config.is_attention_free,
|
||||
use_mla=self.model_config.use_mla,
|
||||
)
|
||||
|
||||
if vllm_version_is("0.10.2"):
|
||||
self.attn_backend = get_attn_backend(
|
||||
0,
|
||||
self.dtype,
|
||||
None,
|
||||
self.block_size,
|
||||
self.model_config.is_attention_free,
|
||||
use_mla=self.model_config.use_mla,
|
||||
)
|
||||
else:
|
||||
self.attn_backend = get_attn_backend(
|
||||
0,
|
||||
self.dtype,
|
||||
None,
|
||||
self.block_size,
|
||||
use_mla=self.model_config.use_mla,
|
||||
)
|
||||
if torch.version.cann.startswith("8.3"):
|
||||
self.attn_mask_builder = AttentionMaskBuilder(
|
||||
self.scheduler_config.max_num_batched_tokens, self.dtype,
|
||||
|
||||
Reference in New Issue
Block a user