Upgrade vLLM to v0.10.0 (#1927)
### What this PR does / why we need it? - Upgrade to v0.10.0 - Drop v0.9.2 version compatibility - Add patch for `vllm_ascend/patch/worker/patch_common/patch_sampler_gather_logprobs.py` as workaround off3a683b7c9for v0.10.0 and also add e2e test `test_models_prompt_logprobs` - Pin transformers<4.54.0 as workaround of https://github.com/vllm-project/vllm-ascend/issues/2034 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Test locally: `VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/singlecard/test_offline_inference.py::test_models_prompt_logprobs` - CI passed - vLLM version: v0.9.2 - vLLM main:7728dd77bb--------- Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
This commit is contained in:
4
.github/workflows/accuracy_test.yaml
vendored
4
.github/workflows/accuracy_test.yaml
vendored
@@ -37,7 +37,7 @@ on:
|
||||
# Current supported vLLM versions
|
||||
options:
|
||||
- main
|
||||
- v0.9.2
|
||||
- v0.10.0
|
||||
- v0.9.1
|
||||
- v0.7.3
|
||||
vllm-ascend-version:
|
||||
@@ -163,7 +163,7 @@ jobs:
|
||||
repository: vllm-project/vllm
|
||||
path: ./vllm-empty
|
||||
# Please also update this when bump matched version
|
||||
ref: ${{ github.event.inputs.vllm-version || 'v0.9.2' }}
|
||||
ref: ${{ github.event.inputs.vllm-version || 'v0.10.0' }}
|
||||
|
||||
- name: Install vllm-project/vllm from source
|
||||
working-directory: ./vllm-empty
|
||||
|
||||
2
.github/workflows/nightly_benchmarks.yaml
vendored
2
.github/workflows/nightly_benchmarks.yaml
vendored
@@ -51,7 +51,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- vllm_branch: v0.9.2
|
||||
- vllm_branch: v0.10.0
|
||||
vllm_ascend_branch: main
|
||||
vllm_use_v1: 1
|
||||
max-parallel: 1
|
||||
|
||||
6
.github/workflows/vllm_ascend_test.yaml
vendored
6
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -81,7 +81,7 @@ jobs:
|
||||
VLLM_USE_MODELSCOPE: True
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [main, v0.9.2]
|
||||
vllm_version: [main, v0.10.0]
|
||||
steps:
|
||||
- name: Install packages
|
||||
run: |
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
os: [linux-arm64-npu-1]
|
||||
vllm_version: [main, v0.9.2]
|
||||
vllm_version: [main, v0.10.0]
|
||||
name: singlecard e2e test
|
||||
runs-on: ${{ matrix.os }}
|
||||
container:
|
||||
@@ -216,7 +216,7 @@ jobs:
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
os: [linux-arm64-npu-4]
|
||||
vllm_version: [main, v0.9.2]
|
||||
vllm_version: [main, v0.10.0]
|
||||
name: multicard e2e test
|
||||
runs-on: ${{ matrix.os }}
|
||||
container:
|
||||
|
||||
@@ -43,7 +43,7 @@ jobs:
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
os: [linux-arm64-npu-1, linux-arm64-npu-4]
|
||||
vllm_version: [main, v0.9.2]
|
||||
vllm_version: [main, v0.10.0]
|
||||
name: vLLM Ascend long term test
|
||||
runs-on: ${{ matrix.os }}
|
||||
container:
|
||||
|
||||
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.9.2
|
||||
ARG VLLM_TAG=v0.10.0
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
|
||||
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.9.2
|
||||
ARG VLLM_TAG=v0.10.0
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
|
||||
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.9.2
|
||||
ARG VLLM_TAG=v0.10.0
|
||||
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
|
||||
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.9.2
|
||||
ARG VLLM_TAG=v0.10.0
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
|
||||
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.9.2
|
||||
ARG VLLM_TAG=v0.10.0
|
||||
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
|
||||
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.9.2
|
||||
ARG VLLM_TAG=v0.10.0
|
||||
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
|
||||
@@ -77,7 +77,7 @@ myst_substitutions = {
|
||||
# CANN image tag
|
||||
'cann_image_tag': "8.1.rc1-910b-ubuntu22.04-py3.10",
|
||||
# vllm version in ci
|
||||
'ci_vllm_version': 'v0.9.2',
|
||||
'ci_vllm_version': 'v0.10.0',
|
||||
}
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
|
||||
@@ -38,15 +38,15 @@ vllm_ascend
|
||||
|
||||
In both **platform** and **worker** folder, there are several patch modules. They are used for patching different version of vLLM.
|
||||
|
||||
- `patch_0_9_2`: This module is used for patching vLLM 0.9.2. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_9_2` is used for patching vLLM 0.9.2.
|
||||
- `patch_0_10_0`: This module is used for patching vLLM 0.10.0. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_10_0` is used for patching vLLM 0.10.0.
|
||||
- `patch_main`: This module is used for patching the code in vLLM main branch.
|
||||
- `patch_common`: This module is used for patching both vLLM 0.9.2 and vLLM main branch.
|
||||
- `patch_common`: This module is used for patching both vLLM 0.10.0 and vLLM main branch.
|
||||
|
||||
## How to write a patch
|
||||
|
||||
Before writing a patch, following the principle above, we should patch the least code. If it's necessary, we can patch the code in either **platform** and **worker** folder. Here is an example to patch `distributed` module in vLLM.
|
||||
|
||||
1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.9.2 and main of vLLM.
|
||||
1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.10.0 and main of vLLM.
|
||||
2. Decide which process we should patch. For example, here `distributed` belongs to the vLLM main process, so we should patch `platform`.
|
||||
3. Create the patch file in the right folder. The file should be named as `patch_{module_name}.py`. The example here is `vllm_ascend/patch/platform/patch_common/patch_distributed.py`.
|
||||
4. Write your patch code in the new file. Here is an example:
|
||||
@@ -82,4 +82,4 @@ Before writing a patch, following the principle above, we should patch the least
|
||||
|
||||
## Limitation
|
||||
1. In V1 Engine, vLLM starts three kinds of process: Main process, EngineCore process and Worker process. Now vLLM Ascend only support patch the code in Main process and Worker process by default. If you want to patch the code runs in EngineCore process, you should patch EngineCore process entirely during setup, the entry code is here `vllm.v1.engine.core`. Please override `EngineCoreProc` and `DPEngineCoreProc` entirely.
|
||||
2. If you are running an edited vLLM code, the version of the vLLM may be changed automatically. For example, if you runs an edited vLLM based on v0.9.n, the version of vLLM may be change to v0.9.nxxx, in this case, the patch for v0.9.n in vLLM Ascend would not work as expect, because that vLLM Ascend can't distinguish the version of vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of vLLM you're using, then the patch for v0.9.2 should work.
|
||||
2. If you are running an edited vLLM code, the version of the vLLM may be changed automatically. For example, if you runs an edited vLLM based on v0.9.n, the version of vLLM may be change to v0.9.nxxx, in this case, the patch for v0.9.n in vLLM Ascend would not work as expect, because that vLLM Ascend can't distinguish the version of vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of vLLM you're using, then the patch for v0.10.0 should work.
|
||||
|
||||
@@ -19,6 +19,8 @@ requires = [
|
||||
"msgpack",
|
||||
"quart",
|
||||
"numba",
|
||||
# Remove after https://github.com/vllm-project/vllm-ascend/issues/2034
|
||||
"transformers<4.54.0",
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
|
||||
@@ -13,6 +13,8 @@ setuptools-scm>=8
|
||||
torch>=2.5.1
|
||||
torchvision<0.21.0
|
||||
wheel
|
||||
# Remove after https://github.com/vllm-project/vllm-ascend/issues/2034
|
||||
transformers<4.54.0
|
||||
|
||||
# requirements for disaggregated prefill
|
||||
msgpack
|
||||
|
||||
@@ -127,3 +127,19 @@ def test_models_topk() -> None:
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
def test_models_prompt_logprobs() -> None:
|
||||
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
|
||||
with VllmRunner("Qwen/Qwen2.5-0.5B-Instruct",
|
||||
max_model_len=8192,
|
||||
dtype="float16",
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
vllm_model.generate_greedy_logprobs(example_prompts,
|
||||
max_tokens=5,
|
||||
num_logprobs=1)
|
||||
|
||||
@@ -3,15 +3,12 @@ from unittest.mock import MagicMock, patch
|
||||
import torch
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.attention.attention_v1 import \
|
||||
AscendAttentionBackendImpl092 # isort: skip
|
||||
from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend,
|
||||
AscendAttentionBackendImpl,
|
||||
AscendAttentionMetadataBuilder,
|
||||
AscendAttentionState,
|
||||
AscendMetadata,
|
||||
CommonAttentionState)
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class TestAscendAttentionBackend(TestBase):
|
||||
@@ -20,12 +17,8 @@ class TestAscendAttentionBackend(TestBase):
|
||||
self.assertEqual(AscendAttentionBackend.get_name(), "ASCEND")
|
||||
|
||||
def test_get_impl_cls(self):
|
||||
if vllm_version_is("0.9.2"):
|
||||
self.assertEqual(AscendAttentionBackend.get_impl_cls(),
|
||||
AscendAttentionBackendImpl092)
|
||||
else:
|
||||
self.assertEqual(AscendAttentionBackend.get_impl_cls(),
|
||||
AscendAttentionBackendImpl)
|
||||
self.assertEqual(AscendAttentionBackend.get_impl_cls(),
|
||||
AscendAttentionBackendImpl)
|
||||
|
||||
def test_get_metadata_cls(self):
|
||||
self.assertEqual(AscendAttentionBackend.get_metadata_cls(),
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import torch
|
||||
import torch_npu
|
||||
@@ -31,7 +31,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
|
||||
from vllm_ascend.ops.attention import vanilla_chunked_prefill
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
|
||||
nd_to_nz_2d, nd_to_nz_spec, vllm_version_is)
|
||||
nd_to_nz_2d, nd_to_nz_spec)
|
||||
|
||||
|
||||
class AscendAttentionBackend(AttentionBackend):
|
||||
@@ -43,8 +43,6 @@ class AscendAttentionBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
|
||||
if vllm_version_is("0.9.2"):
|
||||
return AscendAttentionBackendImpl092
|
||||
return AscendAttentionBackendImpl
|
||||
|
||||
@staticmethod
|
||||
@@ -440,38 +438,6 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
||||
return output.view(num_tokens, self.hidden_size)
|
||||
|
||||
|
||||
class AscendAttentionBackendImpl092(AscendAttentionBackendImpl):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
num_heads=num_heads,
|
||||
head_size=head_size,
|
||||
scale=scale,
|
||||
num_kv_heads=num_kv_heads,
|
||||
alibi_slopes=alibi_slopes,
|
||||
sliding_window=sliding_window,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
logits_soft_cap=logits_soft_cap,
|
||||
attn_type=attn_type,
|
||||
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
|
||||
use_irope=use_irope,
|
||||
)
|
||||
|
||||
|
||||
def unified_ascend_attention_with_output(
|
||||
query: torch.Tensor,
|
||||
key: torch.Tensor,
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
#
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -29,7 +29,7 @@ from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
|
||||
nd_to_nz_2d, vllm_version_is)
|
||||
nd_to_nz_2d)
|
||||
|
||||
|
||||
class AscendAttentionTorchairBackend(AttentionBackend):
|
||||
@@ -41,8 +41,6 @@ class AscendAttentionTorchairBackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["AscendAttentionTorchairBackendImpl"]:
|
||||
if vllm_version_is("0.9.2"):
|
||||
return AscendAttentionTorchairBackendImpl092
|
||||
return AscendAttentionTorchairBackendImpl
|
||||
|
||||
@staticmethod
|
||||
@@ -489,36 +487,3 @@ class AscendAttentionTorchairBackendImpl(AttentionImpl):
|
||||
"to use ascend scheduler.")
|
||||
|
||||
return output.view(num_tokens, self.hidden_size)
|
||||
|
||||
|
||||
class AscendAttentionTorchairBackendImpl092(AscendAttentionTorchairBackendImpl
|
||||
):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
num_heads=num_heads,
|
||||
head_size=head_size,
|
||||
scale=scale,
|
||||
num_kv_heads=num_kv_heads,
|
||||
alibi_slopes=alibi_slopes,
|
||||
sliding_window=sliding_window,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
logits_soft_cap=logits_soft_cap,
|
||||
attn_type=attn_type,
|
||||
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
|
||||
use_irope=use_irope,
|
||||
)
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
|
||||
TypeVar)
|
||||
from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch_npu
|
||||
from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
|
||||
AttentionMetadata, AttentionType,
|
||||
AttentionMetadata,
|
||||
MLAAttentionImpl)
|
||||
from vllm.attention.backends.utils import PAD_SLOT_ID
|
||||
from vllm.config import get_current_vllm_config
|
||||
@@ -22,7 +21,7 @@ from vllm_ascend.multistream.context import get_multistream_comm_context
|
||||
from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn
|
||||
from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla
|
||||
from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
|
||||
from vllm_ascend.utils import npu_prefetch, vllm_version_is
|
||||
from vllm_ascend.utils import npu_prefetch
|
||||
from vllm_ascend.worker.npu_input_batch import InputBatch
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -54,8 +53,6 @@ class AscendMLABackend(AttentionBackend):
|
||||
|
||||
@staticmethod
|
||||
def get_impl_cls() -> Type["MLAAttentionImpl"]:
|
||||
if vllm_version_is("0.9.2"):
|
||||
return AscendMLAImpl092
|
||||
return AscendMLAImpl
|
||||
|
||||
|
||||
@@ -1212,34 +1209,3 @@ class AscendMLAImpl(MLAAttentionImpl):
|
||||
output[:num_decode_tokens] = output_decode
|
||||
|
||||
return output_padded
|
||||
|
||||
|
||||
class AscendMLAImpl092(AscendMLAImpl):
|
||||
|
||||
def __init__(self,
|
||||
num_heads: int,
|
||||
head_size: int,
|
||||
scale: float,
|
||||
num_kv_heads: int,
|
||||
alibi_slopes: Optional[List[float]],
|
||||
sliding_window: Optional[int],
|
||||
kv_cache_dtype: str,
|
||||
blocksparse_params: Optional[Dict[str, Any]] = None,
|
||||
logits_soft_cap: Optional[float] = None,
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
kv_sharing_target_layer_name: Optional[str] = None,
|
||||
use_irope: bool = False,
|
||||
**kwargs) -> None:
|
||||
super().__init__(
|
||||
num_heads=num_heads,
|
||||
head_size=head_size,
|
||||
scale=scale,
|
||||
num_kv_heads=num_kv_heads,
|
||||
alibi_slopes=alibi_slopes,
|
||||
sliding_window=sliding_window,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
logits_soft_cap=logits_soft_cap,
|
||||
attn_type=attn_type,
|
||||
kv_sharing_target_layer_name=kv_sharing_target_layer_name,
|
||||
use_irope=use_irope,
|
||||
**kwargs)
|
||||
|
||||
@@ -32,8 +32,6 @@ from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
|
||||
class AscendScheduler(Scheduler):
|
||||
"""This Scheduler extends vllm's original v1 scheduler
|
||||
@@ -283,23 +281,12 @@ class AscendScheduler(Scheduler):
|
||||
# allow the lower-priority requests to be scheduled.
|
||||
req_index += 1
|
||||
continue
|
||||
if vllm_version_is("0.9.2"):
|
||||
num_draft_tokens = max(
|
||||
num_new_tokens + request.num_computed_tokens -
|
||||
request.num_tokens, 0)
|
||||
|
||||
while True:
|
||||
if vllm_version_is("0.9.2"):
|
||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||
request,
|
||||
num_new_tokens,
|
||||
num_draft_tokens=num_draft_tokens,
|
||||
num_lookahead_tokens=self.num_lookahead_tokens)
|
||||
else:
|
||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||
request,
|
||||
num_new_tokens,
|
||||
num_lookahead_tokens=self.num_lookahead_tokens)
|
||||
new_blocks = self.kv_cache_manager.allocate_slots(
|
||||
request,
|
||||
num_new_tokens,
|
||||
num_lookahead_tokens=self.num_lookahead_tokens)
|
||||
if new_blocks is None:
|
||||
# The request cannot be scheduled.
|
||||
# Preempt the lowest-priority request.
|
||||
|
||||
@@ -24,9 +24,9 @@
|
||||
# each worker's `__init__` function.
|
||||
#
|
||||
# Then in each kind of patch, there are three folders:
|
||||
# - patch_0_9_2: contains the patches applied when vllm version is 0.9.2.
|
||||
# - patch_0_10_0: contains the patches applied when vllm version is 0.10.0.
|
||||
# - patch_main: contains the patches applied when vllm version is main branch.
|
||||
# - patch_common: contains the patches applied in both 0.9.2 and main branch.
|
||||
# - patch_common: contains the patches applied in both 0.10.0 and main branch.
|
||||
#
|
||||
# Once a new patch is added in vllm-ascend, please add the patch description into this file as well.
|
||||
# ----------------------------------------------------------------------------------
|
||||
@@ -101,3 +101,16 @@
|
||||
# - https://github.com/vllm-project/vllm-ascend/pull/1732
|
||||
# Future Plan:
|
||||
# Revert it when the ascend scatter performance improves.
|
||||
#
|
||||
# ** File: worker/patch_common/patch_sampler.py **
|
||||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
# 1. `vllm.v1.sample.sampler.Sampler.gather_logprobs`
|
||||
# Why:
|
||||
# We need to patch gather_logprobs to make sure call batched_count_greater_than
|
||||
# with backend=current_platform.simple_compile_backend
|
||||
# How:
|
||||
# Patch gather_logprobs call new batched_count_greater_than
|
||||
# Related PR (if no, explain why):
|
||||
# - https://github.com/vllm-project/vllm/pull/21591
|
||||
# Future Plan:
|
||||
# Revert it when vLLM merge #21591 and release new version
|
||||
|
||||
@@ -17,8 +17,8 @@
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
# Import specific patches for different versions
|
||||
if vllm_version_is("0.9.2"):
|
||||
from vllm_ascend.patch.platform import patch_0_9_2 # noqa: F401
|
||||
if vllm_version_is("0.10.0"):
|
||||
from vllm_ascend.patch.platform import patch_0_10_0 # noqa: F401
|
||||
from vllm_ascend.patch.platform import patch_common # noqa: F401
|
||||
else:
|
||||
from vllm_ascend.patch.platform import patch_common # noqa: F401
|
||||
|
||||
@@ -18,8 +18,8 @@
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
# Import specific patches for different versions
|
||||
if vllm_version_is("0.9.2"):
|
||||
from vllm_ascend.patch.worker import patch_0_9_2 # noqa: F401
|
||||
if vllm_version_is("0.10.0"):
|
||||
from vllm_ascend.patch.worker import patch_0_10_0 # noqa: F401
|
||||
from vllm_ascend.patch.worker import patch_common # noqa: F401
|
||||
else:
|
||||
from vllm_ascend.patch.worker import patch_common # noqa: F401
|
||||
|
||||
@@ -14,3 +14,5 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import vllm_ascend.patch.worker.patch_0_10_0.patch_sampler_gather_logprobs # noqa
|
||||
@@ -0,0 +1,87 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import torch
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.v1.outputs import LogprobsTensors
|
||||
from vllm.v1.sample.sampler import Sampler
|
||||
|
||||
|
||||
@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
|
||||
def batched_count_greater_than(x: torch.Tensor,
|
||||
values: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Counts elements in each row of x that are greater than the corresponding
|
||||
value in values. Use torch.compile to generate an optimized kernel for
|
||||
this function. otherwise, it will create additional copies of the input
|
||||
tensors and cause memory issues.
|
||||
Args:
|
||||
x (torch.Tensor): A 2D tensor of shape (batch_size, n_elements).
|
||||
values (torch.Tensor): A 2D tensor of shape (batch_size, 1).
|
||||
Returns:
|
||||
torch.Tensor: A 1D tensor of shape (batch_size,) with the counts.
|
||||
"""
|
||||
return (x >= values).sum(-1)
|
||||
|
||||
|
||||
def gather_logprobs(
|
||||
self,
|
||||
logprobs: torch.Tensor,
|
||||
num_logprobs: int,
|
||||
token_ids: torch.Tensor,
|
||||
) -> LogprobsTensors:
|
||||
"""
|
||||
Gather logprobs for topk and sampled/prompt token.
|
||||
|
||||
Args:
|
||||
logprobs: (num tokens) x (vocab) tensor
|
||||
num_logprobs: minimum number of logprobs to
|
||||
retain per token
|
||||
token_ids: prompt tokens (if prompt logprobs)
|
||||
or sampled tokens (if sampled
|
||||
logprobs); 1D token ID tensor
|
||||
with (num tokens) elements
|
||||
Must be int64.
|
||||
|
||||
Returns:
|
||||
Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
|
||||
Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
|
||||
Sampled token rank tensor, (num tokens)
|
||||
"""
|
||||
assert token_ids.dtype == torch.int64
|
||||
# Find the topK values.
|
||||
topk_logprobs, topk_indices = torch.topk(logprobs, num_logprobs, dim=-1)
|
||||
|
||||
# Get with the logprob of the prompt or sampled token.
|
||||
token_ids = token_ids.unsqueeze(-1)
|
||||
token_logprobs = logprobs.gather(-1, token_ids)
|
||||
|
||||
# Compute the ranks of the actual token.
|
||||
token_ranks = batched_count_greater_than(logprobs, token_logprobs)
|
||||
|
||||
# Concatenate together with the topk.
|
||||
indices = torch.cat((token_ids, topk_indices), dim=1)
|
||||
logprobs = torch.cat((token_logprobs, topk_logprobs), dim=1)
|
||||
|
||||
# Use int32 to reduce the tensor size.
|
||||
indices = indices.to(torch.int32)
|
||||
|
||||
return LogprobsTensors(indices, logprobs, token_ranks)
|
||||
|
||||
|
||||
Sampler.gather_logprobs = gather_logprobs
|
||||
@@ -45,8 +45,9 @@ from vllm.logger import logger
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.model_executor.models.interfaces_base import (VllmModelForPooling,
|
||||
is_pooling_model)
|
||||
from vllm.model_executor.models.interfaces import supports_transcription
|
||||
from vllm.model_executor.models.interfaces_base import (
|
||||
VllmModelForPooling, is_pooling_model, is_text_generation_model)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
|
||||
from vllm.multimodal.utils import group_mm_inputs_by_modality
|
||||
@@ -66,7 +67,7 @@ from vllm.v1.sample.sampler import Sampler
|
||||
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||
from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
|
||||
from vllm.v1.worker.utils import (gather_mm_placeholders,
|
||||
from vllm.v1.worker.utils import (bind_kv_cache, gather_mm_placeholders,
|
||||
sanity_check_mm_encoder_outputs,
|
||||
scatter_mm_placeholders)
|
||||
|
||||
@@ -88,15 +89,8 @@ from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
|
||||
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
|
||||
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
||||
|
||||
if vllm_version_is("0.9.2"):
|
||||
from vllm.model_executor.models.interfaces import has_step_pooler
|
||||
from vllm.v1.utils import bind_kv_cache
|
||||
else:
|
||||
from vllm.model_executor.models.interfaces import supports_transcription
|
||||
from vllm.model_executor.models.interfaces_base import \
|
||||
is_text_generation_model
|
||||
if not vllm_version_is("0.10.0"):
|
||||
from vllm.tasks import GenerationTask, SupportedTask
|
||||
from vllm.v1.worker.utils import bind_kv_cache
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import xgrammar as xgr # type: ignore[import-untyped]
|
||||
@@ -409,7 +403,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
else:
|
||||
generator = None
|
||||
|
||||
if not vllm_version_is("0.9.2") and pooling_params:
|
||||
if pooling_params:
|
||||
assert (task := pooling_params.task) is not None, (
|
||||
"You did not set `task` in the API")
|
||||
model = cast(VllmModelForPooling, self.model)
|
||||
@@ -585,10 +579,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
# OPTIMIZATION: Start copying the block table first.
|
||||
# This way, we can overlap the copy with the following CPU operations.
|
||||
if vllm_version_is("0.9.2"):
|
||||
self.input_batch.block_table.commit(num_reqs)
|
||||
else:
|
||||
self.input_batch.block_table.commit_block_table(num_reqs)
|
||||
self.input_batch.block_table.commit_block_table(num_reqs)
|
||||
|
||||
# Get the number of scheduled tokens for each request.
|
||||
req_ids = self.input_batch.req_ids
|
||||
@@ -939,10 +930,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
# OPTIMIZATION: Start copying the block table first.
|
||||
# This way, we can overlap the copy with the following CPU operations.
|
||||
if vllm_version_is("0.9.2"):
|
||||
self.input_batch.block_table.commit(num_reqs)
|
||||
else:
|
||||
self.input_batch.block_table.commit_block_table(num_reqs)
|
||||
self.input_batch.block_table.commit_block_table(num_reqs)
|
||||
|
||||
# Get the number of scheduled tokens for each request.
|
||||
# TODO: The Python loop can be slow. Optimize.
|
||||
@@ -1771,57 +1759,33 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
req_num_tokens = num_tokens // num_reqs
|
||||
|
||||
if vllm_version_is("0.9.2"):
|
||||
dummy_metadata = PoolingMetadata(
|
||||
prompt_lens=torch.tensor(
|
||||
[h.shape[0] for h in hidden_states_list],
|
||||
device=self.device),
|
||||
prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
|
||||
dtype=torch.int32,
|
||||
device=self.device),
|
||||
pooling_params=[PoolingParams()] * num_reqs)
|
||||
try:
|
||||
pooler_output = self.model.pooler(
|
||||
hidden_states=hidden_states_list,
|
||||
pooling_metadata=dummy_metadata)
|
||||
except RuntimeError as e:
|
||||
if 'out of memory' in str(e):
|
||||
raise RuntimeError(
|
||||
"NPU out of memory occurred when warming up pooler with "
|
||||
f"{num_reqs} dummy requests. Please try lowering "
|
||||
"`max_num_seqs` or `gpu_memory_utilization` when "
|
||||
"initializing the engine.") from e
|
||||
else:
|
||||
raise e
|
||||
else:
|
||||
model = cast(VllmModelForPooling, self.model)
|
||||
dummy_task = self.get_supported_pooling_tasks()[0]
|
||||
dummy_pooling_params = PoolingParams(task=dummy_task)
|
||||
model = cast(VllmModelForPooling, self.model)
|
||||
dummy_task = self.get_supported_pooling_tasks()[0]
|
||||
dummy_pooling_params = PoolingParams(task=dummy_task)
|
||||
|
||||
to_update = model.pooler.get_pooling_updates(dummy_task)
|
||||
to_update.apply(dummy_pooling_params)
|
||||
to_update = model.pooler.get_pooling_updates(dummy_task)
|
||||
to_update.apply(dummy_pooling_params)
|
||||
|
||||
dummy_metadata = PoolingMetadata(
|
||||
prompt_lens=torch.tensor(
|
||||
[h.shape[0] for h in hidden_states_list],
|
||||
device=self.device),
|
||||
prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
|
||||
dtype=torch.int32,
|
||||
device=self.device),
|
||||
pooling_params=[dummy_pooling_params] * num_reqs)
|
||||
dummy_metadata = PoolingMetadata(
|
||||
prompt_lens=torch.tensor([h.shape[0] for h in hidden_states_list],
|
||||
device=self.device),
|
||||
prompt_token_ids=torch.zeros((num_reqs, req_num_tokens),
|
||||
dtype=torch.int32,
|
||||
device=self.device),
|
||||
pooling_params=[dummy_pooling_params] * num_reqs)
|
||||
|
||||
try:
|
||||
pooler_output = model.pooler(hidden_states=hidden_states_list,
|
||||
pooling_metadata=dummy_metadata)
|
||||
except RuntimeError as e:
|
||||
if 'out of memory' in str(e):
|
||||
raise RuntimeError(
|
||||
"NPU out of memory occurred when warming up pooler with "
|
||||
f"{num_reqs} dummy requests. Please try lowering "
|
||||
"`max_num_seqs` or `gpu_memory_utilization` when "
|
||||
"initializing the engine.") from e
|
||||
else:
|
||||
raise e
|
||||
try:
|
||||
pooler_output = model.pooler(hidden_states=hidden_states_list,
|
||||
pooling_metadata=dummy_metadata)
|
||||
except RuntimeError as e:
|
||||
if 'out of memory' in str(e):
|
||||
raise RuntimeError(
|
||||
"NPU out of memory occurred when warming up pooler with "
|
||||
f"{num_reqs} dummy requests. Please try lowering "
|
||||
"`max_num_seqs` or `gpu_memory_utilization` when "
|
||||
"initializing the engine.") from e
|
||||
else:
|
||||
raise e
|
||||
|
||||
return pooler_output
|
||||
|
||||
@@ -1841,9 +1805,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
QKVParallelLinear, RowParallelLinear)):
|
||||
module.weight.data = torch_npu.npu_format_cast(
|
||||
module.weight.data, ACL_FORMAT_FRACTAL_NZ)
|
||||
|
||||
if vllm_version_is("0.9.2") and has_step_pooler(self.model):
|
||||
self.input_batch.logits_processing_needs_token_ids_bool = True
|
||||
if self.drafter:
|
||||
logger.info("Loading drafter model...")
|
||||
if isinstance(self.drafter, EagleProposer):
|
||||
|
||||
@@ -35,8 +35,6 @@ from vllm.v1.spec_decode.utils import is_spec_decode_unsupported
|
||||
from vllm.v1.utils import copy_slice
|
||||
from vllm.v1.worker.block_table import MultiGroupBlockTable
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
_SAMPLING_EPS = 1e-5
|
||||
|
||||
|
||||
@@ -246,11 +244,8 @@ class InputBatch:
|
||||
|
||||
# req_index -> bad_words_token_ids
|
||||
self.bad_words_token_ids: dict[int, list[list[int]]] = {}
|
||||
if vllm_version_is("0.9.2"):
|
||||
self.logits_processing_needs_token_ids_bool = False
|
||||
else:
|
||||
self.logits_processing_needs_token_ids = np.zeros(max_num_reqs,
|
||||
dtype=bool)
|
||||
self.logits_processing_needs_token_ids = np.zeros(max_num_reqs,
|
||||
dtype=bool)
|
||||
|
||||
self.req_output_token_ids: list[Optional[list[int]]] = []
|
||||
|
||||
@@ -387,9 +382,6 @@ class InputBatch:
|
||||
if sampling_params.bad_words_token_ids:
|
||||
self.bad_words_token_ids[
|
||||
req_index] = sampling_params.bad_words_token_ids
|
||||
elif vllm_version_is("0.9.2"):
|
||||
assert request.pooling_params is not None
|
||||
self.pooling_params[req_id] = request.pooling_params
|
||||
elif pooling_params := request.pooling_params:
|
||||
self.pooling_params[req_id] = pooling_params
|
||||
self.logits_processing_needs_token_ids[req_index] = (
|
||||
@@ -624,15 +616,10 @@ class InputBatch:
|
||||
self.presence_penalties, num_reqs)
|
||||
copy_slice(self.repetition_penalties_cpu_tensor,
|
||||
self.repetition_penalties, num_reqs)
|
||||
if vllm_version_is("0.9.2"):
|
||||
needs_prompt_token_ids = (
|
||||
not self.no_penalties
|
||||
or (self.num_reqs > 0
|
||||
and self.logits_processing_needs_token_ids_bool))
|
||||
else:
|
||||
needs_prompt_token_ids = (
|
||||
not self.no_penalties
|
||||
or self.logits_processing_needs_token_ids[:num_reqs].any())
|
||||
|
||||
needs_prompt_token_ids = (
|
||||
not self.no_penalties
|
||||
or self.logits_processing_needs_token_ids[:num_reqs].any())
|
||||
if needs_prompt_token_ids:
|
||||
# The prompt tokens are used only for applying penalties or
|
||||
# step pooling during the sampling/pooling process.
|
||||
|
||||
@@ -45,7 +45,7 @@ from vllm_ascend.utils import (sleep_mode_enabled, try_register_lib,
|
||||
vllm_version_is)
|
||||
from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
|
||||
|
||||
if not vllm_version_is("0.9.2"):
|
||||
if not vllm_version_is("0.10.0"):
|
||||
from vllm.tasks import SupportedTask
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user