upgrade vLLM to main (#4608)
1. fix https://github.com/vllm-project/vllm/pull/28542 The model structure modifications we involved in are: - Qwen2.5-VL(still exist some patch) - Qwen2-VL - Qwen2 - DeepSeek series - Qwen-moe series 2. fix https://github.com/vllm-project/vllm/pull/29121 the output token now type changed from np to `list[list[int]]` 3. fix https://github.com/vllm-project/vllm/pull/29262 `xformers` backend for multimodal now has been deprecated 4. fix https://github.com/vllm-project/vllm/pull/29342 5. fix https://github.com/vllm-project/vllm/pull/28579 6. fix https://github.com/vllm-project/vllm/pull/28718 7. fix https://github.com/vllm-project/vllm/issues/28665 8. fix https://github.com/vllm-project/vllm/pull/26847 vllm introduced the `optimization-level`, some default config has been changed, and the param `--enforce-eager` has been deprecated 9. fix http://github.com/vllm-project/vllm/pull/29223 it retuns tuple for sampler. 10. fix https://github.com/vllm-project/vllm/pull/29471 we'll remove the related patch to avoid this kind of error. Co-authored-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangli <wangli858794774@gmail.com> - vLLM version: v0.11.2 --------- Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com> Signed-off-by: wangli <wangli858794774@gmail.com> Signed-off-by: hfadzxy <starmoon_zhang@163.com> Co-authored-by: wangli <wangli858794774@gmail.com> Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
@@ -32,7 +32,7 @@ on:
|
||||
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
|
||||
vllm_version:
|
||||
required: false
|
||||
default: "v0.11.2"
|
||||
default: "86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24"
|
||||
type: string
|
||||
description: vllm version to use
|
||||
vllm_ascend_remote_url:
|
||||
|
||||
2
.github/workflows/format_pr_body.yaml
vendored
2
.github/workflows/format_pr_body.yaml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
|
||||
- name: Get vLLM version
|
||||
run: |
|
||||
VLLM_COMMIT=v0.11.2
|
||||
VLLM_COMMIT=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
||||
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
|
||||
|
||||
- name: Checkout repository
|
||||
|
||||
2
.github/workflows/nightly_benchmarks.yaml
vendored
2
.github/workflows/nightly_benchmarks.yaml
vendored
@@ -51,7 +51,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- vllm_branch: v0.11.2
|
||||
- vllm_branch: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
||||
vllm_ascend_branch: main
|
||||
max-parallel: 1
|
||||
container:
|
||||
|
||||
@@ -86,7 +86,7 @@ jobs:
|
||||
tests: tests/e2e/nightly/ops
|
||||
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
||||
with:
|
||||
vllm: v0.11.2
|
||||
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
||||
runner: ${{ matrix.test_config.os }}
|
||||
tests: ${{ matrix.test_config.tests }}
|
||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
|
||||
@@ -134,7 +134,7 @@ jobs:
|
||||
- Qwen3-Next-80B-A3B-Instruct
|
||||
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
|
||||
with:
|
||||
vllm: v0.11.2
|
||||
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
||||
runner: ${{ matrix.test_config.os }}
|
||||
model_list: ${{ toJson(matrix.test_config.model_list) }}
|
||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
|
||||
|
||||
@@ -139,7 +139,7 @@ jobs:
|
||||
tests: tests/e2e/nightly/models/test_glm4_5.py
|
||||
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
||||
with:
|
||||
vllm: v0.11.2
|
||||
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
||||
runner: ${{ matrix.test_config.os }}
|
||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
|
||||
tests: ${{ matrix.test_config.tests }}
|
||||
|
||||
@@ -69,7 +69,7 @@ jobs:
|
||||
name: e2e-full
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [v0.11.2]
|
||||
vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24]
|
||||
needs: [changes]
|
||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
||||
uses: ./.github/workflows/_e2e_test.yaml
|
||||
|
||||
@@ -42,7 +42,7 @@ jobs:
|
||||
lint:
|
||||
uses: ./.github/workflows/pre-commit.yml
|
||||
with:
|
||||
vllm: v0.11.2
|
||||
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
||||
changes:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
@@ -84,7 +84,7 @@ jobs:
|
||||
SOC_VERSION: ascend910b1
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [v0.11.2]
|
||||
vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24]
|
||||
steps:
|
||||
- name: Install packages
|
||||
run: |
|
||||
@@ -142,7 +142,7 @@ jobs:
|
||||
name: e2e-light
|
||||
strategy:
|
||||
matrix:
|
||||
vllm_version: [v0.11.2]
|
||||
vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24]
|
||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||
needs: [lint, changes]
|
||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||
|
||||
@@ -72,7 +72,7 @@ jobs:
|
||||
- DeepSeek-V2-Lite
|
||||
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
|
||||
with:
|
||||
vllm: v0.11.2
|
||||
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
||||
runner: ${{ matrix.runner }}
|
||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
|
||||
model_list: ${{ toJson(matrix.model_list) }}
|
||||
|
||||
@@ -48,8 +48,10 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.2
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
|
||||
@@ -39,8 +39,10 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.2
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
|
||||
@@ -36,8 +36,10 @@ COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.2
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
|
||||
@@ -47,8 +47,10 @@ RUN apt-get update -y && \
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.2
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
|
||||
@@ -50,8 +50,10 @@ RUN yum update -y && \
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.2
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
|
||||
@@ -50,8 +50,10 @@ RUN yum update -y && \
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.2
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
|
||||
@@ -77,7 +77,7 @@ myst_substitutions = {
|
||||
# CANN image tag
|
||||
'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11",
|
||||
# vllm version in ci
|
||||
'ci_vllm_version': 'v0.11.2',
|
||||
'ci_vllm_version': '86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24',
|
||||
}
|
||||
|
||||
# For cross-file header anchors
|
||||
|
||||
@@ -191,7 +191,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.decode_max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_vllm_config.scheduler_config.enable_chunked_prefill = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_dcp.world_size = 1
|
||||
@@ -213,7 +213,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
mock_vllm_config.cache_config.block_size)
|
||||
self.assertEqual(
|
||||
builder.chunked_prefill_enabled,
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled)
|
||||
mock_vllm_config.scheduler_config.enable_chunked_prefill)
|
||||
|
||||
@patch('vllm.distributed.parallel_state.get_dcp_group')
|
||||
@patch('vllm.distributed.parallel_state._DCP',
|
||||
@@ -230,7 +230,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.decode_max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_vllm_config.scheduler_config.enable_chunked_prefill = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_dcp.world_size = 1
|
||||
@@ -254,7 +254,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
mock_vllm_config.cache_config.block_size)
|
||||
self.assertEqual(
|
||||
builder.chunked_prefill_enabled,
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled)
|
||||
mock_vllm_config.scheduler_config.enable_chunked_prefill)
|
||||
|
||||
@patch('vllm.distributed.parallel_state.get_dcp_group')
|
||||
@patch('vllm.distributed.parallel_state._DCP',
|
||||
@@ -321,7 +321,7 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.decode_max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_vllm_config.scheduler_config.enable_chunked_prefill = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_dcp.world_size = 1
|
||||
@@ -440,8 +440,10 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
|
||||
self.mock_vllm_config.model_config = ModelConfig(max_model_len=2048)
|
||||
self.mock_vllm_config.model_config.hf_text_config.qk_rope_head_dim = 32
|
||||
self.mock_vllm_config.cache_config = CacheConfig(block_size=32)
|
||||
self.mock_vllm_config.scheduler_config = SchedulerConfig(
|
||||
max_num_seqs=8, chunked_prefill_enabled=True)
|
||||
mock_scheduler_config = MagicMock(spec=SchedulerConfig)
|
||||
mock_scheduler_config.max_num_seqs = 8
|
||||
mock_scheduler_config.chunked_prefill_enabled = True
|
||||
self.mock_vllm_config.scheduler_config = mock_scheduler_config
|
||||
self.mock_vllm_config.speculative_config = None
|
||||
self.mock_device = torch.device("cpu")
|
||||
|
||||
@@ -454,12 +456,20 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
|
||||
"vllm_ascend.attention.mla_v1.get_decode_context_model_parallel_world_size"
|
||||
)
|
||||
@patch("vllm_ascend.attention.mla_v1.get_ascend_config")
|
||||
def test_build_prefix_no_cache_metadata(self, mock_get_ascend_config,
|
||||
@patch("vllm_ascend.attention.mla_v1.torch.zeros", wraps=torch.zeros)
|
||||
@patch("torch.Tensor.npu", new=lambda self: self)
|
||||
@patch("torch.npu.is_available")
|
||||
def test_build_prefix_no_cache_metadata(self, mock_npu_available,
|
||||
mock_zeros, mock_get_ascend_config,
|
||||
mock_dcp_world_size):
|
||||
if not torch.npu.is_available():
|
||||
self.skipTest("NPU not available, skipping NPU-dependent tests")
|
||||
mock_npu_available.return_value = False
|
||||
mock_dcp_world_size.return_value = 1
|
||||
|
||||
def zeros_override(*args, **kwargs):
|
||||
kwargs.pop('pin_memory', None)
|
||||
return mock_zeros._mock_wraps(*args, **kwargs)
|
||||
|
||||
mock_zeros.side_effect = zeros_override
|
||||
common_attn_metadata = AscendCommonAttentionMetadata(
|
||||
query_start_loc=torch.tensor([0, 3, 7]),
|
||||
query_start_loc_cpu=torch.tensor([0, 3, 7]),
|
||||
@@ -506,12 +516,21 @@ class TestAscendMLAMetadataBuilderBuild(TestBase):
|
||||
"vllm_ascend.attention.mla_v1.get_decode_context_model_parallel_world_size"
|
||||
)
|
||||
@patch("vllm_ascend.attention.mla_v1.get_ascend_config")
|
||||
def test_build_chunked_prefix_metadata(self, mock_get_ascend_config,
|
||||
@patch("vllm_ascend.attention.mla_v1.torch.zeros", wraps=torch.zeros)
|
||||
@patch("torch.Tensor.npu", new=lambda self: self)
|
||||
@patch("torch.npu.is_available")
|
||||
def test_build_chunked_prefix_metadata(self, mock_npu_available,
|
||||
mock_zeros, mock_get_ascend_config,
|
||||
mock_dcp_world_size):
|
||||
if not torch.npu.is_available():
|
||||
self.skipTest("NPU not available, skipping NPU-dependent tests")
|
||||
mock_npu_available.return_value = False
|
||||
mock_dcp_world_size.return_value = 1
|
||||
|
||||
def zeros_override(*args, **kwargs):
|
||||
kwargs.pop('pin_memory', None)
|
||||
return mock_zeros._mock_wraps(*args, **kwargs)
|
||||
|
||||
mock_zeros.side_effect = zeros_override
|
||||
|
||||
common_attn_metadata = AscendCommonAttentionMetadata(
|
||||
query_start_loc=torch.tensor([0, 2, 5, 9]),
|
||||
query_start_loc_cpu=torch.tensor([0, 2, 5, 9]),
|
||||
|
||||
@@ -32,7 +32,7 @@ class TestACLGraphEntry(TestBase):
|
||||
"""Test ACLGraphEntry initialization with default values"""
|
||||
batch_descriptor = BatchDescriptor(
|
||||
num_tokens=30,
|
||||
uniform_decode=False,
|
||||
uniform=False,
|
||||
)
|
||||
|
||||
entry = ACLGraphEntry(batch_descriptor=batch_descriptor)
|
||||
@@ -46,7 +46,7 @@ class TestACLGraphEntry(TestBase):
|
||||
"""Test ACLGraphEntry initialization with specified values"""
|
||||
batch_descriptor = BatchDescriptor(
|
||||
num_tokens=30,
|
||||
uniform_decode=False,
|
||||
uniform=False,
|
||||
)
|
||||
|
||||
mock_graph = MagicMock()
|
||||
@@ -89,7 +89,7 @@ class TestACLGraphWrapper(TestBase):
|
||||
# Mock BatchDescriptor
|
||||
self.mock_batch_descriptor = BatchDescriptor(
|
||||
num_tokens=30,
|
||||
uniform_decode=False,
|
||||
uniform=False,
|
||||
)
|
||||
|
||||
# Mock ForwardContext
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
|
||||
SchedulerConfig, SpeculativeConfig, VllmConfig)
|
||||
@@ -81,9 +81,7 @@ def make_output(scheduler):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(scheduler.running)
|
||||
}
|
||||
sampled_token_ids = [
|
||||
np.array([1000], dtype=np.int64) for _ in scheduler.running
|
||||
]
|
||||
sampled_token_ids = [[1000]] * len(scheduler.running)
|
||||
|
||||
logprobs = None
|
||||
|
||||
@@ -98,6 +96,7 @@ def make_output(scheduler):
|
||||
return modelrunner_output
|
||||
|
||||
|
||||
@pytest.mark.skip("Ascend Scheduler has been deprecated")
|
||||
class TestAscendScheduler(TestBase):
|
||||
|
||||
@patch("vllm.config.ModelConfig.__post_init__", MagicMock())
|
||||
@@ -372,8 +371,7 @@ class TestAscendScheduler(TestBase):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[np.array([EOS_TOKEN_ID]),
|
||||
np.array([10, 11])
|
||||
sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
|
||||
], # First request hits EOS, second continues
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@@ -424,9 +422,8 @@ class TestAscendScheduler(TestBase):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[np.array([10, 42, 12]),
|
||||
np.array([13, 14])
|
||||
], # First request hits stop token
|
||||
sampled_token_ids=[[10, 42, 12],
|
||||
[13, 14]], # First request hits stop token
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -475,9 +472,8 @@ class TestAscendScheduler(TestBase):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[np.array([10, 11, 12]),
|
||||
np.array([13])
|
||||
], # First request exceeds max_tokens
|
||||
sampled_token_ids=[[10, 11, 12],
|
||||
[13]], # First request exceeds max_tokens
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -516,7 +512,7 @@ class TestAscendScheduler(TestBase):
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
|
||||
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -573,7 +569,7 @@ class TestAscendScheduler(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[np.array([0], dtype=np.int64)],
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -589,7 +585,7 @@ class TestAscendScheduler(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[1].request_id],
|
||||
req_id_to_index={requests[1].request_id: 0},
|
||||
sampled_token_ids=[np.array([0], dtype=np.int64)],
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -607,12 +603,10 @@ class TestAscendScheduler(TestBase):
|
||||
spec_tokens_list: List[List[List[int]]] = [[[1, 2, 3]], [[1, 2, 3]],
|
||||
[[1, 2], [3]], [[1]], [[]],
|
||||
[[1, 2, 3], [4, 5, 6]]]
|
||||
output_tokens_list: List[List[List[int]]] = [
|
||||
[np.array([1, 2, 3, 4])], [np.array([1, 5])],
|
||||
[np.array([1, 2, 5]), np.array([3, 4])], [np.array([1, 2])],
|
||||
[np.array([5])], [np.array([1, 2, 7]),
|
||||
np.array([4, 8])]
|
||||
]
|
||||
output_tokens_list: List[List[List[int]]] = [[[1, 2, 3, 4]], [[1, 5]],
|
||||
[[1, 2, 5], [3, 4]],
|
||||
[[1, 2]], [[5]],
|
||||
[[1, 2, 7], [4, 8]]]
|
||||
expected_list: List[Tuple[int, int,
|
||||
int, List[int]]] = [(1, 3, 3, [1, 1, 1]),
|
||||
(1, 3, 1, [1, 0, 0]),
|
||||
@@ -650,9 +644,7 @@ class TestAscendScheduler(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[
|
||||
np.array([0]) for _ in range(len(requests))
|
||||
],
|
||||
sampled_token_ids=[[0] for _ in range(len(requests))],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -892,11 +884,13 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
torch.float32, False))
|
||||
],
|
||||
)
|
||||
kv_cache_config.hash_block_size = block_size
|
||||
cache_config.num_gpu_blocks = 10000
|
||||
|
||||
scheduler = SchedulerDynamicBatch(
|
||||
vllm_config=vllm_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
block_size=block_size,
|
||||
log_stats=True,
|
||||
structured_output_manager=MagicMock(spec=StructuredOutputManager),
|
||||
)
|
||||
@@ -1064,8 +1058,7 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[np.array([EOS_TOKEN_ID]),
|
||||
np.array([10, 11])
|
||||
sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
|
||||
], # First request hits EOS, second continues
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
@@ -1116,9 +1109,8 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[np.array([10, 42, 12]),
|
||||
np.array([13, 14])
|
||||
], # First request hits stop token
|
||||
sampled_token_ids=[[10, 42, 12],
|
||||
[13, 14]], # First request hits stop token
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1167,9 +1159,8 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[np.array([10, 11, 12]),
|
||||
np.array([13])
|
||||
], # First request exceeds max_tokens
|
||||
sampled_token_ids=[[10, 11, 12],
|
||||
[13]], # First request exceeds max_tokens
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1208,7 +1199,7 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[np.array([EOS_TOKEN_ID, 10, 11])],
|
||||
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1265,7 +1256,7 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[np.array([0])],
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1281,7 +1272,7 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[1].request_id],
|
||||
req_id_to_index={requests[1].request_id: 0},
|
||||
sampled_token_ids=[np.array([0])],
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
@@ -1299,12 +1290,10 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
spec_tokens_list: List[List[List[int]]] = [[[1, 2, 3]], [[1, 2, 3]],
|
||||
[[1, 2], [3]], [[1]], [[]],
|
||||
[[1, 2, 3], [4, 5, 6]]]
|
||||
output_tokens_list: List[List[List[int]]] = [
|
||||
[np.array([1, 2, 3, 4])], [np.array([1, 5])],
|
||||
[np.array([1, 2, 5]), np.array([3, 4])], [np.array([1, 2])],
|
||||
[np.array([5])], [np.array([1, 2, 7]),
|
||||
np.array([4, 8])]
|
||||
]
|
||||
output_tokens_list: List[List[List[int]]] = [[[1, 2, 3, 4]], [[1, 5]],
|
||||
[[1, 2, 5], [3, 4]],
|
||||
[[1, 2]], [[5]],
|
||||
[[1, 2, 7], [4, 8]]]
|
||||
expected_list: List[Tuple[int, int,
|
||||
int, List[int]]] = [(1, 3, 3, [1, 1, 1]),
|
||||
(1, 3, 1, [1, 0, 0]),
|
||||
@@ -1342,9 +1331,7 @@ class TestSchedulerDynamicBatch(TestBase):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[
|
||||
np.array([0]) for _ in range(len(requests))
|
||||
],
|
||||
sampled_token_ids=[[0] for _ in range(len(requests))],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
import os
|
||||
from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
|
||||
@@ -189,7 +188,7 @@ def create_model_runner_output(
|
||||
|
||||
# Make sampled tokens.
|
||||
sampled_token = EOS_TOKEN_ID if use_eos else 0
|
||||
sampled_token_ids = [np.array([sampled_token]) for _ in req_ids]
|
||||
sampled_token_ids = [[sampled_token] for _ in req_ids]
|
||||
|
||||
# Make output data structure.
|
||||
extra_args = {}
|
||||
|
||||
@@ -224,7 +224,6 @@ class TestEagleProposerGenerateTokenIds(TestBase):
|
||||
|
||||
def test_generate_token_ids_without_metadata(self):
|
||||
valid_sampled = [[20, 30, 40]]
|
||||
valid_sampled = [np.array(sublist) for sublist in valid_sampled]
|
||||
scheduler_output = MagicMock()
|
||||
scheduler_output.num_scheduled_tokens = [2, 1, 3]
|
||||
positions = torch.tensor([0, 1, 2, 3, 4, 5])
|
||||
@@ -251,7 +250,6 @@ class TestEagleProposerGenerateTokenIds(TestBase):
|
||||
|
||||
def test_generate_token_ids_with_metadata(self):
|
||||
valid_sampled = [[5], [6, 7], [8, 9, 10]]
|
||||
valid_sampled = [np.array(sublist) for sublist in valid_sampled]
|
||||
spec_metadata = MagicMock()
|
||||
spec_metadata.num_draft_tokens = [2, 3, 4]
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ import torch
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.config import CacheConfig
|
||||
from vllm.distributed.parallel_state import GroupCoordinator
|
||||
from vllm.transformers_utils.config import patch_rope_parameters
|
||||
|
||||
from vllm_ascend.torchair.models.torchair_deepseek_v2 import (
|
||||
TorchairDeepseekV2DecoderLayer, TorchairDeepseekV2ForCausalLM,
|
||||
@@ -59,6 +60,7 @@ def base_config():
|
||||
topk_group=1,
|
||||
vocab_size=10000,
|
||||
)
|
||||
patch_rope_parameters(config)
|
||||
return config
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from torch import nn
|
||||
from vllm.distributed.parallel_state import GroupCoordinator
|
||||
@@ -180,17 +181,19 @@ class TestAscendMLATorchairMetadata(TestBase):
|
||||
class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
|
||||
def test_ascend_mla_metadata_builder_default(self):
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.model_config.get_head_size.return_value = 64
|
||||
mock_vllm_config.model_config.dtype = torch.float16
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
ascend_config = MagicMock()
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
ascend_config.torchair_graph_config.enabled = True
|
||||
@@ -204,22 +207,25 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
mock_vllm_config.cache_config.block_size)
|
||||
self.assertEqual(
|
||||
builder.chunked_prefill_enabled,
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled)
|
||||
mock_vllm_config.scheduler_config.enable_chunked_prefill)
|
||||
self.assertEqual(builder.torchair_graph_enabled, True)
|
||||
|
||||
@patch("vllm_ascend.torchair.torchair_mla.get_ascend_config")
|
||||
def test_reorder_batch_with_torchair_graph(self, ascend_config):
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
ascend_config.torchair_graph_config.enabled = True
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
@@ -248,15 +254,20 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
with patch("vllm_ascend.torchair.torchair_mla.get_ascend_config",
|
||||
return_value=ascend_config):
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
@@ -287,14 +298,21 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
ascend_config = MagicMock()
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
@@ -305,19 +323,26 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
self.assertEqual(result.shape[1], 64)
|
||||
self.assertTrue(torch.equal(result[:, :10], block_tables))
|
||||
|
||||
@pytest.mark.skip(reason="Skipping this test temporarily.")
|
||||
@patch("vllm_ascend.torchair.torchair_mla.get_ascend_config")
|
||||
def test_get_graph_runner_block_tables_truncated(self, mock_ascend_config):
|
||||
ascend_config = MagicMock()
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 64
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
@@ -334,14 +359,21 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
ascend_config = MagicMock()
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
@@ -360,16 +392,20 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_vllm_config.get_head_size.return_value = 64
|
||||
mock_vllm_config.model_config.dtype = torch.float16
|
||||
mock_device = 'cpu'
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(
|
||||
None,
|
||||
None,
|
||||
@@ -427,18 +463,23 @@ class TestAscendMLATorchairMetadataBuilder(TestBase):
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_vllm_config.get_head_size.return_value = 64
|
||||
mock_vllm_config.model_config.dtype = torch.float16
|
||||
mock_device = 'cpu'
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
model = MagicMock(spec=nn.Module)
|
||||
model.model = MagicMock(spec=nn.Module)
|
||||
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
builder = AscendMLATorchairMetadataBuilder(
|
||||
None,
|
||||
None,
|
||||
|
||||
@@ -176,17 +176,19 @@ class TestAscendSFATorchairMetadata(TestBase):
|
||||
class TestAscendSFATorchairMetadataBuilder(TestBase):
|
||||
|
||||
def test_ascend_sfa_metadata_builder_default(self):
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.model_config.get_head_size.return_value = 64
|
||||
mock_vllm_config.model_config.dtype = torch.float16
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
ascend_config = MagicMock()
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
ascend_config.torchair_graph_config.enabled = True
|
||||
@@ -200,7 +202,7 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
|
||||
mock_vllm_config.cache_config.block_size)
|
||||
self.assertEqual(
|
||||
builder.chunked_prefill_enabled,
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled)
|
||||
mock_vllm_config.scheduler_config.enable_chunked_prefill)
|
||||
self.assertEqual(builder.torchair_graph_enabled, True)
|
||||
self.assertEqual(builder.max_blocks, (mock_vllm_config.model_config.max_model_len +
|
||||
mock_vllm_config.cache_config.block_size - 1) \
|
||||
@@ -208,17 +210,22 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
|
||||
|
||||
@patch("vllm_ascend.torchair.torchair_sfa.get_ascend_config")
|
||||
def test_reorder_batch_with_torchair_graph(self, ascend_config):
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
ascend_config.torchair_graph_config = MagicMock()
|
||||
ascend_config.torchair_graph_config.enabled = True
|
||||
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
builder = AscendSFATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
@@ -247,13 +254,18 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
|
||||
ascend_config = MagicMock()
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
builder = AscendSFATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
@@ -270,18 +282,25 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
|
||||
ascend_config = MagicMock()
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 64
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
|
||||
builder = AscendSFATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
|
||||
builder.max_blocks = 4
|
||||
block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32)
|
||||
|
||||
result = builder._get_graph_runner_block_tables(3, block_tables)
|
||||
@@ -295,14 +314,19 @@ class TestAscendSFATorchairMetadataBuilder(TestBase):
|
||||
ascend_config = MagicMock()
|
||||
mock_ascend_config.return_value = ascend_config
|
||||
ascend_config.torchair_graph_config.enabled = False
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
mock_model_config = MagicMock()
|
||||
mock_model_config.max_model_len = 1024
|
||||
mock_model_config.get_head_size.return_value = 64
|
||||
mock_model_config.dtype = torch.float16
|
||||
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config = mock_model_config
|
||||
mock_vllm_config.cache_config = MagicMock(block_size=16)
|
||||
mock_vllm_config.scheduler_config = MagicMock(
|
||||
max_num_seqs=4, enable_chunked_prefill=False)
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
mock_device = torch.device('cpu')
|
||||
builder = AscendSFATorchairMetadataBuilder(None, None,
|
||||
mock_vllm_config,
|
||||
mock_device)
|
||||
|
||||
@@ -276,7 +276,7 @@ class AscendAttentionMetadataBuilder:
|
||||
AscendAttentionMetadataBuilder.reorder_batch_threshold = self.decode_threshold
|
||||
|
||||
scheduler_config = vllm_config.scheduler_config
|
||||
self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
|
||||
self.chunked_prefill_enabled = scheduler_config.enable_chunked_prefill
|
||||
|
||||
def reorder_batch(self, input_batch,
|
||||
scheduler_output: "SchedulerOutput") -> bool:
|
||||
|
||||
@@ -226,7 +226,7 @@ class AscendMLAMetadataBuilder:
|
||||
self.block_size = vllm_config.cache_config.block_size
|
||||
self.max_blocks = (vllm_config.model_config.max_model_len +
|
||||
self.block_size - 1) // self.block_size
|
||||
self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
|
||||
self.chunked_prefill_enabled = scheduler_config.enable_chunked_prefill
|
||||
|
||||
self.speculative_config = vllm_config.speculative_config
|
||||
self.decode_threshold = 1
|
||||
|
||||
@@ -456,7 +456,7 @@ class RecomputeScheduler(SchedulerInterface):
|
||||
|
||||
# chunked prefill has to be enabled explicitly to allow
|
||||
# pooling requests to be chunked
|
||||
if not self.scheduler_config.chunked_prefill_enabled and \
|
||||
if not self.scheduler_config.enable_chunked_prefill and \
|
||||
num_new_tokens > token_budget:
|
||||
self.waiting.pop_request()
|
||||
skipped_waiting_requests.prepend_request(request)
|
||||
|
||||
@@ -70,7 +70,7 @@ class AscendScheduler(Scheduler):
|
||||
self._initialize_common()
|
||||
|
||||
def schedule(self) -> SchedulerOutput:
|
||||
if self.scheduler_config.chunked_prefill_enabled:
|
||||
if self.scheduler_config.enable_chunked_prefill:
|
||||
return super().schedule()
|
||||
scheduled_new_reqs: list[Request] = []
|
||||
scheduled_resumed_reqs: list[Request] = []
|
||||
@@ -534,7 +534,7 @@ class AscendScheduler(Scheduler):
|
||||
return True
|
||||
|
||||
def _get_prompt_limit(self, request: Request) -> int:
|
||||
if (self.scheduler_config.chunked_prefill_enabled
|
||||
if (self.scheduler_config.enable_chunked_prefill
|
||||
and not self.scheduler_config.is_multi_step):
|
||||
prompt_limit = self.vllm_config.model_config.max_model_len
|
||||
else:
|
||||
|
||||
@@ -404,7 +404,7 @@ class SchedulerDynamicBatch(Scheduler):
|
||||
|
||||
# chunked prefill has to be enabled explicitly to allow
|
||||
# pooling requests to be chunked
|
||||
if not self.scheduler_config.chunked_prefill_enabled and \
|
||||
if not self.scheduler_config.enable_chunked_prefill and \
|
||||
num_new_tokens > token_budget:
|
||||
self.waiting.pop_request()
|
||||
skipped_waiting_requests.prepend_request(request)
|
||||
|
||||
@@ -9,14 +9,14 @@ from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, Optional, Sequence
|
||||
|
||||
import torch
|
||||
from vllm.attention import AttentionType
|
||||
from vllm.attention.backends.abstract import AttentionType
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
|
||||
from vllm.distributed.parallel_state import get_pp_group, get_tp_group
|
||||
from vllm.logger import logger
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.utils import logger
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheSpec,
|
||||
MLAAttentionSpec)
|
||||
|
||||
@@ -2,7 +2,8 @@ import time
|
||||
from collections import defaultdict
|
||||
from typing import Optional
|
||||
|
||||
from vllm.utils import logger, sha256
|
||||
from vllm.logger import logger
|
||||
from vllm.utils.hashing import sha256
|
||||
from vllm.v1.core.block_pool import BlockPool
|
||||
from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock,
|
||||
PrefixCachingMetrics)
|
||||
|
||||
@@ -9,7 +9,7 @@ import torch
|
||||
import vllm.envs as envs
|
||||
import zmq
|
||||
from vllm.config import KVTransferConfig, VllmConfig
|
||||
from vllm.utils import logger
|
||||
from vllm.logger import logger
|
||||
from vllm.utils.network_utils import make_zmq_socket
|
||||
from vllm.utils.torch_utils import get_dtype_size
|
||||
from vllm.v1.kv_cache_interface import AttentionSpec
|
||||
|
||||
@@ -8,7 +8,7 @@ from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.utils import logger
|
||||
from vllm.logger import logger
|
||||
from vllm.utils.network_utils import make_zmq_socket
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
|
||||
@@ -3,7 +3,7 @@ from enum import Enum
|
||||
|
||||
import torch
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.utils import logger
|
||||
from vllm.logger import logger
|
||||
|
||||
from vllm_ascend.distributed.kvpool.backend.backend import Backend
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ from typing import Union
|
||||
|
||||
# Third Party
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.utils import logger
|
||||
from vllm.logger import logger
|
||||
from vllm.utils.network_utils import get_ip
|
||||
|
||||
from vllm_ascend.distributed.kvpool.backend.backend import Backend
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import Iterable, List, Optional, Tuple, Union
|
||||
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import \
|
||||
KVConnectorMetadata
|
||||
from vllm.utils import logger
|
||||
from vllm.logger import logger
|
||||
from vllm.utils.math_utils import cdiv
|
||||
from vllm.v1.core.kv_cache_utils import BlockHash
|
||||
from vllm.v1.core.sched.output import NewRequestData
|
||||
|
||||
@@ -4,7 +4,7 @@ from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Any, Optional
|
||||
|
||||
import torch
|
||||
from vllm.utils import logger
|
||||
from vllm.logger import logger
|
||||
from vllm.v1.core.kv_cache_utils import BlockHash
|
||||
|
||||
from vllm_ascend.distributed.kvpool.backend.backend import Backend
|
||||
|
||||
@@ -5,7 +5,7 @@ import zmq
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.distributed.kv_transfer.kv_connector.v1.base import \
|
||||
KVConnectorMetadata
|
||||
from vllm.utils import logger
|
||||
from vllm.logger import logger
|
||||
from vllm.utils.network_utils import make_zmq_socket
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
from vllm.v1.core.kv_cache_utils import BlockHash
|
||||
|
||||
@@ -8,7 +8,7 @@ from vllm.distributed import (get_decode_context_model_parallel_rank,
|
||||
get_decode_context_model_parallel_world_size,
|
||||
get_tensor_model_parallel_rank,
|
||||
get_tensor_model_parallel_world_size)
|
||||
from vllm.utils import logger
|
||||
from vllm.logger import logger
|
||||
from vllm.v1.core.kv_cache_utils import BlockHash
|
||||
|
||||
from vllm_ascend.distributed.kvpool.backend.backend import Backend
|
||||
|
||||
@@ -25,7 +25,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
from vllm.distributed.parallel_state import (get_dcp_group, get_tp_group,
|
||||
get_world_group)
|
||||
from vllm.forward_context import ForwardContext
|
||||
from vllm.utils import logger
|
||||
from vllm.logger import logger
|
||||
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
|
||||
@@ -29,7 +29,7 @@ from vllm.distributed.parallel_state import (
|
||||
get_decode_context_model_parallel_rank,
|
||||
get_decode_context_model_parallel_world_size,
|
||||
get_tensor_model_parallel_rank, get_tp_group)
|
||||
from vllm.utils import logger
|
||||
from vllm.logger import logger
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
from vllm.v1.request import RequestStatus
|
||||
|
||||
@@ -27,7 +27,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import (
|
||||
KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole)
|
||||
from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
|
||||
get_tp_group, get_world_group)
|
||||
from vllm.utils import logger
|
||||
from vllm.logger import logger
|
||||
from vllm.utils.network_utils import get_ip, make_zmq_path, make_zmq_socket
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
from vllm.attention import AttentionBackend
|
||||
from vllm.attention.backends.abstract import AttentionBackend
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.platform_utils import is_pin_memory_available
|
||||
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
|
||||
|
||||
@@ -23,7 +23,7 @@ from typing import Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from vllm.attention import AttentionMetadata
|
||||
from vllm.attention.backends.abstract import AttentionMetadata
|
||||
from vllm.attention.layer import MLAAttention
|
||||
from vllm.config import CacheConfig, get_current_vllm_config
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
|
||||
@@ -27,8 +27,7 @@ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import \
|
||||
from transformers.models.qwen2_vl.configuration_qwen2_vl import \
|
||||
Qwen2VLVisionConfig
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
from vllm.attention.layer import (check_upstream_fa_availability,
|
||||
maybe_get_vit_flash_attn_backend)
|
||||
from vllm.attention.layer import maybe_get_vit_flash_attn_backend
|
||||
from vllm.model_executor.layers.activation import get_act_and_mul_fn
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
@@ -65,7 +64,6 @@ class AscendQwen2_5_VisionAttention(nn.Module):
|
||||
rotary_pos_emb_cos: torch.Tensor,
|
||||
rotary_pos_emb_sin: torch.Tensor,
|
||||
max_seqlen: torch.Tensor,
|
||||
seqlens: torch.Tensor = None,
|
||||
) -> torch.Tensor:
|
||||
# [s, b, c] --> [s, b, head * 3 * head_dim]
|
||||
x, _ = self.qkv(x)
|
||||
@@ -141,7 +139,6 @@ class AscendQwen2VisionBlock(nn.Module):
|
||||
rotary_pos_emb_cos: torch.Tensor,
|
||||
rotary_pos_emb_sin: torch.Tensor,
|
||||
max_seqlen: int | None = None, # Only used for Flash Attention
|
||||
seqlens: list[int] | None = None, # Only used for xFormers
|
||||
) -> torch.Tensor:
|
||||
x = x + self.attn(
|
||||
self.norm1(x),
|
||||
@@ -149,7 +146,6 @@ class AscendQwen2VisionBlock(nn.Module):
|
||||
rotary_pos_emb_cos=rotary_pos_emb_cos,
|
||||
rotary_pos_emb_sin=rotary_pos_emb_sin,
|
||||
max_seqlen=max_seqlen,
|
||||
seqlens=seqlens,
|
||||
)
|
||||
x = x + self.mlp(self.norm2(x))
|
||||
return x
|
||||
@@ -198,7 +194,6 @@ class AscendQwen2VisionTransformer(nn.Module):
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim // 2,
|
||||
max_position=8192,
|
||||
base=10000.0,
|
||||
is_neox_style=True,
|
||||
)
|
||||
|
||||
@@ -228,10 +223,6 @@ class AscendQwen2VisionTransformer(nn.Module):
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
|
||||
if (self.attn_backend != AttentionBackendEnum.FLASH_ATTN
|
||||
and check_upstream_fa_availability(torch.get_default_dtype())):
|
||||
self.attn_backend = AttentionBackendEnum.FLASH_ATTN
|
||||
|
||||
def rot_pos_emb(
|
||||
self,
|
||||
grid_thw: list[list[int]]) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
@@ -300,7 +291,7 @@ class AscendQwen2VisionTransformer(nn.Module):
|
||||
x = x.unsqueeze(1)
|
||||
|
||||
# pre-compute seqlens for attn mask to reduce cuMemcpy operations
|
||||
max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
|
||||
max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
|
||||
for blk in self.blocks:
|
||||
x = blk(
|
||||
x,
|
||||
@@ -308,7 +299,6 @@ class AscendQwen2VisionTransformer(nn.Module):
|
||||
rotary_pos_emb_cos=rotary_pos_emb_cos,
|
||||
rotary_pos_emb_sin=rotary_pos_emb_sin,
|
||||
max_seqlen=max_seqlen,
|
||||
seqlens=seqlens,
|
||||
)
|
||||
|
||||
# adapter
|
||||
@@ -326,7 +316,6 @@ class AscendQwen2_5_VisionBlock(nn.Module):
|
||||
rotary_pos_emb_cos: torch.Tensor,
|
||||
rotary_pos_emb_sin: torch.Tensor,
|
||||
max_seqlen: torch.Tensor, # Only used for Flash Attention
|
||||
seqlens: torch.Tensor, # Only used for xFormers
|
||||
) -> torch.Tensor:
|
||||
x_attn = self.attn(
|
||||
self.norm1(x),
|
||||
@@ -334,7 +323,6 @@ class AscendQwen2_5_VisionBlock(nn.Module):
|
||||
rotary_pos_emb_cos=rotary_pos_emb_cos,
|
||||
rotary_pos_emb_sin=rotary_pos_emb_sin,
|
||||
max_seqlen=max_seqlen,
|
||||
seqlens=seqlens,
|
||||
)
|
||||
x_fused_norm, residual = self.norm2(x, residual=x_attn)
|
||||
x = residual + self.mlp(x_fused_norm)
|
||||
@@ -388,11 +376,9 @@ class AscendQwen2_5_VisionTransformer(nn.Module):
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim // 2,
|
||||
max_position=8192,
|
||||
base=10000.0,
|
||||
is_neox_style=True,
|
||||
)
|
||||
|
||||
use_upstream_fa = False
|
||||
self.attn_backend = get_vit_attn_backend(
|
||||
head_size=head_dim,
|
||||
dtype=torch.get_default_dtype(),
|
||||
@@ -402,7 +388,6 @@ class AscendQwen2_5_VisionTransformer(nn.Module):
|
||||
self.attn_backend, self.flash_attn_varlen_func = (
|
||||
maybe_get_vit_flash_attn_backend(
|
||||
self.attn_backend,
|
||||
use_upstream_fa,
|
||||
attn_backend_override=attn_backend_override,
|
||||
))
|
||||
|
||||
@@ -418,7 +403,6 @@ class AscendQwen2_5_VisionTransformer(nn.Module):
|
||||
prefix=f"{prefix}.blocks.{layer_idx}",
|
||||
use_data_parallel=use_data_parallel,
|
||||
attn_backend=self.attn_backend,
|
||||
use_upstream_fa=use_upstream_fa,
|
||||
attn_backend_override=attn_backend_override,
|
||||
) for layer_idx in range(depth)
|
||||
])
|
||||
@@ -553,10 +537,8 @@ class AscendQwen2_5_VisionTransformer(nn.Module):
|
||||
|
||||
# transformers
|
||||
# pre-compute seqlens for window/full attn to reduce cuMemcpy operations
|
||||
max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(
|
||||
cu_seqlens)
|
||||
max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen(
|
||||
cu_window_seqlens)
|
||||
max_seqlen_full = self.compute_attn_mask_seqlen(cu_seqlens)
|
||||
max_seqlen_window = self.compute_attn_mask_seqlen(cu_window_seqlens)
|
||||
|
||||
cu_seqlens = cu_seqlens.to( # type: ignore[attr-defined]
|
||||
device=self.device,
|
||||
@@ -587,11 +569,9 @@ class AscendQwen2_5_VisionTransformer(nn.Module):
|
||||
if layer_num in self.fullatt_block_indexes:
|
||||
cu_seqlens_now = cu_seqlens
|
||||
max_seqlen_now = max_seqlen_full
|
||||
seqlens_now = seqlens_full
|
||||
else:
|
||||
cu_seqlens_now = cu_window_seqlens
|
||||
max_seqlen_now = max_seqlen_window
|
||||
seqlens_now = seqlens_window
|
||||
|
||||
hidden_states = blk(
|
||||
hidden_states,
|
||||
@@ -599,7 +579,6 @@ class AscendQwen2_5_VisionTransformer(nn.Module):
|
||||
rotary_pos_emb_cos=rotary_pos_emb_cos,
|
||||
rotary_pos_emb_sin=rotary_pos_emb_sin,
|
||||
max_seqlen=max_seqlen_now,
|
||||
seqlens=seqlens_now,
|
||||
)
|
||||
|
||||
# For Qwen2.5-VL-3B, float16 will overflow at last block
|
||||
|
||||
@@ -23,7 +23,6 @@ import torch.nn as nn
|
||||
from transformers.models.qwen3_vl.configuration_qwen3_vl import \
|
||||
Qwen3VLVisionConfig
|
||||
from vllm.attention.backends.registry import AttentionBackendEnum
|
||||
from vllm.attention.layer import check_upstream_fa_availability
|
||||
from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
@@ -101,7 +100,6 @@ class AscendQwen3_VisionTransformer(nn.Module):
|
||||
head_size=head_dim,
|
||||
rotary_dim=head_dim // 2,
|
||||
max_position=8192,
|
||||
base=10000.0,
|
||||
is_neox_style=True,
|
||||
)
|
||||
|
||||
@@ -133,17 +131,10 @@ class AscendQwen3_VisionTransformer(nn.Module):
|
||||
dtype=torch.get_default_dtype(),
|
||||
attn_backend_override=attn_backend_override,
|
||||
)
|
||||
use_upstream_fa = False
|
||||
if (self.attn_backend != AttentionBackendEnum.FLASH_ATTN
|
||||
and self.attn_backend != AttentionBackendEnum.ROCM_AITER_FA
|
||||
and check_upstream_fa_availability(torch.get_default_dtype())):
|
||||
self.attn_backend = AttentionBackendEnum.FLASH_ATTN
|
||||
use_upstream_fa = True
|
||||
|
||||
if self.attn_backend not in {
|
||||
AttentionBackendEnum.FLASH_ATTN,
|
||||
AttentionBackendEnum.TORCH_SDPA,
|
||||
AttentionBackendEnum.XFORMERS,
|
||||
AttentionBackendEnum.ROCM_AITER_FA,
|
||||
}:
|
||||
raise RuntimeError(
|
||||
@@ -159,7 +150,6 @@ class AscendQwen3_VisionTransformer(nn.Module):
|
||||
prefix=f"{prefix}.blocks.{layer_idx}",
|
||||
use_data_parallel=use_data_parallel,
|
||||
attn_backend=self.attn_backend,
|
||||
use_upstream_fa=use_upstream_fa,
|
||||
) for layer_idx in range(vision_config.depth)
|
||||
])
|
||||
|
||||
|
||||
@@ -157,6 +157,7 @@ class NPUPlatform(Platform):
|
||||
compilation_config.splitting_ops = []
|
||||
|
||||
compilation_config.cudagraph_num_of_warmups = 1
|
||||
compilation_config.pass_config.enable_fusion = False
|
||||
|
||||
if compilation_config.mode not in [
|
||||
CompilationMode.NONE, CompilationMode.VLLM_COMPILE
|
||||
@@ -310,7 +311,7 @@ class NPUPlatform(Platform):
|
||||
vllm_config.scheduler_config.scheduler_cls = (
|
||||
"vllm_ascend.core.scheduler_dynamic_batch.SchedulerDynamicBatch"
|
||||
)
|
||||
vllm_config.scheduler_config.chunked_prefill_enabled = True
|
||||
vllm_config.scheduler_config.enable_chunked_prefill = True
|
||||
vllm_config.scheduler_config.SLO_limits_for_dynamic_batch = ascend_config.SLO_limits_for_dynamic_batch
|
||||
|
||||
if vllm_config.kv_transfer_config is not None and \
|
||||
|
||||
@@ -138,7 +138,8 @@ class EagleProposer(Proposer):
|
||||
dummy_compute_logits(self.hidden_states)
|
||||
|
||||
def generate_token_ids(self,
|
||||
valid_sampled_token_ids: list[np.ndarray],
|
||||
valid_sampled_token_ids: torch.Tensor
|
||||
| list[list[int]],
|
||||
sampling_metadata: SamplingMetadata = None,
|
||||
scheduler_output: SchedulerOutput = None,
|
||||
spec_decode_metadata: SpecDecodeMetadata = None,
|
||||
@@ -151,7 +152,7 @@ class EagleProposer(Proposer):
|
||||
attn_metadata = self._get_eagle_atten_dict(scheduler_output)
|
||||
next_token_ids: list[int] = []
|
||||
for i, token_ids in enumerate(valid_sampled_token_ids):
|
||||
if token_ids.shape[0] > 0:
|
||||
if token_ids:
|
||||
# Common case.
|
||||
next_token_id = token_ids[-1]
|
||||
else:
|
||||
@@ -163,7 +164,7 @@ class EagleProposer(Proposer):
|
||||
scheduler_output.num_scheduled_tokens[req_id])
|
||||
|
||||
next_token_id = req_state.get_token_id(seq_len)
|
||||
next_token_ids.append(next_token_id.item())
|
||||
next_token_ids.append(next_token_id)
|
||||
next_token_ids = torch.tensor(next_token_ids,
|
||||
dtype=torch.int32,
|
||||
device=self.device)
|
||||
@@ -183,7 +184,7 @@ class EagleProposer(Proposer):
|
||||
else:
|
||||
num_draft_tokens = spec_decode_metadata.num_draft_tokens
|
||||
num_rejected_tokens = [
|
||||
n + 1 - valid_sampled_token_ids[i].shape[0] if n > 0 else 0
|
||||
n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0
|
||||
for i, n in enumerate(num_draft_tokens)
|
||||
]
|
||||
num_rejected_tokens = torch.tensor(
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import enum
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from vllm.config import CUDAGraphMode, VllmConfig
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
@@ -42,7 +41,7 @@ class Proposer:
|
||||
raise NotImplementedError
|
||||
|
||||
def generate_token_ids(self,
|
||||
valid_sampled_token_ids: list[np.ndarray],
|
||||
valid_sampled_token_ids: list[list[int]],
|
||||
sampling_metadata: SamplingMetadata = None,
|
||||
scheduler_output: SchedulerOutput = None,
|
||||
spec_decode_metadata: SpecDecodeMetadata = None,
|
||||
|
||||
@@ -7,7 +7,7 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from vllm.config import (CUDAGraphMode, VllmConfig,
|
||||
get_layers_from_vllm_config, set_current_vllm_config)
|
||||
from vllm.forward_context import BatchDescriptor, get_forward_context
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
from vllm.model_executor.model_loader import get_model_loader
|
||||
@@ -314,8 +314,7 @@ class MtpProposer(Proposer):
|
||||
break
|
||||
|
||||
def generate_token_ids(self,
|
||||
sampled_token_ids: Union[torch.Tensor,
|
||||
list[np.ndarray]],
|
||||
sampled_token_ids: torch.Tensor | list[list[int]],
|
||||
sampling_metadata: SamplingMetadata = None,
|
||||
scheduler_output: SchedulerOutput = None,
|
||||
spec_decode_metadata: SpecDecodeMetadata = None,
|
||||
@@ -392,7 +391,6 @@ class MtpProposer(Proposer):
|
||||
common_attn_metadata.query_start_loc = \
|
||||
query_start_loc_pcp_full[:num_reqs + 1]
|
||||
if self.speculative_config.disable_padded_drafter_batch:
|
||||
assert isinstance(sampled_token_ids, list)
|
||||
# NOTE: Currently, MTP-fullgraph is incompatibility with pcp
|
||||
token_indices_to_sample = None
|
||||
common_attn_metadata, token_indices =\
|
||||
@@ -451,7 +449,7 @@ class MtpProposer(Proposer):
|
||||
def _prepare_inputs(
|
||||
self,
|
||||
common_attn_metadata: CommonAttentionMetadata,
|
||||
sampled_token_ids: list[np.ndarray],
|
||||
sampled_token_ids: list[list[int]],
|
||||
num_draft_tokens: list[int],
|
||||
) -> tuple[CommonAttentionMetadata, torch.Tensor]:
|
||||
"""
|
||||
@@ -695,13 +693,11 @@ class MtpProposer(Proposer):
|
||||
2))) and (scheduler_output.total_num_scheduled_tokens
|
||||
== self.runner.input_batch.num_reqs *
|
||||
(self.num_speculative_tokens + 1))
|
||||
batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
|
||||
uniform_decode=uniform_decode)
|
||||
else:
|
||||
batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
|
||||
uniform_decode=False)
|
||||
uniform_decode = False
|
||||
has_lora = len(self.runner.input_batch.lora_id_to_lora_request) > 0
|
||||
aclgraph_runtime_mode, batch_descriptor = \
|
||||
self.runner.aclgraph_dispatcher.dispatch(batch_descriptor)
|
||||
self.runner.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
|
||||
|
||||
if self.vllm_config.compilation_config.cudagraph_mode.has_full_cudagraphs(
|
||||
) and aclgraph_runtime_mode == CUDAGraphMode.FULL:
|
||||
@@ -929,7 +925,7 @@ class MtpProposer(Proposer):
|
||||
|
||||
def prepare_next_token_ids_cpu(
|
||||
self,
|
||||
sampled_token_ids: list[np.ndarray],
|
||||
sampled_token_ids: list[list[int]],
|
||||
requests: dict[str, CachedRequestState],
|
||||
gpu_input_batch: InputBatch,
|
||||
num_scheduled_tokens: dict[str, int],
|
||||
@@ -944,7 +940,7 @@ class MtpProposer(Proposer):
|
||||
req_ids = gpu_input_batch.req_ids
|
||||
next_token_ids: list[int] = []
|
||||
for i, token_ids in enumerate(sampled_token_ids):
|
||||
if token_ids.shape[0] > 0:
|
||||
if token_ids:
|
||||
# Common case.
|
||||
next_token_id = token_ids[-1]
|
||||
else:
|
||||
@@ -955,7 +951,7 @@ class MtpProposer(Proposer):
|
||||
seq_len = req_state.num_computed_tokens + num_scheduled_tokens[
|
||||
req_id]
|
||||
next_token_id = req_state.get_token_id(seq_len)
|
||||
next_token_ids.append(next_token_id.item())
|
||||
next_token_ids.append(next_token_id)
|
||||
next_token_ids = torch.tensor(next_token_ids,
|
||||
dtype=torch.int32,
|
||||
device=self.input_ids.device)
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
from vllm.config import CUDAGraphMode
|
||||
from vllm.v1.spec_decode.ngram_proposer import \
|
||||
@@ -32,7 +31,7 @@ class NgramProposer(VllmNgramProposer, Proposer):
|
||||
pass
|
||||
|
||||
def generate_token_ids(self,
|
||||
valid_sampled_token_ids: list[np.ndarray],
|
||||
valid_sampled_token_ids,
|
||||
sampling_metadata=None,
|
||||
scheduler_output=None,
|
||||
spec_decode_metadata=None,
|
||||
@@ -43,7 +42,7 @@ class NgramProposer(VllmNgramProposer, Proposer):
|
||||
aux_hidden_states=None) -> list[list[int]]:
|
||||
valid_ngram_requests = []
|
||||
for i, sampled_ids in enumerate(valid_sampled_token_ids):
|
||||
num_sampled_ids = sampled_ids.shape[0]
|
||||
num_sampled_ids = len(sampled_ids)
|
||||
if not num_sampled_ids:
|
||||
continue
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ import torch.nn.functional as F
|
||||
import vllm
|
||||
from torch import nn
|
||||
from transformers import Qwen2Config
|
||||
from vllm.attention import AttentionMetadata, AttentionType
|
||||
from vllm.attention.backends.abstract import AttentionMetadata, AttentionType
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import CacheConfig, VllmConfig
|
||||
from vllm.distributed import (get_pp_group, tensor_model_parallel_all_gather,
|
||||
@@ -40,6 +40,7 @@ from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Model
|
||||
from vllm.model_executor.models.utils import (AutoWeightsLoader,
|
||||
PPMissingLayer, maybe_prefix)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.config import set_default_rope_theta
|
||||
|
||||
from vllm_ascend.ascend_config import get_ascend_config
|
||||
from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
||||
@@ -72,11 +73,10 @@ class CustomQwen2Attention(Qwen2Attention):
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_parameters: Optional[dict[str, Any]] = None,
|
||||
max_position: int = 4096 * 32,
|
||||
rope_theta: float = 10000,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
rope_scaling: Optional[tuple] = None,
|
||||
prefix: str = "",
|
||||
attn_type: str = AttentionType.DECODER,
|
||||
dual_chunk_attention_config: Optional[dict[str, Any]] = None,
|
||||
@@ -86,13 +86,13 @@ class CustomQwen2Attention(Qwen2Attention):
|
||||
num_heads=num_heads,
|
||||
num_kv_heads=num_kv_heads,
|
||||
max_position=max_position,
|
||||
rope_theta=rope_theta,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
rope_scaling=rope_scaling,
|
||||
prefix=prefix,
|
||||
attn_type=attn_type,
|
||||
dual_chunk_attention_config=dual_chunk_attention_config)
|
||||
dual_chunk_attention_config=dual_chunk_attention_config,
|
||||
rope_parameters=rope_parameters)
|
||||
|
||||
ascend_config = get_ascend_config()
|
||||
self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
|
||||
|
||||
@@ -145,9 +145,9 @@ class CustomQwen2DecoderLayer(nn.Module):
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
# Requires transformers > 4.32.0
|
||||
rope_theta = getattr(config, "rope_theta", 1000000)
|
||||
rope_scaling = getattr(config, "rope_scaling", None)
|
||||
|
||||
set_default_rope_theta(config, default_theta=1000000)
|
||||
|
||||
dual_chunk_attention_config = getattr(config,
|
||||
"dual_chunk_attention_config",
|
||||
None)
|
||||
@@ -166,10 +166,9 @@ class CustomQwen2DecoderLayer(nn.Module):
|
||||
num_heads=config.num_attention_heads,
|
||||
max_position=config.max_position_embeddings,
|
||||
num_kv_heads=config.num_key_value_heads,
|
||||
rope_theta=rope_theta,
|
||||
rope_parameters=config.rope_parameters,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
rope_scaling=rope_scaling,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
attn_type=attn_type,
|
||||
dual_chunk_attention_config=dual_chunk_attention_config,
|
||||
|
||||
@@ -21,7 +21,8 @@ from typing import Any, List, Optional, Union
|
||||
import torch
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.attention import Attention, AttentionMetadata
|
||||
from vllm.attention.backends.abstract import AttentionMetadata
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import CacheConfig, CompilationMode, VllmConfig
|
||||
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
|
||||
@@ -137,8 +138,7 @@ class CustomQwen3MoeAttention(Qwen3MoeAttention):
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[dict[str, Any]] = None,
|
||||
rope_parameters: dict[str, Any],
|
||||
max_position_embeddings: int = 8192,
|
||||
head_dim: Optional[int] = None,
|
||||
rms_norm_eps: float = 1e-06,
|
||||
@@ -167,7 +167,6 @@ class CustomQwen3MoeAttention(Qwen3MoeAttention):
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.rope_theta = rope_theta
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
self.qkv_proj = QKVParallelLinear(hidden_size,
|
||||
@@ -188,8 +187,7 @@ class CustomQwen3MoeAttention(Qwen3MoeAttention):
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
base=rope_theta,
|
||||
rope_scaling=rope_scaling,
|
||||
rope_parameters=rope_parameters,
|
||||
)
|
||||
self.attn = Attention(self.num_heads,
|
||||
self.head_dim,
|
||||
@@ -270,16 +268,13 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
|
||||
|
||||
nn.Module.__init__(self)
|
||||
self.hidden_size = config.hidden_size
|
||||
rope_theta = getattr(config, "rope_theta", 10000)
|
||||
rope_scaling = getattr(config, "rope_scaling", None)
|
||||
max_position_embeddings = getattr(config, "max_position_embeddings",
|
||||
8192)
|
||||
self.self_attn = CustomQwen3MoeAttention(
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=config.num_attention_heads,
|
||||
num_kv_heads=config.num_key_value_heads,
|
||||
rope_theta=rope_theta,
|
||||
rope_scaling=rope_scaling,
|
||||
rope_parameters=config.rope_parameters,
|
||||
max_position_embeddings=max_position_embeddings,
|
||||
rms_norm_eps=config.rms_norm_eps,
|
||||
qkv_bias=getattr(config, 'attention_bias', False),
|
||||
|
||||
@@ -25,13 +25,13 @@
|
||||
# # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py
|
||||
# """Inference-only DeepseekV2/DeepseekV3 model."""
|
||||
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
|
||||
from typing import Callable, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch_npu
|
||||
from torch import nn
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.attention import AttentionMetadata
|
||||
from vllm.attention.backends.abstract import AttentionMetadata
|
||||
from vllm.attention.layer import MLAAttention
|
||||
from vllm.config import CacheConfig, ModelConfig, VllmConfig
|
||||
from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
|
||||
@@ -492,8 +492,6 @@ class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention):
|
||||
v_head_dim: int,
|
||||
q_lora_rank: Optional[int],
|
||||
kv_lora_rank: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
@@ -518,7 +516,6 @@ class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention):
|
||||
self.first_k_dense_replace = config.first_k_dense_replace
|
||||
|
||||
self.scaling = self.qk_head_dim**-0.5
|
||||
self.rope_theta = rope_theta
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
self.prefix = prefix
|
||||
@@ -592,17 +589,17 @@ class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention):
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.o_proj")
|
||||
|
||||
if rope_scaling:
|
||||
rope_scaling["rope_type"] = 'deepseek_yarn'
|
||||
if config.rope_parameters["rope_type"] != "default":
|
||||
config.rope_parameters["rope_type"] = "deepseek_yarn"
|
||||
self.rotary_emb = get_rope(qk_rope_head_dim,
|
||||
rotary_dim=qk_rope_head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
base=rope_theta,
|
||||
rope_scaling=rope_scaling,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=False)
|
||||
if rope_scaling:
|
||||
mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
|
||||
scaling_factor = rope_scaling["factor"]
|
||||
if config.rope_parameters["rope_type"] != "default":
|
||||
mscale_all_dim = config.rope_parameters.get(
|
||||
"mscale_all_dim", False)
|
||||
scaling_factor = config.rope_parameters["factor"]
|
||||
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
|
||||
self.scaling = self.scaling * mscale * mscale
|
||||
|
||||
@@ -708,8 +705,6 @@ class TorchairDeepseekV2SFAAttention(DeepseekV2MLAAttention):
|
||||
v_head_dim: int,
|
||||
q_lora_rank: Optional[int],
|
||||
kv_lora_rank: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
max_position_embeddings: int = 8192,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
@@ -734,7 +729,6 @@ class TorchairDeepseekV2SFAAttention(DeepseekV2MLAAttention):
|
||||
self.first_k_dense_replace = config.first_k_dense_replace
|
||||
|
||||
self.scaling = self.qk_head_dim**-0.5
|
||||
self.rope_theta = rope_theta
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
self.prefix = prefix
|
||||
@@ -814,17 +808,19 @@ class TorchairDeepseekV2SFAAttention(DeepseekV2MLAAttention):
|
||||
return_bias=False,
|
||||
)
|
||||
|
||||
if rope_scaling:
|
||||
rope_scaling["rope_type"] = 'deepseek_yarn'
|
||||
self.rotary_emb = get_rope(qk_rope_head_dim,
|
||||
rotary_dim=qk_rope_head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
base=rope_theta,
|
||||
rope_scaling=rope_scaling,
|
||||
is_neox_style=False)
|
||||
if rope_scaling:
|
||||
mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
|
||||
scaling_factor = rope_scaling["factor"]
|
||||
if config.rope_parameters["rope_type"] != "default":
|
||||
config.rope_parameters["rope_type"] = "deepseek_yarn"
|
||||
self.rotary_emb = get_rope(
|
||||
qk_rope_head_dim,
|
||||
rotary_dim=qk_rope_head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
rope_parameters=config.rope_parameters,
|
||||
is_neox_style=False,
|
||||
)
|
||||
if config.rope_parameters["rope_type"] != "default":
|
||||
mscale_all_dim = config.rope_parameters.get(
|
||||
"mscale_all_dim", False)
|
||||
scaling_factor = config.rope_parameters["factor"]
|
||||
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
|
||||
self.scaling = self.scaling * mscale * mscale
|
||||
|
||||
@@ -921,8 +917,6 @@ class TorchairDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
|
||||
) -> None:
|
||||
nn.Module.__init__(self)
|
||||
self.hidden_size = config.hidden_size
|
||||
rope_theta = getattr(config, "rope_theta", 10000)
|
||||
rope_scaling = getattr(config, "rope_scaling", None)
|
||||
max_position_embeddings = getattr(config, "max_position_embeddings",
|
||||
8192)
|
||||
# DecoderLayers are created with `make_layers` which passes the prefix
|
||||
@@ -955,8 +949,6 @@ class TorchairDeepseekV2DecoderLayer(DeepseekV2DecoderLayer):
|
||||
q_lora_rank=config.q_lora_rank
|
||||
if hasattr(config, "q_lora_rank") else None,
|
||||
kv_lora_rank=config.kv_lora_rank,
|
||||
rope_theta=rope_theta,
|
||||
rope_scaling=rope_scaling,
|
||||
max_position_embeddings=max_position_embeddings,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
|
||||
@@ -24,7 +24,8 @@ import torch_npu
|
||||
from torch import nn
|
||||
from torch.nn import Parameter
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.attention import Attention, AttentionMetadata
|
||||
from vllm.attention.backends.abstract import AttentionMetadata
|
||||
from vllm.attention.layer import Attention
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import CacheConfig, VllmConfig
|
||||
from vllm.distributed import (divide, get_pp_group,
|
||||
@@ -539,8 +540,7 @@ class PanguProMoEAttention(nn.Module):
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
rope_theta: float = 10000,
|
||||
rope_scaling: Optional[Dict[str, Any]] = None,
|
||||
rope_parameters: Dict[str, Any],
|
||||
max_position_embeddings: int = 8192,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
@@ -566,7 +566,6 @@ class PanguProMoEAttention(nn.Module):
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = self.head_dim**-0.5
|
||||
self.rope_theta = rope_theta
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
|
||||
self.qkv_proj = QKVParallelLinear(
|
||||
@@ -600,8 +599,7 @@ class PanguProMoEAttention(nn.Module):
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
base=rope_theta,
|
||||
rope_scaling=rope_scaling,
|
||||
rope_parameters=rope_parameters,
|
||||
)
|
||||
self.attn = Attention(
|
||||
self.num_heads,
|
||||
@@ -654,8 +652,6 @@ class PanguProMoEDecoderLayer(nn.Module):
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
rope_theta = getattr(config, "rope_theta", 10000)
|
||||
rope_scaling = getattr(config, "rope_scaling", None)
|
||||
max_position_embeddings = getattr(config, "max_position_embeddings",
|
||||
8192)
|
||||
|
||||
@@ -663,8 +659,7 @@ class PanguProMoEDecoderLayer(nn.Module):
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=config.num_attention_heads,
|
||||
num_kv_heads=config.num_key_value_heads,
|
||||
rope_theta=rope_theta,
|
||||
rope_scaling=rope_scaling,
|
||||
rope_parameters=config.rope_parameters,
|
||||
max_position_embeddings=max_position_embeddings,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
|
||||
@@ -993,6 +993,7 @@ class TorchairAscendFusedMoE(FusedMoE):
|
||||
tp_size=tp_size,
|
||||
ep_size=ep_size,
|
||||
dp_size=dp_size,
|
||||
pcp_size=1,
|
||||
prefix=prefix,
|
||||
custom_routing_function=custom_routing_function,
|
||||
scoring_func=scoring_func,
|
||||
@@ -1011,6 +1012,8 @@ class TorchairAscendFusedMoE(FusedMoE):
|
||||
self.moe_parallel_config = FusedMoEParallelConfig.make(
|
||||
tp_size_=(tp_size if tp_size is not None else
|
||||
get_tensor_model_parallel_world_size()),
|
||||
# TODO: support pcp
|
||||
pcp_size_=1,
|
||||
dp_size_=(dp_size
|
||||
if dp_size is not None else get_dp_group().world_size),
|
||||
vllm_parallel_config=vllm_config.parallel_config)
|
||||
|
||||
@@ -170,7 +170,7 @@ class AscendMLATorchairMetadataBuilder:
|
||||
self.block_size = vllm_config.cache_config.block_size
|
||||
self.max_blocks = (vllm_config.model_config.max_model_len +
|
||||
self.block_size - 1) // self.block_size
|
||||
self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
|
||||
self.chunked_prefill_enabled = scheduler_config.enable_chunked_prefill
|
||||
if self.chunked_prefill_enabled:
|
||||
self.chunked_prefill_workspace_size = min(
|
||||
# Max sure there is enough for 8 full length request or at least
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
import types
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torchair
|
||||
from torchair import patch_for_hcom
|
||||
from vllm.config import (CUDAGraphMode, VllmConfig,
|
||||
get_layers_from_vllm_config, set_current_vllm_config)
|
||||
from vllm.forward_context import BatchDescriptor, get_forward_context
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
from vllm.model_executor.model_loader import get_model_loader
|
||||
from vllm.model_executor.model_loader.utils import \
|
||||
@@ -149,7 +148,7 @@ class TorchairMtpProposer(MtpProposer):
|
||||
break
|
||||
|
||||
def generate_token_ids(self,
|
||||
valid_sampled_token_ids: list[np.ndarray],
|
||||
valid_sampled_token_ids: list[list[int]],
|
||||
sampling_metadata: SamplingMetadata = None,
|
||||
scheduler_output: SchedulerOutput = None,
|
||||
spec_decode_metadata: SpecDecodeMetadata = None,
|
||||
@@ -162,7 +161,7 @@ class TorchairMtpProposer(MtpProposer):
|
||||
attn_metadata = attn_metadata['model.layers.0.self_attn.attn']
|
||||
next_token_ids: list[int] = []
|
||||
for i, token_ids in enumerate(valid_sampled_token_ids):
|
||||
if token_ids.shape[0] > 0:
|
||||
if token_ids:
|
||||
# Common case.
|
||||
next_token_id = token_ids[-1]
|
||||
else:
|
||||
@@ -173,7 +172,7 @@ class TorchairMtpProposer(MtpProposer):
|
||||
seq_len = (req_state.num_computed_tokens +
|
||||
scheduler_output.num_scheduled_tokens[req_id])
|
||||
next_token_id = req_state.get_token_id(seq_len)
|
||||
next_token_ids.append(next_token_id.item())
|
||||
next_token_ids.append(next_token_id)
|
||||
next_token_ids = torch.tensor(next_token_ids,
|
||||
dtype=torch.int32,
|
||||
device=self.device)
|
||||
@@ -189,7 +188,7 @@ class TorchairMtpProposer(MtpProposer):
|
||||
# TODO(woosuk): Refactor this.
|
||||
num_draft_tokens = spec_decode_metadata.num_draft_tokens
|
||||
num_rejected_tokens = [
|
||||
n + 1 - valid_sampled_token_ids[i].shape[0] if n > 0 else 0
|
||||
n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0
|
||||
for i, n in enumerate(num_draft_tokens)
|
||||
]
|
||||
num_rejected_tokens = torch.tensor(
|
||||
@@ -343,12 +342,7 @@ class TorchairMtpProposer(MtpProposer):
|
||||
# torchair mode can reuse self.runner.num_tokens_across_dp
|
||||
num_tokens_across_dp = self.runner.num_tokens_across_dp
|
||||
with_prefill = self.runner.with_prefill
|
||||
|
||||
moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
|
||||
batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
|
||||
uniform_decode=False)
|
||||
aclgraph_runtime_mode, batch_descriptor = \
|
||||
self.runner.aclgraph_dispatcher.dispatch(batch_descriptor)
|
||||
|
||||
for step in range(self.num_speculative_tokens):
|
||||
with set_ascend_forward_context(
|
||||
@@ -359,7 +353,6 @@ class TorchairMtpProposer(MtpProposer):
|
||||
num_tokens_across_dp=num_tokens_across_dp,
|
||||
reserved_mc2_mask=self.runner.reserved_mc2_mask,
|
||||
moe_comm_type=moe_comm_type,
|
||||
aclgraph_runtime_mode=aclgraph_runtime_mode,
|
||||
in_profile_run=self.runner.in_profile_run,
|
||||
num_actual_tokens=num_tokens):
|
||||
with ProfileExecuteDuration().capture_async('mtp_forward'):
|
||||
|
||||
@@ -171,7 +171,7 @@ class AscendSFATorchairMetadataBuilder:
|
||||
self.block_size = vllm_config.cache_config.block_size
|
||||
self.max_blocks = (vllm_config.model_config.max_model_len +
|
||||
self.block_size - 1) // self.block_size
|
||||
self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
|
||||
self.chunked_prefill_enabled = scheduler_config.enable_chunked_prefill
|
||||
if self.chunked_prefill_enabled:
|
||||
self.chunked_prefill_workspace_size = min(
|
||||
# Max sure there is enough for 8 full length request or at least
|
||||
|
||||
@@ -483,6 +483,13 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
|
||||
compilation_config.cudagraph_capture_sizes, None
|
||||
|
||||
# Calculate parallel configuration factor
|
||||
if not vllm_config.model_config:
|
||||
logger.warning(
|
||||
"Got empty model config. This typically occurs when an empty vllm_config is "
|
||||
"initialized (e.g., in unit tests), where config updates are intentionally skipped."
|
||||
)
|
||||
|
||||
return
|
||||
hf_config = vllm_config.model_config.hf_config
|
||||
if hasattr(hf_config, 'num_hidden_layers'):
|
||||
num_hidden_layers = hf_config.num_hidden_layers
|
||||
|
||||
@@ -39,9 +39,9 @@ import torch._dynamo.cache_size
|
||||
import torch.distributed as dist
|
||||
import torch.nn as nn
|
||||
from tqdm import tqdm # type: ignore
|
||||
from vllm.attention import AttentionType, get_attn_backend
|
||||
from vllm.attention.backends.abstract import AttentionBackend
|
||||
from vllm.attention.backends.abstract import AttentionBackend, AttentionType
|
||||
from vllm.attention.layer import Attention, MLAAttention
|
||||
from vllm.attention.selector import get_attn_backend
|
||||
from vllm.compilation.counter import compilation_counter
|
||||
from vllm.compilation.monitor import set_cudagraph_capturing_enabled
|
||||
from vllm.config import (CompilationMode, CUDAGraphMode, VllmConfig,
|
||||
@@ -53,7 +53,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
|
||||
from vllm.distributed.parallel_state import (get_dcp_group, get_dp_group,
|
||||
get_pp_group, get_tp_group,
|
||||
is_global_first_rank)
|
||||
from vllm.forward_context import BatchDescriptor, get_forward_context
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.logger import logger
|
||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||
from vllm.model_executor.layers.mamba.abstract import MambaBase
|
||||
@@ -244,11 +244,9 @@ class AsyncNPUModelRunnerOutput(AsyncModelRunnerOutput):
|
||||
# Release the device tensor once the copy has completed
|
||||
del self._sampled_token_ids
|
||||
|
||||
valid_sampled_token_ids: list[np.ndarray] = [
|
||||
row for row in self._sampled_token_ids_cpu.numpy()
|
||||
]
|
||||
valid_sampled_token_ids = self._sampled_token_ids_cpu.tolist()
|
||||
for i in self._invalid_req_indices:
|
||||
valid_sampled_token_ids[i] = np.array([])
|
||||
valid_sampled_token_ids[i].clear()
|
||||
|
||||
output = self._model_runner_output
|
||||
output.sampled_token_ids = valid_sampled_token_ids
|
||||
@@ -332,7 +330,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# Ascend-specific configurations
|
||||
self.ascend_config = get_ascend_config()
|
||||
if self.ascend_config.ascend_scheduler_config.enabled:
|
||||
self.chunked_prefill_enabled = self.scheduler_config.chunked_prefill_enabled
|
||||
self.chunked_prefill_enabled = self.scheduler_config.enable_chunked_prefill
|
||||
else:
|
||||
self.chunked_prefill_enabled = True
|
||||
self.weight_prefetch_method = WeightPrefetchMethod(
|
||||
@@ -2130,7 +2128,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
|
||||
def propose_draft_token_ids(
|
||||
self,
|
||||
valid_sampled_token_ids: Union[torch.Tensor, list[np.ndarray]],
|
||||
valid_sampled_token_ids: torch.Tensor | list[list[int]],
|
||||
sampling_metadata: SamplingMetadata,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
spec_decode_metadata: SpecDecodeMetadata,
|
||||
@@ -2309,10 +2307,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
|
||||
scheduler_output.total_num_scheduled_tokens
|
||||
== self.input_batch.num_reqs * max_query_len)
|
||||
batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
|
||||
uniform_decode=uniform_decode)
|
||||
has_lora = len(self.input_batch.lora_id_to_lora_request) > 0
|
||||
aclgraph_runtime_mode, batch_descriptor = \
|
||||
self.aclgraph_dispatcher.dispatch(batch_descriptor)
|
||||
self.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
|
||||
|
||||
# Run forward pass
|
||||
with ProfileExecuteDuration().capture_async("forward"):
|
||||
@@ -2510,18 +2507,16 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
max_gen_len = sampled_token_ids.shape[-1]
|
||||
if max_gen_len == 1:
|
||||
# No spec decode tokens. It's a tensor.
|
||||
valid_sampled_token_ids: list[np.ndarray] = [
|
||||
row for row in sampled_token_ids.cpu().numpy()
|
||||
]
|
||||
valid_sampled_token_ids = sampled_token_ids.tolist()
|
||||
else:
|
||||
# Includes spec decode tokens. It's a numpy array
|
||||
valid_sampled_token_ids = self.rejection_sampler.parse_output(
|
||||
valid_sampled_token_ids, _ = self.rejection_sampler.parse_output(
|
||||
sampled_token_ids,
|
||||
self.input_batch.vocab_size,
|
||||
)
|
||||
# Mask out the sampled tokens that should not be sampled.
|
||||
for i in discard_sampled_tokens_req_indices:
|
||||
valid_sampled_token_ids[int(i)] = np.array([])
|
||||
valid_sampled_token_ids[int(i)].clear()
|
||||
else:
|
||||
valid_sampled_token_ids = []
|
||||
invalid_req_indices = discard_sampled_tokens_req_indices.tolist(
|
||||
@@ -2547,17 +2542,16 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
# the sampled tokens back, because there's no direct communication
|
||||
# between the first-stage worker and the last-stage worker.
|
||||
for req_idx in range(num_sampled_tokens):
|
||||
sampled_ids: np.ndarray | None
|
||||
if self.use_async_scheduling:
|
||||
sampled_ids = (np.array([-1]) if req_idx
|
||||
not in invalid_req_indices_set else None)
|
||||
sampled_ids = [-1] * 1 if \
|
||||
req_idx not in invalid_req_indices_set else None
|
||||
else:
|
||||
sampled_ids = valid_sampled_token_ids[req_idx]
|
||||
if sampled_ids is None or sampled_ids.shape[0] == 0:
|
||||
if not sampled_ids:
|
||||
continue
|
||||
|
||||
start_idx = self.input_batch.num_tokens_no_spec[req_idx]
|
||||
end_idx = start_idx + sampled_ids.shape[0]
|
||||
end_idx = start_idx + len(sampled_ids)
|
||||
assert end_idx <= self.model_config.max_model_len, (
|
||||
"Sampled token IDs exceed the max model length. "
|
||||
f"Total number of tokens: {end_idx} > max_model_len: "
|
||||
@@ -2571,7 +2565,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.input_batch.num_tokens[req_idx] = end_idx
|
||||
req_id = self.input_batch.req_ids[req_idx]
|
||||
req_state = self.requests[req_id]
|
||||
req_state.output_token_ids.extend(sampled_ids.tolist())
|
||||
req_state.output_token_ids.extend(sampled_ids)
|
||||
|
||||
def propose_draft_token_ids(sampled_token_ids):
|
||||
assert self.spec_decode_common_attn_metadata is not None
|
||||
@@ -2877,7 +2871,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
assert aclgraph_runtime_mode is None or aclgraph_runtime_mode in {
|
||||
CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
|
||||
}
|
||||
|
||||
# In multi-DP scenarios, there may be situations where all DP groups are executing dummy runs.
|
||||
# If sequence parallelism is enabled, it is essential to ensure that num_tokens is divisible by tp_size.
|
||||
if self.use_aclgraph and enable_sp(self.vllm_config):
|
||||
@@ -2971,19 +2964,18 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
k: v[:num_tokens]
|
||||
for k, v in self.intermediate_tensors.items()
|
||||
})
|
||||
|
||||
has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
|
||||
# filter out the valid batch descriptor
|
||||
_ag_mode, batch_descriptor = \
|
||||
self.aclgraph_dispatcher.dispatch(
|
||||
BatchDescriptor(num_tokens=num_tokens,
|
||||
uniform_decode=uniform_decode))
|
||||
self.aclgraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
|
||||
if aclgraph_runtime_mode is not None:
|
||||
# we allow forcing NONE when the dispatcher disagrees to support
|
||||
# warm ups for aclgraph capture
|
||||
assert aclgraph_runtime_mode == CUDAGraphMode.NONE or \
|
||||
aclgraph_runtime_mode == _ag_mode, (
|
||||
f"Aclgraph runtime mode mismatch at dummy_run. "
|
||||
f"Expected {_ag_mode}, but got {aclgraph_runtime_mode}.")
|
||||
if aclgraph_runtime_mode != CUDAGraphMode.NONE and aclgraph_runtime_mode != _ag_mode:
|
||||
raise ValueError(
|
||||
f"Aclgraph runtime mode mismatch at dummy_run. "
|
||||
f"Expected {_ag_mode}, but got {aclgraph_runtime_mode}."
|
||||
)
|
||||
else:
|
||||
aclgraph_runtime_mode = _ag_mode
|
||||
|
||||
@@ -4466,18 +4458,3 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
self.input_ids_pcp_full_cpu[:total_num_scheduled_tokens_pcp_full],
|
||||
non_blocking=True,
|
||||
)
|
||||
|
||||
def _to_list(self, sampled_token_ids: torch.Tensor) -> list[np.ndarray]:
|
||||
# This is a short term mitigation for issue mentioned in
|
||||
# https://github.com/vllm-project/vllm/issues/22754.
|
||||
# `tolist` would trigger a cuda wise stream sync, which
|
||||
# would block other copy ops from other cuda streams.
|
||||
# A cuda event sync would avoid such a situation. Since
|
||||
# this is in the critical path of every single model
|
||||
# forward loop, this has caused perf issue for a disagg
|
||||
# setup.
|
||||
pinned = self.sampled_token_ids_pinned_cpu[:sampled_token_ids.shape[0]]
|
||||
pinned.copy_(sampled_token_ids, non_blocking=True)
|
||||
self.transfer_event.record()
|
||||
self.transfer_event.synchronize()
|
||||
return [row for row in pinned.numpy()]
|
||||
|
||||
Reference in New Issue
Block a user