[Main2Main] Upgrade vllm commit to 0109 (#5752)

### What this PR does / why we need it?
Upgrade vllm commit to 0109 (bde38c11df0ea066a740efe9b77fff5418be45df)

1. remove `init_cached_hf_modules ` due to
https://github.com/vllm-project/vllm/pull/31786
2. fix spec_decode e2e test due to
https://github.com/vllm-project/vllm/pull/29821 break
3. fix `vllm.v1.attention.backends.utils` duo to
https://github.com/vllm-project/vllm/pull/31891
4. fix `self.seq_lens - query_lens` on same device due to
https://github.com/vllm-project/vllm/pull/31773
5. skip model_runner_v2 e2e test due to `'_OpNamespace' '_C' object has
no attribute 'get_cuda_view_from_cpu_tensor'`

- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
zhangxinyuehfad
2026-01-13 19:14:43 +08:00
committed by GitHub
parent eed9e366a7
commit f7b904641e
21 changed files with 203 additions and 38 deletions

View File

@@ -119,7 +119,7 @@ jobs:
pytest -sv --durations=0 tests/e2e/singlecard/compile/test_norm_quant_fusion.py pytest -sv --durations=0 tests/e2e/singlecard/compile/test_norm_quant_fusion.py
# model_runner_v2 # model_runner_v2
pytest -sv --durations=0 tests/e2e/singlecard/model_runner_v2/test_basic.py # pytest -sv --durations=0 tests/e2e/singlecard/model_runner_v2/test_basic.py
# pooling # pooling
pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py
@@ -309,7 +309,7 @@ jobs:
run: | run: |
pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_data_parallel_tp2.py pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_data_parallel_tp2.py
pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_kimi_k2.py pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_kimi_k2.py
pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_qwen3_next.py pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_qwen3_next.py
# long_sequence # long_sequence
pytest -sv --durations=0 tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py pytest -sv --durations=0 tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py

View File

@@ -37,7 +37,7 @@ jobs:
steps: steps:
- name: Get vLLM version - name: Get vLLM version
run: | run: |
VLLM_COMMIT=2f4e6548efec402b913ffddc8726230d9311948d VLLM_COMMIT=bde38c11df0ea066a740efe9b77fff5418be45df
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
- name: Checkout repository - name: Checkout repository

View File

@@ -75,7 +75,7 @@ jobs:
name: e2e-full name: e2e-full
strategy: strategy:
matrix: matrix:
vllm_version: [2f4e6548efec402b913ffddc8726230d9311948d, v0.13.0] vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0]
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -41,7 +41,7 @@ jobs:
lint: lint:
uses: ./.github/workflows/_pre_commit.yml uses: ./.github/workflows/_pre_commit.yml
with: with:
vllm: 2f4e6548efec402b913ffddc8726230d9311948d vllm: bde38c11df0ea066a740efe9b77fff5418be45df
changes: changes:
runs-on: linux-aarch64-a2-0 runs-on: linux-aarch64-a2-0
outputs: outputs:
@@ -81,7 +81,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy: strategy:
matrix: matrix:
vllm_version: [2f4e6548efec402b913ffddc8726230d9311948d, v0.13.0] vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0]
uses: ./.github/workflows/_unit_test.yaml uses: ./.github/workflows/_unit_test.yaml
with: with:
vllm: ${{ matrix.vllm_version }} vllm: ${{ matrix.vllm_version }}
@@ -93,7 +93,7 @@ jobs:
name: e2e-light name: e2e-light
strategy: strategy:
matrix: matrix:
vllm_version: [2f4e6548efec402b913ffddc8726230d9311948d, v0.13.0] vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs # Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes] needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request. # only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -33,7 +33,7 @@ jobs:
name: refresh codecov name: refresh codecov
strategy: strategy:
matrix: matrix:
vllm_version: [2f4e6548efec402b913ffddc8726230d9311948d] vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df]
uses: ./.github/workflows/_unit_test.yaml uses: ./.github/workflows/_unit_test.yaml
with: with:
vllm: ${{ matrix.vllm_version }} vllm: ${{ matrix.vllm_version }}

View File

@@ -51,7 +51,7 @@ If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/
For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly. For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------| |-------------|--------------|------------------|-------------|--------------------|
| main | 2f4e6548efec402b913ffddc8726230d9311948d, v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | | main | bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
## Release cadence ## Release cadence

View File

@@ -305,15 +305,16 @@ def test_rmsnorm_quant_fusion(
vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype)) vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype))
update_environment_variables({ with vllm.config.set_current_vllm_config(vllm_config):
"RANK": "0", update_environment_variables({
"LOCAL_RANK": "0", "RANK": "0",
"WORLD_SIZE": "1", "LOCAL_RANK": "0",
"MASTER_ADDR": "localhost", "WORLD_SIZE": "1",
"MASTER_PORT": "12345", "MASTER_ADDR": "localhost",
}) "MASTER_PORT": "12345",
init_distributed_environment() })
ensure_model_parallel_initialized(1, 1) init_distributed_environment()
ensure_model_parallel_initialized(1, 1)
with vllm.config.set_current_vllm_config(vllm_config): with vllm.config.set_current_vllm_config(vllm_config):
with set_ascend_forward_context(None, vllm_config): with set_ascend_forward_context(None, vllm_config):

View File

@@ -33,6 +33,11 @@ class TestAscendAttentionCPImpl(TestBase):
self.layer_no_quant.layer_name = "test_layer" self.layer_no_quant.layer_name = "test_layer"
self.layer_no_quant._k_scale_float = 1.0 self.layer_no_quant._k_scale_float = 1.0
self.layer_no_quant._v_scale_float = 1.0 self.layer_no_quant._v_scale_float = 1.0
self.mock_vllm_config = MagicMock()
self.config_patcher = patch(
'vllm_ascend.attention.attention_v1.get_current_vllm_config',
return_value=self.mock_vllm_config)
self.config_patcher.start()
self.impl = AscendAttentionCPImpl( self.impl = AscendAttentionCPImpl(
num_heads=8, num_heads=8,

View File

@@ -13,6 +13,23 @@ from vllm_ascend.utils import AscendDeviceType
class TestAscendAttentionBackend(TestBase): class TestAscendAttentionBackend(TestBase):
def setUp(self):
self.mock_config = MagicMock()
mock_parallel_config = MagicMock()
mock_parallel_config.prefill_context_parallel_size = 1
mock_parallel_config.decode_context_parallel_size = 1
self.mock_config.parallel_config = mock_parallel_config
self.utils_patcher = patch(
'vllm_ascend.attention.utils.get_current_vllm_config',
return_value=self.mock_config)
self.utils_patcher.start()
from vllm_ascend.attention.utils import enable_cp
enable_cp.cache_clear()
def test_get_name(self): def test_get_name(self):
self.assertEqual(AscendAttentionBackend.get_name(), "CUSTOM") self.assertEqual(AscendAttentionBackend.get_name(), "CUSTOM")
@@ -102,6 +119,19 @@ class TestAscendAttentionMetadataBuilder(TestBase):
class TestAscendAttentionBackendImpl(TestBase): class TestAscendAttentionBackendImpl(TestBase):
def setUp(self): def setUp(self):
self.mock_event = MagicMock()
self.mock_event.record.return_value = None
self.mock_event.wait.return_value = None
self.mock_stream = MagicMock()
self.event_patcher = patch('torch_npu.npu.Event',
return_value=self.mock_event)
self.stream_patcher = patch('torch_npu.npu.current_stream',
return_value=self.mock_stream)
self.event_patcher.start()
self.stream_patcher.start()
self.layer = MagicMock() self.layer = MagicMock()
self.layer.layer_name = "test_layer" self.layer.layer_name = "test_layer"
self.layer._k_scale_float = 1.0 self.layer._k_scale_float = 1.0
@@ -119,6 +149,11 @@ class TestAscendAttentionBackendImpl(TestBase):
self.layer_no_quant.layer_name = "test_layer" self.layer_no_quant.layer_name = "test_layer"
self.layer_no_quant._k_scale_float = 1.0 self.layer_no_quant._k_scale_float = 1.0
self.layer_no_quant._v_scale_float = 1.0 self.layer_no_quant._v_scale_float = 1.0
self.mock_vllm_config = MagicMock()
self.config_patcher = patch(
'vllm_ascend.attention.attention_v1.get_current_vllm_config',
return_value=self.mock_vllm_config)
self.config_patcher.start()
self.impl = AscendAttentionBackendImpl( self.impl = AscendAttentionBackendImpl(
num_heads=8, num_heads=8,

View File

@@ -22,6 +22,23 @@ from vllm_ascend.utils import vllm_version_is
class TestAscendMLABackend(TestBase): class TestAscendMLABackend(TestBase):
def setUp(self):
self.mock_config = MagicMock()
mock_parallel_config = MagicMock()
mock_parallel_config.prefill_context_parallel_size = 1
mock_parallel_config.decode_context_parallel_size = 1
self.mock_config.parallel_config = mock_parallel_config
self.utils_patcher = patch(
'vllm_ascend.attention.utils.get_current_vllm_config',
return_value=self.mock_config)
self.utils_patcher.start()
from vllm_ascend.attention.utils import enable_cp
enable_cp.cache_clear()
def test_get_name(self): def test_get_name(self):
self.assertEqual(AscendMLABackend.get_name(), "ASCEND_MLA") self.assertEqual(AscendMLABackend.get_name(), "ASCEND_MLA")

View File

@@ -12,6 +12,7 @@ if 'torch_npu._inductor' not in sys.modules:
from vllm_ascend.attention.sfa_v1 import (AscendSFABackend, AscendSFAImpl, from vllm_ascend.attention.sfa_v1 import (AscendSFABackend, AscendSFAImpl,
AscendSFAMetadata, AscendSFAMetadata,
AscendSFAMetadataBuilder) AscendSFAMetadataBuilder)
from vllm_ascend.utils import enable_dsa_cp
class TestAscendSFABackend(TestBase): class TestAscendSFABackend(TestBase):
@@ -83,6 +84,27 @@ class TestAscendSFAMetadata(TestBase):
class TestAscendSFAMetadataBuilder(TestBase): class TestAscendSFAMetadataBuilder(TestBase):
def setUp(self):
self.mock_cfg = MagicMock()
self.mock_cfg.parallel_config = MagicMock()
self.mock_cfg.parallel_config.tensor_parallel_size = 1
self.mock_cfg.parallel_config.prefill_context_parallel_size = 1
self.mock_cfg.parallel_config.decode_context_parallel_size = 1
self.mock_cfg.compilation_config = MagicMock()
self.mock_cfg.compilation_config.pass_config = MagicMock()
self.mock_cfg.compilation_config.pass_config.enable_sp = False
self.mock_cfg.speculative_config.num_speculative_tokens = 0
self.patcher = patch("vllm.config.get_current_vllm_config",
return_value=self.mock_cfg)
self.patcher.start()
if hasattr(enable_dsa_cp, "cache_clear"):
enable_dsa_cp.cache_clear()
def test_ascend_sfa_metadata_builder_default(self): def test_ascend_sfa_metadata_builder_default(self):
kv_cache_spec = MagicMock() kv_cache_spec = MagicMock()
layer_names = ["layer1", "layer2"] layer_names = ["layer1", "layer2"]

View File

@@ -13,10 +13,11 @@
# This file is a part of the vllm-ascend project. # This file is a part of the vllm-ascend project.
# #
from unittest.mock import patch from unittest.mock import MagicMock, patch
import pytest import pytest
import torch import torch
from vllm.config import set_current_vllm_config
from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
from vllm_ascend.utils import AscendDeviceType from vllm_ascend.utils import AscendDeviceType
@@ -27,8 +28,20 @@ def dummy_tensor():
return torch.randn(4, 8, dtype=torch.float16) return torch.randn(4, 8, dtype=torch.float16)
@pytest.fixture
def default_vllm_config():
mock_config = MagicMock()
mock_config.compilation_config.dispatch_forward_backend = "eager"
mock_config.compilation_config.custom_ops = ["all"]
with set_current_vllm_config(mock_config):
yield mock_config
@patch("torch_npu.npu_fast_gelu", side_effect=lambda x: x + 1) @patch("torch_npu.npu_fast_gelu", side_effect=lambda x: x + 1)
def test_QuickGELU_forward(mock_gelu, dummy_tensor): def test_QuickGELU_forward(mock_gelu, dummy_tensor, default_vllm_config):
layer = QuickGELU() layer = QuickGELU()
out = layer.forward(dummy_tensor) out = layer.forward(dummy_tensor)
@@ -45,7 +58,7 @@ def test_QuickGELU_forward(mock_gelu, dummy_tensor):
side_effect=lambda x: None) side_effect=lambda x: None)
def test_SiluAndMul_forward(mock_maybe_prefetch_mlp_down_proj, def test_SiluAndMul_forward(mock_maybe_prefetch_mlp_down_proj,
mock_maybe_wait_prefetch_done, mock_swiglu, mock_maybe_wait_prefetch_done, mock_swiglu,
is_310p, dummy_tensor): is_310p, dummy_tensor, default_vllm_config):
with patch("vllm_ascend.utils.get_ascend_device_type", with patch("vllm_ascend.utils.get_ascend_device_type",
return_value=AscendDeviceType._310P return_value=AscendDeviceType._310P

View File

@@ -1,7 +1,8 @@
from unittest.mock import patch from unittest.mock import MagicMock, patch
import pytest import pytest
import torch import torch
from vllm.config import set_current_vllm_config
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm_ascend.utils import AscendDeviceType from vllm_ascend.utils import AscendDeviceType
@@ -20,13 +21,22 @@ def mock_add_rms_norm(x, residual, weight, eps):
return 2 * x, None, 2 * residual return 2 * x, None, 2 * residual
@pytest.fixture(autouse=True)
def default_vllm_config():
mock_config = MagicMock()
mock_config.compilation_config.custom_ops = ["all"]
with set_current_vllm_config(mock_config):
yield mock_config
@pytest.mark.parametrize("is_310p", [True, False]) @pytest.mark.parametrize("is_310p", [True, False])
@pytest.mark.parametrize("residual", @pytest.mark.parametrize("residual",
[None, torch.randn(4, 8, dtype=torch.float32)]) [None, torch.randn(4, 8, dtype=torch.float32)])
@patch("torch_npu.npu_rms_norm", side_effect=mock_rms_norm) @patch("torch_npu.npu_rms_norm", side_effect=mock_rms_norm)
@patch("torch_npu.npu_add_rms_norm", side_effect=mock_add_rms_norm) @patch("torch_npu.npu_add_rms_norm", side_effect=mock_add_rms_norm)
def test_RMSNorm_forward(mock_add_rmsnorm, mock_rmsnorm, is_310p, residual, def test_RMSNorm_forward(mock_add_rmsnorm, mock_rmsnorm, is_310p, residual,
dummy_tensor): dummy_tensor, default_vllm_config):
with patch("vllm_ascend.utils.get_ascend_device_type", with patch("vllm_ascend.utils.get_ascend_device_type",
return_value=AscendDeviceType._310P return_value=AscendDeviceType._310P

View File

@@ -78,6 +78,12 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
def setUp(self): def setUp(self):
# Common setup for tests # Common setup for tests
self.config_patcher = patch('vllm.config.vllm.get_current_vllm_config')
self.mock_get_config = self.config_patcher.start()
mock_config = MagicMock()
mock_config.compilation_config.custom_ops = ["all"]
self.mock_get_config.return_value = mock_config
self.positions = torch.tensor([1, 2, 3]) self.positions = torch.tensor([1, 2, 3])
self.query = torch.randn(3, 1, 32, dtype=torch.float16) self.query = torch.randn(3, 1, 32, dtype=torch.float16)
self.key = torch.randn(3, 1, 32, dtype=torch.float16) self.key = torch.randn(3, 1, 32, dtype=torch.float16)
@@ -242,6 +248,12 @@ class TestAscendDeepseekScalingRotaryEmbedding(TestBase):
def setUp(self): def setUp(self):
# Common setup for tests # Common setup for tests
self.config_patcher = patch('vllm.config.vllm.get_current_vllm_config')
self.mock_get_config = self.config_patcher.start()
mock_config = MagicMock()
mock_config.compilation_config.custom_ops = ["all"]
self.mock_get_config.return_value = mock_config
self.positions = torch.tensor([1, 2, 3]) self.positions = torch.tensor([1, 2, 3])
self.query = torch.randn(3, 1, 32, dtype=torch.float16) self.query = torch.randn(3, 1, 32, dtype=torch.float16)
self.key = torch.randn(3, 1, 32, dtype=torch.float16) self.key = torch.randn(3, 1, 32, dtype=torch.float16)
@@ -368,7 +380,11 @@ class TestAscendDeepseekScalingRotaryEmbedding(TestBase):
class TestAscendMRotaryEmbedding(unittest.TestCase): class TestAscendMRotaryEmbedding(unittest.TestCase):
def setUp(self): def setUp(self):
# Common setup for tests self.config_patcher = patch('vllm.config.vllm.get_current_vllm_config')
self.mock_get_config = self.config_patcher.start()
mock_config = MagicMock()
mock_config.compilation_config.custom_ops = ["all"]
self.mock_get_config.return_value = mock_config
self.number_tokens = 3 self.number_tokens = 3
self.num_head = 8 self.num_head = 8
self.num_kvhead = 8 self.num_kvhead = 8

View File

@@ -29,6 +29,23 @@ from vllm_ascend.ops.fused_moe.token_dispatcher import ( # isort: skip
class TestTokenDispatcherWithMC2(TestBase): class TestTokenDispatcherWithMC2(TestBase):
def setUp(self): def setUp(self):
self.config_patcher = patch(
'vllm_ascend.ops.fused_moe.token_dispatcher.get_current_vllm_config'
)
self.mock_get_config = self.config_patcher.start()
mock_config = MagicMock()
mock_config.scheduler_config.max_num_seqs = 256
mock_config.scheduler_config.decode_max_num_seqs = 256
mock_config.compilation_config.custom_ops = ["all"]
mock_config.speculative_config = None
mock_config.parallel_config.tensor_parallel_size = 1
self.mock_get_config.return_value = mock_config
self.mc2_group = MagicMock() self.mc2_group = MagicMock()
self.mc2_group.device_group.return_value._get_backend.return_value.get_hccl_comm_name.return_value = "hccl_123" self.mc2_group.device_group.return_value._get_backend.return_value.get_hccl_comm_name.return_value = "hccl_123"
self.mc2_group.rank_in_group = 0 self.mc2_group.rank_in_group = 0

View File

@@ -208,6 +208,15 @@ class TestCustomVocabParallelEmbedding(unittest.TestCase):
class TestAscendLogitsProcessor(unittest.TestCase): class TestAscendLogitsProcessor(unittest.TestCase):
def setUp(self): def setUp(self):
self.mock_vllm_config = MagicMock()
self.mock_vllm_config.compilation_config.custom_ops = ["all"]
from vllm.config.vllm import set_current_vllm_config
set_current_vllm_config(self.mock_vllm_config)
self.config_patch = patch("vllm.config.vllm.get_current_vllm_config",
return_value=self.mock_vllm_config)
self.config_patch.start()
self.vocab_size = 50 self.vocab_size = 50
self.num_embeddings = 50 self.num_embeddings = 50
self.embedding_dim = 10 self.embedding_dim = 10

View File

@@ -5,6 +5,7 @@ import torch
from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
from tests.ut.base import TestBase from tests.ut.base import TestBase
from vllm_ascend.utils import vllm_version_is
init_cached_hf_modules_path = "vllm.utils.import_utils.init_cached_hf_modules" init_cached_hf_modules_path = "vllm.utils.import_utils.init_cached_hf_modules"
@@ -52,7 +53,7 @@ class TestNPUWorker(TestBase):
@patch("vllm_ascend.worker.worker.get_ascend_config") @patch("vllm_ascend.worker.worker.get_ascend_config")
@patch("vllm_ascend.worker.worker.init_ascend_config") @patch("vllm_ascend.worker.worker.init_ascend_config")
@patch("vllm_ascend.worker.worker.check_ascend_device_type") @patch("vllm_ascend.worker.worker.check_ascend_device_type")
@patch(init_cached_hf_modules_path) @patch(init_cached_hf_modules_path, create=True)
@patch("vllm_ascend.worker.worker.NPUWorker._init_profiler") @patch("vllm_ascend.worker.worker.NPUWorker._init_profiler")
def test_init_npu_worker_normal_case( def test_init_npu_worker_normal_case(
self, self,
@@ -106,7 +107,7 @@ class TestNPUWorker(TestBase):
@patch("vllm_ascend.worker.worker.get_ascend_config") @patch("vllm_ascend.worker.worker.get_ascend_config")
@patch("vllm_ascend.worker.worker.init_ascend_config") @patch("vllm_ascend.worker.worker.init_ascend_config")
@patch("vllm_ascend.worker.worker.check_ascend_device_type") @patch("vllm_ascend.worker.worker.check_ascend_device_type")
@patch(init_cached_hf_modules_path) @patch(init_cached_hf_modules_path, create=True)
@patch("vllm_ascend.worker.worker.NPUWorker._init_profiler") @patch("vllm_ascend.worker.worker.NPUWorker._init_profiler")
def test_init_npu_worker_with_trust_remote_code( def test_init_npu_worker_with_trust_remote_code(
self, self,
@@ -140,7 +141,10 @@ class TestNPUWorker(TestBase):
) )
# Verify init_cached_hf_modules is called (trust_remote_code=True) # Verify init_cached_hf_modules is called (trust_remote_code=True)
mock_init_cached_hf_modules.assert_called_once() if vllm_version_is('0.13.0'):
mock_init_cached_hf_modules.assert_called_once()
else:
mock_init_cached_hf_modules.assert_not_called()
@patch("vllm_ascend.utils.adapt_patch") @patch("vllm_ascend.utils.adapt_patch")
@patch("vllm_ascend.ops") @patch("vllm_ascend.ops")
@@ -149,7 +153,7 @@ class TestNPUWorker(TestBase):
@patch("vllm_ascend.worker.worker.get_ascend_config") @patch("vllm_ascend.worker.worker.get_ascend_config")
@patch("vllm_ascend.worker.worker.init_ascend_config") @patch("vllm_ascend.worker.worker.init_ascend_config")
@patch("vllm_ascend.worker.worker.check_ascend_device_type") @patch("vllm_ascend.worker.worker.check_ascend_device_type")
@patch(init_cached_hf_modules_path) @patch(init_cached_hf_modules_path, create=True)
@patch("vllm_ascend.worker.worker.NPUWorker._init_profiler") @patch("vllm_ascend.worker.worker.NPUWorker._init_profiler")
def test_init_npu_worker_with_custom_cache_dtype( def test_init_npu_worker_with_custom_cache_dtype(
self, self,
@@ -813,10 +817,11 @@ class TestNPUWorker(TestBase):
mock_scheduler_output, None) mock_scheduler_output, None)
self.assertEqual(result, mock_model_output) self.assertEqual(result, mock_model_output)
@patch("vllm_ascend.worker.worker.enable_sp", return_value=False)
@patch("vllm_ascend.worker.worker.get_pp_group") @patch("vllm_ascend.worker.worker.get_pp_group")
@patch("vllm_ascend.worker.worker.get_tp_group") @patch("vllm_ascend.worker.worker.get_tp_group")
def test_execute_model_middle_rank(self, mock_get_tp_group, def test_execute_model_middle_rank(self, mock_get_tp_group,
mock_get_pp_group): mock_get_pp_group, mock_enable_sp):
"""Test execute_model method - middle rank case""" """Test execute_model method - middle rank case"""
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
@@ -1113,12 +1118,14 @@ class TestNPUWorker(TestBase):
worker.model_runner.initialize_kv_cache.assert_called_once_with( worker.model_runner.initialize_kv_cache.assert_called_once_with(
mock_kv_cache_config) mock_kv_cache_config)
@patch("vllm_ascend.worker.worker.enable_sp", return_value=False)
@patch("vllm_ascend.worker.worker.get_pp_group") @patch("vllm_ascend.worker.worker.get_pp_group")
@patch("vllm_ascend.worker.worker.get_tp_group") @patch("vllm_ascend.worker.worker.get_tp_group")
@patch("vllm_ascend.worker.worker.EMPTY_MODEL_RUNNER_OUTPUT") @patch("vllm_ascend.worker.worker.EMPTY_MODEL_RUNNER_OUTPUT")
def test_execute_model_kv_connector_not_finished(self, mock_empty_output, def test_execute_model_kv_connector_not_finished(self, mock_empty_output,
mock_get_tp_group, mock_get_tp_group,
mock_get_pp_group): mock_get_pp_group,
mock_enable_sp):
"""Test execute_model method - kv_connector_output not finished sending/recving case""" """Test execute_model method - kv_connector_output not finished sending/recving case"""
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors

View File

@@ -6,7 +6,6 @@ import torch
import torch_npu import torch_npu
import vllm.envs as envs_vllm import vllm.envs as envs_vllm
from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl
from vllm.attention.backends.utils import PAD_SLOT_ID
from vllm.config import VllmConfig, get_current_vllm_config from vllm.config import VllmConfig, get_current_vllm_config
from vllm.forward_context import ForwardContext, get_forward_context from vllm.forward_context import ForwardContext, get_forward_context
from vllm.logger import logger from vllm.logger import logger
@@ -39,12 +38,17 @@ from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla
from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, maybe_trans_nz, from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, maybe_trans_nz,
weak_ref_tensors) vllm_version_is, weak_ref_tensors)
from vllm_ascend.worker.npu_input_batch import NPUInputBatch from vllm_ascend.worker.npu_input_batch import NPUInputBatch
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
if vllm_version_is('0.13.0'):
from vllm.attention.backends.utils import PAD_SLOT_ID # type: ignore
else:
from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore
MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024 MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024
BUILD_METADATA_STEP_PREFILL = 0 BUILD_METADATA_STEP_PREFILL = 0
BUILD_METADATA_STEP_DECODE = 1 BUILD_METADATA_STEP_DECODE = 1

View File

@@ -13,7 +13,13 @@ import torch
import torch.nn.functional as F import torch.nn.functional as F
import triton import triton
import triton.language as tl import triton.language as tl
from vllm.attention.backends.utils import PAD_SLOT_ID
from vllm_ascend.utils import vllm_version_is
if vllm_version_is('0.13.0'):
from vllm.attention.backends.utils import PAD_SLOT_ID # type: ignore
else:
from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore
def causal_conv1d_ref( def causal_conv1d_ref(

View File

@@ -1670,6 +1670,8 @@ class NPUModelRunner(GPUModelRunner):
attn_metadata, attn_metadata,
aux_hidden_states, aux_hidden_states,
) )
if not vllm_version_is('0.13.0'):
self._copy_draft_token_ids_to_cpu(scheduler_output)
( (
logprobs_lists, logprobs_lists,
@@ -1983,7 +1985,7 @@ class NPUModelRunner(GPUModelRunner):
query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs + query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +
1], 1],
_seq_lens_cpu=self.seq_lens.cpu[:num_reqs], _seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
seq_lens=self.seq_lens.cpu[:num_reqs], seq_lens=self.seq_lens.gpu[:num_reqs],
num_reqs=num_reqs, num_reqs=num_reqs,
num_actual_tokens=num_tokens, num_actual_tokens=num_tokens,
block_table_tensor=block_table_tensor[:num_reqs], block_table_tensor=block_table_tensor[:num_reqs],

View File

@@ -121,11 +121,12 @@ class NPUWorker(WorkerBase):
self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
self.cache_config.cache_dtype] self.cache_config.cache_dtype]
if self.model_config.trust_remote_code: if vllm_version_is('0.13.0'):
# note: lazy import to avoid importing torch before initializing if self.model_config.trust_remote_code:
from vllm.utils.import_utils import init_cached_hf_modules # note: lazy import to avoid importing torch before initializing
from vllm.utils.import_utils import init_cached_hf_modules
init_cached_hf_modules() init_cached_hf_modules()
self.profiler = self._init_profiler() self.profiler = self._init_profiler()
if vllm_config.model_config and vllm_config.model_config.enable_sleep_mode: if vllm_config.model_config and vllm_config.model_config.enable_sleep_mode: