[main2main] upgrade vllm main 0202 (#6560)

### What this PR does / why we need it?
1. Fix `TypeError: FusedMoEParallelConfig.__init__() missing 1 required
positional argument: 'is_sequence_parallel'` due to
https://github.com/vllm-project/vllm/pull/32567
2. Fix ` TypeError: '>' not supported between instances of 'MagicMock'
and 'int'` due to https://github.com/vllm-project/vllm/pull/33035
3. Fix `TypeError: Can't instantiate abstract class AscendMLAImpl with
abstract methods forward_mha, forward_mqa` and AttributeError: 'bool'
object has no attribute 'process_weights_after_loading' due to
https://github.com/vllm-project/vllm/pull/33284
4. Fix `'AscendSharedFusedMoE' object has no attribute
'_routed_input_transform'`due to
https://github.com/vllm-project/vllm/pull/32790
5. Fix `NPUModelRunner._dummy_run() got an unexpected keyword argument
'num_active_loras'` due to
https://github.com/vllm-project/vllm/pull/32005
6. Fix the problem caused by` 'tuple' object has no attribute 'job_id'`
due to https://github.com/vllm-project/vllm/pull/27492
7. Fix the problem that all_moe_layers is not equal to vllm.moe_forward,
vllm.moe_forward_shared due to
https://github.com/vllm-project/vllm/pull/33184
8. Add patch to fix the problem "got multiple values for keyword
argument 'add_special_tokens'" due to
https://github.com/vllm-project/vllm/pull/32863
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.15.0
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.15.0

---------

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Meihan-chen <jcccx.cmh@gmail.com>
Signed-off-by: hfadzxy <starmoon_zhang@163.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: hfadzxy <starmoon_zhang@163.com>
This commit is contained in:
meihanc
2026-02-05 19:31:17 +08:00
committed by GitHub
parent 2c1608265b
commit 922e5c163b
28 changed files with 246 additions and 30 deletions

View File

@@ -38,6 +38,7 @@ jobs:
repository: vllm-project/vllm repository: vllm-project/vllm
path: ./vllm-empty path: ./vllm-empty
ref: ${{ inputs.vllm }} ref: ${{ inputs.vllm }}
- uses: dorny/paths-filter@v3 - uses: dorny/paths-filter@v3
id: filter id: filter
with: with:
@@ -62,10 +63,12 @@ jobs:
run: | run: |
git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend
pre-commit run --all-files --hook-stage manual --show-diff-on-failure pre-commit run --all-files --hook-stage manual --show-diff-on-failure
- name: Run mypy - name: Run mypy
run: | run: |
PYTHONPATH="$PYTHONPATH:$(pwd)/vllm-empty" PYTHONPATH="$PYTHONPATH:$(pwd)/vllm-empty"
export PYTHONPATH export PYTHONPATH
env
git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend
# Run mypy for Python 3.10, 3.11, 3.12 manually # Run mypy for Python 3.10, 3.11, 3.12 manually
# Note: We are now separating mypy from pre-commit hooks for performance reasons. # Note: We are now separating mypy from pre-commit hooks for performance reasons.

View File

@@ -37,7 +37,7 @@ jobs:
steps: steps:
- name: Get vLLM version - name: Get vLLM version
run: | run: |
VLLM_COMMIT=v0.15.0 VLLM_COMMIT=d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
- name: Checkout repository - name: Checkout repository

View File

@@ -27,7 +27,7 @@ RUN apt-get update -y && \
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching. # For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=v0.15.0 ARG VLLM_COMMIT=d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
cd /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \
git checkout $VLLM_COMMIT git checkout $VLLM_COMMIT

View File

@@ -75,7 +75,7 @@ jobs:
name: e2e-full name: e2e-full
strategy: strategy:
matrix: matrix:
vllm_version: [v0.15.0] vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0]
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -41,7 +41,7 @@ jobs:
lint: lint:
uses: ./.github/workflows/_pre_commit.yml uses: ./.github/workflows/_pre_commit.yml
with: with:
vllm: v0.15.0 vllm: d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a
changes: changes:
runs-on: linux-aarch64-a2-0 runs-on: linux-aarch64-a2-0
outputs: outputs:
@@ -87,7 +87,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy: strategy:
matrix: matrix:
vllm_version: [v0.15.0] vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0]
uses: ./.github/workflows/_unit_test.yaml uses: ./.github/workflows/_unit_test.yaml
with: with:
vllm: ${{ matrix.vllm_version }} vllm: ${{ matrix.vllm_version }}
@@ -99,7 +99,7 @@ jobs:
name: e2e-light name: e2e-light
strategy: strategy:
matrix: matrix:
vllm_version: [v0.15.0] vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs # Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes] needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request. # only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -33,7 +33,7 @@ jobs:
name: refresh codecov name: refresh codecov
strategy: strategy:
matrix: matrix:
vllm_version: [v0.15.0] vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a]
uses: ./.github/workflows/_unit_test.yaml uses: ./.github/workflows/_unit_test.yaml
with: with:
vllm: ${{ matrix.vllm_version }} vllm: ${{ matrix.vllm_version }}

View File

@@ -55,7 +55,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------| |-------------|--------------|------------------|-------------|--------------------|
| main | v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | | main | d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
## Release cadence ## Release cadence

View File

@@ -922,4 +922,7 @@ PROMPT_CONFIGS = {
@pytest.fixture(params=PROMPT_CONFIGS.keys()) @pytest.fixture(params=PROMPT_CONFIGS.keys())
def vl_config(request): def vl_config(request):
return PROMPT_CONFIGS[request.param] config = PROMPT_CONFIGS[request.param]
if "skip" in config:
pytest.skip(config["skip"])
return config

View File

@@ -9,6 +9,7 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig, FusedMoE
from vllm_ascend.ascend_config import init_ascend_config from vllm_ascend.ascend_config import init_ascend_config
from vllm_ascend.eplb.core.eplb_utils import init_eplb_config from vllm_ascend.eplb.core.eplb_utils import init_eplb_config
from vllm_ascend.utils import vllm_version_is
# isort: on # isort: on
@@ -21,7 +22,13 @@ class TestAscendConfig(unittest.TestCase):
"eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2}, "eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2},
} }
from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
moe_parallel_config = FusedMoEParallelConfig(2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True) if vllm_version_is("0.15.0"):
moe_parallel_config = FusedMoEParallelConfig(
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True)
else:
moe_parallel_config = FusedMoEParallelConfig(
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl",
is_sequence_parallel=False, enable_eplb=True)
moe_config = FusedMoEConfig( moe_config = FusedMoEConfig(
num_experts=8, num_experts=8,
experts_per_token=8, experts_per_token=8,

View File

@@ -82,8 +82,13 @@ class TestAscendMultiHeadLatentAttention(TestBase):
@patch("vllm_ascend.ops.mla.get_tensor_model_parallel_world_size") @patch("vllm_ascend.ops.mla.get_tensor_model_parallel_world_size")
def test_initialization(self, mock_tp_size, mock_ascend_config, def test_initialization(self, mock_tp_size, mock_ascend_config,
mock_get_vllm_config): mock_get_vllm_config):
# Create a proper mock for MLAAttention that has the required attributes
mock_mla_attn = MagicMock()
mock_mla_attn.process_weights_after_loading = MagicMock()
mock_mla_attn.impl = MagicMock()
mock_mla_attn.impl.process_weights_after_loading = MagicMock()
with patch("vllm_ascend.ops.mla.MLAAttention", return_value=True): with patch("vllm_ascend.ops.mla.MLAAttention", return_value=mock_mla_attn):
mock_tp_size.return_value = 2 mock_tp_size.return_value = 2
mock_ascend_config.return_value.enable_shared_expert_dp = True mock_ascend_config.return_value.enable_shared_expert_dp = True
mock_vllm_config = MagicMock(spec=VllmConfig) mock_vllm_config = MagicMock(spec=VllmConfig)
@@ -126,7 +131,14 @@ class TestAscendMultiHeadLatentAttention(TestBase):
num_hidden_layers=32, first_k_dense_replace=False) num_hidden_layers=32, first_k_dense_replace=False)
mock_get_vllm_config.return_value = mock_vllm_config mock_get_vllm_config.return_value = mock_vllm_config
mock_vllm_config.compilation_config = CompilationConfig() mock_vllm_config.compilation_config = CompilationConfig()
with patch("vllm_ascend.ops.mla.MLAAttention", return_value=True):
# Create a proper mock for MLAAttention that has the required attributes
mock_mla_attn = MagicMock()
mock_mla_attn.process_weights_after_loading = MagicMock()
mock_mla_attn.impl = MagicMock()
mock_mla_attn.impl.process_weights_after_loading = MagicMock()
with patch("vllm_ascend.ops.mla.MLAAttention", return_value=mock_mla_attn):
attn = AscendMultiHeadLatentAttention( attn = AscendMultiHeadLatentAttention(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=self.num_heads, num_heads=self.num_heads,

View File

@@ -1,6 +1,5 @@
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
from vllm.attention.layer import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.linear import LinearBase from vllm.model_executor.layers.linear import LinearBase
@@ -8,7 +7,12 @@ from vllm.model_executor.layers.linear import LinearBase
from tests.ut.base import TestBase from tests.ut.base import TestBase
from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
from vllm_ascend.quantization.modelslim_config import AscendModelSlimConfig from vllm_ascend.quantization.modelslim_config import AscendModelSlimConfig
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is
if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention
class TestAscendModelSlimConfig(TestBase): class TestAscendModelSlimConfig(TestBase):

View File

@@ -28,12 +28,15 @@ class TestEagleProposerInitialization(TestBase):
self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False self.vllm_config.model_config.uses_mrope = False
self.vllm_config.model_config.uses_xdrope_dim = 0
self.vllm_config.parallel_config.tensor_parallel_size = 1 self.vllm_config.parallel_config.tensor_parallel_size = 1
self.vllm_config.parallel_config.data_parallel_rank = 0
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.num_speculative_tokens = 2 self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.speculative_token_tree = str([ self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2) (i + 1) * (0, ) for i in range(2)
]) ])
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
self.vllm_config.additional_config = None self.vllm_config.additional_config = None
self.mock_cpugpubuffer = patch( self.mock_cpugpubuffer = patch(
@@ -141,12 +144,15 @@ class TestEagleProposerLoadModel(TestBase):
self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False self.vllm_config.model_config.uses_mrope = False
self.vllm_config.model_config.uses_xdrope_dim = 0
self.vllm_config.parallel_config.tensor_parallel_size = 1 self.vllm_config.parallel_config.tensor_parallel_size = 1
self.vllm_config.parallel_config.data_parallel_rank = 0
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.num_speculative_tokens = 2 self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.speculative_token_tree = str([ self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2) (i + 1) * (0, ) for i in range(2)
]) ])
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
self.vllm_config.additional_config = None self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config) init_ascend_config(self.vllm_config)
@@ -285,12 +291,15 @@ class TestEagleProposerDummyRun(TestBase):
self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False self.vllm_config.model_config.uses_mrope = False
self.vllm_config.model_config.uses_xdrope_dim = 0
self.vllm_config.model_config.use_mla = False self.vllm_config.model_config.use_mla = False
self.vllm_config.parallel_config.tensor_parallel_size = 1 self.vllm_config.parallel_config.tensor_parallel_size = 1
self.vllm_config.parallel_config.data_parallel_rank = 0
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.speculative_token_tree = str([ self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(4) (i + 1) * (0, ) for i in range(4)
]) ])
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
self.vllm_config.additional_config = None self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config) init_ascend_config(self.vllm_config)
@@ -404,12 +413,15 @@ class TestEagleProposerHelperMethods(TestBase):
self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False self.vllm_config.model_config.uses_mrope = False
self.vllm_config.model_config.uses_xdrope_dim = 0
self.vllm_config.parallel_config.tensor_parallel_size = 1 self.vllm_config.parallel_config.tensor_parallel_size = 1
self.vllm_config.parallel_config.data_parallel_rank = 0
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.num_speculative_tokens = 2 self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.speculative_token_tree = str([ self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2) (i + 1) * (0, ) for i in range(2)
]) ])
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
self.vllm_config.additional_config = None self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config) init_ascend_config(self.vllm_config)

View File

@@ -34,6 +34,7 @@ class TestMtpProposer:
config.speculative_config.draft_model_config = MagicMock() config.speculative_config.draft_model_config = MagicMock()
config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096 config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
config.speculative_config.draft_model_config.uses_mrope = False config.speculative_config.draft_model_config.uses_mrope = False
config.speculative_config.draft_model_config.uses_xdrope_dim = 0
config.speculative_config.speculative_token_tree = str([ config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2) (i + 1) * (0, ) for i in range(2)
]) ])
@@ -42,9 +43,11 @@ class TestMtpProposer:
config.model_config.dtype = torch.float16 config.model_config.dtype = torch.float16
config.model_config.max_model_len = 2048 config.model_config.max_model_len = 2048
config.model_config.uses_mrope = False config.model_config.uses_mrope = False
config.model_config.uses_xdrope_dim = 0
config.model_config.hf_text_config = None config.model_config.hf_text_config = None
config.model_config.hf_config = None config.model_config.hf_config = None
config.parallel_config.tensor_parallel_size = 1 config.parallel_config.tensor_parallel_size = 1
config.parallel_config.data_parallel_rank = 0
config.speculative_config.draft_tensor_parallel_size = 1 config.speculative_config.draft_tensor_parallel_size = 1
config.load_config = None config.load_config = None

View File

@@ -1450,6 +1450,28 @@ class AscendMLAImpl(MLAAttentionImpl):
def get_num_actual_tokens(self, attn_metadata: M): def get_num_actual_tokens(self, attn_metadata: M):
return attn_metadata.num_actual_tokens return attn_metadata.num_actual_tokens
def forward_mha(
self,
layer_name: str,
hidden_states: torch.Tensor,
kv_cache: tuple[torch.Tensor],
attn_metadata: M,
need_gather_q_kv: bool = False,
output: torch.Tensor | None = None,
) -> torch.Tensor:
raise NotImplementedError("forward_mha is not supported for MLA attention. Use forward() instead.")
def forward_mqa(
self,
layer_name: str,
hidden_states: torch.Tensor,
kv_cache: tuple[torch.Tensor],
attn_metadata: M,
need_gather_q_kv: bool = False,
output: torch.Tensor | None = None,
) -> torch.Tensor:
raise NotImplementedError("forward_mqa is not supported for MLA attention. Use forward() instead.")
def forward( def forward(
self, self,
layer_name, layer_name,

View File

@@ -1062,3 +1062,24 @@ class AscendSFAImpl(MLAAttentionImpl):
torch.distributed.all_to_all_single(attn_output, send, group=get_tp_group().device_group) torch.distributed.all_to_all_single(attn_output, send, group=get_tp_group().device_group)
return attn_output, True return attn_output, True
def forward_mha(
self,
q: torch.Tensor,
kv_c_normed: torch.Tensor,
k_pe: torch.Tensor,
kv_c_and_k_pe_cache: torch.Tensor,
attn_metadata: M,
k_scale: torch.Tensor,
output: torch.Tensor,
) -> None:
raise NotImplementedError("forward_mha is not supported for SFA attention. Use forward() instead.")
def forward_mqa(
self,
q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
kv_c_and_k_pe_cache: torch.Tensor,
attn_metadata: M,
layer,
) -> tuple[torch.Tensor, torch.Tensor | None]:
raise NotImplementedError("forward_mqa is not supported for SFA attention. Use forward() instead.")

View File

@@ -18,7 +18,6 @@
import torch import torch
import torchair import torchair
from vllm.attention.layer import Attention
from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.config.compilation import Range from vllm.config.compilation import Range
from vllm.logger import logger from vllm.logger import logger
@@ -27,6 +26,12 @@ from vllm_ascend.compilation.npugraph_ex_passes.utils.npugraph_ex_utils_check im
check_and_register_fusion_pass, check_and_register_fusion_pass,
extra_stream_scope_check, extra_stream_scope_check,
) )
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention
class GraphEXQKNormRopeFusionPattern: class GraphEXQKNormRopeFusionPattern:

View File

@@ -18,12 +18,18 @@
import torch import torch
import torch._inductor.pattern_matcher as pm import torch._inductor.pattern_matcher as pm
from torch._inductor.pattern_matcher import PatternMatcherPass, PatternPrettyPrinter from torch._inductor.pattern_matcher import PatternMatcherPass, PatternPrettyPrinter
from vllm.attention.layer import Attention
from vllm.compilation.vllm_inductor_pass import VllmInductorPass from vllm.compilation.vllm_inductor_pass import VllmInductorPass
from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.config.compilation import Range from vllm.config.compilation import Range
from vllm.logger import logger from vllm.logger import logger
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention
class QKNormRopeFusionPattern: class QKNormRopeFusionPattern:
def __init__(self, vllm_config, head_dim, num_heads, num_kv_heads, eps=1e-6): def __init__(self, vllm_config, head_dim, num_heads, num_kv_heads, eps=1e-6):

View File

@@ -10,7 +10,6 @@ from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Optional from typing import TYPE_CHECKING, Any, Optional
import torch import torch
from vllm.attention.layer import Attention, MLAAttention
from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer
from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole
@@ -27,6 +26,7 @@ from vllm_ascend.distributed.kv_transfer.kv_pool.cpu_offload.metadata import (
MetadataServerProc, MetadataServerProc,
MLAConfig, MLAConfig,
) )
from vllm_ascend.utils import vllm_version_is
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.forward_context import ForwardContext from vllm.forward_context import ForwardContext
@@ -35,6 +35,11 @@ if TYPE_CHECKING:
from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import Request from vllm.v1.request import Request
if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention, MLAAttention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention, MLAAttention
@dataclass @dataclass
class ReqMeta: class ReqMeta:

View File

@@ -6,6 +6,8 @@ from vllm.v1.attention.backend import AttentionBackend # type: ignore
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.worker.worker import OffloadingHandler, TransferResult, TransferSpec from vllm.v1.kv_offload.worker.worker import OffloadingHandler, TransferResult, TransferSpec
from vllm_ascend.utils import vllm_version_is
logger = init_logger(__name__) logger = init_logger(__name__)
@@ -153,12 +155,30 @@ class CpuNpuOffloadingHandler(OffloadingHandler):
def get_finished(self) -> list[TransferResult]: def get_finished(self) -> list[TransferResult]:
results: list[TransferResult] = [] results: list[TransferResult] = []
for job_id, event in self.transfer_events.items(): if vllm_version_is("v0.15.0"):
if event.query(): for job_id, event in self.transfer_events.items():
results.append((job_id, True)) if event.query():
self.events_pool.append(event) results.append((job_id, True))
for job_id, _ in results: self.events_pool.append(event)
del self.transfer_events[job_id] for job_id, _ in results:
del self.transfer_events[job_id]
else:
finished_job_ids = []
for job_id, event in self.transfer_events.items():
if event.query():
results.append(
TransferResult(
job_id=job_id,
success=True,
transfer_size=None,
transfer_time=None,
transfer_type=None,
)
)
finished_job_ids.append(job_id)
self.events_pool.append(event)
for job_id in finished_job_ids:
del self.transfer_events[job_id]
return results return results
def wait(self, job_ids: set[int]) -> None: def wait(self, job_ids: set[int]) -> None:

View File

@@ -46,7 +46,8 @@ from vllm_ascend.ops.fused_moe.prepare_finalize import QuantType
from vllm_ascend.utils import (AscendDeviceType, enable_sp, from vllm_ascend.utils import (AscendDeviceType, enable_sp,
get_ascend_device_type, maybe_trans_nz, get_ascend_device_type, maybe_trans_nz,
npu_stream_switch, shared_expert_dp_enabled, npu_stream_switch, shared_expert_dp_enabled,
shared_experts_calculation_stream) shared_experts_calculation_stream,
vllm_version_is)
@dataclass @dataclass
class FusedMoEResult: class FusedMoEResult:
@@ -407,10 +408,13 @@ class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE):
shared_experts: torch.nn.Module, shared_experts: torch.nn.Module,
gate: Optional[torch.nn.Module] = None, gate: Optional[torch.nn.Module] = None,
use_overlapped: bool = True, use_overlapped: bool = True,
routed_input_transform: Optional[torch.nn.Module] = None,
**kwargs, **kwargs,
): ):
AscendFusedMoE.__init__(self, **kwargs) AscendFusedMoE.__init__(self, **kwargs)
if not vllm_version_is("0.15.0"):
self._routed_input_transform = routed_input_transform
self._shared_experts = shared_experts self._shared_experts = shared_experts
self.use_overlapped = use_overlapped self.use_overlapped = use_overlapped
self.shared_expert_stream = None self.shared_expert_stream = None

View File

@@ -23,7 +23,6 @@ from typing import Optional
import torch import torch
from torch import nn from torch import nn
from vllm.attention.layer import MLAAttention
from vllm.config import CacheConfig, get_current_vllm_config from vllm.config import CacheConfig, get_current_vllm_config
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.forward_context import ForwardContext, get_forward_context from vllm.forward_context import ForwardContext, get_forward_context
@@ -34,6 +33,12 @@ from vllm.utils.torch_utils import direct_register_custom_op
from vllm.v1.attention.backend import AttentionMetadata # type: ignore from vllm.v1.attention.backend import AttentionMetadata # type: ignore
from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("v0.15.0"):
from vllm.attention.layer import MLAAttention # type: ignore
else:
from vllm.model_executor.layers.attention import MLAAttention
class IndexerWrapper(nn.Module): class IndexerWrapper(nn.Module):
@@ -125,6 +130,16 @@ class AscendMultiHeadLatentAttention(MultiHeadLatentAttentionWrapper):
o_proj=mla_modules.o_proj, o_proj=mla_modules.o_proj,
) )
original_process_weights = self.mla_attn.process_weights_after_loading
def wrapped_process_weights(act_dtype: torch.dtype):
from vllm_ascend.attention.sfa_v1 import AscendSFAImpl
if not isinstance(self.mla_attn.impl, AscendSFAImpl):
original_process_weights(act_dtype)
self.mla_attn.impl.process_weights_after_loading(act_dtype)
self.mla_attn.process_weights_after_loading = wrapped_process_weights
compilation_config = get_current_vllm_config().compilation_config compilation_config = get_current_vllm_config().compilation_config
if prefix in compilation_config.static_forward_context: if prefix in compilation_config.static_forward_context:
raise ValueError(f"Duplicate layer name: {prefix}") raise ValueError(f"Duplicate layer name: {prefix}")

View File

@@ -33,3 +33,4 @@ import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa
import vllm_ascend.patch.worker.patch_rejection_sampler # noqa import vllm_ascend.patch.worker.patch_rejection_sampler # noqa
import vllm_ascend.patch.worker.patch_qwen3_next # noqa import vllm_ascend.patch.worker.patch_qwen3_next # noqa
import vllm_ascend.patch.worker.patch_v2_egale # noqa import vllm_ascend.patch.worker.patch_v2_egale # noqa
import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa

View File

@@ -0,0 +1,27 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#from collections.abc import Iterable
from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
_original_call = HunYuanVLProcessor.__call__
def _patched_call(self, images=None, text=None, videos=None, **kwargs):
"""Remove add_special_tokens requirement."""
kwargs.pop("add_special_tokens", None)
return _original_call(self, images=images, text=text, videos=videos, **kwargs)
HunYuanVLProcessor.__call__ = _patched_call

View File

@@ -1,8 +1,12 @@
import torch import torch
import vllm.v1.worker.utils as utils import vllm.v1.worker.utils as utils
from vllm.attention.layer import Attention
from vllm.v1.worker.utils import defaultdict, extract_layer_index from vllm.v1.worker.utils import defaultdict, extract_layer_index
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention
# Without this patch, it will raise an exception when initialize kv_cache. # Without this patch, it will raise an exception when initialize kv_cache.
# TODO To remove the patch, we need check why the original bind_kv_cache raises an NotImplementedError. # TODO To remove the patch, we need check why the original bind_kv_cache raises an NotImplementedError.

View File

@@ -401,7 +401,13 @@ class AscendModelSlimConfig(QuantizationConfig):
self.packed_modules_mapping = packed_modules_model_mapping[ self.packed_modules_mapping = packed_modules_model_mapping[
model_type] model_type]
prefix = self.quant_prefix_mapper(model_type, prefix) prefix = self.quant_prefix_mapper(model_type, prefix)
from vllm.attention.layer import Attention
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention
if prefix.startswith("language_model"): if prefix.startswith("language_model"):
prefix = prefix.split('.', 1)[-1] prefix = prefix.split('.', 1)[-1]
if isinstance(layer, LinearBase): if isinstance(layer, LinearBase):

View File

@@ -41,7 +41,7 @@ from vllm_ascend.ops.rotary_embedding import update_cos_sin
from vllm_ascend.ops.triton.spec_decode.utils import \ from vllm_ascend.ops.triton.spec_decode.utils import \
prepare_inputs_padded_kernel prepare_inputs_padded_kernel
from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
from vllm_ascend.utils import enable_sp, shared_expert_dp_enabled, lmhead_tp_enable from vllm_ascend.utils import enable_sp, shared_expert_dp_enabled, lmhead_tp_enable, vllm_version_is
# Currently we will fix block size to a small one since `num_reqs` can't be too large # Currently we will fix block size to a small one since `num_reqs` can't be too large
_PREPARE_INPUTS_BLOCK_SIZE = 4 _PREPARE_INPUTS_BLOCK_SIZE = 4
@@ -400,6 +400,12 @@ class EagleProposer(VllmEagleProposer):
is_draft_model=True, is_draft_model=True,
draft_attn_metadatas=multi_steps_attn_metadata): draft_attn_metadatas=multi_steps_attn_metadata):
if not vllm_version_is("v0.15.0"):
# Reset MOE layer index before first model call
forward_context = get_forward_context()
if forward_context is not None:
forward_context.moe_layer_index = 0
self._runnable( self._runnable(
num_input_tokens=num_tokens, num_input_tokens=num_tokens,
batch_size=batch_size, batch_size=batch_size,
@@ -559,6 +565,12 @@ class EagleProposer(VllmEagleProposer):
is_draft_model=True, is_draft_model=True,
draft_attn_metadatas=multi_steps_attn_metadata): draft_attn_metadatas=multi_steps_attn_metadata):
if not vllm_version_is("v0.15.0"):
# Reset MOE layer index for forward pass
forward_context = get_forward_context()
if forward_context is not None:
forward_context.moe_layer_index = 0
draft_token_ids = self._runnable( draft_token_ids = self._runnable(
num_input_tokens=num_input_tokens, num_input_tokens=num_input_tokens,
batch_size=batch_size, batch_size=batch_size,
@@ -660,6 +672,12 @@ class EagleProposer(VllmEagleProposer):
forward_context.num_accept_tokens = batch_size forward_context.num_accept_tokens = batch_size
for draft_step in range(self.num_speculative_tokens - 1): for draft_step in range(self.num_speculative_tokens - 1):
if not vllm_version_is("v0.15.0"):
# Reset MOE layer index for each draft step iteration
forward_context = get_forward_context()
if forward_context is not None:
forward_context.moe_layer_index = 0
# Update the inputs. # Update the inputs.
# cast to int32 is crucial when eagle model is compiled. # cast to int32 is crucial when eagle model is compiled.
# tensor.argmax() returns int64 by default. # tensor.argmax() returns int64 by default.

View File

@@ -18,7 +18,7 @@ from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
from vllm_ascend.compilation.acl_graph import ACLGraphWrapper from vllm_ascend.compilation.acl_graph import ACLGraphWrapper
from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla
from vllm_ascend.spec_decode.eagle_proposer import EagleProposer from vllm_ascend.spec_decode.eagle_proposer import EagleProposer
from vllm_ascend.utils import lmhead_tp_enable from vllm_ascend.utils import lmhead_tp_enable, vllm_version_is
class MtpProposer(EagleProposer): class MtpProposer(EagleProposer):
@@ -122,6 +122,11 @@ class MtpProposer(EagleProposer):
batch_descriptor=batch_descriptor, batch_descriptor=batch_descriptor,
is_draft_model=True, is_draft_model=True,
in_profile_run=is_profile): in_profile_run=is_profile):
if not vllm_version_is("v0.15.0"):
# Reset MOE layer index for each MTP step iteration
forward_context = get_forward_context()
if forward_context is not None:
forward_context.moe_layer_index = 0
previous_hidden_states, positions = self.maybe_pad_and_reduce( previous_hidden_states, positions = self.maybe_pad_and_reduce(
previous_hidden_states, positions) previous_hidden_states, positions)
self.model(input_ids=input_ids, self.model(input_ids=input_ids,
@@ -330,6 +335,13 @@ class MtpProposer(EagleProposer):
batch_descriptor=batch_descriptor, batch_descriptor=batch_descriptor,
num_actual_tokens=num_tokens, num_actual_tokens=num_tokens,
is_draft_model=True): is_draft_model=True):
if not vllm_version_is("v0.15.0"):
# Reset MOE layer index for each MTP step to match all_moe_layers registration
forward_context = get_forward_context()
if forward_context is not None:
forward_context.moe_layer_index = 0
with record_function_or_nullcontext('mtp_forward'): with record_function_or_nullcontext('mtp_forward'):
model_kwargs = {} model_kwargs = {}
model_kwargs["attn_metadata"] = attn_metadata model_kwargs["attn_metadata"] = attn_metadata

View File

@@ -30,7 +30,6 @@ import numpy as np
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import torch.nn as nn import torch.nn as nn
from vllm.attention.layer import Attention, MLAAttention
from vllm.compilation.cuda_graph import CUDAGraphStat from vllm.compilation.cuda_graph import CUDAGraphStat
from vllm.config import CompilationMode, CUDAGraphMode, VllmConfig, get_layers_from_vllm_config from vllm.config import CompilationMode, CUDAGraphMode, VllmConfig, get_layers_from_vllm_config
from vllm.distributed import get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather from vllm.distributed import get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather
@@ -137,6 +136,12 @@ if TYPE_CHECKING:
else: else:
xgr = LazyLoader("xgr", globals(), "xgrammar") xgr = LazyLoader("xgr", globals(), "xgrammar")
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention, MLAAttention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention, MLAAttention
# if true, allow tensor initialization and casting with internal format (e.g., NZ) # if true, allow tensor initialization and casting with internal format (e.g., NZ)
torch.npu.config.allow_internal_format = True torch.npu.config.allow_internal_format = True
@@ -2026,6 +2031,7 @@ class NPUModelRunner(GPUModelRunner):
remove_lora: bool = True, remove_lora: bool = True,
activate_lora: bool = False, activate_lora: bool = False,
is_graph_capturing: bool = False, is_graph_capturing: bool = False,
num_active_loras: int = 0,
) -> tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
# only support eager mode and piecewise graph now # only support eager mode and piecewise graph now
assert cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes() assert cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes()