upgrade vLLM to 0.12.0 tag (#4647)

Upgrade vLLM to v0.12.0 tag

- vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
- vLLM main:
86e178f7c4

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2025-12-03 23:43:05 +08:00
committed by GitHub
parent 26e8e58cea
commit 3f4c0ea0a0
22 changed files with 97 additions and 47 deletions

View File

@@ -32,7 +32,7 @@ on:
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
vllm_version: vllm_version:
required: false required: false
default: "86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24" default: "v0.12.0"
type: string type: string
description: vllm version to use description: vllm version to use
vllm_ascend_remote_url: vllm_ascend_remote_url:

View File

@@ -36,7 +36,7 @@ jobs:
- name: Get vLLM version - name: Get vLLM version
run: | run: |
VLLM_COMMIT=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 VLLM_COMMIT=v0.12.0
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
- name: Checkout repository - name: Checkout repository

View File

@@ -51,7 +51,7 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- vllm_branch: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 - vllm_branch: v0.12.0
vllm_ascend_branch: main vllm_ascend_branch: main
max-parallel: 1 max-parallel: 1
container: container:

View File

@@ -86,7 +86,7 @@ jobs:
tests: tests/e2e/nightly/ops tests: tests/e2e/nightly/ops
uses: ./.github/workflows/_e2e_nightly_single_node.yaml uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with: with:
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 vllm: v0.12.0
runner: ${{ matrix.test_config.os }} runner: ${{ matrix.test_config.os }}
tests: ${{ matrix.test_config.tests }} tests: ${{ matrix.test_config.tests }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
@@ -134,7 +134,7 @@ jobs:
- Qwen3-Next-80B-A3B-Instruct - Qwen3-Next-80B-A3B-Instruct
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
with: with:
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 vllm: v0.12.0
runner: ${{ matrix.test_config.os }} runner: ${{ matrix.test_config.os }}
model_list: ${{ toJson(matrix.test_config.model_list) }} model_list: ${{ toJson(matrix.test_config.model_list) }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11' image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'

View File

@@ -139,7 +139,7 @@ jobs:
tests: tests/e2e/nightly/models/test_glm4_5.py tests: tests/e2e/nightly/models/test_glm4_5.py
uses: ./.github/workflows/_e2e_nightly_single_node.yaml uses: ./.github/workflows/_e2e_nightly_single_node.yaml
with: with:
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 vllm: v0.12.0
runner: ${{ matrix.test_config.os }} runner: ${{ matrix.test_config.os }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
tests: ${{ matrix.test_config.tests }} tests: ${{ matrix.test_config.tests }}

View File

@@ -50,7 +50,7 @@ jobs:
with: with:
filters: | filters: |
e2e_tracker: e2e_tracker:
- '.github/workflows/vllm_ascend_test.yaml' - '.github/workflows/vllm_ascend_test_pr_full.yaml'
- '.github/workflows/_e2e_test.yaml' - '.github/workflows/_e2e_test.yaml'
- 'vllm_ascend/**' - 'vllm_ascend/**'
- 'csrc/**' - 'csrc/**'
@@ -69,7 +69,7 @@ jobs:
name: e2e-full name: e2e-full
strategy: strategy:
matrix: matrix:
vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24] vllm_version: [v0.12.0]
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml uses: ./.github/workflows/_e2e_test.yaml

View File

@@ -42,7 +42,7 @@ jobs:
lint: lint:
uses: ./.github/workflows/pre-commit.yml uses: ./.github/workflows/pre-commit.yml
with: with:
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 vllm: v0.12.0
changes: changes:
runs-on: ubuntu-latest runs-on: ubuntu-latest
outputs: outputs:
@@ -84,7 +84,7 @@ jobs:
SOC_VERSION: ascend910b1 SOC_VERSION: ascend910b1
strategy: strategy:
matrix: matrix:
vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24] vllm_version: [v0.12.0]
steps: steps:
- name: Free up disk space - name: Free up disk space
@@ -137,7 +137,8 @@ jobs:
--ignore tests/ut/torchair/models/test_torchair_deepseek_v2.py \ --ignore tests/ut/torchair/models/test_torchair_deepseek_v2.py \
--ignore tests/ut/models/test_qwen2_vl.py \ --ignore tests/ut/models/test_qwen2_vl.py \
--ignore tests/ut/models/test_qwen2_5_vl.py \ --ignore tests/ut/models/test_qwen2_5_vl.py \
--ignore tests/ut/models/test_qwen2_5_vl_without_padding.py --ignore tests/ut/models/test_qwen2_5_vl_without_padding.py \
--ignore tests/ut/model_loder
- name: Upload coverage to Codecov - name: Upload coverage to Codecov
# only upload coverage when commits merged # only upload coverage when commits merged
@@ -154,7 +155,7 @@ jobs:
name: e2e-light name: e2e-light
strategy: strategy:
matrix: matrix:
vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24] vllm_version: [v0.12.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs # Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes] needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request. # only trigger e2e test after lint passed and the change is e2e related with pull request.

View File

@@ -72,7 +72,7 @@ jobs:
- DeepSeek-V2-Lite - DeepSeek-V2-Lite
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
with: with:
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 vllm: v0.12.0
runner: ${{ matrix.runner }} runner: ${{ matrix.runner }}
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11 image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
model_list: ${{ toJson(matrix.model_list) }} model_list: ${{ toJson(matrix.model_list) }}

View File

@@ -48,10 +48,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 ARG VLLM_TAG=v0.12.0
# Revert this change once VLLM_TAG is specified to branch or tag RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \ python3 -m pip uninstall -y triton && \

View File

@@ -39,10 +39,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 ARG VLLM_TAG=v0.12.0
# Revert this change once VLLM_TAG is specified to branch or tag RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \ python3 -m pip uninstall -y triton && \

View File

@@ -36,10 +36,8 @@ COPY . /vllm-workspace/vllm-ascend/
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 ARG VLLM_TAG=v0.12.0
# Revert this change once VLLM_TAG is specified to branch or tag RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \ python3 -m pip uninstall -y triton && \

View File

@@ -47,10 +47,8 @@ RUN apt-get update -y && \
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 ARG VLLM_TAG=v0.12.0
# Revert this change once VLLM_TAG is specified to branch or tag RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \ python3 -m pip uninstall -y triton && \

View File

@@ -50,10 +50,8 @@ RUN yum update -y && \
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 ARG VLLM_TAG=v0.12.0
# Revert this change once VLLM_TAG is specified to branch or tag RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \ python3 -m pip uninstall -y triton && \

View File

@@ -50,10 +50,8 @@ RUN yum update -y && \
# Install vLLM # Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24 ARG VLLM_TAG=v0.12.0
# Revert this change once VLLM_TAG is specified to branch or tag RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \ python3 -m pip uninstall -y triton && \

View File

@@ -44,7 +44,7 @@ The table below is the release compatibility matrix for vLLM Ascend release.
For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly. For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------| |-------------|--------------|------------------|-------------|--------------------|
| main | v0.11.2 | >= 3.10, < 3.12 | 8.3.RC1 | 2.7.1 / 2.7.1 | | main | v0.12.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
## Release cadence ## Release cadence

View File

@@ -77,7 +77,7 @@ myst_substitutions = {
# CANN image tag # CANN image tag
'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11", 'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11",
# vllm version in ci # vllm version in ci
'ci_vllm_version': '86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24', 'ci_vllm_version': 'v0.12.0',
} }
# For cross-file header anchors # For cross-file header anchors

View File

@@ -24,7 +24,7 @@ class TestNPUPlatform(TestBase):
mock_vllm_config.cache_config = MagicMock() mock_vllm_config.cache_config = MagicMock()
mock_vllm_config.scheduler_config = MagicMock() mock_vllm_config.scheduler_config = MagicMock()
mock_vllm_config.speculative_config = None mock_vllm_config.speculative_config = None
mock_vllm_config.compilation_config.pass_config.enable_sequence_parallelism = False mock_vllm_config.compilation_config.pass_config.enable_sp = False
mock_vllm_config.compilation_config.cudagraph_mode = None mock_vllm_config.compilation_config.cudagraph_mode = None
return mock_vllm_config return mock_vllm_config

View File

@@ -23,6 +23,7 @@ if HAS_TRITON:
# isort: off # isort: off
import vllm_ascend.patch.platform.patch_sched_yield # noqa import vllm_ascend.patch.platform.patch_sched_yield # noqa
import vllm_ascend.patch.worker.patch_distributed # noqa import vllm_ascend.patch.worker.patch_distributed # noqa
import vllm_ascend.patch.worker.patch_deepseek # noqa
import vllm_ascend.patch.worker.patch_roberta # noqa import vllm_ascend.patch.worker.patch_roberta # noqa
import vllm_ascend.patch.worker.patch_weight_loader # noqa import vllm_ascend.patch.worker.patch_weight_loader # noqa
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa import vllm_ascend.patch.worker.patch_multimodal_merge # noqa

View File

@@ -0,0 +1,60 @@
from itertools import islice
import torch
from vllm.distributed import get_pp_group
from vllm.model_executor.models.deepseek_v2 import (DeepseekV2Model,
_get_llama_4_scaling)
from vllm.sequence import IntermediateTensors
def forward(
self,
input_ids,
positions,
intermediate_tensors,
inputs_embeds,
):
if get_pp_group().is_first_rank:
if inputs_embeds is not None:
hidden_states = inputs_embeds
else:
hidden_states = self.embed_input_ids(input_ids)
residual = None
else:
assert intermediate_tensors is not None
hidden_states = intermediate_tensors["hidden_states"]
residual = intermediate_tensors["residual"]
# Compute llama 4 scaling once per forward pass if enabled
# Note(wxy): This is a hack fix to avoid graph mode error for torch 2.8
# We'll find a better way to remove this patch.
try:
llama_4_scaling_config = getattr(self.config, "llama_4_scaling")
except AttributeError:
llama_4_scaling_config = None
llama_4_scaling: torch.Tensor | None
if llama_4_scaling_config is not None:
llama_4_scaling = _get_llama_4_scaling(
original_max_position_embeddings=llama_4_scaling_config[
"original_max_position_embeddings"],
scaling_beta=llama_4_scaling_config["beta"],
positions=positions,
)
else:
llama_4_scaling = None
for layer in islice(self.layers, self.start_layer, self.end_layer):
hidden_states, residual = layer(positions, hidden_states, residual,
llama_4_scaling)
if not get_pp_group().is_last_rank:
return IntermediateTensors({
"hidden_states": hidden_states,
"residual": residual
})
hidden_states, _ = self.norm(hidden_states, residual)
return hidden_states
DeepseekV2Model.forward = forward

View File

@@ -159,7 +159,8 @@ class NPUPlatform(Platform):
compilation_config.splitting_ops = [] compilation_config.splitting_ops = []
compilation_config.cudagraph_num_of_warmups = 1 compilation_config.cudagraph_num_of_warmups = 1
compilation_config.pass_config.enable_fusion = False compilation_config.pass_config.fuse_norm_quant = False
compilation_config.pass_config.fuse_act_quant = False
if compilation_config.mode not in [ if compilation_config.mode not in [
CompilationMode.NONE, CompilationMode.VLLM_COMPILE CompilationMode.NONE, CompilationMode.VLLM_COMPILE
@@ -194,7 +195,7 @@ class NPUPlatform(Platform):
# to ascend ops && hardwares. We update these sizes here to improve # to ascend ops && hardwares. We update these sizes here to improve
# default performance. # default performance.
update_default_aclgraph_sizes(vllm_config) update_default_aclgraph_sizes(vllm_config)
# TODO delete graph size update here when compilation_config.pass_config.enable_sequence_parallelism # TODO delete graph size update here when compilation_config.pass_config.enable_sp
# is supported by vllm-ascend. # is supported by vllm-ascend.
if vllm_config.parallel_config.tensor_parallel_size > 1 and not vllm_config.model_config.enforce_eager and \ if vllm_config.parallel_config.tensor_parallel_size > 1 and not vllm_config.model_config.enforce_eager and \
enable_sp(vllm_config): enable_sp(vllm_config):

View File

@@ -315,8 +315,8 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
eps=config.rms_norm_eps) eps=config.rms_norm_eps)
self.enable_sequence_parallelism = ( self.enable_sequence_parallelism = (
vllm_config.compilation_config.pass_config. vllm_config.compilation_config.pass_config.enable_sp
enable_sequence_parallelism if vllm_config is not None else False) if vllm_config is not None else False)
def forward( def forward(
self, self,
@@ -488,7 +488,7 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors) self.model.make_empty_intermediate_tensors)
self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sequence_parallelism self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sp
# Set MoE hyperparameters # Set MoE hyperparameters
self.expert_weights: list[torch.Tensor] = [] self.expert_weights: list[torch.Tensor] = []

View File

@@ -773,8 +773,7 @@ def enable_sp(vllm_config=None, enable_shared_expert_dp: bool = False) -> bool:
from vllm.config import get_current_vllm_config from vllm.config import get_current_vllm_config
vllm_config = get_current_vllm_config() vllm_config = get_current_vllm_config()
_ENABLE_SP = ( _ENABLE_SP = (
vllm_config.compilation_config.pass_config. vllm_config.compilation_config.pass_config.enable_sp
enable_sequence_parallelism
or envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM1 or envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM1
# Flash comm 1 should be enabled by env VLLM_ASCEND_ENABLE_FLASHCOMM1 # Flash comm 1 should be enabled by env VLLM_ASCEND_ENABLE_FLASHCOMM1
# We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility. # We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.