upgrade vLLM to 0.12.0 tag (#4647)
Upgrade vLLM to v0.12.0 tag
- vLLM version: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
- vLLM main:
86e178f7c4
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -32,7 +32,7 @@ on:
|
|||||||
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
|
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
|
||||||
vllm_version:
|
vllm_version:
|
||||||
required: false
|
required: false
|
||||||
default: "86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24"
|
default: "v0.12.0"
|
||||||
type: string
|
type: string
|
||||||
description: vllm version to use
|
description: vllm version to use
|
||||||
vllm_ascend_remote_url:
|
vllm_ascend_remote_url:
|
||||||
|
|||||||
2
.github/workflows/format_pr_body.yaml
vendored
2
.github/workflows/format_pr_body.yaml
vendored
@@ -36,7 +36,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Get vLLM version
|
- name: Get vLLM version
|
||||||
run: |
|
run: |
|
||||||
VLLM_COMMIT=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
VLLM_COMMIT=v0.12.0
|
||||||
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
|
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
|
||||||
|
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
|
|||||||
2
.github/workflows/nightly_benchmarks.yaml
vendored
2
.github/workflows/nightly_benchmarks.yaml
vendored
@@ -51,7 +51,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- vllm_branch: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
- vllm_branch: v0.12.0
|
||||||
vllm_ascend_branch: main
|
vllm_ascend_branch: main
|
||||||
max-parallel: 1
|
max-parallel: 1
|
||||||
container:
|
container:
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ jobs:
|
|||||||
tests: tests/e2e/nightly/ops
|
tests: tests/e2e/nightly/ops
|
||||||
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
||||||
with:
|
with:
|
||||||
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
vllm: v0.12.0
|
||||||
runner: ${{ matrix.test_config.os }}
|
runner: ${{ matrix.test_config.os }}
|
||||||
tests: ${{ matrix.test_config.tests }}
|
tests: ${{ matrix.test_config.tests }}
|
||||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2'
|
||||||
@@ -134,7 +134,7 @@ jobs:
|
|||||||
- Qwen3-Next-80B-A3B-Instruct
|
- Qwen3-Next-80B-A3B-Instruct
|
||||||
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
|
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
|
||||||
with:
|
with:
|
||||||
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
vllm: v0.12.0
|
||||||
runner: ${{ matrix.test_config.os }}
|
runner: ${{ matrix.test_config.os }}
|
||||||
model_list: ${{ toJson(matrix.test_config.model_list) }}
|
model_list: ${{ toJson(matrix.test_config.model_list) }}
|
||||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
|
||||||
|
|||||||
@@ -139,7 +139,7 @@ jobs:
|
|||||||
tests: tests/e2e/nightly/models/test_glm4_5.py
|
tests: tests/e2e/nightly/models/test_glm4_5.py
|
||||||
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
uses: ./.github/workflows/_e2e_nightly_single_node.yaml
|
||||||
with:
|
with:
|
||||||
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
vllm: v0.12.0
|
||||||
runner: ${{ matrix.test_config.os }}
|
runner: ${{ matrix.test_config.os }}
|
||||||
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
|
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3'
|
||||||
tests: ${{ matrix.test_config.tests }}
|
tests: ${{ matrix.test_config.tests }}
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
filters: |
|
filters: |
|
||||||
e2e_tracker:
|
e2e_tracker:
|
||||||
- '.github/workflows/vllm_ascend_test.yaml'
|
- '.github/workflows/vllm_ascend_test_pr_full.yaml'
|
||||||
- '.github/workflows/_e2e_test.yaml'
|
- '.github/workflows/_e2e_test.yaml'
|
||||||
- 'vllm_ascend/**'
|
- 'vllm_ascend/**'
|
||||||
- 'csrc/**'
|
- 'csrc/**'
|
||||||
@@ -69,7 +69,7 @@ jobs:
|
|||||||
name: e2e-full
|
name: e2e-full
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24]
|
vllm_version: [v0.12.0]
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
|
||||||
uses: ./.github/workflows/_e2e_test.yaml
|
uses: ./.github/workflows/_e2e_test.yaml
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ jobs:
|
|||||||
lint:
|
lint:
|
||||||
uses: ./.github/workflows/pre-commit.yml
|
uses: ./.github/workflows/pre-commit.yml
|
||||||
with:
|
with:
|
||||||
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
vllm: v0.12.0
|
||||||
changes:
|
changes:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
outputs:
|
outputs:
|
||||||
@@ -84,7 +84,7 @@ jobs:
|
|||||||
SOC_VERSION: ascend910b1
|
SOC_VERSION: ascend910b1
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24]
|
vllm_version: [v0.12.0]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Free up disk space
|
- name: Free up disk space
|
||||||
@@ -137,7 +137,8 @@ jobs:
|
|||||||
--ignore tests/ut/torchair/models/test_torchair_deepseek_v2.py \
|
--ignore tests/ut/torchair/models/test_torchair_deepseek_v2.py \
|
||||||
--ignore tests/ut/models/test_qwen2_vl.py \
|
--ignore tests/ut/models/test_qwen2_vl.py \
|
||||||
--ignore tests/ut/models/test_qwen2_5_vl.py \
|
--ignore tests/ut/models/test_qwen2_5_vl.py \
|
||||||
--ignore tests/ut/models/test_qwen2_5_vl_without_padding.py
|
--ignore tests/ut/models/test_qwen2_5_vl_without_padding.py \
|
||||||
|
--ignore tests/ut/model_loder
|
||||||
|
|
||||||
- name: Upload coverage to Codecov
|
- name: Upload coverage to Codecov
|
||||||
# only upload coverage when commits merged
|
# only upload coverage when commits merged
|
||||||
@@ -154,7 +155,7 @@ jobs:
|
|||||||
name: e2e-light
|
name: e2e-light
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24]
|
vllm_version: [v0.12.0]
|
||||||
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
# Note (yikun): If CI resource are limited we can split job into two chain jobs
|
||||||
needs: [lint, changes]
|
needs: [lint, changes]
|
||||||
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
# only trigger e2e test after lint passed and the change is e2e related with pull request.
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ jobs:
|
|||||||
- DeepSeek-V2-Lite
|
- DeepSeek-V2-Lite
|
||||||
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
|
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
|
||||||
with:
|
with:
|
||||||
vllm: 86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
vllm: v0.12.0
|
||||||
runner: ${{ matrix.runner }}
|
runner: ${{ matrix.runner }}
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
|
||||||
model_list: ${{ toJson(matrix.model_list) }}
|
model_list: ${{ toJson(matrix.model_list) }}
|
||||||
|
|||||||
@@ -48,10 +48,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
ARG VLLM_TAG=v0.12.0
|
||||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
|
||||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
python3 -m pip uninstall -y triton && \
|
python3 -m pip uninstall -y triton && \
|
||||||
|
|||||||
@@ -39,10 +39,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
ARG VLLM_TAG=v0.12.0
|
||||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
|
||||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
python3 -m pip uninstall -y triton && \
|
python3 -m pip uninstall -y triton && \
|
||||||
|
|||||||
@@ -36,10 +36,8 @@ COPY . /vllm-workspace/vllm-ascend/
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
ARG VLLM_TAG=v0.12.0
|
||||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
|
||||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
python3 -m pip uninstall -y triton && \
|
python3 -m pip uninstall -y triton && \
|
||||||
|
|||||||
@@ -47,10 +47,8 @@ RUN apt-get update -y && \
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
ARG VLLM_TAG=v0.12.0
|
||||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
|
||||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
python3 -m pip uninstall -y triton && \
|
python3 -m pip uninstall -y triton && \
|
||||||
|
|||||||
@@ -50,10 +50,8 @@ RUN yum update -y && \
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
ARG VLLM_TAG=v0.12.0
|
||||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
|
||||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
python3 -m pip uninstall -y triton && \
|
python3 -m pip uninstall -y triton && \
|
||||||
|
|||||||
@@ -50,10 +50,8 @@ RUN yum update -y && \
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24
|
ARG VLLM_TAG=v0.12.0
|
||||||
# Revert this change once VLLM_TAG is specified to branch or tag
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
|
||||||
RUN git clone $VLLM_REPO /vllm-workspace/vllm && (cd /vllm-workspace/vllm && git checkout $VLLM_TAG)
|
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
python3 -m pip uninstall -y triton && \
|
python3 -m pip uninstall -y triton && \
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ The table below is the release compatibility matrix for vLLM Ascend release.
|
|||||||
For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
|
For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
|
||||||
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|
||||||
|-------------|--------------|------------------|-------------|--------------------|
|
|-------------|--------------|------------------|-------------|--------------------|
|
||||||
| main | v0.11.2 | >= 3.10, < 3.12 | 8.3.RC1 | 2.7.1 / 2.7.1 |
|
| main | v0.12.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
|
||||||
|
|
||||||
## Release cadence
|
## Release cadence
|
||||||
|
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ myst_substitutions = {
|
|||||||
# CANN image tag
|
# CANN image tag
|
||||||
'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11",
|
'cann_image_tag': "8.3.rc2-910b-ubuntu22.04-py3.11",
|
||||||
# vllm version in ci
|
# vllm version in ci
|
||||||
'ci_vllm_version': '86e178f7c4d8c3b0eaf3c8e3f810a83f63b90e24',
|
'ci_vllm_version': 'v0.12.0',
|
||||||
}
|
}
|
||||||
|
|
||||||
# For cross-file header anchors
|
# For cross-file header anchors
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ class TestNPUPlatform(TestBase):
|
|||||||
mock_vllm_config.cache_config = MagicMock()
|
mock_vllm_config.cache_config = MagicMock()
|
||||||
mock_vllm_config.scheduler_config = MagicMock()
|
mock_vllm_config.scheduler_config = MagicMock()
|
||||||
mock_vllm_config.speculative_config = None
|
mock_vllm_config.speculative_config = None
|
||||||
mock_vllm_config.compilation_config.pass_config.enable_sequence_parallelism = False
|
mock_vllm_config.compilation_config.pass_config.enable_sp = False
|
||||||
mock_vllm_config.compilation_config.cudagraph_mode = None
|
mock_vllm_config.compilation_config.cudagraph_mode = None
|
||||||
return mock_vllm_config
|
return mock_vllm_config
|
||||||
|
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ if HAS_TRITON:
|
|||||||
# isort: off
|
# isort: off
|
||||||
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
import vllm_ascend.patch.platform.patch_sched_yield # noqa
|
||||||
import vllm_ascend.patch.worker.patch_distributed # noqa
|
import vllm_ascend.patch.worker.patch_distributed # noqa
|
||||||
|
import vllm_ascend.patch.worker.patch_deepseek # noqa
|
||||||
import vllm_ascend.patch.worker.patch_roberta # noqa
|
import vllm_ascend.patch.worker.patch_roberta # noqa
|
||||||
import vllm_ascend.patch.worker.patch_weight_loader # noqa
|
import vllm_ascend.patch.worker.patch_weight_loader # noqa
|
||||||
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
|
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
|
||||||
|
|||||||
60
vllm_ascend/patch/worker/patch_deepseek.py
Normal file
60
vllm_ascend/patch/worker/patch_deepseek.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
from itertools import islice
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from vllm.distributed import get_pp_group
|
||||||
|
from vllm.model_executor.models.deepseek_v2 import (DeepseekV2Model,
|
||||||
|
_get_llama_4_scaling)
|
||||||
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids,
|
||||||
|
positions,
|
||||||
|
intermediate_tensors,
|
||||||
|
inputs_embeds,
|
||||||
|
):
|
||||||
|
if get_pp_group().is_first_rank:
|
||||||
|
if inputs_embeds is not None:
|
||||||
|
hidden_states = inputs_embeds
|
||||||
|
else:
|
||||||
|
hidden_states = self.embed_input_ids(input_ids)
|
||||||
|
residual = None
|
||||||
|
else:
|
||||||
|
assert intermediate_tensors is not None
|
||||||
|
hidden_states = intermediate_tensors["hidden_states"]
|
||||||
|
residual = intermediate_tensors["residual"]
|
||||||
|
|
||||||
|
# Compute llama 4 scaling once per forward pass if enabled
|
||||||
|
# Note(wxy): This is a hack fix to avoid graph mode error for torch 2.8
|
||||||
|
# We'll find a better way to remove this patch.
|
||||||
|
try:
|
||||||
|
llama_4_scaling_config = getattr(self.config, "llama_4_scaling")
|
||||||
|
except AttributeError:
|
||||||
|
llama_4_scaling_config = None
|
||||||
|
llama_4_scaling: torch.Tensor | None
|
||||||
|
if llama_4_scaling_config is not None:
|
||||||
|
llama_4_scaling = _get_llama_4_scaling(
|
||||||
|
original_max_position_embeddings=llama_4_scaling_config[
|
||||||
|
"original_max_position_embeddings"],
|
||||||
|
scaling_beta=llama_4_scaling_config["beta"],
|
||||||
|
positions=positions,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
llama_4_scaling = None
|
||||||
|
|
||||||
|
for layer in islice(self.layers, self.start_layer, self.end_layer):
|
||||||
|
hidden_states, residual = layer(positions, hidden_states, residual,
|
||||||
|
llama_4_scaling)
|
||||||
|
|
||||||
|
if not get_pp_group().is_last_rank:
|
||||||
|
return IntermediateTensors({
|
||||||
|
"hidden_states": hidden_states,
|
||||||
|
"residual": residual
|
||||||
|
})
|
||||||
|
|
||||||
|
hidden_states, _ = self.norm(hidden_states, residual)
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
DeepseekV2Model.forward = forward
|
||||||
@@ -159,7 +159,8 @@ class NPUPlatform(Platform):
|
|||||||
compilation_config.splitting_ops = []
|
compilation_config.splitting_ops = []
|
||||||
|
|
||||||
compilation_config.cudagraph_num_of_warmups = 1
|
compilation_config.cudagraph_num_of_warmups = 1
|
||||||
compilation_config.pass_config.enable_fusion = False
|
compilation_config.pass_config.fuse_norm_quant = False
|
||||||
|
compilation_config.pass_config.fuse_act_quant = False
|
||||||
|
|
||||||
if compilation_config.mode not in [
|
if compilation_config.mode not in [
|
||||||
CompilationMode.NONE, CompilationMode.VLLM_COMPILE
|
CompilationMode.NONE, CompilationMode.VLLM_COMPILE
|
||||||
@@ -194,7 +195,7 @@ class NPUPlatform(Platform):
|
|||||||
# to ascend ops && hardwares. We update these sizes here to improve
|
# to ascend ops && hardwares. We update these sizes here to improve
|
||||||
# default performance.
|
# default performance.
|
||||||
update_default_aclgraph_sizes(vllm_config)
|
update_default_aclgraph_sizes(vllm_config)
|
||||||
# TODO delete graph size update here when compilation_config.pass_config.enable_sequence_parallelism
|
# TODO delete graph size update here when compilation_config.pass_config.enable_sp
|
||||||
# is supported by vllm-ascend.
|
# is supported by vllm-ascend.
|
||||||
if vllm_config.parallel_config.tensor_parallel_size > 1 and not vllm_config.model_config.enforce_eager and \
|
if vllm_config.parallel_config.tensor_parallel_size > 1 and not vllm_config.model_config.enforce_eager and \
|
||||||
enable_sp(vllm_config):
|
enable_sp(vllm_config):
|
||||||
|
|||||||
@@ -315,8 +315,8 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
|
|||||||
eps=config.rms_norm_eps)
|
eps=config.rms_norm_eps)
|
||||||
|
|
||||||
self.enable_sequence_parallelism = (
|
self.enable_sequence_parallelism = (
|
||||||
vllm_config.compilation_config.pass_config.
|
vllm_config.compilation_config.pass_config.enable_sp
|
||||||
enable_sequence_parallelism if vllm_config is not None else False)
|
if vllm_config is not None else False)
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
@@ -488,7 +488,7 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
|
|||||||
self.make_empty_intermediate_tensors = (
|
self.make_empty_intermediate_tensors = (
|
||||||
self.model.make_empty_intermediate_tensors)
|
self.model.make_empty_intermediate_tensors)
|
||||||
|
|
||||||
self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sequence_parallelism
|
self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sp
|
||||||
# Set MoE hyperparameters
|
# Set MoE hyperparameters
|
||||||
self.expert_weights: list[torch.Tensor] = []
|
self.expert_weights: list[torch.Tensor] = []
|
||||||
|
|
||||||
|
|||||||
@@ -773,8 +773,7 @@ def enable_sp(vllm_config=None, enable_shared_expert_dp: bool = False) -> bool:
|
|||||||
from vllm.config import get_current_vllm_config
|
from vllm.config import get_current_vllm_config
|
||||||
vllm_config = get_current_vllm_config()
|
vllm_config = get_current_vllm_config()
|
||||||
_ENABLE_SP = (
|
_ENABLE_SP = (
|
||||||
vllm_config.compilation_config.pass_config.
|
vllm_config.compilation_config.pass_config.enable_sp
|
||||||
enable_sequence_parallelism
|
|
||||||
or envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM1
|
or envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM1
|
||||||
# Flash comm 1 should be enabled by env VLLM_ASCEND_ENABLE_FLASHCOMM1
|
# Flash comm 1 should be enabled by env VLLM_ASCEND_ENABLE_FLASHCOMM1
|
||||||
# We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.
|
# We retain the env VLLM_ASCEND_ENABLE_FLASHCOMM here for backward compatibility.
|
||||||
|
|||||||
Reference in New Issue
Block a user