[CI] fix ci (#2464)
### What this PR does / why we need it?
1. use action/checkout@v5 instead of v4
2. remove dbo test case because there is issue with it and will be
refactored later
3. make vllm-ascend compatible with vllm v0.10.1.1 and add CI for it
4. fix sampler api changes introduced by
https://github.com/vllm-project/vllm/pull/22387
6. fix qwen3 moe config changes intruoduced by
https://github.com/vllm-project/vllm/pull/20562
7. fix kvcache block changes introduced by
https://github.com/vllm-project/vllm/pull/23262
### Does this PR introduce _any_ user-facing change?
N/A
### How was this patch tested?
CI passed with existing test.
- vLLM version: v0.10.0
- vLLM main:
0c6e40bbaa
---------
Signed-off-by: MengqingCao <cmq0113@163.com>
This commit is contained in:
8
.github/workflows/accuracy_test.yaml
vendored
8
.github/workflows/accuracy_test.yaml
vendored
@@ -88,7 +88,7 @@ jobs:
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Set model name as output
|
- name: Set model name as output
|
||||||
id: set_output
|
id: set_output
|
||||||
@@ -109,7 +109,7 @@ jobs:
|
|||||||
apt-get -y install gcc g++ cmake libnuma-dev
|
apt-get -y install gcc g++ cmake libnuma-dev
|
||||||
|
|
||||||
- name: Checkout vllm-project/vllm repo
|
- name: Checkout vllm-project/vllm repo
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
repository: vllm-project/vllm
|
repository: vllm-project/vllm
|
||||||
ref: v0.10.0
|
ref: v0.10.0
|
||||||
@@ -138,7 +138,7 @@ jobs:
|
|||||||
echo "GHA_VLLM_ASCEND_VERSION=$RESOLVED_VERSION" >> $GITHUB_ENV
|
echo "GHA_VLLM_ASCEND_VERSION=$RESOLVED_VERSION" >> $GITHUB_ENV
|
||||||
|
|
||||||
- name: Checkout vllm-project/vllm-ascend repo
|
- name: Checkout vllm-project/vllm-ascend repo
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
repository: vllm-project/vllm-ascend
|
repository: vllm-project/vllm-ascend
|
||||||
path: ./vllm-ascend
|
path: ./vllm-ascend
|
||||||
@@ -236,7 +236,7 @@ jobs:
|
|||||||
UPSTREAM_REPO: vllm-project/vllm-ascend
|
UPSTREAM_REPO: vllm-project/vllm-ascend
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
repository: vllm-ascend-ci/vllm-ascend
|
repository: vllm-ascend-ci/vllm-ascend
|
||||||
token: ${{ secrets.PAT_TOKEN }}
|
token: ${{ secrets.PAT_TOKEN }}
|
||||||
|
|||||||
2
.github/workflows/format_pr_body.yaml
vendored
2
.github/workflows/format_pr_body.yaml
vendored
@@ -34,7 +34,7 @@ jobs:
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout vllm-project/vllm repo
|
- name: Checkout vllm-project/vllm repo
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
repository: vllm-project/vllm
|
repository: vllm-project/vllm
|
||||||
path: ./vllm-empty
|
path: ./vllm-empty
|
||||||
|
|||||||
2
.github/workflows/image_310p_openeuler.yml
vendored
2
.github/workflows/image_310p_openeuler.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
|||||||
'ubuntu-24.04-arm'
|
'ubuntu-24.04-arm'
|
||||||
}}
|
}}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Print
|
- name: Print
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
2
.github/workflows/image_310p_ubuntu.yml
vendored
2
.github/workflows/image_310p_ubuntu.yml
vendored
@@ -49,7 +49,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Print
|
- name: Print
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
2
.github/workflows/image_a3_openeuler.yml
vendored
2
.github/workflows/image_a3_openeuler.yml
vendored
@@ -53,7 +53,7 @@ jobs:
|
|||||||
'ubuntu-24.04-arm'
|
'ubuntu-24.04-arm'
|
||||||
}}
|
}}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Print
|
- name: Print
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
2
.github/workflows/image_a3_ubuntu.yml
vendored
2
.github/workflows/image_a3_ubuntu.yml
vendored
@@ -49,7 +49,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Print
|
- name: Print
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
2
.github/workflows/image_openeuler.yml
vendored
2
.github/workflows/image_openeuler.yml
vendored
@@ -52,7 +52,7 @@ jobs:
|
|||||||
'ubuntu-24.04-arm'
|
'ubuntu-24.04-arm'
|
||||||
}}
|
}}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Print
|
- name: Print
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
2
.github/workflows/image_ubuntu.yml
vendored
2
.github/workflows/image_ubuntu.yml
vendored
@@ -49,7 +49,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Print
|
- name: Print
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
4
.github/workflows/nightly_benchmarks.yaml
vendored
4
.github/workflows/nightly_benchmarks.yaml
vendored
@@ -97,12 +97,12 @@ jobs:
|
|||||||
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
|
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
|
||||||
|
|
||||||
- name: Checkout vllm-project/vllm-ascend repo
|
- name: Checkout vllm-project/vllm-ascend repo
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Checkout vllm-project/vllm repo
|
- name: Checkout vllm-project/vllm repo
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
repository: vllm-project/vllm
|
repository: vllm-project/vllm
|
||||||
path: ./vllm-empty
|
path: ./vllm-empty
|
||||||
|
|||||||
4
.github/workflows/pre-commit.yml
vendored
4
.github/workflows/pre-commit.yml
vendored
@@ -11,14 +11,14 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout vllm-project/vllm-ascend repo
|
- name: Checkout vllm-project/vllm-ascend repo
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
||||||
- name: Checkout vllm-project/vllm repo
|
- name: Checkout vllm-project/vllm repo
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
repository: vllm-project/vllm
|
repository: vllm-project/vllm
|
||||||
path: ./vllm-empty
|
path: ./vllm-empty
|
||||||
|
|||||||
2
.github/workflows/vllm_ascend_doctest.yaml
vendored
2
.github/workflows/vllm_ascend_doctest.yaml
vendored
@@ -66,7 +66,7 @@ jobs:
|
|||||||
git --no-pager log -1 || true
|
git --no-pager log -1 || true
|
||||||
|
|
||||||
- name: Checkout vllm-project/vllm-ascend repo
|
- name: Checkout vllm-project/vllm-ascend repo
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Run vllm-ascend/tests/e2e/run_doctests.sh
|
- name: Run vllm-ascend/tests/e2e/run_doctests.sh
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
11
.github/workflows/vllm_ascend_test.yaml
vendored
11
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -81,7 +81,7 @@ jobs:
|
|||||||
VLLM_USE_MODELSCOPE: True
|
VLLM_USE_MODELSCOPE: True
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
vllm_version: [main]
|
vllm_version: [v0.10.1.1, main]
|
||||||
steps:
|
steps:
|
||||||
- name: Install packages
|
- name: Install packages
|
||||||
run: |
|
run: |
|
||||||
@@ -89,7 +89,7 @@ jobs:
|
|||||||
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2
|
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2
|
||||||
|
|
||||||
- name: Checkout vllm-project/vllm repo
|
- name: Checkout vllm-project/vllm repo
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
repository: vllm-project/vllm
|
repository: vllm-project/vllm
|
||||||
ref: ${{ matrix.vllm_version }}
|
ref: ${{ matrix.vllm_version }}
|
||||||
@@ -102,7 +102,7 @@ jobs:
|
|||||||
python3 -m pip uninstall -y triton
|
python3 -m pip uninstall -y triton
|
||||||
|
|
||||||
- name: Checkout vllm-project/vllm-ascend repo
|
- name: Checkout vllm-project/vllm-ascend repo
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Install vllm-project/vllm-ascend
|
- name: Install vllm-project/vllm-ascend
|
||||||
run: |
|
run: |
|
||||||
@@ -137,7 +137,7 @@ jobs:
|
|||||||
max-parallel: 2
|
max-parallel: 2
|
||||||
matrix:
|
matrix:
|
||||||
os: [linux-aarch64-a2-1]
|
os: [linux-aarch64-a2-1]
|
||||||
vllm_version: [main]
|
vllm_version: [v0.10.1.1, main]
|
||||||
name: singlecard e2e test
|
name: singlecard e2e test
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
container:
|
container:
|
||||||
@@ -219,7 +219,7 @@ jobs:
|
|||||||
max-parallel: 2
|
max-parallel: 2
|
||||||
matrix:
|
matrix:
|
||||||
os: [linux-aarch64-a2-2]
|
os: [linux-aarch64-a2-2]
|
||||||
vllm_version: [main]
|
vllm_version: [v0.10.1.1, main]
|
||||||
name: multicard e2e test
|
name: multicard e2e test
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
container:
|
container:
|
||||||
@@ -278,7 +278,6 @@ jobs:
|
|||||||
# To avoid oom, we need to run the test in a single process.
|
# To avoid oom, we need to run the test in a single process.
|
||||||
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
|
||||||
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
|
||||||
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
|
|
||||||
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
|
||||||
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_alltoallv
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_alltoallv
|
||||||
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
|
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
|
||||||
|
|||||||
6
.github/workflows/vllm_ascend_test_310p.yaml
vendored
6
.github/workflows/vllm_ascend_test_310p.yaml
vendored
@@ -53,7 +53,7 @@ jobs:
|
|||||||
max-parallel: 2
|
max-parallel: 2
|
||||||
matrix:
|
matrix:
|
||||||
os: [linux-aarch64-310p-1, linux-aarch64-310p-4]
|
os: [linux-aarch64-310p-1, linux-aarch64-310p-4]
|
||||||
vllm_version: [main]
|
vllm_version: [v0.10.1.1, main]
|
||||||
name: 310p e2e test
|
name: 310p e2e test
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
container:
|
container:
|
||||||
@@ -77,7 +77,7 @@ jobs:
|
|||||||
apt install git -y
|
apt install git -y
|
||||||
|
|
||||||
- name: Checkout vllm-project/vllm-ascend repo
|
- name: Checkout vllm-project/vllm-ascend repo
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Install system dependencies
|
- name: Install system dependencies
|
||||||
run: |
|
run: |
|
||||||
@@ -85,7 +85,7 @@ jobs:
|
|||||||
apt-get -y install git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2
|
apt-get -y install git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2
|
||||||
|
|
||||||
- name: Checkout vllm-project/vllm repo
|
- name: Checkout vllm-project/vllm repo
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
repository: vllm-project/vllm
|
repository: vllm-project/vllm
|
||||||
ref: ${{ matrix.vllm_version }}
|
ref: ${{ matrix.vllm_version }}
|
||||||
|
|||||||
4
.github/workflows/vllm_ascend_test_pd.yaml
vendored
4
.github/workflows/vllm_ascend_test_pd.yaml
vendored
@@ -80,7 +80,7 @@ jobs:
|
|||||||
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
|
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
|
||||||
|
|
||||||
- name: Checkout vllm-project/vllm-ascend repo
|
- name: Checkout vllm-project/vllm-ascend repo
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Install system dependencies
|
- name: Install system dependencies
|
||||||
run: |
|
run: |
|
||||||
@@ -88,7 +88,7 @@ jobs:
|
|||||||
apt-get -y install gcc g++ cmake libnuma-dev
|
apt-get -y install gcc g++ cmake libnuma-dev
|
||||||
|
|
||||||
- name: Checkout vllm-project/vllm repo
|
- name: Checkout vllm-project/vllm repo
|
||||||
uses: actions/checkout@v5
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
repository: vllm-project/vllm
|
repository: vllm-project/vllm
|
||||||
ref: ${{ matrix.vllm_verison }}
|
ref: ${{ matrix.vllm_verison }}
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=v0.10.0
|
ARG VLLM_TAG=v0.10.1.1
|
||||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=v0.10.0
|
ARG VLLM_TAG=v0.10.1.1
|
||||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=v0.10.0
|
ARG VLLM_TAG=v0.10.1.1
|
||||||
|
|
||||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=v0.10.0
|
ARG VLLM_TAG=v0.10.1.1
|
||||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=v0.10.0
|
ARG VLLM_TAG=v0.10.1.1
|
||||||
|
|
||||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ COPY . /vllm-workspace/vllm-ascend/
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||||
ARG VLLM_TAG=v0.10.0
|
ARG VLLM_TAG=v0.10.1.1
|
||||||
|
|
||||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||||
|
|||||||
@@ -78,26 +78,6 @@ def test_models_distributed_DeepSeek_multistream_moe():
|
|||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
|
|
||||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
|
|
||||||
def test_models_distributed_DeepSeek_dbo():
|
|
||||||
example_prompts = ["The president of the United States is"] * 41
|
|
||||||
dtype = "half"
|
|
||||||
sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
|
|
||||||
with VllmRunner(
|
|
||||||
"deepseek-ai/DeepSeek-V2-Lite",
|
|
||||||
dtype=dtype,
|
|
||||||
tensor_parallel_size=2,
|
|
||||||
distributed_executor_backend="mp",
|
|
||||||
) as vllm_model:
|
|
||||||
model_arch = 'DeepseekV2ForCausalLM'
|
|
||||||
registed_models = ModelRegistry.models
|
|
||||||
assert registed_models[
|
|
||||||
model_arch].module_name == "vllm_ascend.models.deepseek_dbo"
|
|
||||||
assert registed_models[
|
|
||||||
model_arch].class_name == "CustomDeepseekDBOForCausalLM"
|
|
||||||
vllm_model.generate(example_prompts, sampling_params)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(
|
@pytest.mark.skip(
|
||||||
reason=
|
reason=
|
||||||
"deepseek dbo dose not consider the support on half precision float, will enable this ut after we actually support it"
|
"deepseek dbo dose not consider the support on half precision float, will enable this ut after we actually support it"
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
|
|||||||
from vllm.v1.core.sched.output import SchedulerOutput
|
from vllm.v1.core.sched.output import SchedulerOutput
|
||||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||||
KVCacheGroupSpec)
|
KVCacheGroupSpec)
|
||||||
from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
|
from vllm.v1.outputs import ModelRunnerOutput
|
||||||
from vllm.v1.request import Request, RequestStatus
|
from vllm.v1.request import Request, RequestStatus
|
||||||
from vllm.v1.structured_output import StructuredOutputManager
|
from vllm.v1.structured_output import StructuredOutputManager
|
||||||
|
|
||||||
@@ -21,6 +21,11 @@ from tests.ut.base import TestBase
|
|||||||
from vllm_ascend.core.scheduler import AscendScheduler
|
from vllm_ascend.core.scheduler import AscendScheduler
|
||||||
from vllm_ascend.utils import vllm_version_is
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if not vllm_version_is("0.10.1.1"):
|
||||||
|
from vllm.v1.outputs import DraftTokenIds
|
||||||
|
else:
|
||||||
|
DraftTokenIds = None
|
||||||
|
|
||||||
EOS_TOKEN_ID = 50256
|
EOS_TOKEN_ID = 50256
|
||||||
MODEL = "Qwen3-0.6B"
|
MODEL = "Qwen3-0.6B"
|
||||||
ENABLE_PREFIX_CACHING = None
|
ENABLE_PREFIX_CACHING = None
|
||||||
@@ -66,16 +71,33 @@ def create_requests(
|
|||||||
|
|
||||||
|
|
||||||
def make_output(scheduler):
|
def make_output(scheduler):
|
||||||
return ModelRunnerOutput(
|
req_ids = [req.request_id for req in scheduler.running]
|
||||||
req_ids=[req.request_id for req in scheduler.running],
|
req_id_to_index = {
|
||||||
req_id_to_index={
|
req.request_id: i
|
||||||
req.request_id: i
|
for i, req in enumerate(scheduler.running)
|
||||||
for i, req in enumerate(scheduler.running)
|
}
|
||||||
},
|
sampled_token_ids = [[1000]] * len(scheduler.running)
|
||||||
sampled_token_ids=[[1000]] * len(scheduler.running),
|
logprobs = None
|
||||||
logprobs=None,
|
if vllm_version_is("0.10.1.1"):
|
||||||
prompt_logprobs_dict={},
|
modelrunner_output = ModelRunnerOutput(
|
||||||
pooler_output=[])
|
req_ids=req_ids,
|
||||||
|
req_id_to_index=req_id_to_index,
|
||||||
|
sampled_token_ids=sampled_token_ids,
|
||||||
|
spec_token_ids=None,
|
||||||
|
logprobs=logprobs,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
modelrunner_output = ModelRunnerOutput(
|
||||||
|
req_ids=req_ids,
|
||||||
|
req_id_to_index=req_id_to_index,
|
||||||
|
sampled_token_ids=sampled_token_ids,
|
||||||
|
logprobs=logprobs,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[],
|
||||||
|
)
|
||||||
|
return modelrunner_output
|
||||||
|
|
||||||
|
|
||||||
class TestAscendScheduler(TestBase):
|
class TestAscendScheduler(TestBase):
|
||||||
@@ -271,8 +293,7 @@ class TestAscendScheduler(TestBase):
|
|||||||
req.num_computed_tokens = req.num_tokens
|
req.num_computed_tokens = req.num_tokens
|
||||||
scheduler.requests[req.request_id] = req
|
scheduler.requests[req.request_id] = req
|
||||||
scheduler.running.append(req)
|
scheduler.running.append(req)
|
||||||
if not vllm_version_is("0.9.2"):
|
req.status = RequestStatus.RUNNING
|
||||||
req.status = RequestStatus.RUNNING
|
|
||||||
|
|
||||||
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
||||||
scheduled_cached_reqs=[],
|
scheduled_cached_reqs=[],
|
||||||
@@ -291,18 +312,33 @@ class TestAscendScheduler(TestBase):
|
|||||||
free_encoder_input_ids=[],
|
free_encoder_input_ids=[],
|
||||||
structured_output_request_ids={},
|
structured_output_request_ids={},
|
||||||
grammar_bitmask=None)
|
grammar_bitmask=None)
|
||||||
|
if vllm_version_is("0.10.1.1"):
|
||||||
model_output = ModelRunnerOutput(
|
model_output = ModelRunnerOutput(
|
||||||
req_ids=[req.request_id for req in requests],
|
req_ids=[req.request_id for req in requests],
|
||||||
req_id_to_index={
|
req_id_to_index={
|
||||||
req.request_id: i
|
req.request_id: i
|
||||||
for i, req in enumerate(requests)
|
for i, req in enumerate(requests)
|
||||||
},
|
},
|
||||||
sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
|
sampled_token_ids=[[EOS_TOKEN_ID], [
|
||||||
], # First request hits EOS, second continues
|
10, 11
|
||||||
logprobs=None,
|
]], # First request hits EOS, second continues
|
||||||
prompt_logprobs_dict={},
|
spec_token_ids=None,
|
||||||
pooler_output=[])
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
|
else:
|
||||||
|
model_output = ModelRunnerOutput(
|
||||||
|
req_ids=[req.request_id for req in requests],
|
||||||
|
req_id_to_index={
|
||||||
|
req.request_id: i
|
||||||
|
for i, req in enumerate(requests)
|
||||||
|
},
|
||||||
|
sampled_token_ids=[[EOS_TOKEN_ID], [
|
||||||
|
10, 11
|
||||||
|
]], # First request hits EOS, second continues
|
||||||
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
|
|
||||||
scheduler.update_from_output(scheduler_output, model_output)
|
scheduler.update_from_output(scheduler_output, model_output)
|
||||||
|
|
||||||
@@ -325,8 +361,7 @@ class TestAscendScheduler(TestBase):
|
|||||||
req.num_computed_tokens = req.num_tokens
|
req.num_computed_tokens = req.num_tokens
|
||||||
scheduler.requests[req.request_id] = req
|
scheduler.requests[req.request_id] = req
|
||||||
scheduler.running.append(req)
|
scheduler.running.append(req)
|
||||||
if not vllm_version_is("0.9.2"):
|
req.status = RequestStatus.RUNNING
|
||||||
req.status = RequestStatus.RUNNING
|
|
||||||
|
|
||||||
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
||||||
scheduled_cached_reqs=[],
|
scheduled_cached_reqs=[],
|
||||||
@@ -346,18 +381,31 @@ class TestAscendScheduler(TestBase):
|
|||||||
free_encoder_input_ids=[],
|
free_encoder_input_ids=[],
|
||||||
structured_output_request_ids={},
|
structured_output_request_ids={},
|
||||||
grammar_bitmask=None)
|
grammar_bitmask=None)
|
||||||
|
if vllm_version_is("0.10.1.1"):
|
||||||
model_output = ModelRunnerOutput(
|
model_output = ModelRunnerOutput(
|
||||||
req_ids=[req.request_id for req in requests],
|
req_ids=[req.request_id for req in requests],
|
||||||
req_id_to_index={
|
req_id_to_index={
|
||||||
req.request_id: i
|
req.request_id: i
|
||||||
for i, req in enumerate(requests)
|
for i, req in enumerate(requests)
|
||||||
},
|
},
|
||||||
sampled_token_ids=[[10, 42, 12],
|
sampled_token_ids=[[10, 42, 12],
|
||||||
[13, 14]], # First request hits stop token
|
[13, 14]], # First request hits stop token
|
||||||
logprobs=None,
|
spec_token_ids=None,
|
||||||
prompt_logprobs_dict={},
|
logprobs=None,
|
||||||
pooler_output=[])
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
|
else:
|
||||||
|
model_output = ModelRunnerOutput(
|
||||||
|
req_ids=[req.request_id for req in requests],
|
||||||
|
req_id_to_index={
|
||||||
|
req.request_id: i
|
||||||
|
for i, req in enumerate(requests)
|
||||||
|
},
|
||||||
|
sampled_token_ids=[[10, 42, 12],
|
||||||
|
[13, 14]], # First request hits stop token
|
||||||
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
|
|
||||||
scheduler.update_from_output(scheduler_output, model_output)
|
scheduler.update_from_output(scheduler_output, model_output)
|
||||||
|
|
||||||
@@ -379,8 +427,7 @@ class TestAscendScheduler(TestBase):
|
|||||||
req.num_computed_tokens = req.num_tokens
|
req.num_computed_tokens = req.num_tokens
|
||||||
scheduler.requests[req.request_id] = req
|
scheduler.requests[req.request_id] = req
|
||||||
scheduler.running.append(req)
|
scheduler.running.append(req)
|
||||||
if not vllm_version_is("0.9.2"):
|
req.status = RequestStatus.RUNNING
|
||||||
req.status = RequestStatus.RUNNING
|
|
||||||
|
|
||||||
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
||||||
scheduled_cached_reqs=[],
|
scheduled_cached_reqs=[],
|
||||||
@@ -401,18 +448,31 @@ class TestAscendScheduler(TestBase):
|
|||||||
structured_output_request_ids={},
|
structured_output_request_ids={},
|
||||||
grammar_bitmask=None)
|
grammar_bitmask=None)
|
||||||
|
|
||||||
model_output = ModelRunnerOutput(
|
if vllm_version_is("0.10.1.1"):
|
||||||
req_ids=[req.request_id for req in requests],
|
model_output = ModelRunnerOutput(
|
||||||
req_id_to_index={
|
req_ids=[req.request_id for req in requests],
|
||||||
req.request_id: i
|
req_id_to_index={
|
||||||
for i, req in enumerate(requests)
|
req.request_id: i
|
||||||
},
|
for i, req in enumerate(requests)
|
||||||
sampled_token_ids=[[10, 11, 12],
|
},
|
||||||
[13]], # First request exceeds max_tokens
|
sampled_token_ids=[[10, 11, 12],
|
||||||
logprobs=None,
|
[13]], # First request exceeds max_tokens
|
||||||
prompt_logprobs_dict={},
|
spec_token_ids=None,
|
||||||
pooler_output=[])
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
|
else:
|
||||||
|
model_output = ModelRunnerOutput(
|
||||||
|
req_ids=[req.request_id for req in requests],
|
||||||
|
req_id_to_index={
|
||||||
|
req.request_id: i
|
||||||
|
for i, req in enumerate(requests)
|
||||||
|
},
|
||||||
|
sampled_token_ids=[[10, 11, 12],
|
||||||
|
[13]], # First request exceeds max_tokens
|
||||||
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
scheduler.update_from_output(scheduler_output, model_output)
|
scheduler.update_from_output(scheduler_output, model_output)
|
||||||
|
|
||||||
# Verify first request stopped due to length
|
# Verify first request stopped due to length
|
||||||
@@ -448,13 +508,24 @@ class TestAscendScheduler(TestBase):
|
|||||||
structured_output_request_ids={},
|
structured_output_request_ids={},
|
||||||
grammar_bitmask=None)
|
grammar_bitmask=None)
|
||||||
|
|
||||||
model_output = ModelRunnerOutput(
|
if vllm_version_is("0.10.1.1"):
|
||||||
req_ids=[requests[0].request_id],
|
model_output = ModelRunnerOutput(
|
||||||
req_id_to_index={requests[0].request_id: 0},
|
req_ids=[requests[0].request_id],
|
||||||
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
|
req_id_to_index={requests[0].request_id: 0},
|
||||||
logprobs=None,
|
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
|
||||||
prompt_logprobs_dict={},
|
spec_token_ids=None,
|
||||||
pooler_output=[])
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
|
|
||||||
|
else:
|
||||||
|
model_output = ModelRunnerOutput(
|
||||||
|
req_ids=[requests[0].request_id],
|
||||||
|
req_id_to_index={requests[0].request_id: 0},
|
||||||
|
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
|
||||||
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
|
|
||||||
scheduler.update_from_output(scheduler_output, model_output)
|
scheduler.update_from_output(scheduler_output, model_output)
|
||||||
|
|
||||||
@@ -505,13 +576,23 @@ class TestAscendScheduler(TestBase):
|
|||||||
512)
|
512)
|
||||||
|
|
||||||
# Model output of the first request.
|
# Model output of the first request.
|
||||||
model_runner_output = ModelRunnerOutput(
|
if vllm_version_is("0.10.1.1"):
|
||||||
req_ids=[requests[0].request_id],
|
model_runner_output = ModelRunnerOutput(
|
||||||
req_id_to_index={requests[0].request_id: 0},
|
req_ids=[requests[0].request_id],
|
||||||
sampled_token_ids=[[0]],
|
req_id_to_index={requests[0].request_id: 0},
|
||||||
logprobs=None,
|
sampled_token_ids=[[0]],
|
||||||
prompt_logprobs_dict={},
|
spec_token_ids=None,
|
||||||
pooler_output=[])
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
|
else:
|
||||||
|
model_runner_output = ModelRunnerOutput(
|
||||||
|
req_ids=[requests[0].request_id],
|
||||||
|
req_id_to_index={requests[0].request_id: 0},
|
||||||
|
sampled_token_ids=[[0]],
|
||||||
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
|
|
||||||
scheduler.update_from_output(scheduler_output0,
|
scheduler.update_from_output(scheduler_output0,
|
||||||
model_runner_output)
|
model_runner_output)
|
||||||
@@ -521,13 +602,23 @@ class TestAscendScheduler(TestBase):
|
|||||||
# request is still running.
|
# request is still running.
|
||||||
scheduler.schedule()
|
scheduler.schedule()
|
||||||
# Model output of the second request.
|
# Model output of the second request.
|
||||||
model_runner_output = ModelRunnerOutput(
|
if vllm_version_is("0.10.1.1"):
|
||||||
req_ids=[requests[1].request_id],
|
model_runner_output = ModelRunnerOutput(
|
||||||
req_id_to_index={requests[1].request_id: 0},
|
req_ids=[requests[1].request_id],
|
||||||
sampled_token_ids=[[0]],
|
req_id_to_index={requests[1].request_id: 0},
|
||||||
logprobs=None,
|
sampled_token_ids=[[0]],
|
||||||
prompt_logprobs_dict={},
|
spec_token_ids=None,
|
||||||
pooler_output=[])
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
|
else:
|
||||||
|
model_runner_output = ModelRunnerOutput(
|
||||||
|
req_ids=[requests[1].request_id],
|
||||||
|
req_id_to_index={requests[1].request_id: 0},
|
||||||
|
sampled_token_ids=[[0]],
|
||||||
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
|
|
||||||
scheduler.update_from_output(scheduler_output1,
|
scheduler.update_from_output(scheduler_output1,
|
||||||
model_runner_output)
|
model_runner_output)
|
||||||
@@ -579,19 +670,29 @@ class TestAscendScheduler(TestBase):
|
|||||||
req_id = requests[i].request_id
|
req_id = requests[i].request_id
|
||||||
self.assertEqual(output.num_scheduled_tokens[req_id], 1)
|
self.assertEqual(output.num_scheduled_tokens[req_id], 1)
|
||||||
self.assertNotIn(req_id, output.scheduled_spec_decode_tokens)
|
self.assertNotIn(req_id, output.scheduled_spec_decode_tokens)
|
||||||
|
if vllm_version_is("0.10.1.1"):
|
||||||
model_runner_output = ModelRunnerOutput(
|
model_runner_output = ModelRunnerOutput(
|
||||||
req_ids=req_ids,
|
req_ids=req_ids,
|
||||||
req_id_to_index=req_to_index,
|
req_id_to_index=req_to_index,
|
||||||
sampled_token_ids=[[0] for _ in range(len(requests))],
|
sampled_token_ids=[[0] for _ in range(len(requests))],
|
||||||
logprobs=None,
|
logprobs=None,
|
||||||
prompt_logprobs_dict={},
|
prompt_logprobs_dict={},
|
||||||
pooler_output=[])
|
spec_token_ids=spec_tokens,
|
||||||
draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
|
pooler_output=[])
|
||||||
|
else:
|
||||||
|
model_runner_output = ModelRunnerOutput(
|
||||||
|
req_ids=req_ids,
|
||||||
|
req_id_to_index=req_to_index,
|
||||||
|
sampled_token_ids=[[0] for _ in range(len(requests))],
|
||||||
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
|
draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
|
||||||
|
|
||||||
engine_core_outputs = scheduler.update_from_output(
|
engine_core_outputs = scheduler.update_from_output(
|
||||||
output, model_runner_output)
|
output, model_runner_output)
|
||||||
scheduler.update_draft_token_ids(draft_token_ids)
|
if not vllm_version_is("0.10.1.1"):
|
||||||
|
scheduler.update_draft_token_ids(draft_token_ids)
|
||||||
|
|
||||||
for i in range(len(requests)):
|
for i in range(len(requests)):
|
||||||
running_req = scheduler.running[i]
|
running_req = scheduler.running[i]
|
||||||
@@ -627,14 +728,23 @@ class TestAscendScheduler(TestBase):
|
|||||||
else:
|
else:
|
||||||
self.assertNotIn(req_id,
|
self.assertNotIn(req_id,
|
||||||
output.scheduled_spec_decode_tokens)
|
output.scheduled_spec_decode_tokens)
|
||||||
|
if vllm_version_is("0.10.1.1"):
|
||||||
model_runner_output = ModelRunnerOutput(
|
model_runner_output = ModelRunnerOutput(
|
||||||
req_ids=req_ids,
|
req_ids=req_ids,
|
||||||
req_id_to_index=req_to_index,
|
req_id_to_index=req_to_index,
|
||||||
sampled_token_ids=output_tokens,
|
sampled_token_ids=output_tokens,
|
||||||
logprobs=None,
|
spec_token_ids=None,
|
||||||
prompt_logprobs_dict={},
|
logprobs=None,
|
||||||
pooler_output=[])
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
|
else:
|
||||||
|
model_runner_output = ModelRunnerOutput(
|
||||||
|
req_ids=req_ids,
|
||||||
|
req_id_to_index=req_to_index,
|
||||||
|
sampled_token_ids=output_tokens,
|
||||||
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[])
|
||||||
|
|
||||||
engine_core_outputs = scheduler.update_from_output(
|
engine_core_outputs = scheduler.update_from_output(
|
||||||
output, model_runner_output)
|
output, model_runner_output)
|
||||||
|
|||||||
@@ -200,12 +200,26 @@ def create_model_runner_output(
|
|||||||
kv_connector_output = KVConnectorOutput(finished_sending=finished_sending,
|
kv_connector_output = KVConnectorOutput(finished_sending=finished_sending,
|
||||||
finished_recving=finished_recving)
|
finished_recving=finished_recving)
|
||||||
extra_args = {"kv_connector_output": kv_connector_output}
|
extra_args = {"kv_connector_output": kv_connector_output}
|
||||||
return ModelRunnerOutput(
|
if vllm_version_is("0.10.1.1"):
|
||||||
req_ids=req_ids,
|
model_runner_output = ModelRunnerOutput(
|
||||||
req_id_to_index=req_id_to_index,
|
req_ids=req_ids,
|
||||||
sampled_token_ids=sampled_token_ids,
|
req_id_to_index=req_id_to_index,
|
||||||
logprobs=None,
|
sampled_token_ids=sampled_token_ids,
|
||||||
prompt_logprobs_dict={},
|
spec_token_ids=None,
|
||||||
pooler_output=[],
|
logprobs=None,
|
||||||
**extra_args,
|
prompt_logprobs_dict={},
|
||||||
)
|
pooler_output=[],
|
||||||
|
**extra_args,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
model_runner_output = ModelRunnerOutput(
|
||||||
|
req_ids=req_ids,
|
||||||
|
req_id_to_index=req_id_to_index,
|
||||||
|
sampled_token_ids=sampled_token_ids,
|
||||||
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=[],
|
||||||
|
**extra_args,
|
||||||
|
)
|
||||||
|
|
||||||
|
return model_runner_output
|
||||||
|
|||||||
@@ -31,6 +31,13 @@ from vllm.v1.outputs import ModelRunnerOutput
|
|||||||
from vllm.v1.request import Request, RequestStatus
|
from vllm.v1.request import Request, RequestStatus
|
||||||
from vllm.v1.structured_output import StructuredOutputManager
|
from vllm.v1.structured_output import StructuredOutputManager
|
||||||
|
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
if vllm_version_is("0.10.1.1"):
|
||||||
|
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
|
||||||
|
else:
|
||||||
|
KVCacheBlocks = None
|
||||||
|
|
||||||
|
|
||||||
class AscendScheduler(Scheduler):
|
class AscendScheduler(Scheduler):
|
||||||
"""This Scheduler extends vllm's original v1 scheduler
|
"""This Scheduler extends vllm's original v1 scheduler
|
||||||
@@ -59,7 +66,10 @@ class AscendScheduler(Scheduler):
|
|||||||
scheduled_running_reqs: list[Request] = []
|
scheduled_running_reqs: list[Request] = []
|
||||||
preempted_reqs: list[Request] = []
|
preempted_reqs: list[Request] = []
|
||||||
|
|
||||||
req_to_new_block_ids: dict[str, list[int]] = {}
|
if vllm_version_is("0.10.1.1"):
|
||||||
|
req_to_new_block_ids: dict[str, list[int]] = {}
|
||||||
|
else:
|
||||||
|
req_to_new_blocks: dict[str, KVCacheBlocks] = {}
|
||||||
num_scheduled_tokens: dict[str, int] = {}
|
num_scheduled_tokens: dict[str, int] = {}
|
||||||
token_budget = self.max_num_scheduled_tokens
|
token_budget = self.max_num_scheduled_tokens
|
||||||
# Spec decode-related.
|
# Spec decode-related.
|
||||||
@@ -217,8 +227,11 @@ class AscendScheduler(Scheduler):
|
|||||||
|
|
||||||
if self.lora_config and request.lora_request:
|
if self.lora_config and request.lora_request:
|
||||||
scheduled_loras.add(request.lora_request.lora_int_id)
|
scheduled_loras.add(request.lora_request.lora_int_id)
|
||||||
req_to_new_block_ids[request.request_id] = (
|
if vllm_version_is("0.10.1.1"):
|
||||||
self.kv_cache_manager.get_block_ids(request.request_id))
|
req_to_new_block_ids[request.request_id] = (
|
||||||
|
self.kv_cache_manager.get_block_ids(request.request_id))
|
||||||
|
else:
|
||||||
|
req_to_new_blocks[request.request_id] = new_blocks
|
||||||
# Update request info.
|
# Update request info.
|
||||||
num_scheduled_tokens[request.request_id] = num_new_tokens
|
num_scheduled_tokens[request.request_id] = num_new_tokens
|
||||||
token_budget -= num_new_tokens
|
token_budget -= num_new_tokens
|
||||||
@@ -307,8 +320,11 @@ class AscendScheduler(Scheduler):
|
|||||||
# Schedule the request.
|
# Schedule the request.
|
||||||
scheduled_running_reqs.append(request)
|
scheduled_running_reqs.append(request)
|
||||||
self.scheduled_req_ids.add(request.request_id)
|
self.scheduled_req_ids.add(request.request_id)
|
||||||
req_to_new_block_ids[request.request_id] = (
|
if vllm_version_is("0.10.1.1"):
|
||||||
new_blocks.get_block_ids())
|
req_to_new_block_ids[request.request_id] = (
|
||||||
|
new_blocks.get_block_ids())
|
||||||
|
else:
|
||||||
|
req_to_new_blocks[request.request_id] = new_blocks
|
||||||
num_scheduled_tokens[request.request_id] = num_new_tokens
|
num_scheduled_tokens[request.request_id] = num_new_tokens
|
||||||
token_budget -= num_new_tokens
|
token_budget -= num_new_tokens
|
||||||
req_index += 1
|
req_index += 1
|
||||||
@@ -346,16 +362,27 @@ class AscendScheduler(Scheduler):
|
|||||||
any_request, len(self.running)))
|
any_request, len(self.running)))
|
||||||
|
|
||||||
# Construct the scheduler output.
|
# Construct the scheduler output.
|
||||||
new_reqs_data = [
|
if vllm_version_is("0.10.1.1"):
|
||||||
NewRequestData.from_request(req,
|
new_reqs_data = [
|
||||||
req_to_new_block_ids[req.request_id])
|
NewRequestData.from_request(
|
||||||
for req in scheduled_new_reqs
|
req, req_to_new_block_ids[req.request_id])
|
||||||
]
|
for req in scheduled_new_reqs
|
||||||
|
]
|
||||||
|
cached_reqs_data = self._make_cached_request_data(
|
||||||
|
scheduled_running_reqs, scheduled_resumed_reqs,
|
||||||
|
num_scheduled_tokens, scheduled_spec_decode_tokens,
|
||||||
|
req_to_new_block_ids)
|
||||||
|
else:
|
||||||
|
new_reqs_data = [
|
||||||
|
NewRequestData.from_request(
|
||||||
|
req, req_to_new_blocks[req.request_id].get_block_ids())
|
||||||
|
for req in scheduled_new_reqs
|
||||||
|
]
|
||||||
|
|
||||||
cached_reqs_data = self._make_cached_request_data(
|
cached_reqs_data = self._make_cached_request_data(
|
||||||
scheduled_running_reqs, scheduled_resumed_reqs,
|
scheduled_running_reqs, scheduled_resumed_reqs,
|
||||||
num_scheduled_tokens, scheduled_spec_decode_tokens,
|
num_scheduled_tokens, scheduled_spec_decode_tokens,
|
||||||
req_to_new_block_ids)
|
req_to_new_blocks)
|
||||||
scheduled_cached_reqs = cached_reqs_data
|
scheduled_cached_reqs = cached_reqs_data
|
||||||
|
|
||||||
scheduler_output = SchedulerOutput(
|
scheduler_output = SchedulerOutput(
|
||||||
|
|||||||
@@ -50,6 +50,7 @@ from vllm.sequence import IntermediateTensors
|
|||||||
from vllm_ascend.ops.fused_moe import AscendFusedMoE
|
from vllm_ascend.ops.fused_moe import AscendFusedMoE
|
||||||
from vllm_ascend.ops.sequence_parallel import (MetadataForPadding,
|
from vllm_ascend.ops.sequence_parallel import (MetadataForPadding,
|
||||||
init_metadata_for_sp)
|
init_metadata_for_sp)
|
||||||
|
from vllm_ascend.utils import vllm_version_is
|
||||||
|
|
||||||
|
|
||||||
class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
|
class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
|
||||||
@@ -253,7 +254,11 @@ class CustomQwen3MoeModel(Qwen3MoeModel):
|
|||||||
quant_config = vllm_config.quant_config
|
quant_config = vllm_config.quant_config
|
||||||
|
|
||||||
parallel_config = vllm_config.parallel_config
|
parallel_config = vllm_config.parallel_config
|
||||||
self.num_redundant_experts = parallel_config.num_redundant_experts
|
if vllm_version_is("0.10.1.1"):
|
||||||
|
self.num_redundant_experts = parallel_config.num_redundant_experts
|
||||||
|
else:
|
||||||
|
eplb_config = parallel_config.eplb_config
|
||||||
|
self.num_redundant_experts = eplb_config.num_redundant_experts
|
||||||
self.padding_idx = config.pad_token_id
|
self.padding_idx = config.pad_token_id
|
||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|||||||
@@ -3,12 +3,19 @@ import torch_npu
|
|||||||
from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
|
from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
|
||||||
from vllm.v1.sample.sampler import Sampler
|
from vllm.v1.sample.sampler import Sampler
|
||||||
|
|
||||||
from vllm_ascend.utils import is_310p
|
from vllm_ascend.utils import is_310p, vllm_version_is
|
||||||
|
|
||||||
|
if not vllm_version_is("0.10.1.1"):
|
||||||
|
from vllm.config import LogprobsMode
|
||||||
|
DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS
|
||||||
|
else:
|
||||||
|
LogprobsMode = None
|
||||||
|
DEFAULT_LOGPROBS_MODE = "raw_logprobs"
|
||||||
|
|
||||||
|
|
||||||
class AscendSampler(Sampler):
|
class AscendSampler(Sampler):
|
||||||
|
|
||||||
def __init__(self, logprobs_mode="raw_logprobs"):
|
def __init__(self, logprobs_mode=DEFAULT_LOGPROBS_MODE):
|
||||||
# TODO: support logprobs_mode in vllm-ascend
|
# TODO: support logprobs_mode in vllm-ascend
|
||||||
super().__init__(logprobs_mode=logprobs_mode)
|
super().__init__(logprobs_mode=logprobs_mode)
|
||||||
self.topk_topp_sampler = AscendTopKTopPSampler()
|
self.topk_topp_sampler = AscendTopKTopPSampler()
|
||||||
@@ -61,5 +68,19 @@ class AscendTopKTopPSampler(TopKTopPSampler):
|
|||||||
def forward_native(self, logits, generators, k, p):
|
def forward_native(self, logits, generators, k, p):
|
||||||
"""Override pytorch native implementation to torch_npu"""
|
"""Override pytorch native implementation to torch_npu"""
|
||||||
logits = self._apply_top_k_top_p(logits, k, p)
|
logits = self._apply_top_k_top_p(logits, k, p)
|
||||||
|
if not vllm_version_is("0.10.1.1"):
|
||||||
|
|
||||||
|
logits_to_return = None
|
||||||
|
if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS:
|
||||||
|
logits_to_return = logits
|
||||||
|
elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS:
|
||||||
|
logits_to_return = logits.log_softmax(dim=-1,
|
||||||
|
dtype=torch.float32)
|
||||||
|
|
||||||
probs = logits.softmax(dim=-1, dtype=torch.float32)
|
probs = logits.softmax(dim=-1, dtype=torch.float32)
|
||||||
return random_sample(probs, generators)
|
output = None
|
||||||
|
if vllm_version_is("0.10.1.1"):
|
||||||
|
output = random_sample(probs, generators)
|
||||||
|
else:
|
||||||
|
output = (random_sample(probs, generators), logits_to_return)
|
||||||
|
return output
|
||||||
|
|||||||
@@ -64,8 +64,8 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
|
|||||||
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
|
from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
|
||||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||||
KVCacheSpec)
|
KVCacheSpec)
|
||||||
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, DraftTokenIds,
|
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
|
||||||
LogprobsTensors, ModelRunnerOutput)
|
ModelRunnerOutput)
|
||||||
from vllm.v1.pool.metadata import PoolingMetadata
|
from vllm.v1.pool.metadata import PoolingMetadata
|
||||||
from vllm.v1.sample.logits_processor import build_logitsprocs
|
from vllm.v1.sample.logits_processor import build_logitsprocs
|
||||||
from vllm.v1.sample.metadata import SamplingMetadata
|
from vllm.v1.sample.metadata import SamplingMetadata
|
||||||
@@ -95,11 +95,17 @@ from vllm_ascend.torchair.torchair_attention import AscendTorchairMetadata
|
|||||||
from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata
|
from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata
|
||||||
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
||||||
ProfileExecuteDuration, is_310p,
|
ProfileExecuteDuration, is_310p,
|
||||||
maybe_converting_weight_acl_format)
|
maybe_converting_weight_acl_format,
|
||||||
|
vllm_version_is)
|
||||||
from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
|
from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer
|
||||||
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
|
from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer
|
||||||
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
|
||||||
|
|
||||||
|
if not vllm_version_is("0.10.1.1"):
|
||||||
|
from vllm.v1.outputs import DraftTokenIds
|
||||||
|
else:
|
||||||
|
DraftTokenIds = None
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import xgrammar as xgr # type: ignore[import-untyped]
|
import xgrammar as xgr # type: ignore[import-untyped]
|
||||||
from vllm.v1.core.sched.output import SchedulerOutput
|
from vllm.v1.core.sched.output import SchedulerOutput
|
||||||
@@ -514,11 +520,13 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
|
|
||||||
# Update the block IDs.
|
# Update the block IDs.
|
||||||
if not resumed_from_preemption:
|
if not resumed_from_preemption:
|
||||||
# Append the new blocks to the existing block IDs.
|
if new_block_ids is not None:
|
||||||
for block_ids, new_ids in zip(req_state.block_ids,
|
# Append the new blocks to the existing block IDs.
|
||||||
new_block_ids):
|
for block_ids, new_ids in zip(req_state.block_ids,
|
||||||
block_ids.extend(new_ids)
|
new_block_ids):
|
||||||
|
block_ids.extend(new_ids)
|
||||||
else:
|
else:
|
||||||
|
assert new_block_ids is not None
|
||||||
# The request is resumed from preemption.
|
# The request is resumed from preemption.
|
||||||
# Replace the existing block IDs with the new ones.
|
# Replace the existing block IDs with the new ones.
|
||||||
req_state.block_ids = new_block_ids
|
req_state.block_ids = new_block_ids
|
||||||
@@ -534,7 +542,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
# Update the persistent batch.
|
# Update the persistent batch.
|
||||||
self.input_batch.num_computed_tokens_cpu[req_index] = (
|
self.input_batch.num_computed_tokens_cpu[req_index] = (
|
||||||
num_computed_tokens)
|
num_computed_tokens)
|
||||||
self.input_batch.block_table.append_row(new_block_ids, req_index)
|
if new_block_ids is not None:
|
||||||
|
self.input_batch.block_table.append_row(
|
||||||
|
new_block_ids, req_index)
|
||||||
|
|
||||||
# For the last rank, we don't need to update the token_ids_cpu
|
# For the last rank, we don't need to update the token_ids_cpu
|
||||||
# because the sampled tokens are already cached.
|
# because the sampled tokens are already cached.
|
||||||
@@ -1526,16 +1536,28 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
else:
|
else:
|
||||||
pooler_output.append(None)
|
pooler_output.append(None)
|
||||||
extra_args = ({"kv_connector_output": kv_connector_output})
|
extra_args = ({"kv_connector_output": kv_connector_output})
|
||||||
|
if vllm_version_is("0.10.1.1"):
|
||||||
return ModelRunnerOutput(
|
modelrunner_output = ModelRunnerOutput(
|
||||||
req_ids=self.input_batch.req_ids,
|
req_ids=self.input_batch.req_ids,
|
||||||
req_id_to_index=self.input_batch.req_id_to_index,
|
req_id_to_index=self.input_batch.req_id_to_index,
|
||||||
sampled_token_ids=[],
|
sampled_token_ids=[],
|
||||||
logprobs=None,
|
spec_token_ids=None,
|
||||||
prompt_logprobs_dict={},
|
logprobs=None,
|
||||||
pooler_output=pooler_output,
|
prompt_logprobs_dict={},
|
||||||
**extra_args,
|
pooler_output=pooler_output,
|
||||||
)
|
**extra_args,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
modelrunner_output = ModelRunnerOutput(
|
||||||
|
req_ids=self.input_batch.req_ids,
|
||||||
|
req_id_to_index=self.input_batch.req_id_to_index,
|
||||||
|
sampled_token_ids=[],
|
||||||
|
logprobs=None,
|
||||||
|
prompt_logprobs_dict={},
|
||||||
|
pooler_output=pooler_output,
|
||||||
|
**extra_args,
|
||||||
|
)
|
||||||
|
return modelrunner_output
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def execute_model(
|
def execute_model(
|
||||||
@@ -1757,15 +1779,27 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
|
|
||||||
extra_args = ({"kv_connector_output": kv_connector_output})
|
extra_args = ({"kv_connector_output": kv_connector_output})
|
||||||
|
|
||||||
model_runner_output = ModelRunnerOutput(
|
if vllm_version_is("0.10.1.1"):
|
||||||
req_ids=self.input_batch.req_ids,
|
model_runner_output = ModelRunnerOutput(
|
||||||
req_id_to_index=self.input_batch.req_id_to_index,
|
req_ids=self.input_batch.req_ids,
|
||||||
sampled_token_ids=valid_sampled_token_ids,
|
req_id_to_index=self.input_batch.req_id_to_index,
|
||||||
logprobs=logprobs_lists,
|
sampled_token_ids=valid_sampled_token_ids,
|
||||||
prompt_logprobs_dict=prompt_logprobs_dict,
|
logprobs=logprobs_lists,
|
||||||
pooler_output=[],
|
spec_token_ids=self._draft_token_ids,
|
||||||
**extra_args,
|
prompt_logprobs_dict=prompt_logprobs_dict,
|
||||||
)
|
pooler_output=[],
|
||||||
|
**extra_args,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
model_runner_output = ModelRunnerOutput(
|
||||||
|
req_ids=self.input_batch.req_ids,
|
||||||
|
req_id_to_index=self.input_batch.req_id_to_index,
|
||||||
|
sampled_token_ids=valid_sampled_token_ids,
|
||||||
|
logprobs=logprobs_lists,
|
||||||
|
prompt_logprobs_dict=prompt_logprobs_dict,
|
||||||
|
pooler_output=[],
|
||||||
|
**extra_args,
|
||||||
|
)
|
||||||
|
|
||||||
durations = ProfileExecuteDuration().pop_captured_sync()
|
durations = ProfileExecuteDuration().pop_captured_sync()
|
||||||
if durations:
|
if durations:
|
||||||
|
|||||||
Reference in New Issue
Block a user