diff --git a/.github/doc_codespell.yaml b/.github/workflows/doc_codespell.yaml similarity index 100% rename from .github/doc_codespell.yaml rename to .github/workflows/doc_codespell.yaml diff --git a/.github/workflows/image_310p_openeuler.yml b/.github/workflows/image_310p_openeuler.yml new file mode 100644 index 0000000..5b84a24 --- /dev/null +++ b/.github/workflows/image_310p_openeuler.yml @@ -0,0 +1,114 @@ +name: 'image / openEuler' +# This is a docker build check and publish job: +# 1. PR Triggered docker image build check +# - is for image build check +# - Enable on main/*-dev branch +# - push: ${{ github.event_name != 'pull_request' }} ==> false +# 2. branches push trigger image publish +# - is for branch/dev/nightly image +# - commits are merge into main/*-dev ==> vllm-ascend:main / vllm-ascend:*-dev +# 3. tags push trigger image publish +# - is for final release image +# - Publish when tag with v* (pep440 version) ===> vllm-ascend:v1.2.3-openeuler|latest / vllm-ascend:v1.2.3rc1-openeuler +on: + pull_request: + branches: + - 'main' + - '*-dev' + paths: + - '.github/workflows/image_310p_openeuler.yml' + - 'Dockerfile.310p.openEuler' + - 'vllm_ascend/**' + - 'setup.py' + - 'pyproject.toml' + - 'requirements.txt' + - 'cmake/**' + - 'CMakeLists.txt' + - 'csrc/**' + push: + # Publish image when tagging, the Dockerfile in tag will be build as tag image + branches: + - 'main' + - '*-dev' + tags: + - 'v*' + paths: + - '.github/workflows/image_310p.openeuler.yml' + - 'Dockerfile.310p.openEuler' + - 'vllm_ascend/**' + +jobs: + build: + name: vllm-ascend image build + runs-on: >- + ${{ + github.event_name == 'push' && github.repository_owner == 'vllm-project' && + 'ubuntu-latest' || + 'ubuntu-24.04-arm' + }} + steps: + - uses: actions/checkout@v4 + + - name: Print + run: | + lscpu + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + # TODO(yikun): add more hub image and a note on release policy for container image + images: | + quay.io/ascend/vllm-ascend + # Note for test case + # https://github.com/marketplace/actions/docker-metadata-action#typeref + # 1. branch job pulish per main/*-dev branch commits + # 2. main and dev pull_request is build only, so the tag pr-N-openeuler is fine + # 3. only pep440 matched tag will be published: + # - v0.7.1 --> v0.7.1-openeuler, latest + # - pre/post/dev: v0.7.1rc1-openeuler/v0.7.1rc1-openeuler/v0.7.1rc1.dev1-openeuler/v0.7.1.post1-openeuler, no latest + # which follow the rule from vLLM with prefix v + # TODO(yikun): the post release might be considered as latest release + tags: | + type=ref,event=branch,suffix=-310p-openeuler + type=ref,event=pr,suffix=-openeuler + type=pep440,pattern={{raw}},suffix=-310p-openeuler + + - name: Free up disk space + uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 + with: + tool-cache: true + docker-images: false + + - name: Build - Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Build - Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Publish - Login to Quay Container Registry + if: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }} + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ vars.QUAY_USERNAME }} + password: ${{ secrets.QUAY_PASSWORD }} + + - name: Build and push 310p + uses: docker/build-push-action@v6 + with: + platforms: >- + ${{ + github.event_name == 'push' && github.repository_owner == 'vllm-project' && + 'linux/amd64,linux/arm64' || + 'linux/arm64' + }} + # use the current repo path as the build context, ensure .git is contained + context: . + # only trigger when tag, branch/main push + push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }} + labels: ${{ steps.meta.outputs.labels }} + tags: ${{ steps.meta.outputs.tags }} + file: Dockerfile.310p.openEuler + build-args: | + PIP_INDEX_URL=https://pypi.org/simple diff --git a/.github/workflows/image_310p_ubuntu.yml b/.github/workflows/image_310p_ubuntu.yml new file mode 100644 index 0000000..02a59fb --- /dev/null +++ b/.github/workflows/image_310p_ubuntu.yml @@ -0,0 +1,110 @@ +name: 'image / Ubuntu' +# This is a docker build check and publish job: +# 1. PR Triggered docker image build check +# - is for image build check +# - Enable on main/*-dev branch +# - push: ${{ github.event_name != 'pull_request' }} ==> false +# 2. branches push trigger image publish +# - is for branch/dev/nightly image +# - commits are merge into main/*-dev ==> vllm-ascend:main / vllm-ascend:*-dev +# 3. tags push trigger image publish +# - is for final release image +# - Publish when tag with v* (pep440 version) ===> vllm-ascend:v1.2.3|latest / vllm-ascend:v1.2.3rc1 +on: + pull_request: + branches: + - 'main' + - '*-dev' + paths: + - '.github/workflows/image_310p.ubuntu.yml' + - 'Dockerfile.310p' + - 'vllm_ascend/**' + - 'setup.py' + - 'pyproject.toml' + - 'requirements.txt' + - 'cmake/**' + - 'CMakeLists.txt' + - 'csrc/**' + push: + # Publish image when tagging, the Dockerfile in tag will be build as tag image + branches: + - 'main' + - '*-dev' + tags: + - 'v*' + paths: + - '.github/workflows/image_310p_ubuntu.yml' + - 'Dockerfile.310p' + - 'vllm_ascend/**' +jobs: + + build: + name: vllm-ascend image build + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Print + run: | + lscpu + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + # TODO(yikun): add more hub image and a note on release policy for container image + images: | + quay.io/ascend/vllm-ascend + # Note for test case + # https://github.com/marketplace/actions/docker-metadata-action#typeref + # 1. branch job pulish per main/*-dev branch commits + # 2. main and dev pull_request is build only, so the tag pr-N is fine + # 3. only pep440 matched tag will be published: + # - v0.7.1 --> v0.7.1, latest + # - pre/post/dev: v0.7.1rc1/v0.7.1rc1/v0.7.1rc1.dev1/v0.7.1.post1, no latest + # which follow the rule from vLLM with prefix v + # TODO(yikun): the post release might be considered as latest release + tags: | + type=ref,event=branch,suffix=-310p + type=ref,event=pr,suffix=-310p + type=pep440,pattern={{raw}},suffix=-310p + + - name: Free up disk space + uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 + with: + tool-cache: true + docker-images: false + + - name: Build - Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Build - Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Publish - Login to Quay Container Registry + if: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }} + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ vars.QUAY_USERNAME }} + password: ${{ secrets.QUAY_PASSWORD }} + + - name: Build and push 310p + uses: docker/build-push-action@v6 + with: + platforms: >- + ${{ + github.event_name == 'push' && github.repository_owner == 'vllm-project' && + 'linux/amd64,linux/arm64' || + 'linux/amd64' + }} + # use the current repo path as the build context, ensure .git is contained + context: . + file: Dockerfile.310p + # only trigger when tag, branch/main push + push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }} + labels: ${{ steps.meta.outputs.labels }} + tags: ${{ steps.meta.outputs.tags }} + build-args: | + PIP_INDEX_URL=https://pypi.org/simple diff --git a/.github/workflows/image_openeuler.yml b/.github/workflows/image_openeuler.yml index 690d814..c954e56 100644 --- a/.github/workflows/image_openeuler.yml +++ b/.github/workflows/image_openeuler.yml @@ -94,7 +94,7 @@ jobs: username: ${{ vars.QUAY_USERNAME }} password: ${{ secrets.QUAY_PASSWORD }} - - name: Build and push + - name: Build and push 910b uses: docker/build-push-action@v6 with: platforms: >- diff --git a/.github/workflows/image_ubuntu.yml b/.github/workflows/image_ubuntu.yml index a2cfbce..69fe385 100644 --- a/.github/workflows/image_ubuntu.yml +++ b/.github/workflows/image_ubuntu.yml @@ -90,7 +90,7 @@ jobs: username: ${{ vars.QUAY_USERNAME }} password: ${{ secrets.QUAY_PASSWORD }} - - name: Build and push + - name: Build and push 910b uses: docker/build-push-action@v6 with: platforms: >- @@ -101,6 +101,7 @@ jobs: }} # use the current repo path as the build context, ensure .git is contained context: . + file: Dockerfile # only trigger when tag, branch/main push push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }} labels: ${{ steps.meta.outputs.labels }} diff --git a/Dockerfile.310p b/Dockerfile.310p new file mode 100644 index 0000000..fffe73e --- /dev/null +++ b/Dockerfile.310p @@ -0,0 +1,61 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM quay.io/ascend/cann:8.1.rc1-310p-ubuntu22.04-py3.10 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +# Define environments +ENV DEBIAN_FRONTEND=noninteractive +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + +RUN apt-get update -y && \ + apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \ + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /workspace + +COPY . /vllm-workspace/vllm-ascend/ + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +# Install vLLM +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_TAG=v0.9.1 +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +# Install vllm-ascend +# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + export SOC_VERSION=ASCEND310P3 && \ + python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip cache purge + +# Install modelscope (for fast download) and ray (for multinode) +RUN python3 -m pip install modelscope ray && \ + python3 -m pip cache purge + +CMD ["/bin/bash"] diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler new file mode 100644 index 0000000..da4718c --- /dev/null +++ b/Dockerfile.310p.openEuler @@ -0,0 +1,58 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM quay.io/ascend/cann:8.1.rc1-310p-openeuler22.03-py3.10 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + +RUN yum update -y && \ + yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \ + rm -rf /var/cache/yum + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +WORKDIR /workspace + +COPY . /vllm-workspace/vllm-ascend/ + +# Install vLLM +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_TAG=v0.9.1 + +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +# Install vllm-ascend +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + export SOC_VERSION=ASCEND310P3 && \ + python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip cache purge + +# Install modelscope (for fast download) and ray (for multinode) +RUN python3 -m pip install modelscope ray && \ + python3 -m pip cache purge + +CMD ["/bin/bash"] diff --git a/csrc/kernels/get_masked_input_and_mask_kernel.cpp b/csrc/kernels/get_masked_input_and_mask_kernel.cpp index 47ce826..25aeb60 100644 --- a/csrc/kernels/get_masked_input_and_mask_kernel.cpp +++ b/csrc/kernels/get_masked_input_and_mask_kernel.cpp @@ -54,6 +54,7 @@ public: pipe.InitBuffer(maskQueue, 1, size_ * sizeof(bool)); // Initialize calculation buffers + // NOTE: calc_buf_1 and calc_buf_2 are also used for int16 casting on older archs. pipe.InitBuffer(calc_buf_1, size_ * sizeof(float)); pipe.InitBuffer(calc_buf_2, size_ * sizeof(float)); @@ -66,7 +67,7 @@ public: // Initialize temporary buffers pipe.InitBuffer(start_buf, size_ * sizeof(float)); pipe.InitBuffer(end_buf, size_ * sizeof(float)); - pipe.InitBuffer(inputFloat_buf, size_ * sizeof(float)); + pipe.InitBuffer(inputFloat_buf, size_ * sizeof(float)); // Also used for half intermediate in casting pipe.InitBuffer(validOffset_buf, size_ * sizeof(float)); pipe.InitBuffer(vocabMask_buf_, size_ * sizeof(int8_t)); pipe.InitBuffer(ones_buf_, size_ * sizeof(float)); @@ -121,7 +122,6 @@ private: const float start_value, const float end_value) { - // Use already initialized buffers AscendC::LocalTensor start_value_tensor = start_buf.Get(); AscendC::LocalTensor end_value_tensor = end_buf.Get(); @@ -134,7 +134,35 @@ private: CompareWithValue(ge_result, start_value_tensor, input, true); CompareWithValue(lt_result, input, end_value_tensor, false); +#if (__CCE_AICORE__ >= 220) AscendC::And(range_mask, ge_result, lt_result, size_); +#else + { + // WORKAROUND for older arch + // No direct int8->int16 cast. Use half as intermediate. + // No direct int8 And. Use int16 And. + AscendC::LocalTensor ge_result_i16 = calc_buf_1.Get(); + AscendC::LocalTensor lt_result_i16 = calc_buf_2.Get(); + AscendC::LocalTensor range_mask_i16 = ge_result_i16; + + // Use a temporary buffer for half type + AscendC::LocalTensor tmp_half = inputFloat_buf.Get(); + + // 1. Cast inputs: int8_t -> half -> int16_t + AscendC::Cast(tmp_half, ge_result, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(ge_result_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::Cast(tmp_half, lt_result, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(lt_result_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_); + + // 2. Perform And on int16_t tensors + AscendC::And(range_mask_i16, ge_result_i16, lt_result_i16, size_); + + // 3. Cast result back: int16_t -> half -> int8_t + AscendC::Cast(tmp_half, range_mask_i16, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(range_mask, tmp_half, AscendC::RoundMode::CAST_NONE, size_); + } +#endif } __aicore__ inline void Compute() { @@ -145,24 +173,18 @@ private: AscendC::LocalTensor inputFloat = inputFloat_buf.Get(); AscendC::Cast(inputFloat, inputLocal, AscendC::RoundMode::CAST_NONE, size_); - // Calculate mask for org_vocab range - // org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index) AscendC::LocalTensor orgVocabMask = result_org_mask_que.AllocTensor(); ComputeRangeMask(orgVocabMask, inputFloat, static_cast(org_vocab_start_index_), static_cast(org_vocab_end_index_)); - // Calculate mask for added_vocab range - // added_vocab_mask = (input_ >= added_vocab_start_index) & (input_ < added_vocab_end_index) AscendC::LocalTensor addedVocabMask = result_add_mask_que.AllocTensor(); ComputeRangeMask(addedVocabMask, inputFloat, static_cast(added_vocab_start_index_), static_cast(added_vocab_end_index_)); - // Calculate validOffset - // valid_offset = (org_vocab_start_index * org_vocab_mask) + (added_offset * added_vocab_mask) AscendC::LocalTensor validOffset = validOffset_buf.Get(); AscendC::LocalTensor constOrgStartIndex = start_buf.Get(); @@ -173,10 +195,7 @@ private: AscendC::Cast(orgVocabMask_fp16, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_); AscendC::Cast(orgVocabMask_fp32, orgVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); - AscendC::Mul(validOffset, - constOrgStartIndex, - orgVocabMask_fp32, - size_); + AscendC::Mul(validOffset, constOrgStartIndex, orgVocabMask_fp32, size_); AscendC::LocalTensor addedOffset; AscendC::LocalTensor addedOffsetTensor = end_buf.Get(); @@ -187,44 +206,61 @@ private: AscendC::Cast(addedVocabMask_fp16, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_); AscendC::Cast(addedVocabMask_fp32, addedVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); - AscendC::Mul(addedOffset, - addedOffsetTensor, - addedVocabMask_fp32, - size_); - + AscendC::Mul(addedOffset, addedOffsetTensor, addedVocabMask_fp32, size_); AscendC::Add(validOffset, validOffset, addedOffset, size_); - // vocab_mask = org_vocab_mask | added_vocab_mask AscendC::LocalTensor vocabMask = vocabMask_buf_.Get(); - + +#if (__CCE_AICORE__ >= 220) AscendC::Or(vocabMask, orgVocabMask, addedVocabMask, size_); - +#else + { + // WORKAROUND for older arch + // No direct int8->int16 cast. Use half as intermediate. + // No direct int8 Or. Use int16 Or. + AscendC::LocalTensor orgVocabMask_i16 = calc_buf_1.Get(); + AscendC::LocalTensor addedVocabMask_i16 = calc_buf_2.Get(); + AscendC::LocalTensor vocabMask_i16 = orgVocabMask_i16; + + // Use a temporary buffer for half type. inputFloat_buf is free now. + AscendC::LocalTensor tmp_half = inputFloat_buf.Get(); + + // 1. Cast inputs: int8_t -> half -> int16_t + AscendC::Cast(tmp_half, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(orgVocabMask_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::Cast(tmp_half, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(addedVocabMask_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_); + + // 2. Perform Or on int16_t tensors + AscendC::Or(vocabMask_i16, orgVocabMask_i16, addedVocabMask_i16, size_); + + // 3. Cast result back: int16_t -> half -> int8_t + AscendC::Cast(tmp_half, vocabMask_i16, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(vocabMask, tmp_half, AscendC::RoundMode::CAST_NONE, size_); + } +#endif + AscendC::Sub(inputFloat, inputFloat, validOffset, size_); - // input_ = vocab_mask * (input_ - valid_offset) AscendC::LocalTensor vocabMask_fp16; AscendC::LocalTensor vocabMask_fp32; AscendC::Cast(vocabMask_fp16, vocabMask, AscendC::RoundMode::CAST_NONE, size_); AscendC::Cast(vocabMask_fp32, vocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); - AscendC::LocalTensor inputFloat_fp32; AscendC::Mul(inputFloat, inputFloat, vocabMask_fp32, size_); AscendC::Cast(maskedLocal, inputFloat, AscendC::RoundMode::CAST_CEIL, size_); outQueue.EnQue(maskedLocal); - // ~vocab_mask AscendC::LocalTensor ones_tensor = ones_buf_.Get(); AscendC::Duplicate(ones_tensor, (float)1, size_); AscendC::LocalTensor maskLocal_fp32; - AscendC::Sub(maskLocal_fp32, - ones_tensor, - vocabMask_fp32, - size_); + AscendC::Sub(maskLocal_fp32, ones_tensor, vocabMask_fp32, size_); AscendC::LocalTensor maskLocal_fp16; AscendC::Cast(maskLocal_fp16, maskLocal_fp32, AscendC::RoundMode::CAST_NONE, size_); @@ -262,8 +298,6 @@ private: // Temporary buffers AscendC::TBuf start_buf; AscendC::TBuf end_buf; - - // Temporary buffers continued AscendC::TBuf inputFloat_buf; AscendC::TBuf validOffset_buf; AscendC::TBuf vocabMask_buf_; @@ -342,4 +376,3 @@ void get_masked_input_and_mask_impl( } } // namespace vllm_ascend - diff --git a/csrc/kernels/pos_encoding_kernels.cpp b/csrc/kernels/pos_encoding_kernels.cpp index 0b77ce8..57af050 100644 --- a/csrc/kernels/pos_encoding_kernels.cpp +++ b/csrc/kernels/pos_encoding_kernels.cpp @@ -30,7 +30,11 @@ using vllm_ascend::local_mem_copy; template class RotaryEmbedding { // NOTE(ganyi): we use 512B as load stride for pipe, need to find another way to // retrieve this size from runtime for more Soc support - static int constexpr loadSize = 512; + #if (__CCE_AICORE__ >= 220) + static int constexpr loadSize = 512; + #else + static int constexpr loadSize = 1024 * 4; + #endif using dst_t = scalar_t; using acc_t = typename AccType::type; // only half tensor have cast instruct to int8, hardcode acc_dst_t as half @@ -326,7 +330,9 @@ private: // Declare all the kernel entry here ROPE_CUSTOM_KERNEL_DECLARE(half) -ROPE_CUSTOM_KERNEL_DECLARE(bfloat16_t) +#if (__CCE_AICORE__ >= 220) + ROPE_CUSTOM_KERNEL_DECLARE(bfloat16_t) +#endif namespace vllm_ascend { @@ -342,7 +348,7 @@ namespace vllm_ascend { reinterpret_cast(cosSinCache), rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, \ numHeads, numKvHeads, headSize, numTokens, loopCnt, blockDim); -// maximum number for runtime to launch a ascendc kernel. +// maximum number for runtime to launch a ascendc kernel. // we use this to constrain the maximum number of block size static const int64_t maxParallelSize = 65535; @@ -357,9 +363,13 @@ extern void rotary_embedding_impl(AscendType type, bool isNeox, void *stream, in int blockDim = maxParallelSize > numTokens ? numTokens : maxParallelSize; if (type == AscendType::FP16) { ROTARY_EMBEDDING_KERNEL_CALL(half); - } else if (type == AscendType::BF16) { + } + #if (__CCE_AICORE__ >= 220) + else if (type == AscendType::BF16) { ROTARY_EMBEDDING_KERNEL_CALL(bfloat16_t); - } else { + } + #endif + else { return; } } diff --git a/csrc/kernels/utils.h b/csrc/kernels/utils.h index 8b8cf21..c2d4261 100644 --- a/csrc/kernels/utils.h +++ b/csrc/kernels/utils.h @@ -20,9 +20,11 @@ namespace vllm_ascend { template struct AccType; +#if (__CCE_AICORE__ >= 220) template <> struct AccType { - using type = float; + using type = float; }; +#endif template <> struct AccType { using type = half; diff --git a/format.sh b/format.sh index e49ac91..2a00ce3 100755 --- a/format.sh +++ b/format.sh @@ -273,7 +273,7 @@ echo 'vllm-ascend isort: Done' # Clang-format section # Exclude some files for formatting because they are vendored CLANG_FORMAT_EXCLUDES=( - 'csrc/kernels/pos_encoding_kernels.cpp' 'csrc/kernels/advance_step.cpp' 'csrc/kernels/get_masked_input_and_mask_kernel.cpp' 'csrc/torch_binding.cpp' 'csrc/ops.h' + 'csrc/kernels/utils.h' 'csrc/kernels/pos_encoding_kernels.cpp' 'csrc/kernels/advance_step.cpp' 'csrc/kernels/get_masked_input_and_mask_kernel.cpp' 'csrc/torch_binding.cpp' 'csrc/ops.h' ) # Format specified files with clang-format diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py index 9c112ed..4b545a1 100644 --- a/vllm_ascend/attention/attention.py +++ b/vllm_ascend/attention/attention.py @@ -36,7 +36,8 @@ from vllm.utils import async_tensor_h2d, make_tensor_with_pad from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ops.cache import concat_and_cache_mla -from vllm_ascend.utils import enable_custom_op +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, + enable_custom_op, is_310p, nd_to_nz_2d) from vllm_ascend.worker.model_runner import ( ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata) @@ -170,7 +171,11 @@ class AscendAttentionBackend(AttentionBackend): num_kv_heads: int, head_size: int, ) -> Tuple[int, ...]: - return (2, num_blocks, block_size, num_kv_heads, head_size) + if is_310p(): + return (2, num_blocks, num_kv_heads * head_size // 16, block_size, + 16) + else: + return (2, num_blocks, block_size, num_kv_heads, head_size) @staticmethod def swap_blocks( @@ -654,6 +659,11 @@ class AscendMetadataBuilder(CommonMetadataBuilder[AscendMetadata]): # normal mask self.attn_mask = AscendMetadataBuilder._attn_mask_builder.get_attn_mask( # type: ignore max_prefill_seq_len, dtype, device) + if is_310p(): + mask_nz = nd_to_nz_2d(self.attn_mask) + mask_nz = torch_npu.npu_format_cast( + mask_nz.contiguous(), ACL_FORMAT_FRACTAL_NZ) + self.attn_mask = mask_nz elif self.num_decode_tokens == 0 and not self.input_builder.chunked_prefill_enabled: # compress mask for prefix cache self.compress_mask = AscendMetadataBuilder._attn_mask_builder.get_attn_mask( # type: ignore @@ -868,6 +878,18 @@ class AscendAttentionBackendImpl(AttentionImpl): self.seq_lens_tensor_cpu = torch.from_numpy( np.array(attn_metadata.prefill_metadata.seq_lens). astype(np.int32)) + if is_310p(): + # align q k v output tensors + query = aligned_16(query) + key = aligned_16(key) + value = aligned_16(value) + output = aligned_16(output) + + # do reformat in case of broadcasted tensors + mask = mask.repeat( + self.seq_lens_tensor_cpu.size(0), 1, 1, 1) + mask = torch_npu.npu_format_cast( + mask.contiguous(), ACL_FORMAT_FRACTAL_NZ) torch_npu._npu_flash_attention( query=query, key=key, @@ -878,6 +900,7 @@ class AscendAttentionBackendImpl(AttentionImpl): num_heads=self.num_heads, num_kv_heads=self.num_kv_heads, out=output) + output = output[:num_tokens, :, :] # Prefix cache only and cache hit elif attn_metadata.num_decode_tokens == 0 and not attn_metadata.chunked_prefill_enabled: assert kv_cache is not None @@ -935,6 +958,10 @@ class AscendAttentionBackendImpl(AttentionImpl): self.seq_lens_tensor_cpu = torch.from_numpy( np.array(attn_metadata.decode_metadata.seq_lens).astype( np.int32)) + if is_310p(): + # # seq_lens_tensor needs to be transferred to the device for 310P + self.seq_lens_tensor_cpu = self.seq_lens_tensor_cpu.to( + device=self.key_cache.device) block_tables = attn_metadata.decode_metadata.block_tables torch_npu._npu_paged_attention( query=query, diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index e6a2376..0aac026 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -30,6 +30,8 @@ from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.worker.gpu_input_batch import InputBatch from vllm_ascend.ops.attention import vanilla_chunked_prefill +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, + nd_to_nz_2d, nd_to_nz_spec) class AscendAttentionBackend(AttentionBackend): @@ -62,6 +64,9 @@ class AscendAttentionBackend(AttentionBackend): num_kv_heads: int, head_size: int, ) -> Tuple[int, ...]: + if is_310p(): + return (2, num_blocks, num_kv_heads * head_size // 16, block_size, + 16) return (2, num_blocks, block_size, num_kv_heads, head_size) @staticmethod @@ -167,6 +172,16 @@ class AscendAttentionMetadataBuilder: query_start_loc = query_start_loc_cpu.to(self.runner.device, non_blocking=True) + if is_310p(): + if attn_state == AscendAttentionState.PrefillNoCache: + mask_nz = nd_to_nz_2d(attn_mask) + attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(), + ACL_FORMAT_FRACTAL_NZ) + elif attn_state == AscendAttentionState.ChunkedPrefill: + mask_nz = nd_to_nz_spec(attn_mask) + attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(), + ACL_FORMAT_FRACTAL_NZ) + attn_metadata = AscendMetadata( num_actual_tokens=num_actual_tokens, block_tables=block_table, @@ -250,6 +265,7 @@ class AscendAttentionBackendImpl(AttentionImpl): self.head_size, dtype=query.dtype, device=query.device) + ori_output = output if trace_flag: torch.ops.vllm.unified_ascend_attention_with_output( query=query, @@ -294,6 +310,18 @@ class AscendAttentionBackendImpl(AttentionImpl): assert attn_metadata is not None assert attn_metadata.attn_mask is not None mask = attn_metadata.attn_mask + if is_310p(): + # align q k v output tensors + query = aligned_16(query) + key = aligned_16(key) + value = aligned_16(value) + output = aligned_16(output) + + # do reformat in case of broadcasted tensors + mask = mask.repeat(attn_metadata.seq_lens.size(0), 1, 1, 1) + mask = torch_npu.npu_format_cast(mask.contiguous(), + ACL_FORMAT_FRACTAL_NZ) + torch_npu._npu_flash_attention(query=query, key=key, value=value, @@ -303,6 +331,7 @@ class AscendAttentionBackendImpl(AttentionImpl): num_heads=self.num_heads, num_kv_heads=self.num_kv_heads, out=output) + output = output[:num_tokens, :, :] elif attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit: assert attn_metadata is not None assert attn_metadata.attn_mask is not None @@ -320,6 +349,10 @@ class AscendAttentionBackendImpl(AttentionImpl): scale_value=self.scale, out=output) elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly: + if is_310p(): + # # seq_lens_tensor needs to be transferred to the device for 310P + attn_metadata.seq_lens = \ + attn_metadata.seq_lens.to(device=query.device) torch_npu._npu_paged_attention( query=query, key_cache=self.key_cache, @@ -353,6 +386,14 @@ class AscendAttentionBackendImpl(AttentionImpl): self.scale, None, True) else: # use paged attention + assert attn_metadata is not None + assert attn_metadata.attn_mask is not None + if is_310p(): + # do reformat in case of broadcasted tensors + attn_metadata.attn_mask = \ + torch_npu.npu_format_cast(attn_metadata.attn_mask.contiguous(), ACL_FORMAT_FRACTAL_NZ) + attn_metadata.seq_lens = \ + attn_metadata.seq_lens.to(device=query.device) torch_npu._npu_paged_attention_splitfuse( query=query, key_cache=self.key_cache, @@ -365,6 +406,9 @@ class AscendAttentionBackendImpl(AttentionImpl): num_heads=self.num_heads, scale_value=self.scale, out=output) + + # to make in-place change to the output tensor + ori_output[:, :, :] = output[:num_tokens, :, :] return output.view(num_tokens, self.hidden_size) diff --git a/vllm_ascend/ops/activation.py b/vllm_ascend/ops/activation.py index 13541ee..1c32643 100644 --- a/vllm_ascend/ops/activation.py +++ b/vllm_ascend/ops/activation.py @@ -18,11 +18,16 @@ import torch from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul +from vllm_ascend.utils import is_310p + def silu_and_mul_forward_oot(self, x: torch.Tensor) -> torch.Tensor: import torch_npu - out = torch_npu.npu_swiglu(x) + if is_310p(): + out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16) + else: + out = torch_npu.npu_swiglu(x) return out diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py index 285b728..3c84f23 100644 --- a/vllm_ascend/ops/common_fused_moe.py +++ b/vllm_ascend/ops/common_fused_moe.py @@ -21,7 +21,9 @@ import torch from vllm.model_executor.layers.fused_moe.layer import \ UnquantizedFusedMoEMethod -from vllm_ascend.ops.fused_moe import fused_experts, select_experts +from vllm_ascend.ops.fused_moe import (fused_experts, fused_experts_310p, + select_experts) +from vllm_ascend.utils import is_310p def forward_oot( @@ -56,6 +58,19 @@ def forward_oot( e_score_correction_bias=e_score_correction_bias, ) + if is_310p(): + assert global_num_experts is not None + return fused_experts_310p( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input) + return fused_experts( hidden_states=x, w1=layer.w13_weight, diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py index c282f7e..c1c865b 100644 --- a/vllm_ascend/ops/fused_moe.py +++ b/vllm_ascend/ops/fused_moe.py @@ -549,6 +549,95 @@ def fused_experts_with_all2all_buffer( return final_hidden_states +# Currently, fused_experts on 310p only supports PanguProMoE. +def fused_experts_310p( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + global_num_experts: int, + expert_map: torch.Tensor = None, + apply_router_weight_on_input: bool = False, +) -> torch.Tensor: + """ + + Args: + hidden_states: Hidden states of shape (num_tokens, hidden_size). + w1: Expert weights1 of shape (num_experts, intermediate_size * 2, hidden_size). + w2: Expert weights2 of shape (num_experts, hidden_size, intermediate_size). + topk_weights: Routing weights of shape (num_tokens, top_k). + topk_ids: Selected expert IDs of shape (num_tokens, top_k). + top_k: Number of experts to select. + expert_map: Expert mapping of shape (num_experts,). + + Returns: + hidden_states: Hidden states after routing. + """ + ep_size = get_ep_group().world_size + local_num_experts = global_num_experts // ep_size + local_num_group = top_k // ep_size + + if apply_router_weight_on_input: + assert (topk_weights.dim() == 2 + ), "`topk_weights` should be in shape (num_tokens, topk)" + _, topk = topk_weights.shape + assert ( + topk == 1 + ), "Only support topk=1 when `apply_router_weight_on_input` is True" + hidden_states = hidden_states * topk_weights.to(hidden_states.dtype) + + bsz, _ = hidden_states.shape + flatten_topk_ids = topk_ids.view(-1) + sorted_topk_ids = torch.argsort(flatten_topk_ids.float()) + sorted_topk_ids = sorted_topk_ids.to(torch.int32) + sorted_hidden_states = hidden_states.index_select( + 0, sorted_topk_ids // local_num_group) + + experts_id = torch.arange(0, + local_num_experts, + dtype=topk_ids.dtype, + device=topk_ids.device) + num_tokens_per_expert = (flatten_topk_ids.unsqueeze(-1) == experts_id).to( + torch.float32).sum(0) + topk_scales = topk_weights.view(-1).index_select( + 0, sorted_topk_ids).unsqueeze(-1) + group_list = num_tokens_per_expert.cumsum(dim=0).to(torch.int64) + + w1 = w1.transpose(1, 2) + gate_up_out = torch_npu.npu_grouped_matmul( + x=[sorted_hidden_states], + weight=[w1], + split_item=2, + group_list_type=0, + group_type=0, + group_list=group_list, + )[0] + + gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to( + torch.float16) + gate_up_out *= topk_scales + + w2 = w2.transpose(1, 2) + down_out_list = torch_npu.npu_grouped_matmul( + x=[gate_up_out], + weight=[w2], + split_item=2, + group_list_type=0, + group_type=0, + group_list=group_list, + )[0] + + unsorted_topk_ids = torch.argsort(sorted_topk_ids.float()).to( + torch.int32) + torch.Tensor([0]).to(torch.int32).npu() + unsorted_hidden_states = down_out_list.index_select(0, unsorted_topk_ids) + final_hidden_states = unsorted_hidden_states.reshape( + bsz, top_k // ep_size, -1).sum(1) + + return final_hidden_states + + def fused_experts( hidden_states: torch.Tensor, w1: torch.Tensor, diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py index 8ff4c55..7b839fe 100644 --- a/vllm_ascend/ops/layernorm.py +++ b/vllm_ascend/ops/layernorm.py @@ -20,6 +20,8 @@ from typing import Optional, Tuple, Union import torch from vllm.model_executor.layers.layernorm import RMSNorm +from vllm_ascend.utils import is_310p + def forward_oot( self, @@ -29,8 +31,15 @@ def forward_oot( import torch_npu if residual is not None: - x, _, residual = torch_npu.npu_add_rms_norm(x, residual, self.weight, - self.variance_epsilon) + if is_310p(): + orig_dtype = residual.dtype + x = x + residual.to(x.dtype) + residual = x.to(orig_dtype) + x, _ = torch_npu.npu_rms_norm(x, self.weight, + self.variance_epsilon) + else: + x, _, residual = torch_npu.npu_add_rms_norm( + x, residual, self.weight, self.variance_epsilon) return x, residual x, residual = torch_npu.npu_rms_norm(x, self.weight, self.variance_epsilon) diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 39a4c1c..2722679 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -22,7 +22,7 @@ import torch from vllm.model_executor.layers.rotary_embedding import ( DeepseekScalingRotaryEmbedding, RotaryEmbedding) -from vllm_ascend.utils import enable_custom_op +from vllm_ascend.utils import enable_custom_op, is_310p def custom_rotary_embedding_enabled(query, neox_style, head_size): @@ -48,7 +48,8 @@ def rope_forward_oot( if is_neox_style_override is not None: neox_style = is_neox_style_override # adopt custom kernel path for rotary_embedding - if custom_rotary_embedding_enabled(query, neox_style, self.head_size): + if custom_rotary_embedding_enabled(query, neox_style, + self.head_size) and not is_310p(): query, key = torch.ops._C.rotary_embedding( positions, query, diff --git a/vllm_ascend/patch/platform/patch_common/patch_distributed.py b/vllm_ascend/patch/platform/patch_common/patch_distributed.py index 86515df..d094886 100644 --- a/vllm_ascend/patch/platform/patch_common/patch_distributed.py +++ b/vllm_ascend/patch/platform/patch_common/patch_distributed.py @@ -17,6 +17,7 @@ # Adapted from vllm/model_executor/models/qwen2_vl.py # This file is a part of the vllm-ascend project. +import torch import vllm import vllm.distributed import vllm.envs as envs @@ -25,6 +26,8 @@ from vllm.config import ParallelConfig from vllm.distributed.utils import \ stateless_init_torch_distributed_process_group +from vllm_ascend.utils import NullHandle, is_310p + def ascend_destroy_model_parallel(): """Set the groups to none and destroy them.""" @@ -81,3 +84,70 @@ def stateless_init_dp_group(self) -> "ProcessGroup": vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port ParallelConfig.stateless_init_dp_group = stateless_init_dp_group + + +def communication_adaptation_310p(): + + def broadcast310p(tensor, src, group=None, async_op=False): + rank = torch.distributed.get_rank(group) + world_size = torch.distributed.get_world_size(group) + tensor_list = [torch.empty_like(tensor) for _ in range(world_size)] + tensor_list[rank] = tensor + torch.distributed.all_gather(tensor_list, tensor, group=group) + tensor[...] = tensor_list[src] + if async_op: + return NullHandle() + else: + return None + + torch.distributed.broadcast = broadcast310p + torch.distributed.distributed_c10d.broadcast = broadcast310p + + def all_reduce_wrapper_310p(fn): + + def all_reduce( + tensor, + op=torch.distributed.ReduceOp.SUM, + group=None, + async_op=False, + ): + if tensor.dtype != torch.int64: + return fn(tensor, op, group, async_op) + rank = torch.distributed.get_rank(group) + world_size = torch.distributed.get_world_size(group) + tensor_list = [torch.empty_like(tensor) for _ in range(world_size)] + tensor_list[rank] = tensor + torch.distributed.all_gather(tensor_list, tensor, group=group) + if op == torch.distributed.ReduceOp.SUM: + return torch.stack(tensor_list).sum(0) + elif op == torch.distributed.ReduceOp.MAX: + return torch.tensor( + torch.stack(tensor_list).cpu().numpy().max(0), + device=tensor.device, + ) + else: + raise RuntimeError(f"not implement op {op}") + + return all_reduce + + torch.distributed.all_reduce = all_reduce_wrapper_310p( + torch.distributed.all_reduce) + torch.distributed.distributed_c10d.all_reduce = all_reduce_wrapper_310p( + torch.distributed.distributed_c10d.all_reduce) + + def reduce_scatter_310p(output_tensor, input_tensor, group=None): + rank = torch.distributed.get_rank(group) + world_size = torch.distributed.get_world_size(group) + torch.distributed.all_reduce(input_tensor, + torch.distributed.ReduceOp.SUM, + group, + async_op=False) + interval = input_tensor.shape[0] // world_size + output_tensor[:] = input_tensor[rank * interval:(rank + 1) * interval] + + torch.distributed._reduce_scatter_base = reduce_scatter_310p + torch.distributed.distributed_c10d._reduce_scatter_base = reduce_scatter_310p + + +if is_310p(): + communication_adaptation_310p() diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index b9233da..881b732 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -28,7 +28,8 @@ from vllm.logger import logger from vllm.platforms import Platform, PlatformEnum from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config -from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes +from vllm_ascend.utils import (ASCEND_QUATIZATION_METHOD, is_310p, + update_aclgraph_sizes) if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig @@ -205,8 +206,9 @@ class NPUPlatform(Platform): cache_config.block_size = 128 if envs.VLLM_USE_V1: - # Activate custom ops for v1. - compilation_config.custom_ops = ["all"] + # Activate custom ops for v1, except on 310P + if not is_310p(): + compilation_config.custom_ops = ["all"] # If ascend_scheduler_config is enabled, # extents original scheduler_config to use AscendScheduler. diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index eeab287..1a59036 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -21,11 +21,12 @@ import atexit import math from contextlib import contextmanager, nullcontext from enum import Enum +from functools import lru_cache from threading import Lock from typing import TYPE_CHECKING, List, Tuple import torch -import torch_npu # noqa: F401 +import torch_npu # noqa: F401 # noqa: F401 import torchair # type: ignore[import] # noqa: F401 from packaging.version import InvalidVersion, Version from torch_npu.npu.streams import Event @@ -57,6 +58,116 @@ ASCEND_QUATIZATION_METHOD = "ascend" CUSTOM_OP_ENABLED = None +SOC_VERSION_INFERENCE_SERIES = ["Ascend310P3"] + +ACL_FORMAT_FRACTAL_ND = 2 +ACL_FORMAT_FRACTAL_NZ = 29 + + +@lru_cache(maxsize=None) +def _get_soc_version(): + """Gets the SOC version and caches it.""" + if not torch.npu.is_available(): + return "" + device_count = torch.npu.device_count() + if device_count <= 0: + return "" + try: + return torch.npu.get_device_name(0) + except Exception: + return "" + + +_SOC_VERSION = _get_soc_version() + + +def is_310p(): + return _SOC_VERSION in SOC_VERSION_INFERENCE_SERIES + + +class NullHandle: + + def __init__(self): + pass + + def wait(self): + pass + + +def _round_up(x: int, align: int): + if align == 0: + return -1 + return (x + align - 1) // align * align + + +def _custom_pad(x, pad_dims): + return torch.nn.functional.pad(x, pad_dims) + + +def _custom_reshape(x, target_shape): + return x.reshape(target_shape) + + +def _custom_transpose(x, dim1, dim2): + return x.transpose(dim1, dim2) + + +def nd_to_nz_2d(in_tensor: torch.Tensor) -> torch.Tensor: + aux_dims = [0, 0, 0, 0] + aux_dims[0] = 1 + aux_dims[1] = _round_up(in_tensor.size(0), 16) + + pad_dims = [0, 0, 0, 0] + pad_dims[3] = _round_up(in_tensor.size(0), 16) - in_tensor.size(0) + + aux_dims[2] = _round_up(in_tensor.size(1), 16) // 16 + aux_dims[3] = 16 + pad_dims[1] = _round_up(in_tensor.size(1), 16) - in_tensor.size(1) + + return _custom_transpose( + _custom_reshape(_custom_pad(in_tensor, pad_dims), aux_dims), 1, + 2).contiguous() + + +def nd_to_nz_spec(mask_tensor: torch.Tensor) -> torch.Tensor: + num_tokens = mask_tensor.shape[0] + max_seq_len = mask_tensor.shape[1] + + tokens_pad = (num_tokens + 15) // 16 * 16 + max_seq_len_pad = (max_seq_len + 15) // 16 * 16 + + mask_tensor_pad = \ + torch.zeros((1, tokens_pad, max_seq_len_pad), dtype=mask_tensor.dtype, device=mask_tensor.device) + mask_tensor_pad[0][:num_tokens, :max_seq_len] = mask_tensor + mask = mask_tensor_pad.reshape( + (1, tokens_pad, max_seq_len_pad // 16, 16)).permute(0, 2, 1, 3) + return mask + + +def aligned_16(tensor: torch.Tensor): + """Aligned tensor for 310P""" + + # Get the size of the current 0th dimension + n = tensor.size(0) + + # Calculate the aligned size + n_aligned = ((n + 15) // 16) * 16 + + # If already aligned, return the original tensor + if n == n_aligned: + return tensor + + # Create a new tensor with shape (n_aligned, H, W) and fill it with zeros + new_tensor = torch.zeros(n_aligned, + *tensor.shape[1:], + dtype=tensor.dtype, + device=tensor.device) + + # Copy the original tensor to the first N positions of the new tensor + new_tensor[:n] = tensor + + return new_tensor + def try_register_lib(lib_name: str, lib_info: str = ""): import importlib diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 2670111..5cf6bb3 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -74,7 +74,9 @@ from vllm_ascend.attention.attention_v1 import (AscendAttentionState, from vllm_ascend.attention.mla_v1 import CommonAttentionMetadata from vllm_ascend.platform import NPUPlatform from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler -from vllm_ascend.utils import ProfileExecuteDuration, vllm_version_is +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, + ProfileExecuteDuration, is_310p, + vllm_version_is) from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer @@ -1911,6 +1913,8 @@ class NPUModelRunner(LoRAModelRunnerMixin): """ self.kv_cache_config = kv_cache_config import torch_npu + acl_format = ACL_FORMAT_FRACTAL_NZ if is_310p( + ) else ACL_FORMAT_FRACTAL_ND kv_caches: Dict[str, torch.Tensor] = {} self.input_batch = InputBatch( @@ -1968,13 +1972,18 @@ class NPUModelRunner(LoRAModelRunnerMixin): device=self.device) kv_caches[layer_name] = (layer_kv_cache_nope, layer_kv_cache_pe) - torch_npu.npu_format_cast(kv_caches[layer_name][0], 2) - torch_npu.npu_format_cast(kv_caches[layer_name][1], 2) + kv_caches[layer_name] = ( + torch_npu.npu_format_cast(kv_caches[layer_name][0], + acl_format), + torch_npu.npu_format_cast(kv_caches[layer_name][1], + acl_format), + ) else: kv_caches[layer_name] = torch.zeros(kv_cache_shape, dtype=dtype, device=self.device) - torch_npu.npu_format_cast(kv_caches[layer_name], 2) + kv_caches[layer_name] = \ + torch_npu.npu_format_cast(kv_caches[layer_name], acl_format) else: # TODO: add new branches when introducing more types of # KV cache specs. diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index e78cc3f..bffc6a8 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -51,7 +51,8 @@ from vllm_ascend.ascend_config import init_ascend_config from vllm_ascend.device_allocator.camem import CaMemAllocator from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel from vllm_ascend.platform import NPUPlatform -from vllm_ascend.utils import try_register_lib +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, + is_310p, try_register_lib) from vllm_ascend.worker.model_runner import NPUModelRunner from vllm_ascend.worker.pooling_model_runner import NPUPoolingModelRunner @@ -342,17 +343,22 @@ class NPUWorker(LocalOrDistributedWorkerBase): for _ in range(self.parallel_config.pipeline_parallel_size) ] import torch_npu + acl_format = ACL_FORMAT_FRACTAL_NZ if is_310p( + ) else ACL_FORMAT_FRACTAL_ND for ve in range(self.parallel_config.pipeline_parallel_size): num_layers = len(self.cache_engine[ve].gpu_cache) for i in range(num_layers): if torch.is_tensor(self.cache_engine[ve].gpu_cache[i]): - torch_npu.npu_format_cast( - self.cache_engine[ve].gpu_cache[i], 2) + self.cache_engine[ve].gpu_cache[ + i] = torch_npu.npu_format_cast( + self.cache_engine[ve].gpu_cache[i], acl_format) else: - torch_npu.npu_format_cast( - self.cache_engine[ve].gpu_cache[i][0], 2) - torch_npu.npu_format_cast( - self.cache_engine[ve].gpu_cache[i][1], 2) + self.cache_engine[ve].gpu_cache[i][ + 0] = torch_npu.npu_format_cast( + self.cache_engine[ve].gpu_cache[i][0], acl_format) + self.cache_engine[ve].gpu_cache[i][ + 1] = torch_npu.npu_format_cast( + self.cache_engine[ve].gpu_cache[i][1], acl_format) self.gpu_cache = [ self.cache_engine[ve].gpu_cache for ve in range(self.parallel_config.pipeline_parallel_size)